From ab15b1b82c1cc2ef2d0029db9faf913ce4ef2145 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:40:10 -0700
Subject: [PATCH 001/109] Add CUDA stream and event API for concurrent kernel
 execution

Introduces qd.create_stream() and qd.create_event() for launching
kernels on separate CUDA streams with event-based synchronization.
The qd_stream kwarg on kernel calls routes the launch to a specific
stream. Non-CUDA backends return no-op handles (0). Routes kernel
launcher memory ops through the active stream.
---
 python/quadrants/lang/__init__.py             |   2 +
 python/quadrants/lang/kernel.py               |  16 +-
 python/quadrants/lang/stream.py               |  96 +++++++++
 quadrants/program/program.cpp                 |  93 +++++++++
 quadrants/program/program.h                   |  10 +
 quadrants/python/export_lang.cpp              |  11 +-
 .../rhi/cuda/cuda_driver_functions.inc.h      |   2 +
 quadrants/runtime/cuda/kernel_launcher.cpp    |  20 +-
 tests/python/test_api.py                      |   4 +
 tests/python/test_cache.py                    |   8 +-
 tests/python/test_streams.py                  | 197 ++++++++++++++++++
 11 files changed, 443 insertions(+), 16 deletions(-)
 create mode 100644 python/quadrants/lang/stream.py
 create mode 100644 tests/python/test_streams.py

diff --git a/python/quadrants/lang/__init__.py b/python/quadrants/lang/__init__.py
index dc4fb2cf19..43a4b44b89 100644
--- a/python/quadrants/lang/__init__.py
+++ b/python/quadrants/lang/__init__.py
@@ -15,6 +15,7 @@
 from quadrants.lang.runtime_ops import *
 from quadrants.lang.snode import *
 from quadrants.lang.source_builder import *
+from quadrants.lang.stream import *
 from quadrants.lang.struct import *
 from quadrants.types.enums import DeviceCapability, Format, Layout  # noqa: F401
 
@@ -45,6 +46,7 @@
         "shell",
         "snode",
         "source_builder",
+        "stream",
         "struct",
         "util",
     ]
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index af6dbdacb5..4b1578ac4b 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -424,7 +424,9 @@ def materialize(self, key: "CompiledKernelKeyType | None", py_args: tuple[Any, .
                     ]
                 runtime._current_global_context = None
 
-    def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args) -> Any:
+    def launch_kernel(
+        self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args, qd_stream=None
+    ) -> Any:
         assert len(args) == len(self.arg_metas), f"{len(self.arg_metas)} arguments needed but {len(args)} provided"
 
         callbacks: list[Callable[[], None]] = []
@@ -503,7 +505,14 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     )
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
-            prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            stream_handle = qd_stream.handle if qd_stream is not None else 0
+            if stream_handle:
+                prog.set_current_cuda_stream(stream_handle)
+            try:
+                prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            finally:
+                if stream_handle:
+                    prog.set_current_cuda_stream(0)
         except Exception as e:
             e = handle_exception_from_cpp(e)
             if impl.get_runtime().print_full_traceback:
@@ -547,6 +556,7 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     # Thus this part needs to be fast. (i.e. < 3us on a 4 GHz x64 CPU)
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
+        qd_stream = kwargs.pop("qd_stream", None)
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()
@@ -578,7 +588,7 @@ def __call__(self, *py_args, **kwargs) -> Any:
         kernel_cpp = self.materialized_kernels[key]
         compiled_kernel_data = self.compiled_kernel_data_by_key.get(key, None)
         self.launch_observations.found_kernel_in_materialize_cache = compiled_kernel_data is not None
-        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args)
+        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args, qd_stream=qd_stream)
         if compiled_kernel_data is None:
             assert self._last_compiled_kernel_data is not None
             self.compiled_kernel_data_by_key[key] = self._last_compiled_kernel_data
diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
new file mode 100644
index 0000000000..8530982455
--- /dev/null
+++ b/python/quadrants/lang/stream.py
@@ -0,0 +1,96 @@
+from quadrants.lang import impl
+
+
+class Stream:
+    """Wraps a backend-specific GPU stream for concurrent kernel execution.
+
+    On backends without native streams (e.g. CPU), this is a no-op object.
+    """
+
+    def __init__(self, handle: int):
+        self._handle = handle
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    def synchronize(self):
+        """Block until all operations on this stream complete."""
+        prog = impl.get_runtime().prog
+        prog.stream_synchronize(self._handle)
+
+    def destroy(self):
+        """Explicitly destroy the stream. Safe to call multiple times."""
+        if self._handle != 0:
+            prog = impl.get_runtime().prog
+            prog.stream_destroy(self._handle)
+            self._handle = 0
+
+    def __del__(self):
+        if self._handle != 0:
+            try:
+                self.destroy()
+            except Exception:
+                pass
+
+
+class Event:
+    """Wraps a backend-specific GPU event for stream synchronization.
+
+    On backends without native events (e.g. CPU), this is a no-op object.
+    """
+
+    def __init__(self, handle: int):
+        self._handle = handle
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    def record(self, stream: Stream | None = None):
+        """Record this event on a stream. None means the default stream."""
+        prog = impl.get_runtime().prog
+        stream_handle = stream.handle if stream is not None else 0
+        prog.event_record(self._handle, stream_handle)
+
+    def wait(self, qd_stream: Stream | None = None):
+        """Make a stream wait for this event. None means the default stream."""
+        prog = impl.get_runtime().prog
+        stream_handle = qd_stream.handle if qd_stream is not None else 0
+        prog.stream_wait_event(stream_handle, self._handle)
+
+    def synchronize(self):
+        """Block the host until this event has been reached."""
+        prog = impl.get_runtime().prog
+        prog.event_synchronize(self._handle)
+
+    def destroy(self):
+        """Explicitly destroy the event. Safe to call multiple times."""
+        if self._handle != 0:
+            prog = impl.get_runtime().prog
+            prog.event_destroy(self._handle)
+            self._handle = 0
+
+    def __del__(self):
+        if self._handle != 0:
+            try:
+                self.destroy()
+            except Exception:
+                pass
+
+
+def create_stream() -> Stream:
+    """Create a new GPU stream for concurrent kernel execution."""
+    prog = impl.get_runtime().prog
+    handle = prog.stream_create()
+    return Stream(handle)
+
+
+def create_event() -> Event:
+    """Create a new GPU event for stream synchronization."""
+    prog = impl.get_runtime().prog
+    handle = prog.event_create()
+    return Event(handle)
+
+
+__all__ = ["Stream", "Event", "create_stream", "create_event"]
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 7f5dfef2d8..9b2ff0886b 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -20,6 +20,11 @@
 #include "quadrants/codegen/llvm/struct_llvm.h"
 #endif
 
+#ifdef QD_WITH_CUDA
+#include "quadrants/rhi/cuda/cuda_driver.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+#endif
+
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -481,4 +486,92 @@ void Program::enqueue_compute_op_lambda(
   program_impl_->enqueue_compute_op_lambda(op, image_refs);
 }
 
+uint64 Program::stream_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    void *stream = nullptr;
+    CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    return reinterpret_cast<uint64>(stream);
+  }
+#endif
+  return 0;
+}
+
+void Program::stream_destroy(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDADriver::get_instance().stream_destroy(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::stream_synchronize(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDADriver::get_instance().stream_synchronize(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::set_current_cuda_stream(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().set_stream(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+uint64 Program::event_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    void *event = nullptr;
+    CUDADriver::get_instance().event_create(&event,
+                                            0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    return reinterpret_cast<uint64>(event);
+  }
+#endif
+  return 0;
+}
+
+void Program::event_destroy(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_destroy(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_record(
+        reinterpret_cast<void *>(event_handle),
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::event_synchronize(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().event_synchronize(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDADriver::get_instance().stream_wait_event(
+        reinterpret_cast<void *>(stream_handle),
+        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 1fa2c2ac57..9568c371c8 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -300,6 +300,16 @@ class QD_DLL_EXPORT Program {
     return ndarrays_.size();
   }
 
+  uint64 stream_create();
+  void stream_destroy(uint64 stream_handle);
+  void stream_synchronize(uint64 stream_handle);
+  void set_current_cuda_stream(uint64 stream_handle);
+  uint64 event_create();
+  void event_destroy(uint64 event_handle);
+  void event_record(uint64 event_handle, uint64 stream_handle);
+  void event_synchronize(uint64 event_handle);
+  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+
   // TODO(zhanlue): Move these members and corresponding interfaces to
   // ProgramImpl Ideally, Program should serve as a pure interface class and all
   // the implementations should fall inside ProgramImpl
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index b3d23c0037..2f5da8b1b4 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -495,7 +495,16 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel,
            py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
-      .def("get_device_caps", &Program::get_device_caps);
+      .def("get_device_caps", &Program::get_device_caps)
+      .def("stream_create", &Program::stream_create)
+      .def("stream_destroy", &Program::stream_destroy)
+      .def("stream_synchronize", &Program::stream_synchronize)
+      .def("set_current_cuda_stream", &Program::set_current_cuda_stream)
+      .def("event_create", &Program::event_create)
+      .def("event_destroy", &Program::event_destroy)
+      .def("event_record", &Program::event_record)
+      .def("event_synchronize", &Program::event_synchronize)
+      .def("stream_wait_event", &Program::stream_wait_event);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 25b3c7958e..a9690ca10b 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -20,6 +20,7 @@ PER_CUDA_FUNCTION(context_set_limit, cuCtxSetLimit, int, std::size_t);
 
 // Stream management
 PER_CUDA_FUNCTION(stream_create, cuStreamCreate, void **, uint32);
+PER_CUDA_FUNCTION(stream_destroy, cuStreamDestroy_v2, void *);
 
 // Memory management
 PER_CUDA_FUNCTION(memcpy_host_to_device, cuMemcpyHtoD_v2, void *, void *, std::size_t);
@@ -52,6 +53,7 @@ PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_a
 
 // Stream management
 PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *);
+PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32);
 
 // Event management
 PER_CUDA_FUNCTION(event_create, cuEventCreate, void **, uint32)
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 5eae5e747d..13845d5a9b 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,5 +1,6 @@
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
+#include "quadrants/rhi/cuda/cuda_driver.h"
 
 namespace quadrants::lang {
 namespace cuda {
@@ -43,10 +44,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   // kernels.
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
+  auto *active_stream = CUDAContext::get_instance().get_stream();
+
   char *device_result_buffer{nullptr};
   CUDADriver::get_instance().malloc_async(
       (void **)&device_result_buffer,
-      std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr);
+      std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
   ctx.get_context().runtime = executor->get_llvm_runtime();
 
   for (int i = 0; i < (int)parameters.size(); i++) {
@@ -120,7 +123,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     }
   }
   if (transfers.size() > 0) {
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
   char *host_result_buffer = (char *)ctx.get_context().result_buffer;
   if (ctx.result_buffer_size > 0) {
@@ -129,10 +132,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   char *device_arg_buffer = nullptr;
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().malloc_async((void **)&device_arg_buffer,
-                                            ctx.arg_buffer_size, nullptr);
+                                            ctx.arg_buffer_size, active_stream);
     CUDADriver::get_instance().memcpy_host_to_device_async(
         device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
-        nullptr);
+        active_stream);
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
@@ -144,17 +147,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                         {});
   }
   if (ctx.arg_buffer_size > 0) {
-    CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr);
+    CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream);
   }
   if (ctx.result_buffer_size > 0) {
     CUDADriver::get_instance().memcpy_device_to_host_async(
         host_result_buffer, device_result_buffer, ctx.result_buffer_size,
-        nullptr);
+        active_stream);
   }
-  CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr);
+  CUDADriver::get_instance().mem_free_async(device_result_buffer,
+                                            active_stream);
   // copy data back to host
   if (transfers.size() > 0) {
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
       CUDADriver::get_instance().memcpy_device_to_host(
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index cf12abc393..002014c960 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -59,6 +59,7 @@ def _get_expected_matrix_apis():
     "DEBUG",
     "DeviceCapability",
     "ERROR",
+    "Event",
     "Field",
     "FieldsBuilder",
     "Format",
@@ -73,6 +74,7 @@ def _get_expected_matrix_apis():
     "SNode",
     "ScalarField",
     "ScalarNdarray",
+    "Stream",
     "Struct",
     "StructField",
     "TRACE",
@@ -117,6 +119,8 @@ def _get_expected_matrix_apis():
     "clock_freq_hz",
     "cos",
     "cpu",
+    "create_event",
+    "create_stream",
     "cuda",
     "data_oriented",
     "dataclass",
diff --git a/tests/python/test_cache.py b/tests/python/test_cache.py
index c3821e44c5..e31daf61e7 100644
--- a/tests/python/test_cache.py
+++ b/tests/python/test_cache.py
@@ -216,11 +216,11 @@ def test_fastcache(tmp_path: pathlib.Path, monkeypatch):
     qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True)
     is_valid = False
 
-    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args):
+    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None):
         nonlocal is_valid
         is_valid = True
         assert compiled_kernel_data is None
-        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args)
+        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream)
 
     monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel)
 
@@ -242,11 +242,11 @@ def fun(value: qd.types.ndarray(), offset: qd.template()):
     qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True)
     is_valid = False
 
-    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args):
+    def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None):
         nonlocal is_valid
         is_valid = True
         assert compiled_kernel_data is not None
-        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args)
+        return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream)
 
     monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel)
 
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
new file mode 100644
index 0000000000..fabc217e96
--- /dev/null
+++ b/tests/python/test_streams.py
@@ -0,0 +1,197 @@
+"""Tests for GPU stream and event support."""
+
+import numpy as np
+
+import quadrants as qd
+from quadrants.lang.stream import Event, Stream
+
+from tests import test_utils
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_create_and_destroy_stream():
+    s = qd.create_stream()
+    assert isinstance(s, Stream)
+    assert s.handle != 0
+    s.destroy()
+    assert s.handle == 0
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_create_and_destroy_event():
+    e = qd.create_event()
+    assert isinstance(e, Event)
+    assert e.handle != 0
+    e.destroy()
+    assert e.handle == 0
+
+
+@test_utils.test()
+def test_kernel_on_stream():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 42.0
+
+    s = qd.create_stream()
+    fill(qd_stream=s)
+    s.synchronize()
+    assert np.allclose(x.to_numpy(), 42.0)
+    s.destroy()
+
+
+@test_utils.test()
+def test_two_streams():
+    N = 1024
+    a = qd.field(qd.f32, shape=(N,))
+    b = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_a():
+        for i in range(N):
+            a[i] = 1.0
+
+    @qd.kernel
+    def fill_b():
+        for i in range(N):
+            b[i] = 2.0
+
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+    fill_a(qd_stream=s1)
+    fill_b(qd_stream=s2)
+    s1.synchronize()
+    s2.synchronize()
+    assert np.allclose(a.to_numpy(), 1.0)
+    assert np.allclose(b.to_numpy(), 2.0)
+    s1.destroy()
+    s2.destroy()
+
+
+@test_utils.test()
+def test_event_synchronization():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+    y = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_x():
+        for i in range(N):
+            x[i] = 10.0
+
+    @qd.kernel
+    def copy_x_to_y():
+        for i in range(N):
+            y[i] = x[i]
+
+    s1 = qd.create_stream()
+    fill_x(qd_stream=s1)
+
+    e = qd.create_event()
+    e.record(s1)
+
+    # Default stream waits for s1 to finish fill_x
+    e.wait()
+    copy_x_to_y()
+    qd.sync()
+
+    assert np.allclose(y.to_numpy(), 10.0)
+
+    e.destroy()
+    s1.destroy()
+
+
+@test_utils.test()
+def test_event_wait_on_stream():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+    y = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_x():
+        for i in range(N):
+            x[i] = 5.0
+
+    @qd.kernel
+    def copy_x_to_y():
+        for i in range(N):
+            y[i] = x[i]
+
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+
+    fill_x(qd_stream=s1)
+
+    e = qd.create_event()
+    e.record(s1)
+
+    # s2 waits for s1's event before running
+    e.wait(qd_stream=s2)
+    copy_x_to_y(qd_stream=s2)
+    s2.synchronize()
+
+    assert np.allclose(y.to_numpy(), 5.0)
+
+    e.destroy()
+    s1.destroy()
+    s2.destroy()
+
+
+@test_utils.test()
+def test_default_stream_kernel():
+    N = 1024
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 7.0
+
+    fill()
+    qd.sync()
+    assert np.allclose(x.to_numpy(), 7.0)
+
+
+@test_utils.test(arch=[qd.cpu])
+def test_stream_noop_on_cpu():
+    """Streams should be no-ops on CPU without errors."""
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 3.0
+
+    s = qd.create_stream()
+    assert s.handle == 0
+    fill(qd_stream=s)
+    qd.sync()
+    assert np.allclose(x.to_numpy(), 3.0)
+
+    e = qd.create_event()
+    assert e.handle == 0
+    e.record(s)
+    e.wait()
+    s.destroy()
+    e.destroy()
+
+
+@test_utils.test()
+def test_stream_with_ndarray():
+    N = 1024
+
+    @qd.kernel
+    def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(N):
+            arr[i] = 99.0
+
+    arr = qd.ndarray(qd.f32, shape=(N,))
+    s = qd.create_stream()
+    fill(arr, qd_stream=s)
+    s.synchronize()
+    assert np.allclose(arr.to_numpy(), 99.0)
+    s.destroy()

From 7bd18ca4e1a9b6e99632af2c7c62076b4195ae3d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:42:22 -0700
Subject: [PATCH 002/109] Add AMDGPU/HIP stream support and async memory
 operations

Mirrors the CUDA stream implementation for HIP: adds stream_ member
to AMDGPUContext, stream_destroy/stream_wait_event/malloc_async/
mem_free_async to HIP driver functions, and AMDGPU branches in all
Program stream/event methods. Converts AMDGPU kernel launcher to use
async memory operations through the active stream. CPU backend
returns 0 handles (no-op).
---
 quadrants/program/program.cpp                 | 64 ++++++++++++++
 quadrants/rhi/amdgpu/amdgpu_context.cpp       |  4 +-
 quadrants/rhi/amdgpu/amdgpu_context.h         |  9 ++
 .../rhi/amdgpu/amdgpu_driver_functions.inc.h  |  8 ++
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 51 +++++------
 tests/python/test_streams.py                  | 84 ++++++++++++++++++-
 6 files changed, 191 insertions(+), 29 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 9b2ff0886b..f4bb8da35b 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -25,6 +25,11 @@
 #include "quadrants/rhi/cuda/cuda_context.h"
 #endif
 
+#ifdef QD_WITH_AMDGPU
+#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
+#include "quadrants/rhi/amdgpu/amdgpu_context.h"
+#endif
+
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -493,6 +498,13 @@ uint64 Program::stream_create() {
     CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
     return reinterpret_cast<uint64>(stream);
   }
+#endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    void *stream = nullptr;
+    AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    return reinterpret_cast<uint64>(stream);
+  }
 #endif
   return 0;
 }
@@ -504,6 +516,12 @@ void Program::stream_destroy(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUDriver::get_instance().stream_destroy(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::stream_synchronize(uint64 stream_handle) {
@@ -513,6 +531,12 @@ void Program::stream_synchronize(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUDriver::get_instance().stream_synchronize(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::set_current_cuda_stream(uint64 stream_handle) {
@@ -522,6 +546,12 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().set_stream(
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 uint64 Program::event_create() {
@@ -532,6 +562,14 @@ uint64 Program::event_create() {
                                             0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
   }
+#endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu) {
+    void *event = nullptr;
+    AMDGPUDriver::get_instance().event_create(&event,
+                                              0x02 /*hipEventDisableTiming*/);
+    return reinterpret_cast<uint64>(event);
+  }
 #endif
   return 0;
 }
@@ -543,6 +581,12 @@ void Program::event_destroy(uint64 event_handle) {
         reinterpret_cast<void *>(event_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_destroy(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
 }
 
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
@@ -553,6 +597,13 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
         reinterpret_cast<void *>(stream_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_record(
+        reinterpret_cast<void *>(event_handle),
+        reinterpret_cast<void *>(stream_handle));
+  }
+#endif
 }
 
 void Program::event_synchronize(uint64 event_handle) {
@@ -562,6 +613,12 @@ void Program::event_synchronize(uint64 event_handle) {
         reinterpret_cast<void *>(event_handle));
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().event_synchronize(
+        reinterpret_cast<void *>(event_handle));
+  }
+#endif
 }
 
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
@@ -572,6 +629,13 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
         reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
+#ifdef QD_WITH_AMDGPU
+  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUDriver::get_instance().stream_wait_event(
+        reinterpret_cast<void *>(stream_handle),
+        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
 }
 
 }  // namespace quadrants::lang
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
index 22f55339ee..f940ed9a7c 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -188,7 +188,7 @@ void AMDGPUContext::launch(void *func,
     void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02,
                       (void *)&pack_size, (void *)0x03};
     driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
-                          dynamic_shared_mem_bytes, nullptr, nullptr,
+                          dynamic_shared_mem_bytes, stream_, nullptr,
                           reinterpret_cast<void **>(&config));
   }
   std::free(packed_arg);
@@ -197,7 +197,7 @@ void AMDGPUContext::launch(void *func,
     profiler_->stop(task_handle);
 
   if (debug_) {
-    driver_.stream_synchronize(nullptr);
+    driver_.stream_synchronize(stream_);
   }
 }
 
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index 9529953bf1..68e7cd7314 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -23,6 +23,7 @@ class AMDGPUContext {
   KernelProfilerBase *profiler_{nullptr};
   AMDGPUDriver &driver_;
   bool debug_{false};
+  void *stream_{nullptr};
   std::vector<void *> kernel_arg_pointer_;
 
  public:
@@ -116,6 +117,14 @@ class AMDGPUContext {
     return std::unique_lock<std::mutex>(lock_);
   }
 
+  void set_stream(void *stream) {
+    stream_ = stream;
+  }
+
+  void *get_stream() const {
+    return stream_;
+  }
+
   static AMDGPUContext &get_instance();
 };
 
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index dbb3612c87..6063d268a9 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -26,6 +26,7 @@ PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
+PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
 
 // Memory management
 PER_AMDGPU_FUNCTION(memcpy_host_to_device,
@@ -69,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
                     std::size_t,
                     void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
+PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed,
                     hipMallocManaged,
                     void **,
@@ -76,6 +78,7 @@ PER_AMDGPU_FUNCTION(malloc_managed,
                     uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
 PER_AMDGPU_FUNCTION(mem_free, hipFree, void *);
+PER_AMDGPU_FUNCTION(mem_free_async, hipFreeAsync, void *, void *);
 PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *);
 PER_AMDGPU_FUNCTION(mem_get_attribute,
                     hipPointerGetAttribute,
@@ -121,6 +124,11 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy,
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
+PER_AMDGPU_FUNCTION(stream_wait_event,
+                    hipStreamWaitEvent,
+                    void *,
+                    void *,
+                    uint32);
 
 // Event management
 PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 6ef0b0e0e5..1d8430d35e 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -1,5 +1,6 @@
 #include "quadrants/runtime/amdgpu/kernel_launcher.h"
 #include "quadrants/rhi/amdgpu/amdgpu_context.h"
+#include "quadrants/rhi/amdgpu/amdgpu_driver.h"
 #include "quadrants/program/launch_context_builder.h"
 
 namespace quadrants::lang {
@@ -32,18 +33,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       transfers;
   std::unordered_map<ArgArrayPtrKey, void *, ArgArrayPtrKeyHasher> device_ptrs;
 
+  auto *active_stream = AMDGPUContext::get_instance().get_stream();
+
   char *device_result_buffer{nullptr};
-  // Here we have to guarantee the result_result_buffer isn't nullptr
-  // It is interesting - The code following
-  // L60:           DeviceAllocation devalloc =
-  // executor->allocate_memory_on_device( call another kernel and it will result
-  // in
-  //   Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address
-  //   (nil). Reason: Page not present or supervisor privilege.
-  // if you don't allocate it.
-  AMDGPUDriver::get_instance().malloc(
+  // Must always allocate device_result_buffer (even when result_buffer_size
+  // is 0) to avoid memory access faults from allocate_memory_on_device below.
+  AMDGPUDriver::get_instance().malloc_async(
       (void **)&device_result_buffer,
-      std::max(ctx.result_buffer_size, sizeof(uint64)));
+      std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
 
   for (int i = 0; i < (int)parameters.size(); i++) {
     const auto &kv = parameters[i];
@@ -86,27 +83,28 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     }
   }
   if (transfers.size() > 0) {
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   char *host_result_buffer = (char *)ctx.get_context().result_buffer;
   if (ctx.result_buffer_size > 0) {
-    // Malloc_Async and Free_Async are available after ROCm 5.4
     ctx.get_context().result_buffer = (uint64 *)device_result_buffer;
   }
   char *device_arg_buffer = nullptr;
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer,
-                                        ctx.arg_buffer_size);
-    AMDGPUDriver::get_instance().memcpy_host_to_device(
-        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size);
+    AMDGPUDriver::get_instance().malloc_async(
+        (void **)&device_arg_buffer, ctx.arg_buffer_size, active_stream);
+    AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+        device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size,
+        active_stream);
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
   void *context_pointer;
   int arg_size = sizeof(RuntimeContext *);
-  AMDGPUDriver::get_instance().malloc((void **)&context_pointer,
-                                      sizeof(RuntimeContext));
-  AMDGPUDriver::get_instance().memcpy_host_to_device(
-      context_pointer, &ctx.get_context(), sizeof(RuntimeContext));
+  AMDGPUDriver::get_instance().malloc_async(
+      (void **)&context_pointer, sizeof(RuntimeContext), active_stream);
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+      context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
+      active_stream);
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
@@ -119,13 +117,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
   }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().mem_free(device_arg_buffer);
+    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer,
+                                                active_stream);
   }
   if (ctx.result_buffer_size > 0) {
-    AMDGPUDriver::get_instance().memcpy_device_to_host(
-        host_result_buffer, device_result_buffer, ctx.result_buffer_size);
+    AMDGPUDriver::get_instance().memcpy_device_to_host_async(
+        host_result_buffer, device_result_buffer, ctx.result_buffer_size,
+        active_stream);
   }
   if (transfers.size()) {
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
       auto arg_id = idx.arg_id;
@@ -135,8 +136,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
-  // Since we always allocating above then we should always free
-  AMDGPUDriver::get_instance().mem_free(device_result_buffer);
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
+                                              active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index fabc217e96..073d383c2e 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -8,7 +8,7 @@
 from tests import test_utils
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test(arch=[qd.cuda, qd.amdgpu])
 def test_create_and_destroy_stream():
     s = qd.create_stream()
     assert isinstance(s, Stream)
@@ -17,7 +17,7 @@ def test_create_and_destroy_stream():
     assert s.handle == 0
 
 
-@test_utils.test(arch=[qd.cuda])
+@test_utils.test(arch=[qd.cuda, qd.amdgpu])
 def test_create_and_destroy_event():
     e = qd.create_event()
     assert isinstance(e, Event)
@@ -195,3 +195,83 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_concurrent_streams_with_events():
+    """Two slow kernels on separate streams run concurrently (~1s on GPU),
+    serial fallback on CPU/Metal."""
+    SPIN_ITERS = 5_000_000
+
+    @qd.kernel
+    def slow_fill(
+        a: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        lcg_state: qd.types.ndarray(dtype=qd.i32, ndim=1),
+        index: qd.i32,
+        value: qd.f32,
+    ):
+        qd.loop_config(block_dim=1)
+        for _ in range(1):
+            x = lcg_state[index]
+            for _j in range(SPIN_ITERS):
+                x = (1664525 * x + 1013904223) % 2147483647
+            lcg_state[index] = x
+            a[index] = value
+
+    @qd.kernel
+    def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        qd.loop_config(block_dim=1)
+        for _ in range(1):
+            a[2] = a[0] + a[1]
+
+    import time
+
+    # Warm up JIT
+    a_warmup = qd.ndarray(qd.f32, shape=(3,))
+    lcg_warmup = qd.ndarray(qd.i32, shape=(3,))
+    slow_fill(a_warmup, lcg_warmup, 0, 0.0)
+    add_first_two(a_warmup)
+    qd.sync()
+
+    # Serial baseline
+    a = qd.ndarray(qd.f32, shape=(3,))
+    lcg = qd.ndarray(qd.i32, shape=(3,))
+    qd.sync()
+    t0 = time.perf_counter()
+    slow_fill(a, lcg, 0, 5.0)
+    slow_fill(a, lcg, 1, 7.0)
+    add_first_two(a)
+    qd.sync()
+    serial_time = time.perf_counter() - t0
+    assert np.isclose(a.to_numpy()[2], 12.0)
+
+    # Streams
+    a = qd.ndarray(qd.f32, shape=(3,))
+    lcg = qd.ndarray(qd.i32, shape=(3,))
+    s1 = qd.create_stream()
+    s2 = qd.create_stream()
+    e1 = qd.create_event()
+    e2 = qd.create_event()
+    qd.sync()
+    t0 = time.perf_counter()
+    slow_fill(a, lcg, 0, 5.0, qd_stream=s1)
+    slow_fill(a, lcg, 1, 7.0, qd_stream=s2)
+    e1.record(s1)
+    e2.record(s2)
+    e1.wait()
+    e2.wait()
+    add_first_two(a)
+    qd.sync()
+    stream_time = time.perf_counter() - t0
+    assert np.isclose(a.to_numpy()[2], 12.0)
+
+    speedup = serial_time / stream_time
+    if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
+        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
+    else:
+        assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
+
+    s1.destroy()
+    s2.destroy()
+    e1.destroy()
+    e2.destroy()

From a40ed4ccd03a1162cf40a5f4fa35ee6ee7979abc Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 16:47:44 -0700
Subject: [PATCH 003/109] Add qd.stream_parallel() context manager for implicit
 stream parallelism

Introduces stream_parallel() for running top-level for-loop blocks on
separate GPU streams. The AST transformer maps 'with qd.stream_parallel()'
blocks to stream-parallel group IDs, which propagate through IR lowering
and offloading to the CUDA/AMDGPU kernel launchers. Each unique group ID
gets its own stream at launch time. Includes validation that all top-level
kernel statements must be stream_parallel blocks (no mixing), and offline
cache key support.
---
 python/quadrants/lang/ast/ast_transformer.py  |  32 +++-
 .../function_def_transformer.py               |  29 +++
 python/quadrants/lang/stream.py               |  15 +-
 quadrants/analysis/gen_offline_cache_key.cpp  |   1 +
 quadrants/codegen/amdgpu/codegen_amdgpu.cpp   |   1 +
 quadrants/codegen/cuda/codegen_cuda.cpp       |   1 +
 quadrants/codegen/llvm/llvm_compiled_data.h   |  13 +-
 quadrants/ir/frontend_ir.cpp                  |  12 +-
 quadrants/ir/frontend_ir.h                    |  12 ++
 quadrants/ir/statements.cpp                   |   3 +
 quadrants/ir/statements.h                     |   3 +
 quadrants/python/export_lang.cpp              |   4 +-
 quadrants/runtime/amdgpu/kernel_launcher.cpp  |  52 ++++-
 quadrants/runtime/cuda/kernel_launcher.cpp    |  52 ++++-
 quadrants/transforms/lower_ast.cpp            |   3 +
 quadrants/transforms/offload.cpp              |   3 +
 tests/python/test_api.py                      |   1 +
 tests/python/test_streams.py                  | 178 ++++++++++++++++--
 18 files changed, 377 insertions(+), 38 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index 1b13ead0f9..f5cfbeef1a 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -28,6 +28,7 @@
 from quadrants.lang.ast.ast_transformers.function_def_transformer import (
     FunctionDefTransformer,
 )
+from quadrants.lang.ast.symbol_resolver import ASTResolver
 from quadrants.lang.exception import (
     QuadrantsIndexError,
     QuadrantsRuntimeTypeError,
@@ -39,6 +40,7 @@
 from quadrants.lang.field import Field
 from quadrants.lang.matrix import Matrix, MatrixType
 from quadrants.lang.snode import append, deactivate, length
+from quadrants.lang.stream import stream_parallel
 from quadrants.lang.struct import Struct, StructType
 from quadrants.types import primitive_types
 from quadrants.types.utils import is_integral
@@ -108,7 +110,11 @@ def build_AnnAssign(ctx: ASTTransformerFuncContext, node: ast.AnnAssign):
 
     @staticmethod
     def build_assign_annotated(
-        ctx: ASTTransformerFuncContext, target: ast.Name, value, is_static_assign: bool, annotation: Type
+        ctx: ASTTransformerFuncContext,
+        target: ast.Name,
+        value,
+        is_static_assign: bool,
+        annotation: Type,
     ):
         """Build an annotated assignment like this: target: annotation = value.
 
@@ -156,7 +162,10 @@ def build_Assign(ctx: ASTTransformerFuncContext, node: ast.Assign) -> None:
 
     @staticmethod
     def build_assign_unpack(
-        ctx: ASTTransformerFuncContext, node_target: list | ast.Tuple, values, is_static_assign: bool
+        ctx: ASTTransformerFuncContext,
+        node_target: list | ast.Tuple,
+        values,
+        is_static_assign: bool,
     ):
         """Build the unpack assignments like this: (target1, target2) = (value1, value2).
         The function should be called only if the node target is a tuple.
@@ -538,7 +547,8 @@ def build_Return(ctx: ASTTransformerFuncContext, node: ast.Return) -> None:
                 else:
                     raise QuadrantsSyntaxError("The return type is not supported now!")
             ctx.ast_builder.create_kernel_exprgroup_return(
-                expr.make_expr_group(return_exprs), _qd_core.DebugInfo(ctx.get_pos_info(node))
+                expr.make_expr_group(return_exprs),
+                _qd_core.DebugInfo(ctx.get_pos_info(node)),
             )
         else:
             ctx.return_data = node.value.ptr
@@ -1381,6 +1391,22 @@ def build_Continue(ctx: ASTTransformerFuncContext, node: ast.Continue) -> None:
             ctx.ast_builder.insert_continue_stmt(_qd_core.DebugInfo(ctx.get_pos_info(node)))
         return None
 
+    @staticmethod
+    def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None:
+        if len(node.items) != 1:
+            raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports a single context manager")
+        item = node.items[0]
+        if item.optional_vars is not None:
+            raise QuadrantsSyntaxError("'with ... as ...' is not supported in Quadrants kernels")
+        if not isinstance(item.context_expr, ast.Call):
+            raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression")
+        if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars):
+            raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()")
+        ctx.ast_builder.begin_stream_parallel()
+        build_stmts(ctx, node.body)
+        ctx.ast_builder.end_stream_parallel()
+        return None
+
     @staticmethod
     def build_Pass(ctx: ASTTransformerFuncContext, node: ast.Pass) -> None:
         return None
diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 6d000b69f5..dacbac4c96 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -21,10 +21,12 @@
 from quadrants.lang.ast.ast_transformer_utils import (
     ASTTransformerFuncContext,
 )
+from quadrants.lang.ast.symbol_resolver import ASTResolver
 from quadrants.lang.exception import (
     QuadrantsSyntaxError,
 )
 from quadrants.lang.matrix import MatrixType
+from quadrants.lang.stream import stream_parallel
 from quadrants.lang.struct import StructType
 from quadrants.lang.util import to_quadrants_type
 from quadrants.types import annotations, ndarray_type, primitive_types
@@ -295,7 +297,34 @@ def build_FunctionDef(
             else:
                 FunctionDefTransformer._transform_as_func(ctx, node, args)
 
+        if ctx.is_kernel:
+            FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars)
+
         with ctx.variable_scope_guard():
             build_stmts(ctx, node.body)
 
         return None
+
+    @staticmethod
+    def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> bool:
+        if not isinstance(stmt, ast.With):
+            return False
+        if len(stmt.items) != 1:
+            return False
+        item = stmt.items[0]
+        if not isinstance(item.context_expr, ast.Call):
+            return False
+        return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars)
+
+    @staticmethod
+    def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None:
+        has_sp = any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body)
+        if not has_sp:
+            return
+        for stmt in body:
+            if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars):
+                raise QuadrantsSyntaxError(
+                    "When using qd.stream_parallel(), all top-level statements "
+                    "in the kernel must be 'with qd.stream_parallel():' blocks. "
+                    "Move non-parallel code to a separate kernel."
+                )
diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 8530982455..77979184d4 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -1,3 +1,5 @@
+from contextlib import contextmanager
+
 from quadrants.lang import impl
 
 
@@ -93,4 +95,15 @@ def create_event() -> Event:
     return Event(handle)
 
 
-__all__ = ["Stream", "Event", "create_stream", "create_event"]
+@contextmanager
+def stream_parallel():
+    """Run top-level for loops in this block on separate GPU streams.
+
+    Used inside @qd.kernel. At Python runtime (outside kernels), this is a
+    no-op. During kernel compilation, the AST transformer calls into the C++
+    ASTBuilder to tag loops with a stream-parallel group ID.
+    """
+    yield
+
+
+__all__ = ["Stream", "Event", "create_stream", "create_event", "stream_parallel"]
diff --git a/quadrants/analysis/gen_offline_cache_key.cpp b/quadrants/analysis/gen_offline_cache_key.cpp
index f9eb5dc324..9a38eb9ac2 100644
--- a/quadrants/analysis/gen_offline_cache_key.cpp
+++ b/quadrants/analysis/gen_offline_cache_key.cpp
@@ -382,6 +382,7 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
     emit(stmt->strictly_serialized);
     emit(stmt->mem_access_opt);
     emit(stmt->block_dim);
+    emit(stmt->stream_parallel_group_id);
     emit(stmt->body.get());
   }
 
diff --git a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp
index bba1c87f20..e0fcca575e 100644
--- a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp
@@ -396,6 +396,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         current_task->grid_dim = num_SMs * query_max_block_per_sm;
       }
       current_task->block_dim = stmt->block_dim;
+      current_task->stream_parallel_group_id = stmt->stream_parallel_group_id;
       QD_ASSERT(current_task->grid_dim != 0);
       QD_ASSERT(current_task->block_dim != 0);
       offloaded_tasks.push_back(*current_task);
diff --git a/quadrants/codegen/cuda/codegen_cuda.cpp b/quadrants/codegen/cuda/codegen_cuda.cpp
index 8395f7adca..4795db23d2 100644
--- a/quadrants/codegen/cuda/codegen_cuda.cpp
+++ b/quadrants/codegen/cuda/codegen_cuda.cpp
@@ -692,6 +692,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
       }
       current_task->block_dim = stmt->block_dim;
       current_task->dynamic_shared_array_bytes = dynamic_shared_array_bytes;
+      current_task->stream_parallel_group_id = stmt->stream_parallel_group_id;
       QD_ASSERT(current_task->grid_dim != 0);
       QD_ASSERT(current_task->block_dim != 0);
       offloaded_tasks.push_back(*current_task);
diff --git a/quadrants/codegen/llvm/llvm_compiled_data.h b/quadrants/codegen/llvm/llvm_compiled_data.h
index 16d4978bd4..f496e6fa3c 100644
--- a/quadrants/codegen/llvm/llvm_compiled_data.h
+++ b/quadrants/codegen/llvm/llvm_compiled_data.h
@@ -14,16 +14,23 @@ class OffloadedTask {
   int block_dim{0};
   int grid_dim{0};
   int dynamic_shared_array_bytes{0};
+  int stream_parallel_group_id{0};
 
   explicit OffloadedTask(const std::string &name = "",
                          int block_dim = 0,
                          int grid_dim = 0,
-                         int dynamic_shared_array_bytes = 0)
+                         int dynamic_shared_array_bytes = 0,
+                         int stream_parallel_group_id = 0)
       : name(name),
         block_dim(block_dim),
         grid_dim(grid_dim),
-        dynamic_shared_array_bytes(dynamic_shared_array_bytes) {};
-  QD_IO_DEF(name, block_dim, grid_dim, dynamic_shared_array_bytes);
+        dynamic_shared_array_bytes(dynamic_shared_array_bytes),
+        stream_parallel_group_id(stream_parallel_group_id) {};
+  QD_IO_DEF(name,
+            block_dim,
+            grid_dim,
+            dynamic_shared_array_bytes,
+            stream_parallel_group_id);
 };
 
 struct LLVMCompiledTask {
diff --git a/quadrants/ir/frontend_ir.cpp b/quadrants/ir/frontend_ir.cpp
index ae2e3ebe7c..6cf3087643 100644
--- a/quadrants/ir/frontend_ir.cpp
+++ b/quadrants/ir/frontend_ir.cpp
@@ -119,7 +119,8 @@ FrontendForStmt::FrontendForStmt(const FrontendForStmt &o)
       num_cpu_threads(o.num_cpu_threads),
       strictly_serialized(o.strictly_serialized),
       mem_access_opt(o.mem_access_opt),
-      block_dim(o.block_dim) {
+      block_dim(o.block_dim),
+      stream_parallel_group_id(o.stream_parallel_group_id) {
 }
 
 void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
@@ -127,6 +128,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) {
   strictly_serialized = config.strictly_serialized;
   mem_access_opt = config.mem_access_opt;
   block_dim = config.block_dim;
+  stream_parallel_group_id = config.stream_parallel_group_id;
   if (arch == Arch::cuda || arch == Arch::amdgpu) {
     num_cpu_threads = 1;
     QD_ASSERT(block_dim <= quadrants_max_gpu_block_dim);
@@ -1542,6 +1544,8 @@ void ASTBuilder::begin_frontend_range_for(const Expr &i,
                                           const Expr &s,
                                           const Expr &e,
                                           const DebugInfo &dbg_info) {
+  for_loop_dec_.config.stream_parallel_group_id =
+      current_stream_parallel_group_id_;
   auto stmt_unique = std::make_unique<FrontendForStmt>(
       i, s, e, arch_, for_loop_dec_.config, dbg_info);
   auto stmt = stmt_unique.get();
@@ -1558,6 +1562,8 @@ void ASTBuilder::begin_frontend_struct_for_on_snode(const ExprGroup &loop_vars,
       for_loop_dec_.config.strictly_serialized,
       "ti.loop_config(serialize=True) does not have effect on the struct for. "
       "The execution order is not guaranteed.");
+  for_loop_dec_.config.stream_parallel_group_id =
+      current_stream_parallel_group_id_;
   auto stmt_unique = std::make_unique<FrontendForStmt>(
       loop_vars, snode, arch_, for_loop_dec_.config, dbg_info);
   for_loop_dec_.reset();
@@ -1574,6 +1580,8 @@ void ASTBuilder::begin_frontend_struct_for_on_external_tensor(
       for_loop_dec_.config.strictly_serialized,
       "ti.loop_config(serialize=True) does not have effect on the struct for. "
       "The execution order is not guaranteed.");
+  for_loop_dec_.config.stream_parallel_group_id =
+      current_stream_parallel_group_id_;
   auto stmt_unique = std::make_unique<FrontendForStmt>(
       loop_vars, external_tensor, arch_, for_loop_dec_.config, dbg_info);
   for_loop_dec_.reset();
@@ -1591,6 +1599,8 @@ void ASTBuilder::begin_frontend_mesh_for(
       for_loop_dec_.config.strictly_serialized,
       "ti.loop_config(serialize=True) does not have effect on the mesh for. "
       "The execution order is not guaranteed.");
+  for_loop_dec_.config.stream_parallel_group_id =
+      current_stream_parallel_group_id_;
   auto stmt_unique =
       std::make_unique<FrontendForStmt>(ExprGroup(i), mesh_ptr, element_type,
                                         arch_, for_loop_dec_.config, dbg_info);
diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h
index bce009f9e7..693a7f461f 100644
--- a/quadrants/ir/frontend_ir.h
+++ b/quadrants/ir/frontend_ir.h
@@ -23,6 +23,7 @@ struct ForLoopConfig {
   MemoryAccessOptions mem_access_opt;
   int block_dim{0};
   bool uniform{false};
+  int stream_parallel_group_id{0};
 };
 
 #define QD_DEFINE_CLONE_FOR_FRONTEND_IR                \
@@ -207,6 +208,7 @@ class FrontendForStmt : public Stmt {
   bool strictly_serialized;
   MemoryAccessOptions mem_access_opt;
   int block_dim;
+  int stream_parallel_group_id{0};
 
   FrontendForStmt(const ExprGroup &loop_vars,
                   SNode *snode,
@@ -961,6 +963,8 @@ class ASTBuilder {
   Arch arch_;
   ForLoopDecoratorRecorder for_loop_dec_;
   int id_counter_{0};
+  int stream_parallel_group_counter_{0};
+  int current_stream_parallel_group_id_{0};
 
  public:
   ASTBuilder(Block *initial, Arch arch, bool is_kernel)
@@ -1107,6 +1111,14 @@ class ASTBuilder {
     for_loop_dec_.reset();
   }
 
+  void begin_stream_parallel() {
+    current_stream_parallel_group_id_ = ++stream_parallel_group_counter_;
+  }
+
+  void end_stream_parallel() {
+    current_stream_parallel_group_id_ = 0;
+  }
+
   Identifier get_next_id(const std::string &name = "") {
     return Identifier(id_counter_++, name);
   }
diff --git a/quadrants/ir/statements.cpp b/quadrants/ir/statements.cpp
index 14c55be85e..79b469a22a 100644
--- a/quadrants/ir/statements.cpp
+++ b/quadrants/ir/statements.cpp
@@ -244,6 +244,7 @@ std::unique_ptr<Stmt> RangeForStmt::clone() const {
       begin, end, body->clone(), is_bit_vectorized, num_cpu_threads, block_dim,
       strictly_serialized);
   new_stmt->reversed = reversed;
+  new_stmt->stream_parallel_group_id = stream_parallel_group_id;
   return new_stmt;
 }
 
@@ -265,6 +266,7 @@ std::unique_ptr<Stmt> StructForStmt::clone() const {
   auto new_stmt = std::make_unique<StructForStmt>(
       snode, body->clone(), is_bit_vectorized, num_cpu_threads, block_dim);
   new_stmt->mem_access_opt = mem_access_opt;
+  new_stmt->stream_parallel_group_id = stream_parallel_group_id;
   return new_stmt;
 }
 
@@ -439,6 +441,7 @@ std::unique_ptr<Stmt> OffloadedStmt::clone() const {
   new_stmt->tls_size = tls_size;
   new_stmt->bls_size = bls_size;
   new_stmt->mem_access_opt = mem_access_opt;
+  new_stmt->stream_parallel_group_id = stream_parallel_group_id;
   return new_stmt;
 }
 
diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h
index e06bb6d4df..3f440fe4e2 100644
--- a/quadrants/ir/statements.h
+++ b/quadrants/ir/statements.h
@@ -1016,6 +1016,7 @@ class RangeForStmt : public Stmt {
   int block_dim;
   bool strictly_serialized;
   std::string range_hint;
+  int stream_parallel_group_id{0};
 
   RangeForStmt(Stmt *begin,
                Stmt *end,
@@ -1061,6 +1062,7 @@ class StructForStmt : public Stmt {
   int num_cpu_threads;
   int block_dim;
   MemoryAccessOptions mem_access_opt;
+  int stream_parallel_group_id{0};
 
   StructForStmt(SNode *snode,
                 std::unique_ptr<Block> &&body,
@@ -1443,6 +1445,7 @@ class OffloadedStmt : public Stmt {
   std::size_t tls_size{1};  // avoid allocating dynamic memory with 0 byte
   std::size_t bls_size{0};
   MemoryAccessOptions mem_access_opt;
+  int stream_parallel_group_id{0};
 
   OffloadedStmt(TaskType task_type, Arch arch, Kernel *kernel);
 
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index 2f5da8b1b4..d134464d49 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -357,7 +357,9 @@ void export_lang(py::module &m) {
       .def("strictly_serialize", &ASTBuilder::strictly_serialize)
       .def("block_dim", &ASTBuilder::block_dim)
       .def("insert_snode_access_flag", &ASTBuilder::insert_snode_access_flag)
-      .def("reset_snode_access_flag", &ASTBuilder::reset_snode_access_flag);
+      .def("reset_snode_access_flag", &ASTBuilder::reset_snode_access_flag)
+      .def("begin_stream_parallel", &ASTBuilder::begin_stream_parallel)
+      .def("end_stream_parallel", &ASTBuilder::end_stream_parallel);
 
   auto device_capability_config =
       py::class_<DeviceCapabilityConfig>(m, "DeviceCapabilityConfig")
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 1d8430d35e..1b82b33459 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -1,3 +1,5 @@
+#include <map>
+
 #include "quadrants/runtime/amdgpu/kernel_launcher.h"
 #include "quadrants/rhi/amdgpu/amdgpu_context.h"
 #include "quadrants/rhi/amdgpu/amdgpu_driver.h"
@@ -108,12 +110,50 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
-  for (auto &task : offloaded_tasks) {
-    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-             task.block_dim);
-    amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
-                          task.dynamic_shared_array_bytes,
-                          {(void *)&context_pointer}, {arg_size});
+  for (size_t i = 0; i < offloaded_tasks.size();) {
+    auto &task = offloaded_tasks[i];
+    if (task.stream_parallel_group_id == 0) {
+      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      amdgpu_module->launch(task.name, task.grid_dim, task.block_dim,
+                            task.dynamic_shared_array_bytes,
+                            {(void *)&context_pointer}, {arg_size});
+      i++;
+    } else {
+      size_t group_start = i;
+      while (i < offloaded_tasks.size() &&
+             offloaded_tasks[i].stream_parallel_group_id != 0) {
+        i++;
+      }
+
+      std::map<int, void *> stream_by_id;
+      for (size_t j = group_start; j < i; j++) {
+        int sid = offloaded_tasks[j].stream_parallel_group_id;
+        if (stream_by_id.find(sid) == stream_by_id.end()) {
+          void *s = nullptr;
+          AMDGPUDriver::get_instance().stream_create(&s, 0);
+          stream_by_id[sid] = s;
+        }
+      }
+
+      for (size_t j = group_start; j < i; j++) {
+        auto &t = offloaded_tasks[j];
+        AMDGPUContext::get_instance().set_stream(
+            stream_by_id[t.stream_parallel_group_id]);
+        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim,
+                              t.dynamic_shared_array_bytes,
+                              {(void *)&context_pointer}, {arg_size});
+      }
+
+      for (auto &[sid, s] : stream_by_id) {
+        AMDGPUDriver::get_instance().stream_synchronize(s);
+      }
+      for (auto &[sid, s] : stream_by_id) {
+        AMDGPUDriver::get_instance().stream_destroy(s);
+      }
+
+      AMDGPUContext::get_instance().set_stream(active_stream);
+    }
   }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 13845d5a9b..94aa786b56 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -1,3 +1,5 @@
+#include <map>
+
 #include "quadrants/runtime/cuda/kernel_launcher.h"
 #include "quadrants/rhi/cuda/cuda_context.h"
 #include "quadrants/rhi/cuda/cuda_driver.h"
@@ -139,12 +141,50 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
     ctx.get_context().arg_buffer = device_arg_buffer;
   }
 
-  for (auto task : offloaded_tasks) {
-    QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-             task.block_dim);
-    cuda_module->launch(task.name, task.grid_dim, task.block_dim,
-                        task.dynamic_shared_array_bytes, {&ctx.get_context()},
-                        {});
+  for (size_t i = 0; i < offloaded_tasks.size();) {
+    auto &task = offloaded_tasks[i];
+    if (task.stream_parallel_group_id == 0) {
+      QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      cuda_module->launch(task.name, task.grid_dim, task.block_dim,
+                          task.dynamic_shared_array_bytes, {&ctx.get_context()},
+                          {});
+      i++;
+    } else {
+      size_t group_start = i;
+      while (i < offloaded_tasks.size() &&
+             offloaded_tasks[i].stream_parallel_group_id != 0) {
+        i++;
+      }
+
+      std::map<int, void *> stream_by_id;
+      for (size_t j = group_start; j < i; j++) {
+        int sid = offloaded_tasks[j].stream_parallel_group_id;
+        if (stream_by_id.find(sid) == stream_by_id.end()) {
+          void *s = nullptr;
+          CUDADriver::get_instance().stream_create(&s, 0);
+          stream_by_id[sid] = s;
+        }
+      }
+
+      for (size_t j = group_start; j < i; j++) {
+        auto &t = offloaded_tasks[j];
+        CUDAContext::get_instance().set_stream(
+            stream_by_id[t.stream_parallel_group_id]);
+        cuda_module->launch(t.name, t.grid_dim, t.block_dim,
+                            t.dynamic_shared_array_bytes, {&ctx.get_context()},
+                            {});
+      }
+
+      for (auto &[sid, s] : stream_by_id) {
+        CUDADriver::get_instance().stream_synchronize(s);
+      }
+      for (auto &[sid, s] : stream_by_id) {
+        CUDADriver::get_instance().stream_destroy(s);
+      }
+
+      CUDAContext::get_instance().set_stream(active_stream);
+    }
   }
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream);
diff --git a/quadrants/transforms/lower_ast.cpp b/quadrants/transforms/lower_ast.cpp
index 74b698a9e6..ef1bb6f06a 100644
--- a/quadrants/transforms/lower_ast.cpp
+++ b/quadrants/transforms/lower_ast.cpp
@@ -232,6 +232,7 @@ class LowerAST : public IRVisitor {
           snode, std::move(stmt->body), stmt->is_bit_vectorized,
           stmt->num_cpu_threads, stmt->block_dim);
       new_for->index_offsets = offsets;
+      new_for->stream_parallel_group_id = stmt->stream_parallel_group_id;
       VecStatement new_statements;
       for (int i = 0; i < (int)stmt->loop_var_ids.size(); i++) {
         Stmt *loop_index = new_statements.push_back<LoopIndexStmt>(
@@ -270,6 +271,7 @@ class LowerAST : public IRVisitor {
           begin, end, std::move(stmt->body), stmt->is_bit_vectorized,
           stmt->num_cpu_threads, stmt->block_dim, stmt->strictly_serialized,
           /*range_hint=*/fmt::format("arg ({})", fmt::join(arg_id, ", ")));
+      new_for->stream_parallel_group_id = stmt->stream_parallel_group_id;
       VecStatement new_statements;
       Stmt *loop_index =
           new_statements.push_back<LoopIndexStmt>(new_for.get(), 0);
@@ -311,6 +313,7 @@ class LowerAST : public IRVisitor {
             begin_stmt, end_stmt, std::move(stmt->body),
             stmt->is_bit_vectorized, stmt->num_cpu_threads, stmt->block_dim,
             stmt->strictly_serialized);
+        new_for->stream_parallel_group_id = stmt->stream_parallel_group_id;
         new_for->body->insert(std::make_unique<LoopIndexStmt>(new_for.get(), 0),
                               0);
         new_for->body->local_var_to_stmt[stmt->loop_var_ids[0]] =
diff --git a/quadrants/transforms/offload.cpp b/quadrants/transforms/offload.cpp
index 2f20247364..f3e254a889 100644
--- a/quadrants/transforms/offload.cpp
+++ b/quadrants/transforms/offload.cpp
@@ -134,6 +134,7 @@ class Offloader {
           offloaded->body->insert(std::move(s->body->statements[j]));
         }
         offloaded->range_hint = s->range_hint;
+        offloaded->stream_parallel_group_id = s->stream_parallel_group_id;
         root_block->insert(std::move(offloaded));
       } else if (auto st = stmt->cast<StructForStmt>()) {
         assemble_serial_statements();
@@ -257,6 +258,8 @@ class Offloader {
     offloaded_struct_for->num_cpu_threads =
         std::min(for_stmt->num_cpu_threads, config.cpu_max_num_threads);
     offloaded_struct_for->mem_access_opt = mem_access_opt;
+    offloaded_struct_for->stream_parallel_group_id =
+        for_stmt->stream_parallel_group_id;
 
     root_block->insert(std::move(offloaded_struct_for));
   }
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 002014c960..241f3143de 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -218,6 +218,7 @@ def _get_expected_matrix_apis():
     "static_assert",
     "static_print",
     "stop_grad",
+    "stream_parallel",
     "svd",
     "sym_eig",
     "sync",
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 073d383c2e..4c28b6f581 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -180,23 +180,6 @@ def fill():
     e.destroy()
 
 
-@test_utils.test()
-def test_stream_with_ndarray():
-    N = 1024
-
-    @qd.kernel
-    def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
-        for i in range(N):
-            arr[i] = 99.0
-
-    arr = qd.ndarray(qd.f32, shape=(N,))
-    s = qd.create_stream()
-    fill(arr, qd_stream=s)
-    s.synchronize()
-    assert np.allclose(arr.to_numpy(), 99.0)
-    s.destroy()
-
-
 @test_utils.test()
 def test_concurrent_streams_with_events():
     """Two slow kernels on separate streams run concurrently (~1s on GPU),
@@ -275,3 +258,164 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s2.destroy()
     e1.destroy()
     e2.destroy()
+
+
+@test_utils.test()
+def test_stream_parallel_basic():
+    """Each with qd.stream_parallel() block runs on its own stream (serial fallback on CPU/Metal)."""
+    N = 1024
+    a = qd.field(qd.f32, shape=(N,))
+    b = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill_parallel():
+        with qd.stream_parallel():
+            for i in range(N):
+                a[i] = 1.0
+        with qd.stream_parallel():
+            for j in range(N):
+                b[j] = 2.0
+
+    fill_parallel()
+    qd.sync()
+    assert np.allclose(a.to_numpy(), 1.0)
+    assert np.allclose(b.to_numpy(), 2.0)
+
+
+@test_utils.test()
+def test_stream_parallel_multiple_loops_per_stream():
+    """Multiple for loops inside one stream_parallel block share a stream (serial fallback on CPU/Metal)."""
+    N = 1024
+    a = qd.field(qd.f32, shape=(N,))
+    b = qd.field(qd.f32, shape=(N,))
+    c = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def parallel_phase():
+        with qd.stream_parallel():
+            for i in range(N):
+                a[i] = 1.0
+            for i in range(N):
+                a[i] = a[i] + 1.0
+        with qd.stream_parallel():
+            for j in range(N):
+                b[j] = 10.0
+
+    @qd.kernel
+    def combine():
+        for i in range(N):
+            c[i] = a[i] + b[i]
+
+    parallel_phase()
+    combine()
+    qd.sync()
+    assert np.allclose(a.to_numpy(), 2.0)
+    assert np.allclose(b.to_numpy(), 10.0)
+    assert np.allclose(c.to_numpy(), 12.0)
+
+
+@test_utils.test()
+def test_stream_parallel_timing():
+    """stream_parallel achieves speedup on GPU, serial fallback elsewhere."""
+    SPIN_ITERS = 5_000_000
+
+    a = qd.field(qd.i32, shape=(2,))
+    b = qd.field(qd.i32, shape=(2,))
+
+    @qd.kernel
+    def serial_spin():
+        for _ in range(1):
+            x = a[0]
+            for _j in range(SPIN_ITERS):
+                x = (1664525 * x + 1013904223) % 2147483647
+            a[0] = x
+        for _ in range(1):
+            x = a[1]
+            for _j in range(SPIN_ITERS):
+                x = (1664525 * x + 1013904223) % 2147483647
+            a[1] = x
+
+    @qd.kernel
+    def parallel_spin():
+        with qd.stream_parallel():
+            for _ in range(1):
+                x = b[0]
+                for _j in range(SPIN_ITERS):
+                    x = (1664525 * x + 1013904223) % 2147483647
+                b[0] = x
+        with qd.stream_parallel():
+            for _ in range(1):
+                x = b[1]
+                for _j in range(SPIN_ITERS):
+                    x = (1664525 * x + 1013904223) % 2147483647
+                b[1] = x
+
+    import time
+
+    # Warm up
+    serial_spin()
+    parallel_spin()
+    qd.sync()
+
+    qd.sync()
+    t0 = time.perf_counter()
+    serial_spin()
+    qd.sync()
+    serial_time = time.perf_counter() - t0
+
+    qd.sync()
+    t0 = time.perf_counter()
+    parallel_spin()
+    qd.sync()
+    stream_time = time.perf_counter() - t0
+
+    speedup = serial_time / stream_time
+    if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
+        assert speedup > 1.5, (
+            f"Expected >1.5x speedup, got {speedup:.2f}x " f"(serial={serial_time:.3f}s, stream={stream_time:.3f}s)"
+        )
+    else:
+        assert speedup > 0.75, (
+            f"Expected >=0.75x (serial fallback), got {speedup:.2f}x "
+            f"(serial={serial_time:.3f}s, stream={stream_time:.3f}s)"
+        )
+
+
+@test_utils.test()
+def test_stream_parallel_rejects_mixed_top_level():
+    """Mixing stream_parallel and non-stream_parallel at top level is an error."""
+    import pytest  # noqa: I001
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+
+    with pytest.raises(QuadrantsSyntaxError, match="all top-level statements"):
+
+        @qd.kernel
+        def bad_kernel():
+            with qd.stream_parallel():
+                for i in range(N):
+                    a[i] = 1.0
+            for i in range(N):
+                a[i] = 2.0
+
+        bad_kernel()
+
+
+@test_utils.test()
+def test_stream_with_ndarray():
+    N = 1024
+
+    @qd.kernel
+    def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(N):
+            arr[i] = 99.0
+
+    arr = qd.ndarray(qd.f32, shape=(N,))
+    s = qd.create_stream()
+    fill(arr, qd_stream=s)
+    s.synchronize()
+    assert np.allclose(arr.to_numpy(), 99.0)
+    s.destroy()

From b856b33247dfbb55ca5f781e788fc50d5e32c9e9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:25:18 -0700
Subject: [PATCH 004/109] Address review feedback for CUDA streams PR

- Make CUDAContext::stream_ thread_local for thread-safety
- Convert sync memcpy_host_to_device to async on active_stream
- Use weakref in Stream/Event __del__ to safely handle interpreter shutdown
- Add __enter__/__exit__ context manager support for Stream and Event
- Use consistent qd_stream parameter naming in Event.record and Event.wait
- Add handle==0 guard to stream_synchronize
---
 python/quadrants/lang/stream.py            | 60 ++++++++++++++++------
 quadrants/program/program.cpp              |  2 +-
 quadrants/rhi/cuda/cuda_context.cpp        |  6 +--
 quadrants/rhi/cuda/cuda_context.h          |  2 +-
 quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++--
 5 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 8530982455..8f6cfab3d6 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -1,14 +1,22 @@
+import weakref
+
 from quadrants.lang import impl
 
 
+def _get_prog_weakref():
+    return weakref.ref(impl.get_runtime().prog)
+
+
 class Stream:
     """Wraps a backend-specific GPU stream for concurrent kernel execution.
 
     On backends without native streams (e.g. CPU), this is a no-op object.
+    Call destroy() explicitly or use as a context manager to ensure cleanup.
     """
 
-    def __init__(self, handle: int):
+    def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
         self._handle = handle
+        self._prog_ref = prog_ref
 
     @property
     def handle(self) -> int:
@@ -27,30 +35,41 @@ def destroy(self):
             self._handle = 0
 
     def __del__(self):
-        if self._handle != 0:
-            try:
-                self.destroy()
-            except Exception:
-                pass
+        if self._handle != 0 and self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                try:
+                    prog.stream_destroy(self._handle)
+                    self._handle = 0
+                except Exception:
+                    pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.destroy()
 
 
 class Event:
     """Wraps a backend-specific GPU event for stream synchronization.
 
     On backends without native events (e.g. CPU), this is a no-op object.
+    Call destroy() explicitly or use as a context manager to ensure cleanup.
     """
 
-    def __init__(self, handle: int):
+    def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
         self._handle = handle
+        self._prog_ref = prog_ref
 
     @property
     def handle(self) -> int:
         return self._handle
 
-    def record(self, stream: Stream | None = None):
+    def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
         prog = impl.get_runtime().prog
-        stream_handle = stream.handle if stream is not None else 0
+        stream_handle = qd_stream.handle if qd_stream is not None else 0
         prog.event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
@@ -72,25 +91,34 @@ def destroy(self):
             self._handle = 0
 
     def __del__(self):
-        if self._handle != 0:
-            try:
-                self.destroy()
-            except Exception:
-                pass
+        if self._handle != 0 and self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                try:
+                    prog.event_destroy(self._handle)
+                    self._handle = 0
+                except Exception:
+                    pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.destroy()
 
 
 def create_stream() -> Stream:
     """Create a new GPU stream for concurrent kernel execution."""
     prog = impl.get_runtime().prog
     handle = prog.stream_create()
-    return Stream(handle)
+    return Stream(handle, _get_prog_weakref())
 
 
 def create_event() -> Event:
     """Create a new GPU event for stream synchronization."""
     prog = impl.get_runtime().prog
     handle = prog.event_create()
-    return Event(handle)
+    return Event(handle, _get_prog_weakref())
 
 
 __all__ = ["Stream", "Event", "create_stream", "create_event"]
diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 9b2ff0886b..be152d02da 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -508,7 +508,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
     CUDADriver::get_instance().stream_synchronize(
         reinterpret_cast<void *>(stream_handle));
   }
diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 89c16135a2..23399649a9 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -11,10 +11,10 @@
 
 namespace quadrants::lang {
 
+thread_local void *CUDAContext::stream_ = nullptr;
+
 CUDAContext::CUDAContext()
-    : profiler_(nullptr),
-      driver_(CUDADriver::get_instance_without_context()),
-      stream_(nullptr) {
+    : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
   // CUDA initialization
   dev_count_ = 0;
   driver_.init(0);
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index c57baa3d92..ba891644a7 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -30,7 +30,7 @@ class CUDAContext {
   int max_shared_memory_bytes_;
   bool debug_;
   bool supports_mem_pool_;
-  void *stream_;
+  static thread_local void *stream_;
 
  public:
   CUDAContext();
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 13845d5a9b..9bbf75044e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -85,8 +85,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
               executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          CUDADriver::get_instance().memcpy_host_to_device(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
+          CUDADriver::get_instance().memcpy_host_to_device_async(
+              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+              active_stream);
           if (grad_ptr != nullptr) {
             DeviceAllocation grad_devalloc =
                 executor->allocate_memory_on_device(
@@ -95,8 +96,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
                 executor->get_device_alloc_info_ptr(grad_devalloc);
             transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc};
 
-            CUDADriver::get_instance().memcpy_host_to_device(
-                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz);
+            CUDADriver::get_instance().memcpy_host_to_device_async(
+                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz,
+                active_stream);
           } else {
             device_ptrs[grad_ptr_idx] = nullptr;
           }

From 7555ec5edf0581290df8b902b5a31e6162521fe3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:03 -0700
Subject: [PATCH 005/109] Move AMDGPU mem_free_async before transfers sync to
 match CUDA ordering

Batch the device_result_buffer free into the stream pipeline before the
sync barrier, matching the CUDA kernel launcher's ordering for
consistency and marginal performance improvement.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 1d8430d35e..cff0f2b4a1 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -125,6 +125,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         host_result_buffer, device_result_buffer, ctx.result_buffer_size,
         active_stream);
   }
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
+                                              active_stream);
   if (transfers.size()) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
@@ -136,8 +138,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
-  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
-                                              active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(

From c12d23e1e1426a0b538382cb5dcab489e4c09b2e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:18 -0700
Subject: [PATCH 006/109] Convert AMDGPU sync memcpy_host_to_device to async on
 active_stream

Use memcpy_host_to_device_async for external array transfers so they
are properly ordered on the active stream, matching the CUDA launcher.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index cff0f2b4a1..f772fc7b5b 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -66,8 +66,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
               executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          AMDGPUDriver::get_instance().memcpy_host_to_device(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz);
+          AMDGPUDriver::get_instance().memcpy_host_to_device_async(
+              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+              active_stream);
         }
         ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx],
                              (uint64)ctx.array_ptrs[grad_ptr_idx]);

From 1673a38761b50fb6af4767e569fbf88751bb4788 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:25 -0700
Subject: [PATCH 007/109] Document ROCm >= 5.4 requirement for
 hipMallocAsync/hipFreeAsync

---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 6063d268a9..25e33774e7 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -70,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async,
                     std::size_t,
                     void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
+// hipMallocAsync/hipFreeAsync require ROCm >= 5.4
 PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed,
                     hipMallocManaged,

From 60d015bfddac7068d1d1067d8f059e9c3236447e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:35 -0700
Subject: [PATCH 008/109] Relax concurrency test threshold and log timings

Lower GPU speedup threshold from 1.5x to 1.3x to reduce flakiness in
CI under contention, and print actual timings for diagnostics.
---
 tests/python/test_streams.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 073d383c2e..236578974d 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -266,8 +266,9 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     assert np.isclose(a.to_numpy()[2], 12.0)
 
     speedup = serial_time / stream_time
+    print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x")
     if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
-        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
+        assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x"
     else:
         assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
 

From c4be4ffd7c77a68ed6176ce30900d1a2260dec5b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:27:55 -0700
Subject: [PATCH 009/109] Add handle==0 guard to AMDGPU stream_synchronize and
 make stream_ thread_local

Mirror the CUDA fixes: guard stream_synchronize against handle==0 to
avoid unintentional default stream sync, and make AMDGPUContext::stream_
thread_local for thread-safety.
---
 quadrants/program/program.cpp           | 2 +-
 quadrants/rhi/amdgpu/amdgpu_context.cpp | 2 ++
 quadrants/rhi/amdgpu/amdgpu_context.h   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index faac67970c..8bab1d30f7 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -532,7 +532,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
     AMDGPUDriver::get_instance().stream_synchronize(
         reinterpret_cast<void *>(stream_handle));
   }
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
index f940ed9a7c..24d924ed0d 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -13,6 +13,8 @@
 namespace quadrants {
 namespace lang {
 
+thread_local void *AMDGPUContext::stream_ = nullptr;
+
 AMDGPUContext::AMDGPUContext()
     : driver_(AMDGPUDriver::get_instance_without_context()) {
   dev_count_ = 0;
diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index 68e7cd7314..4fc7c8328b 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -23,7 +23,7 @@ class AMDGPUContext {
   KernelProfilerBase *profiler_{nullptr};
   AMDGPUDriver &driver_;
   bool debug_{false};
-  void *stream_{nullptr};
+  static thread_local void *stream_;
   std::vector<void *> kernel_arg_pointer_;
 
  public:

From be7ad924c333a589f13bbbe34f2d9583649007f5 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:29:24 -0700
Subject: [PATCH 010/109] Clear stream_parallel_group_id in
 ForLoopDecoratorRecorder::reset()

Prevents stale group IDs from leaking if insert_for is called after a
path that set a non-zero stream_parallel_group_id, matching the reset
pattern of all other ForLoopConfig fields.
---
 quadrants/ir/frontend_ir.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h
index 693a7f461f..38226ca1b3 100644
--- a/quadrants/ir/frontend_ir.h
+++ b/quadrants/ir/frontend_ir.h
@@ -954,6 +954,7 @@ class ASTBuilder {
       config.mem_access_opt.clear();
       config.block_dim = 0;
       config.strictly_serialized = false;
+      config.stream_parallel_group_id = 0;
     }
   };
 

From ce8328102ae0b18f0b29d661b4dc4026edf3c4a8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:29:36 -0700
Subject: [PATCH 011/109] Reject nested stream_parallel blocks

Add an error check in begin_stream_parallel() to prevent nesting, which
would produce undefined group ID semantics.
---
 quadrants/ir/frontend_ir.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h
index 38226ca1b3..46d7a3ec7a 100644
--- a/quadrants/ir/frontend_ir.h
+++ b/quadrants/ir/frontend_ir.h
@@ -1113,6 +1113,8 @@ class ASTBuilder {
   }
 
   void begin_stream_parallel() {
+    QD_ERROR_IF(current_stream_parallel_group_id_ != 0,
+                "Nested stream_parallel blocks are not supported");
     current_stream_parallel_group_id_ = ++stream_parallel_group_counter_;
   }
 

From 880abc7e74cc8be0979d54747ff753929f00221d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 17:30:08 -0700
Subject: [PATCH 012/109] Document stream_parallel launcher design: per-launch
 streams, shared context safety

Add comments explaining that streams are created/destroyed per launch
(stream pooling as future optimization), and that RuntimeContext sharing
across concurrent streams is safe because kernels only read from it.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++
 quadrants/runtime/cuda/kernel_launcher.cpp   | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index f859bb116c..6abd0778ed 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -127,6 +127,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
+      // Create one stream per unique group ID. Streams are created/destroyed
+      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
@@ -137,6 +139,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
       }
 
+      // Launch tasks concurrently on their respective streams. The shared
+      // RuntimeContext is safe here: kernels only read from it (args/runtime
+      // pointers); result_buffer writes are to disjoint offsets per task.
       for (size_t j = group_start; j < i; j++) {
         auto &t = offloaded_tasks[j];
         AMDGPUContext::get_instance().set_stream(
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 2e10226a13..9cf24915ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -159,6 +159,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
+      // Create one stream per unique group ID. Streams are created/destroyed
+      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
@@ -169,6 +171,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         }
       }
 
+      // Launch tasks concurrently on their respective streams. The shared
+      // RuntimeContext is safe here: kernels only read from it (args/runtime
+      // pointers); result_buffer writes are to disjoint offsets per task.
       for (size_t j = group_start; j < i; j++) {
         auto &t = offloaded_tasks[j];
         CUDAContext::get_instance().set_stream(

From b28e7c60901fdde76ff2b9ea153534f15a0050ac Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 18:23:15 -0700
Subject: [PATCH 013/109] Revert "Relax concurrency test threshold and log
 timings"

This reverts commit 60d015bfddac7068d1d1067d8f059e9c3236447e.
---
 tests/python/test_streams.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 236578974d..073d383c2e 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -266,9 +266,8 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     assert np.isclose(a.to_numpy()[2], 12.0)
 
     speedup = serial_time / stream_time
-    print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x")
     if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu):
-        assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x"
+        assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x"
     else:
         assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x"
 

From e9f98c645671a2a9b5ee3cae915c31a852053cf4 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 18:33:27 -0700
Subject: [PATCH 014/109] Add stream pool to reuse GPU streams across kernel
 launches

Replace per-launch stream_create/stream_destroy with acquire_stream/
release_stream on CUDAContext and AMDGPUContext. Streams are cached in
a pool and reused across invocations, avoiding the driver-level overhead
of stream creation (~5-50us) on every kernel launch in hot loops.
---
 quadrants/rhi/amdgpu/amdgpu_context.h        | 18 ++++++++++++++++++
 quadrants/rhi/cuda/cuda_context.h            | 19 +++++++++++++++++++
 quadrants/runtime/amdgpu/kernel_launcher.cpp |  8 ++------
 quadrants/runtime/cuda/kernel_launcher.cpp   |  8 ++------
 4 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index 4fc7c8328b..dd99e4fd37 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -24,6 +24,7 @@ class AMDGPUContext {
   AMDGPUDriver &driver_;
   bool debug_{false};
   static thread_local void *stream_;
+  std::vector<void *> stream_pool_;
   std::vector<void *> kernel_arg_pointer_;
 
  public:
@@ -125,6 +126,23 @@ class AMDGPUContext {
     return stream_;
   }
 
+  void *acquire_stream() {
+    std::lock_guard<std::mutex> _(lock_);
+    if (!stream_pool_.empty()) {
+      auto s = stream_pool_.back();
+      stream_pool_.pop_back();
+      return s;
+    }
+    void *s = nullptr;
+    AMDGPUDriver::get_instance().stream_create(&s, 0);
+    return s;
+  }
+
+  void release_stream(void *s) {
+    std::lock_guard<std::mutex> _(lock_);
+    stream_pool_.push_back(s);
+  }
+
   static AMDGPUContext &get_instance();
 };
 
diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h
index ba891644a7..b4a4809615 100644
--- a/quadrants/rhi/cuda/cuda_context.h
+++ b/quadrants/rhi/cuda/cuda_context.h
@@ -3,6 +3,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <thread>
+#include <vector>
 
 #include "quadrants/program/kernel_profiler.h"
 #include "quadrants/rhi/cuda/cuda_driver.h"
@@ -31,6 +32,7 @@ class CUDAContext {
   bool debug_;
   bool supports_mem_pool_;
   static thread_local void *stream_;
+  std::vector<void *> stream_pool_;
 
  public:
   CUDAContext();
@@ -120,6 +122,23 @@ class CUDAContext {
   void *get_stream() const {
     return stream_;
   }
+
+  void *acquire_stream() {
+    std::lock_guard<std::mutex> _(lock_);
+    if (!stream_pool_.empty()) {
+      auto s = stream_pool_.back();
+      stream_pool_.pop_back();
+      return s;
+    }
+    void *s = nullptr;
+    CUDADriver::get_instance().stream_create(&s, 0);
+    return s;
+  }
+
+  void release_stream(void *s) {
+    std::lock_guard<std::mutex> _(lock_);
+    stream_pool_.push_back(s);
+  }
 };
 
 }  // namespace quadrants::lang
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 6abd0778ed..88a3570924 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -127,15 +127,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
-      // Create one stream per unique group ID. Streams are created/destroyed
-      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
-          void *s = nullptr;
-          AMDGPUDriver::get_instance().stream_create(&s, 0);
-          stream_by_id[sid] = s;
+          stream_by_id[sid] = AMDGPUContext::get_instance().acquire_stream();
         }
       }
 
@@ -155,7 +151,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         AMDGPUDriver::get_instance().stream_synchronize(s);
       }
       for (auto &[sid, s] : stream_by_id) {
-        AMDGPUDriver::get_instance().stream_destroy(s);
+        AMDGPUContext::get_instance().release_stream(s);
       }
 
       AMDGPUContext::get_instance().set_stream(active_stream);
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 9cf24915ab..6743d7c291 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -159,15 +159,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         i++;
       }
 
-      // Create one stream per unique group ID. Streams are created/destroyed
-      // per launch; a stream pool could reduce overhead for hot loops.
       std::map<int, void *> stream_by_id;
       for (size_t j = group_start; j < i; j++) {
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
-          void *s = nullptr;
-          CUDADriver::get_instance().stream_create(&s, 0);
-          stream_by_id[sid] = s;
+          stream_by_id[sid] = CUDAContext::get_instance().acquire_stream();
         }
       }
 
@@ -187,7 +183,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle,
         CUDADriver::get_instance().stream_synchronize(s);
       }
       for (auto &[sid, s] : stream_by_id) {
-        CUDADriver::get_instance().stream_destroy(s);
+        CUDAContext::get_instance().release_stream(s);
       }
 
       CUDAContext::get_instance().set_stream(active_stream);

From 65a7967ca88aa33f62b3de4411cd3f51f870ed5f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 18:37:11 -0700
Subject: [PATCH 015/109] Add test for stream pool reuse across repeated kernel
 launches

Calls a stream_parallel kernel 5 times in a loop to verify that pooled
streams are correctly reused with correct results each iteration.
---
 tests/python/test_streams.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 4c28b6f581..86568c4e17 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -419,3 +419,31 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_stream_pool_reuse():
+    """Repeated stream_parallel invocations reuse pooled streams correctly."""
+    N = 128
+    a = qd.ndarray(qd.f32, shape=(N,))
+    b = qd.ndarray(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def parallel_fill(
+        x: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        y: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        val: qd.f32,
+    ):
+        with qd.stream_parallel():
+            for i in range(N):
+                x[i] = val
+        with qd.stream_parallel():
+            for i in range(N):
+                y[i] = val * 2.0
+
+    for iteration in range(5):
+        v = float(iteration + 1)
+        parallel_fill(a, b, v)
+        qd.sync()
+        assert np.allclose(a.to_numpy(), v), f"iteration {iteration}"
+        assert np.allclose(b.to_numpy(), v * 2.0), f"iteration {iteration}"

From 5393d04c8d210edf8fe7d0301ae6f68e22e56b8f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 11 Mar 2026 18:51:33 -0700
Subject: [PATCH 016/109] Destroy pooled streams in CUDAContext and
 AMDGPUContext destructors

---
 quadrants/rhi/amdgpu/amdgpu_context.cpp |  4 ++++
 quadrants/rhi/cuda/cuda_context.cpp     | 11 ++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp
index 24d924ed0d..7163431e32 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp
@@ -204,6 +204,10 @@ void AMDGPUContext::launch(void *func,
 }
 
 AMDGPUContext::~AMDGPUContext() {
+  for (auto *s : stream_pool_) {
+    driver_.stream_destroy(s);
+  }
+  stream_pool_.clear();
   if (context_) {
     driver_.device_primary_ctx_release(device_);
   }
diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index 23399649a9..286c4eb3ba 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -180,13 +180,10 @@ void CUDAContext::launch(void *func,
 }
 
 CUDAContext::~CUDAContext() {
-  // TODO: restore these?
-  /*
-  CUDADriver::get_instance().cuMemFree(context_buffer);
-  for (auto cudaModule: cudaModules)
-      CUDADriver::get_instance().cuModuleUnload(cudaModule);
-  CUDADriver::get_instance().cuCtxDestroy(context);
-  */
+  for (auto *s : stream_pool_) {
+    driver_.stream_destroy(s);
+  }
+  stream_pool_.clear();
 }
 
 CUDAContext &CUDAContext::get_instance() {

From 9be110daf54838a2da4a430e254e25afdfb198e9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 19 Apr 2026 17:24:53 -0700
Subject: [PATCH 017/109] Apply clang-format

Made-with: Cursor
---
 quadrants/program/program.cpp              | 28 ++++++++--------------
 quadrants/rhi/cuda/cuda_context.cpp        |  3 +--
 quadrants/runtime/cuda/kernel_launcher.cpp |  3 +--
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index a591fb8dba..ec5a9fa57d 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -474,8 +474,7 @@ uint64 Program::stream_create() {
 void Program::stream_destroy(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDADriver::get_instance().stream_destroy(
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -483,8 +482,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDADriver::get_instance().stream_synchronize(
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -492,8 +490,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 void Program::set_current_cuda_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().set_stream(
-        reinterpret_cast<void *>(stream_handle));
+    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -502,8 +499,7 @@ uint64 Program::event_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
     void *event = nullptr;
-    CUDADriver::get_instance().event_create(&event,
-                                            0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
   }
 #endif
@@ -513,8 +509,7 @@ uint64 Program::event_create() {
 void Program::event_destroy(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_destroy(
-        reinterpret_cast<void *>(event_handle));
+    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -522,9 +517,8 @@ void Program::event_destroy(uint64 event_handle) {
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_record(
-        reinterpret_cast<void *>(event_handle),
-        reinterpret_cast<void *>(stream_handle));
+    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                            reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -532,8 +526,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 void Program::event_synchronize(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().event_synchronize(
-        reinterpret_cast<void *>(event_handle));
+    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -541,9 +534,8 @@ void Program::event_synchronize(uint64 event_handle) {
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDADriver::get_instance().stream_wait_event(
-        reinterpret_cast<void *>(stream_handle),
-        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
 }
diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp
index a605d06c64..60553da9c7 100644
--- a/quadrants/rhi/cuda/cuda_context.cpp
+++ b/quadrants/rhi/cuda/cuda_context.cpp
@@ -13,8 +13,7 @@ namespace quadrants::lang {
 
 thread_local void *CUDAContext::stream_ = nullptr;
 
-CUDAContext::CUDAContext()
-    : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
+CUDAContext::CUDAContext() : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) {
   // CUDA initialization
   dev_count_ = 0;
   driver_.init(0);
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 34905218f9..0c5d7e9458 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -173,8 +173,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer,
                                                            ctx.result_buffer_size, active_stream);
   }
-  CUDADriver::get_instance().mem_free_async(device_result_buffer,
-                                            active_stream);
+  CUDADriver::get_instance().mem_free_async(device_result_buffer, active_stream);
   // copy data back to host
   if (transfers.size() > 0) {
     CUDADriver::get_instance().stream_synchronize(active_stream);

From 31fffbf1730e32c200eed37e8b4a4740ddc28b50 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 19 Apr 2026 19:03:53 -0700
Subject: [PATCH 018/109] Apply clang-format

Made-with: Cursor
---
 quadrants/program/program.cpp                 | 28 +++++++------------
 .../rhi/amdgpu/amdgpu_driver_functions.inc.h  |  6 +---
 quadrants/runtime/amdgpu/kernel_launcher.cpp  | 18 ++++++------
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 43e8df1236..648f3291c3 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -491,8 +491,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
-    AMDGPUDriver::get_instance().stream_destroy(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -505,8 +504,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
-    AMDGPUDriver::get_instance().stream_synchronize(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -519,8 +517,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
-    AMDGPUContext::get_instance().set_stream(
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -536,8 +533,7 @@ uint64 Program::event_create() {
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
     void *event = nullptr;
-    AMDGPUDriver::get_instance().event_create(&event,
-                                              0x02 /*hipEventDisableTiming*/);
+    AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
     return reinterpret_cast<uint64>(event);
   }
 #endif
@@ -552,8 +548,7 @@ void Program::event_destroy(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_destroy(
-        reinterpret_cast<void *>(event_handle));
+    AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -567,9 +562,8 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_record(
-        reinterpret_cast<void *>(event_handle),
-        reinterpret_cast<void *>(stream_handle));
+    AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                              reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
@@ -582,8 +576,7 @@ void Program::event_synchronize(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().event_synchronize(
-        reinterpret_cast<void *>(event_handle));
+    AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
@@ -597,9 +590,8 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
-    AMDGPUDriver::get_instance().stream_wait_event(
-        reinterpret_cast<void *>(stream_handle),
-        reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+    AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                   reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }
 #endif
 }
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 6a01c3a87a..6be39db108 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -62,11 +62,7 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, hipOccupancyMaxActiveBlocksPerMultipro
 
 // Stream management
 PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *);
-PER_AMDGPU_FUNCTION(stream_wait_event,
-                    hipStreamWaitEvent,
-                    void *,
-                    void *,
-                    uint32);
+PER_AMDGPU_FUNCTION(stream_wait_event, hipStreamWaitEvent, void *, void *, uint32);
 
 // Event management
 PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 1c5c573d85..cace0821ce 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -86,16 +86,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
           device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(devalloc);
           transfers[data_ptr_idx] = {data_ptr, devalloc};
 
-          AMDGPUDriver::get_instance().memcpy_host_to_device_async(
-              (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, active_stream);
+          AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz,
+                                                                   active_stream);
           if (grad_ptr != nullptr) {
             DeviceAllocation grad_devalloc =
                 executor->allocate_memory_on_device(arr_sz, (uint64 *)device_result_buffer);
             device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(grad_devalloc);
             transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc};
 
-            AMDGPUDriver::get_instance().memcpy_host_to_device_async(
-                (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz, active_stream);
+            AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[grad_ptr_idx], grad_ptr,
+                                                                     arr_sz, active_stream);
           } else {
             device_ptrs[grad_ptr_idx] = nullptr;
           }
@@ -141,8 +141,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   void *context_pointer;
   int arg_size = sizeof(RuntimeContext *);
   AMDGPUDriver::get_instance().malloc_async((void **)&context_pointer, sizeof(RuntimeContext), active_stream);
-  AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(),
-                                                           sizeof(RuntimeContext), active_stream);
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
+                                                           active_stream);
 
   AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer);
 
@@ -154,15 +154,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   }
   QD_TRACE("Launching kernel");
   if (ctx.arg_buffer_size > 0) {
-    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer,
-                                                active_stream);
+    AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, active_stream);
   }
   if (ctx.result_buffer_size > 0) {
     AMDGPUDriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer,
                                                              ctx.result_buffer_size, active_stream);
   }
-  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer,
-                                              active_stream);
+  AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, active_stream);
   if (transfers.size()) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From e9ce144a2302c55b097f61148bae2385808e8d5c Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 19 Apr 2026 20:33:32 -0700
Subject: [PATCH 019/109] Apply clang-format

Made-with: Cursor
---
 quadrants/codegen/llvm/llvm_compiled_data.h  | 6 +-----
 quadrants/ir/frontend_ir.h                   | 3 +--
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++--
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/quadrants/codegen/llvm/llvm_compiled_data.h b/quadrants/codegen/llvm/llvm_compiled_data.h
index 4ed2e69abc..ba7b74e674 100644
--- a/quadrants/codegen/llvm/llvm_compiled_data.h
+++ b/quadrants/codegen/llvm/llvm_compiled_data.h
@@ -26,11 +26,7 @@ class OffloadedTask {
         grid_dim(grid_dim),
         dynamic_shared_array_bytes(dynamic_shared_array_bytes),
         stream_parallel_group_id(stream_parallel_group_id) {};
-  QD_IO_DEF(name,
-            block_dim,
-            grid_dim,
-            dynamic_shared_array_bytes,
-            stream_parallel_group_id);
+  QD_IO_DEF(name, block_dim, grid_dim, dynamic_shared_array_bytes, stream_parallel_group_id);
 };
 
 struct LLVMCompiledTask {
diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h
index 0ceed57772..b4ad04a9b5 100644
--- a/quadrants/ir/frontend_ir.h
+++ b/quadrants/ir/frontend_ir.h
@@ -1028,8 +1028,7 @@ class ASTBuilder {
   }
 
   void begin_stream_parallel() {
-    QD_ERROR_IF(current_stream_parallel_group_id_ != 0,
-                "Nested stream_parallel blocks are not supported");
+    QD_ERROR_IF(current_stream_parallel_group_id_ != 0, "Nested stream_parallel blocks are not supported");
     current_stream_parallel_group_id_ = ++stream_parallel_group_counter_;
   }
 
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 83df04490f..57659a5cfa 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -44,8 +44,8 @@ void KernelLauncher::launch_offloaded_tasks(JITModule *amdgpu_module,
       for (size_t j = group_start; j < i; j++) {
         const auto &t = offloaded_tasks[j];
         AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
-        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
-                              {(void *)&context_pointer}, {arg_size});
+        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer},
+                              {arg_size});
       }
 
       for (auto &[sid, s] : stream_by_id) {

From d3cae3cbaa1a3ffd832a30320fa59c5af753e595 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 02:18:49 -0700
Subject: [PATCH 020/109] [Test] Exclude flaky test_perf_dispatch_python from
 Vulkan

The pure-Python perf dispatch test is timing-sensitive and unreliable on
the Vulkan software renderer in CI. The kernel variant of the same test
still covers perf dispatch on Vulkan.

Made-with: Cursor
---
 tests/python/test_perf_dispatch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py
index eaef03d99f..b533105c42 100644
--- a/tests/python/test_perf_dispatch.py
+++ b/tests/python/test_perf_dispatch.py
@@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2(
     assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2
 
 
-@test_utils.test()
+@test_utils.test(exclude=[qd.vulkan])
 def test_perf_dispatch_python() -> None:
     WARMUP = 3
 

From 798f87a18139fb8799d9b1d91135b2f6b8066a8d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 24 Apr 2026 04:55:29 -0700
Subject: [PATCH 021/109] Exclude flaky test_perf_dispatch_python from Metal
 and Vulkan

The pure-Python perf_dispatch timing test is unreliable on Mac Metal
and Vulkan (MoltenVK) where timing differences between implementations
are too small to consistently pick the fastest one.

Made-with: Cursor
---
 tests/python/test_perf_dispatch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py
index eaef03d99f..2de074ed3c 100644
--- a/tests/python/test_perf_dispatch.py
+++ b/tests/python/test_perf_dispatch.py
@@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2(
     assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2
 
 
-@test_utils.test()
+@test_utils.test(exclude=[qd.metal, qd.vulkan])
 def test_perf_dispatch_python() -> None:
     WARMUP = 3
 

From cd5b486beab0fc878652d4d2d043f44f1bd58e12 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 08:35:22 -0700
Subject: [PATCH 022/109] [Doc] Add user guide for streams API

---
 docs/source/user_guide/index.md   |   1 +
 docs/source/user_guide/streams.md | 145 ++++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 docs/source/user_guide/streams.md

diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index 05a5dfc434..7775e56f0e 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -54,6 +54,7 @@ tile16
 :titlesonly:
 
 graph
+streams
 perf_dispatch
 ```
 
diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
new file mode 100644
index 0000000000..0a610fd217
--- /dev/null
+++ b/docs/source/user_guide/streams.md
@@ -0,0 +1,145 @@
+# Streams
+
+Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default
+stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently
+and control synchronization with events.
+
+## Supported platforms
+
+| Backend | Streams | Events | Notes |
+|---------|---------|--------|-------|
+| CUDA    | Yes     | Yes    | Full concurrent execution |
+| CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+
+On backends without native stream support, `create_stream()` and `create_event()` return objects with handle
+`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+
+## Creating and using streams
+
+```python
+import quadrants as qd
+
+qd.init(arch=qd.cuda)
+
+N = 1024
+a = qd.field(qd.f32, shape=(N,))
+b = qd.field(qd.f32, shape=(N,))
+
+@qd.kernel
+def fill_a():
+    for i in range(N):
+        a[i] = 1.0
+
+@qd.kernel
+def fill_b():
+    for i in range(N):
+        b[i] = 2.0
+
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+fill_a(qd_stream=s1)
+fill_b(qd_stream=s2)
+
+s1.synchronize()
+s2.synchronize()
+
+s1.destroy()
+s2.destroy()
+```
+
+Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute
+concurrently. Call `synchronize()` to block until all work on a stream completes.
+
+## Events
+
+Events let you express dependencies between streams without full synchronization.
+
+```python
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+@qd.kernel
+def produce():
+    for i in range(N):
+        a[i] = 10.0
+
+@qd.kernel
+def consume():
+    for i in range(N):
+        b[i] = a[i]
+
+produce(qd_stream=s1)
+
+e = qd.create_event()
+e.record(s1)       # record when s1 finishes produce()
+e.wait(qd_stream=s2)  # s2 waits for that event before proceeding
+
+consume(qd_stream=s2)  # safe to read a[] — produce() is guaranteed complete
+s2.synchronize()
+
+e.destroy()
+s1.destroy()
+s2.destroy()
+```
+
+`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait
+until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
+
+## Context managers
+
+Streams and events support `with` blocks for automatic cleanup:
+
+```python
+with qd.create_stream() as s:
+    fill_a(qd_stream=s)
+    s.synchronize()
+# s.destroy() called automatically
+```
+
+## PyTorch interop (CUDA)
+
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to
+avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different
+streams with no ordering guarantees, leading to intermittent data corruption.
+
+### Running Quadrants kernels on PyTorch's stream
+
+```python
+import torch
+from quadrants.lang.stream import Stream
+
+torch_stream_ptr = torch.cuda.current_stream().cuda_stream
+stream = Stream(torch_stream_ptr)
+
+physics_kernel(qd_stream=stream)
+observations = compute_obs_tensor()  # PyTorch op on the same stream
+apply_actions_kernel(qd_stream=stream)
+```
+
+Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this
+wrapper — PyTorch owns the underlying stream.
+
+### Running PyTorch operations on a Quadrants stream
+
+```python
+qd_stream = qd.create_stream()
+torch_stream = torch.cuda.ExternalStream(qd_stream.handle)
+
+with torch.cuda.stream(torch_stream):
+    physics_kernel(qd_stream=qd_stream)
+    observations = compute_obs_tensor()
+    apply_actions_kernel(qd_stream=qd_stream)
+
+qd_stream.destroy()
+```
+
+`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly.
+
+## Limitations
+
+- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
+- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one
+  stream's output is another stream's input.

From 22389690c487e1bc05da15ed213b7e2f7bb0d7ed Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 08:43:36 -0700
Subject: [PATCH 023/109] [Doc] Update streams doc with AMDGPU support

---
 docs/source/user_guide/streams.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0a610fd217..cd26e01d20 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -9,6 +9,7 @@ and control synchronization with events.
 | Backend | Streams | Events | Notes |
 |---------|---------|--------|-------|
 | CUDA    | Yes     | Yes    | Full concurrent execution |
+| AMDGPU  | Yes     | Yes    | Full concurrent execution (requires ROCm >= 5.4) |
 | CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |

From 8cd793c888fec5815aa5b7d04361aad251da5268 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 08:51:10 -0700
Subject: [PATCH 024/109] [Doc] Add stream_parallel() section to streams user
 guide

---
 docs/source/user_guide/streams.md | 78 +++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index cd26e01d20..b9a2f5798e 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -1,23 +1,26 @@
 # Streams
 
 Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default
-stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently
-and control synchronization with events.
+stream, which serializes everything. With streams, you can run multiple top-level for loops in parallel.
 
 ## Supported platforms
 
-| Backend | Streams | Events | Notes |
-|---------|---------|--------|-------|
-| CUDA    | Yes     | Yes    | Full concurrent execution |
-| AMDGPU  | Yes     | Yes    | Full concurrent execution (requires ROCm >= 5.4) |
-| CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
-| Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
-| Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Backend | Supported |
+|---------|-----------|
+| CUDA    | Yes       |
+| AMDGPU  | Yes       |
+| CPU     | No-op     |
+| Metal   | No-op     |
+| Vulkan  | No-op     |
 
-On backends without native stream support, `create_stream()` and `create_event()` return objects with handle
-`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+On backends without native stream support, stream operations are no-ops and for loops run serially. Code using
+streams is portable across all backends — it will run without modifications, but serially.
 
-## Creating and using streams
+## Stream parallelism
+
+Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream. The runtime
+creates temporary streams, launches the for loops, and synchronizes automatically before the next
+non-parallel statement.
 
 ```python
 import quadrants as qd
@@ -27,17 +30,43 @@ qd.init(arch=qd.cuda)
 N = 1024
 a = qd.field(qd.f32, shape=(N,))
 b = qd.field(qd.f32, shape=(N,))
+c = qd.field(qd.f32, shape=(N,))
 
 @qd.kernel
-def fill_a():
-    for i in range(N):
-        a[i] = 1.0
+def compute_ab():
+    with qd.stream_parallel():
+        for i in range(N):
+            a[i] = compute_a(i)
+    with qd.stream_parallel():
+        for j in range(N):
+            b[j] = compute_b(j)
 
 @qd.kernel
-def fill_b():
+def combine():
     for i in range(N):
-        b[i] = 2.0
+        c[i] = a[i] + b[i]
+
+compute_ab()  # the two stream_parallel blocks run concurrently
+combine()     # runs after compute_ab() returns — a[] and b[] are ready
+```
+
+Consecutive `with qd.stream_parallel():` blocks run concurrently. Multiple for loops within a single block
+share a stream and run serially on it. All streams are synchronized before the kernel returns.
+
+### Restrictions
 
+- All top-level statements in a kernel must be either all `stream_parallel` blocks or all regular statements.
+  Mixing the two at the top level is a compile-time error.
+- Nesting `stream_parallel` blocks is not supported.
+
+## Explicit streams
+
+For cases that require manual control — such as launching separate kernels on different streams or
+interoperating with PyTorch — you can create and manage streams directly.
+
+### Creating and using streams
+
+```python
 s1 = qd.create_stream()
 s2 = qd.create_stream()
 
@@ -54,7 +83,7 @@ s2.destroy()
 Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute
 concurrently. Call `synchronize()` to block until all work on a stream completes.
 
-## Events
+### Events
 
 Events let you express dependencies between streams without full synchronization.
 
@@ -89,7 +118,7 @@ s2.destroy()
 `e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait
 until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
 
-## Context managers
+### Context managers
 
 Streams and events support `with` blocks for automatic cleanup:
 
@@ -100,13 +129,13 @@ with qd.create_stream() as s:
 # s.destroy() called automatically
 ```
 
-## PyTorch interop (CUDA)
+### PyTorch interop (CUDA)
 
 When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to
 avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different
 streams with no ordering guarantees, leading to intermittent data corruption.
 
-### Running Quadrants kernels on PyTorch's stream
+#### Running Quadrants kernels on PyTorch's stream
 
 ```python
 import torch
@@ -123,7 +152,7 @@ apply_actions_kernel(qd_stream=stream)
 Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this
 wrapper — PyTorch owns the underlying stream.
 
-### Running PyTorch operations on a Quadrants stream
+#### Running PyTorch operations on a Quadrants stream
 
 ```python
 qd_stream = qd.create_stream()
@@ -142,5 +171,6 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one
-  stream's output is another stream's input.
+- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for
+  inserting events or `synchronize()` calls when one stream's output is another stream's input.
+  `stream_parallel` handles this automatically.

From 08b85d5bd8df98d16d337cca55468af82eecb5c4 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 09:07:21 -0700
Subject: [PATCH 025/109] [Doc] Note stream pooling in streams user guide

---
 docs/source/user_guide/streams.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index b9a2f5798e..87d662a045 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -18,9 +18,7 @@ streams is portable across all backends — it will run without modifications, b
 
 ## Stream parallelism
 
-Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream. The runtime
-creates temporary streams, launches the for loops, and synchronizes automatically before the next
-non-parallel statement.
+Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream.
 
 ```python
 import quadrants as qd

From f2a2596c577235d796fa810a969d902e5dfe7016 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 11:11:04 -0700
Subject: [PATCH 026/109] Reflow stream.py docstrings to 120c line width

---
 python/quadrants/lang/stream.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 8f6cfab3d6..5e54b227cd 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -10,8 +10,8 @@ def _get_prog_weakref():
 class Stream:
     """Wraps a backend-specific GPU stream for concurrent kernel execution.
 
-    On backends without native streams (e.g. CPU), this is a no-op object.
-    Call destroy() explicitly or use as a context manager to ensure cleanup.
+    On backends without native streams (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as
+    a context manager to ensure cleanup.
     """
 
     def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
@@ -54,8 +54,8 @@ def __exit__(self, *args):
 class Event:
     """Wraps a backend-specific GPU event for stream synchronization.
 
-    On backends without native events (e.g. CPU), this is a no-op object.
-    Call destroy() explicitly or use as a context manager to ensure cleanup.
+    On backends without native events (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as
+    a context manager to ensure cleanup.
     """
 
     def __init__(self, handle: int, prog_ref: weakref.ref | None = None):

From de99f3efb295525d5ef1c80b30dc0b0007c97290 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 28 Apr 2026 11:15:31 -0700
Subject: [PATCH 027/109] Unwrap prose lines in streams.md to match repo doc
 style

---
 docs/source/user_guide/streams.md | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0a610fd217..0fb2627c0c 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -1,8 +1,6 @@
 # Streams
 
-Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default
-stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently
-and control synchronization with events.
+Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently and control synchronization with events.
 
 ## Supported platforms
 
@@ -13,8 +11,7 @@ and control synchronization with events.
 | Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 | Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
 
-On backends without native stream support, `create_stream()` and `create_event()` return objects with handle
-`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+On backends without native stream support, `create_stream()` and `create_event()` return objects with handle `0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
 
 ## Creating and using streams
 
@@ -50,8 +47,7 @@ s1.destroy()
 s2.destroy()
 ```
 
-Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute
-concurrently. Call `synchronize()` to block until all work on a stream completes.
+Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes.
 
 ## Events
 
@@ -85,8 +81,7 @@ s1.destroy()
 s2.destroy()
 ```
 
-`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait
-until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
+`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
 
 ## Context managers
 
@@ -101,9 +96,7 @@ with qd.create_stream() as s:
 
 ## PyTorch interop (CUDA)
 
-When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to
-avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different
-streams with no ordering guarantees, leading to intermittent data corruption.
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption.
 
 ### Running Quadrants kernels on PyTorch's stream
 
@@ -119,8 +112,7 @@ observations = compute_obs_tensor()  # PyTorch op on the same stream
 apply_actions_kernel(qd_stream=stream)
 ```
 
-Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this
-wrapper — PyTorch owns the underlying stream.
+Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream.
 
 ### Running PyTorch operations on a Quadrants stream
 
@@ -141,5 +133,4 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one
-  stream's output is another stream's input.
+- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.

From 401d6f81f0641c73118e1356feb9b87c3480e4f1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:49:13 -0700
Subject: [PATCH 028/109] Use CU_STREAM_NON_BLOCKING for user-created streams

Streams created with CU_STREAM_DEFAULT (flag 0) implicitly synchronize
with the legacy NULL stream, defeating concurrent execution when any
code path (including the kernel launcher's sizer-context block) posts
work on the NULL stream. Switch to CU_STREAM_NON_BLOCKING (0x1) to
match PyTorch/JAX/CuPy conventions and deliver the concurrency the
stream API promises.
---
 quadrants/program/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 83adc99627..a38ddd0dbb 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -498,7 +498,7 @@ uint64 Program::stream_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
     void *stream = nullptr;
-    CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
   }
 #endif

From a3c98f8da17148524f73d6c5faf348337cd7e8a9 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:49:40 -0700
Subject: [PATCH 029/109] Use async DtoH memcpy on active_stream for external
 array readback

The post-kernel readback of host-backed external arrays used synchronous
cuMemcpyDtoH which implicitly serializes through the NULL stream,
defeating stream isolation. Switch to memcpy_device_to_host_async on
active_stream with a scoped stream_synchronize, consistent with the
HtoD direction already converted in this branch.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index de6bab83e6..8a33bf0b61 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -253,8 +253,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      CUDADriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx],
-                                                       ctx.array_runtime_sizes[idx.arg_id]);
+      CUDADriver::get_instance().memcpy_device_to_host_async(
+          itr->second.first, (void *)device_ptrs[idx],
+          ctx.array_runtime_sizes[idx.arg_id], active_stream);
+    }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
+    for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }

From ca14f6753f35b3feedb4fd2f84e3ca0d3475a1e3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:50:01 -0700
Subject: [PATCH 030/109] Guard destroy()/__exit__ against destroying
 externally-owned handles

Stream.__del__ already checks self._prog_ref is not None to avoid
destroying handles wrapping external streams (e.g. PyTorch), but
destroy() and __exit__ did not. A user doing
`with Stream(torch_stream_ptr): ...` would destroy the PyTorch stream
on block exit. Add the same ownership guard to destroy() for both
Stream and Event.
---
 python/quadrants/lang/stream.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 5e54b227cd..063a2aeafc 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -28,8 +28,11 @@ def synchronize(self):
         prog.stream_synchronize(self._handle)
 
     def destroy(self):
-        """Explicitly destroy the stream. Safe to call multiple times."""
-        if self._handle != 0:
+        """Explicitly destroy the stream. Safe to call multiple times.
+
+        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
+        """
+        if self._handle != 0 and self._prog_ref is not None:
             prog = impl.get_runtime().prog
             prog.stream_destroy(self._handle)
             self._handle = 0
@@ -84,8 +87,11 @@ def synchronize(self):
         prog.event_synchronize(self._handle)
 
     def destroy(self):
-        """Explicitly destroy the event. Safe to call multiple times."""
-        if self._handle != 0:
+        """Explicitly destroy the event. Safe to call multiple times.
+
+        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
+        """
+        if self._handle != 0 and self._prog_ref is not None:
             prog = impl.get_runtime().prog
             prog.event_destroy(self._handle)
             self._handle = 0

From b46de06b5c0cc9892cadac4b22812f72d80522d2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 03:58:15 -0700
Subject: [PATCH 031/109] Fix clang-format indentation for
 memcpy_device_to_host_async

---
 quadrants/runtime/cuda/kernel_launcher.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 8a33bf0b61..f3f48ab21e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -253,9 +253,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      CUDADriver::get_instance().memcpy_device_to_host_async(
-          itr->second.first, (void *)device_ptrs[idx],
-          ctx.array_runtime_sizes[idx.arg_id], active_stream);
+      CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
+                                                             ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
     CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {

From 8efd51f116d3825d152ee67bfbb2430a5ee25d6b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:04:18 -0700
Subject: [PATCH 032/109] Address review comments: fix AMDGPU stream issues

- Fix stream_synchronize(nullptr) in do-while loop to sync active stream,
  mirroring the CUDA path (claude red)
- Remove unused kernel_arg_pointer_ member from AMDGPUContext (claude yellow)
- Reword misleading ROCm fallback comment to clarify it's per-device, not
  per-runtime-version (claude yellow)
- Fix stream_create ABI: bind to hipStreamCreateWithFlags instead of
  hipStreamCreate to match the two-arg call signature (codex P2)
---
 quadrants/rhi/amdgpu/amdgpu_context.h              | 1 -
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp       | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index b9fd5c403c..083406c3f9 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -25,7 +25,6 @@ class AMDGPUContext {
   bool debug_{false};
   bool supports_mem_pool_{false};
   static thread_local void *stream_;
-  std::vector<void *> kernel_arg_pointer_;
 
  public:
   AMDGPUContext();
diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index b6a4d7ba3e..d91afcac00 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -16,7 +16,7 @@ PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *);
 PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
 // Stream management
-PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32);
+PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32);
 PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
 
 // Memory management
@@ -29,7 +29,7 @@ PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, voi
 PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
 // hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers
-// transparently fall back to the synchronous variants when unsupported.
+// fall back to the synchronous variants on devices without memory-pool support.
 PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 5bb5e70194..d54331f237 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -71,7 +71,8 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &
   do {
     launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks, context_pointer, arg_size);
     counter_val = 0;
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    auto *stream = AMDGPUContext::get_instance().get_stream();
+    AMDGPUDriver::get_instance().stream_synchronize(stream);
     AMDGPUDriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
   } while (counter_val != 0);
 }

From b9eef6e844a6940848a0f4a52c9f5820ef69e388 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:07:11 -0700
Subject: [PATCH 033/109] Use async DtoH on active_stream for do-while loop
 counter readback

The do-while loop counter readback in launch_offloaded_tasks_with_do_while
used synchronous cuMemcpyDtoH which serializes through the NULL stream,
defeating stream isolation on every loop iteration. Switch to async
memcpy on the active stream followed by stream_synchronize, matching
the pattern used elsewhere in the launcher.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index f3f48ab21e..a1ccc470ab 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -71,8 +71,9 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &
     launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr);
     counter_val = 0;
     auto *stream = CUDAContext::get_instance().get_stream();
+    CUDADriver::get_instance().memcpy_device_to_host_async(&counter_val, ctx.graph_do_while_flag_dev_ptr,
+                                                           sizeof(int32_t), stream);
     CUDADriver::get_instance().stream_synchronize(stream);
-    CUDADriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t));
   } while (counter_val != 0);
 }
 

From f0dd7d6acb648aef15f8bb726ac86a0d0bca9d05 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:07:30 -0700
Subject: [PATCH 034/109] Use active_stream for sizer device context staging

The needs_sizer_device_ctx block (malloc_async, memcpy_host_to_device_async,
mem_free_async) was using nullptr (NULL stream) while the consuming sizer
kernel runs on active_stream. With non-blocking streams (e.g. wrapped
PyTorch streams), there is no implicit ordering between them, creating
a race where the sizer kernel could read stale or freed memory.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index a1ccc470ab..ca27b78dd1 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -227,9 +227,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   needs_sizer_device_ctx = needs_sizer_device_ctx && !CUDAContext::get_instance().supports_pageable_memory_access();
   void *device_context_ptr = nullptr;
   if (needs_sizer_device_ctx) {
-    CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), nullptr);
+    CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), active_stream);
     CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(),
-                                                           sizeof(RuntimeContext), nullptr);
+                                                           sizeof(RuntimeContext), active_stream);
   }
 
   if (ctx.graph_do_while_arg_id >= 0) {
@@ -239,7 +239,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr);
   }
   if (needs_sizer_device_ctx) {
-    CUDADriver::get_instance().mem_free_async(device_context_ptr, nullptr);
+    CUDADriver::get_instance().mem_free_async(device_context_ptr, active_stream);
   }
   if (ctx.arg_buffer_size > 0) {
     CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream);

From 8b3d4ed5f513603e1c3066090576cc0d90742329 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:08:04 -0700
Subject: [PATCH 035/109] Add make_current() to stream/event Program methods

All other CUDA entry points (kernel_launcher, jit_cuda, graph_manager)
call CUDAContext::get_instance().make_current() to bind the primary
context on the calling thread. The new stream/event methods skipped
this, which would cause CUDA_ERROR_INVALID_CONTEXT if called from a
thread other than the qd.init thread.
---
 quadrants/program/program.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index a38ddd0dbb..5abcd255b3 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -497,6 +497,7 @@ void Program::enqueue_compute_op_lambda(std::function<void(Device *device, Comma
 uint64 Program::stream_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     void *stream = nullptr;
     CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
@@ -508,6 +509,7 @@ uint64 Program::stream_create() {
 void Program::stream_destroy(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -516,6 +518,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 void Program::stream_synchronize(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -524,6 +527,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 void Program::set_current_cuda_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -532,6 +536,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 uint64 Program::event_create() {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
     void *event = nullptr;
     CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
     return reinterpret_cast<uint64>(event);
@@ -543,6 +548,7 @@ uint64 Program::event_create() {
 void Program::event_destroy(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -551,6 +557,7 @@ void Program::event_destroy(uint64 event_handle) {
 void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                             reinterpret_cast<void *>(stream_handle));
   }
@@ -560,6 +567,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 void Program::event_synchronize(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -568,6 +576,7 @@ void Program::event_synchronize(uint64 event_handle) {
 void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
   if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                  reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }

From 34e9fa6aa47672ad4a59d2d2d4e952b1aec66698 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:14:01 -0700
Subject: [PATCH 036/109] Use HIP_STREAM_NON_BLOCKING for AMDGPU stream_create
 to mirror CUDA path

---
 quadrants/program/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 36c27942d0..f3fdeef548 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -510,7 +510,7 @@ uint64 Program::stream_create() {
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
     void *stream = nullptr;
-    AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/);
+    AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
   }
 #endif

From 3b0ba294ace8518f70f8cb0516787bc9651ed644 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:18:35 -0700
Subject: [PATCH 037/109] Restore deleted comments, fix docstring wrapping, fix
 per-task adstack publish in stream-parallel loop

- Restore the deleted comments explaining why device_context_ptr is passed to publish_adstack_metadata
  (CUDA_ERROR_ILLEGAL_ADDRESS / hipErrorIllegalAddress on non-HMM GPUs).
- Reflow stream.py docstring to 120-char wrap.
- Move publish_adstack_metadata into the inner per-task loop for stream-parallel dispatch so each task
  gets its own adstack metadata published before launch (fixes latent bug for autodiff kernels).
---
 python/quadrants/lang/stream.py              |  5 ++---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 11 ++++++++---
 quadrants/runtime/cuda/kernel_launcher.cpp   | 11 +++++++++--
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 721d989109..395cc9d25c 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -132,9 +132,8 @@ def create_event() -> Event:
 def stream_parallel():
     """Run top-level for loops in this block on separate GPU streams.
 
-    Used inside @qd.kernel. At Python runtime (outside kernels), this is a
-    no-op. During kernel compilation, the AST transformer calls into the C++
-    ASTBuilder to tag loops with a stream-parallel group ID.
+    Used inside @qd.kernel. At Python runtime (outside kernels), this is a no-op. During kernel compilation, the AST
+    transformer calls into the C++ ASTBuilder to tag loops with a stream-parallel group ID.
     """
     yield
 
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 0c71f8fa85..fa053b74b5 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -55,8 +55,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
   auto *active_stream = AMDGPUContext::get_instance().get_stream();
   for (size_t i = 0; i < offloaded_tasks.size();) {
     const auto &task = offloaded_tasks[i];
-    executor->publish_adstack_metadata(task.ad_stack, resolve_num_threads(task, executor), &ctx, context_pointer);
     if (task.stream_parallel_group_id == 0) {
+      // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without this the
+      // sizer launches with a host pointer and the next DtoH sync trips
+      // `hipErrorIllegalAddress ... memcpy_device_to_host` because HIP has no UVA fallback for the host
+      // `RuntimeContext` struct.
+      executor->publish_adstack_metadata(task.ad_stack, resolve_num_threads(task, executor), &ctx, context_pointer);
       QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, task.block_dim);
       amdgpu_module->launch(task.name, task.grid_dim, task.block_dim, task.dynamic_shared_array_bytes,
                             {(void *)&context_pointer}, {arg_size});
@@ -79,9 +83,10 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
 
       for (size_t j = group_start; j < i; j++) {
         const auto &t = offloaded_tasks[j];
+        executor->publish_adstack_metadata(t.ad_stack, resolve_num_threads(t, executor), &ctx, context_pointer);
         AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
-        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer},
-                              {arg_size});
+        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
+                              {(void *)&context_pointer}, {arg_size});
       }
 
       for (auto &[sid, s] : stream_by_id) {
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index a3e97e3a26..ac0ccc8896 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -54,9 +54,14 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
   auto *active_stream = CUDAContext::get_instance().get_stream();
   for (size_t i = 0; i < offloaded_tasks.size();) {
     const auto &task = offloaded_tasks[i];
-    std::size_t n = resolve_num_threads(task.ad_stack, executor);
-    executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr);
     if (task.stream_parallel_group_id == 0) {
+      std::size_t n = resolve_num_threads(task.ad_stack, executor);
+      // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without it the sizer
+      // launches with a host pointer and the next DtoH sync trips `CUDA_ERROR_ILLEGAL_ADDRESS ...
+      // memcpy_device_to_host` on GPUs whose driver + kernel cannot coherently access pageable host memory (the HMM
+      // capability gated below in `launch_llvm_kernel`). `nullptr` on HMM-capable setups keeps
+      // `publish_adstack_metadata`'s host-pointer fast path.
+      executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr);
       QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, task.block_dim);
       cuda_module->launch(task.name, task.grid_dim, task.block_dim, task.dynamic_shared_array_bytes,
                           {&ctx.get_context()}, {});
@@ -79,6 +84,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
 
       for (size_t j = group_start; j < i; j++) {
         const auto &t = offloaded_tasks[j];
+        std::size_t n_t = resolve_num_threads(t.ad_stack, executor);
+        executor->publish_adstack_metadata(t.ad_stack, n_t, &ctx, device_context_ptr);
         CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
         cuda_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()}, {});
       }

From 1c62eaecb93bca645a2f80cf40a3ee0ff849dead Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:22:05 -0700
Subject: [PATCH 038/109] Fix clang-format line break in AMDGPU kernel launcher

---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index fa053b74b5..43664a68da 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -85,8 +85,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         const auto &t = offloaded_tasks[j];
         executor->publish_adstack_metadata(t.ad_stack, resolve_num_threads(t, executor), &ctx, context_pointer);
         AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
-        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
-                              {(void *)&context_pointer}, {arg_size});
+        amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer},
+                              {arg_size});
       }
 
       for (auto &[sid, s] : stream_by_id) {

From 162239e38cbd9ce3fcd1365181c1f3470be194d8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:35:34 -0700
Subject: [PATCH 039/109] Use active stream for AMDGPU adstack metadata copies
 in publish_adstack_metadata

AMDGPUContext::launch now dispatches on the user stream, so the
adstack H2D copies must target the same stream to maintain ordering.
Mirrors the CUDA branch.
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 69be9408b5..bc319f9c38 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -851,11 +851,10 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
       std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes);
 
       // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the
-      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active
-      // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the
-      // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no
-      // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel`
-      // (i.e. the default stream), so the copies match that.
+      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. Both CUDA and AMDGPU
+      // fetch the active stream from their respective context singletons (configurable via `set_stream`,
+      // defaults to the null stream), matching the stream used by `CUDAContext::launch` /
+      // `AMDGPUContext::launch`.
 #if defined(QD_WITH_CUDA)
       if (config_.arch == Arch::cuda) {
         void *active_stream = CUDAContext::get_instance().get_stream();
@@ -869,7 +868,7 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
 #endif
 #if defined(QD_WITH_AMDGPU)
       if (config_.arch == Arch::amdgpu) {
-        void *active_stream = nullptr;  // AMDGPUContext::launch always uses the default stream.
+        void *active_stream = AMDGPUContext::get_instance().get_stream();
         AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned,
                                                                  header_bytes, active_stream);
         AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes,

From 216f7d53d91af16c033410a73be767332cf5625b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:42:54 -0700
Subject: [PATCH 040/109] Address Claude review: reject stream_parallel in
 @qd.func, use non-blocking streams

- Reject qd.stream_parallel() inside @qd.func with a clear error; it's only valid in @qd.kernel.
- Use CU_STREAM_NON_BLOCKING (0x1) for internal stream-parallel streams, matching the convention in
  Program::stream_create. Blocking streams (flag 0) serialize with the legacy NULL stream, defeating
  the purpose of parallel dispatch.
---
 python/quadrants/lang/ast/ast_transformer.py | 2 ++
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 2 +-
 quadrants/runtime/cuda/kernel_launcher.cpp   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index 99e10bc4f9..b5b78455c6 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -1541,6 +1541,8 @@ def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None:
             raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression")
         if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars):
             raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()")
+        if not ctx.is_kernel:
+            raise QuadrantsSyntaxError("qd.stream_parallel() can only be used inside @qd.kernel, not @qd.func")
         ctx.ast_builder.begin_stream_parallel()
         build_stmts(ctx, node.body)
         ctx.ast_builder.end_stream_parallel()
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 43664a68da..1da2ec5b0a 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -76,7 +76,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
           void *s = nullptr;
-          AMDGPUDriver::get_instance().stream_create(&s, 0);
+          AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/);
           stream_by_id[sid] = s;
         }
       }
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index ac0ccc8896..b11d3a334b 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -77,7 +77,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
           void *s = nullptr;
-          CUDADriver::get_instance().stream_create(&s, 0);
+          CUDADriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/);
           stream_by_id[sid] = s;
         }
       }

From 9334efd4f102def5c5458e7ccd0a99f63e80d63e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:46:42 -0700
Subject: [PATCH 041/109] Add make_current() to all AMDGPU stream/event Program
 methods

Mirrors commit 8b3d4ed from the CUDA path: HIP uses the same
primary-context-per-thread model, so calling these methods from a
non-init thread requires make_current() to bind the context first.
---
 quadrants/program/program.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 89972bdf6f..2c9e57e378 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -510,6 +510,7 @@ uint64 Program::stream_create() {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     void *stream = nullptr;
     AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
     return reinterpret_cast<uint64>(stream);
@@ -527,6 +528,7 @@ void Program::stream_destroy(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -541,6 +543,7 @@ void Program::stream_synchronize(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -555,6 +558,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
@@ -571,6 +575,7 @@ uint64 Program::event_create() {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().make_current();
     void *event = nullptr;
     AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
     return reinterpret_cast<uint64>(event);
@@ -588,6 +593,7 @@ void Program::event_destroy(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -603,6 +609,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                               reinterpret_cast<void *>(stream_handle));
   }
@@ -618,6 +625,7 @@ void Program::event_synchronize(uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
@@ -633,6 +641,7 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 #ifdef QD_WITH_AMDGPU
   if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+    AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                    reinterpret_cast<void *>(event_handle), 0 /*flags*/);
   }

From aa4a70f91983d26fed7c73a380d5a13646997ed2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:52:32 -0700
Subject: [PATCH 042/109] Use async DtoH on active_stream for
 resolve_num_threads readback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

resolve_num_threads reads dynamic range_for begin/end from device
temporaries via synchronous cuMemcpyDtoH (NULL stream). With
CU_STREAM_NON_BLOCKING user streams, the prep task's store on
active_stream has no ordering with the NULL stream, so the readback
can return stale values — leading to wrong adstack sizing and either
CUDA_ERROR_ILLEGAL_ADDRESS or silent gradient corruption. Switch to
async memcpy on active_stream + stream_synchronize, matching the
pattern used at all other DtoH sites in the launcher.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index ca27b78dd1..005ad480e9 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -23,15 +23,17 @@ std::size_t resolve_num_threads(const AdStackSizingInfo &info, LlvmRuntimeExecut
   std::int32_t begin = info.begin_const_value;
   std::int32_t end = info.end_const_value;
   if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) {
+    auto *active_stream = CUDAContext::get_instance().get_stream();
     auto *temp_dev_ptr = reinterpret_cast<uint8_t *>(executor->get_runtime_temporaries_device_ptr());
     if (info.begin_offset_bytes >= 0) {
-      CUDADriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes,
-                                                       sizeof(std::int32_t));
+      CUDADriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes,
+                                                             sizeof(std::int32_t), active_stream);
     }
     if (info.end_offset_bytes >= 0) {
-      CUDADriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes,
-                                                       sizeof(std::int32_t));
+      CUDADriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes,
+                                                             sizeof(std::int32_t), active_stream);
     }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
   // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by
   // `linear_thread_idx()` (`block_idx * block_dim + thread_idx`), so only `static_num_threads = grid_dim *

From 1fba4f56f6a0a2a276ffb7bd23c2d8a6374fde6b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 04:57:54 -0700
Subject: [PATCH 043/109] Use async DtoH on active_stream for AMDGPU
 resolve_num_threads readback

Mirrors aa4a70f from the CUDA path: with non-blocking user streams,
synchronous DtoH on the NULL stream has no ordering with the prep
task's store on active_stream, risking stale begin/end values.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index d54331f237..bb19087586 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -25,15 +25,17 @@ std::size_t resolve_num_threads(const OffloadedTask &task, LlvmRuntimeExecutor *
   std::int32_t begin = info.begin_const_value;
   std::int32_t end = info.end_const_value;
   if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) {
+    auto *active_stream = AMDGPUContext::get_instance().get_stream();
     auto *temp_dev_ptr = reinterpret_cast<uint8_t *>(executor->get_runtime_temporaries_device_ptr());
     if (info.begin_offset_bytes >= 0) {
-      AMDGPUDriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes,
-                                                         sizeof(std::int32_t));
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes,
+                                                               sizeof(std::int32_t), active_stream);
     }
     if (info.end_offset_bytes >= 0) {
-      AMDGPUDriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes,
-                                                         sizeof(std::int32_t));
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes,
+                                                               sizeof(std::int32_t), active_stream);
     }
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by
   // `linear_thread_idx()`, so only `static_num_threads = grid_dim * block_dim` slices can be touched

From 74604f2753aa778a200d65fe1d882f9c32f6f096 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:03:57 -0700
Subject: [PATCH 044/109] Allow docstrings in stream_parallel kernels, merge
 base branch updates

The stream_parallel exclusivity validation now skips docstrings (bare string expressions at body[0]),
so kernels with docstrings don't get falsely rejected. Also applied style cleanup from earlier review
(use `if not any(...)` pattern).
---
 .../ast/ast_transformers/function_def_transformer.py  | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 135b702d6f..d6b64b5080 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -464,12 +464,17 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
             return False
         return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars)
 
+    @staticmethod
+    def _is_docstring(stmt: ast.stmt, index: int) -> bool:
+        return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str))
+
     @staticmethod
     def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None:
-        has_sp = any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body)
-        if not has_sp:
+        if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body):
             return
-        for stmt in body:
+        for i, stmt in enumerate(body):
+            if FunctionDefTransformer._is_docstring(stmt, i):
+                continue
             if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars):
                 raise QuadrantsSyntaxError(
                     "When using qd.stream_parallel(), all top-level statements "

From 5901a7fc83e7b17a3f6d580449b14db952ccf5d2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:13:32 -0700
Subject: [PATCH 045/109] Sync active_stream at end of launch_llvm_kernel
 unconditionally

The result-buffer DtoH and mem_free_async are queued on active_stream,
but stream_synchronize only ran inside the transfers.size() > 0 branch.
For the ndarray/CUDA-tensor path (transfers empty), the launcher
returned with the DtoH still in flight on a CU_STREAM_NON_BLOCKING
stream. The post-launcher cuStreamSynchronize(NULL) in runtime_ops.sync
does not drain non-blocking streams, so fetch_ret_impl could read stale
bytes. Move the sync to the end of the function unconditionally.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 005ad480e9..d6931da87a 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -259,11 +259,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                              ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
-    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
+  CUDADriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From f89bde02c5497856745bc93dd73fd2825ad2d489 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:19:16 -0700
Subject: [PATCH 046/109] Sync active_stream unconditionally at end of AMDGPU
 launch_llvm_kernel

Mirrors 5901a7fc from the CUDA path: when transfers is empty, the
result-buffer DtoH and mem_free_async were left in-flight on a
non-blocking stream with no sync before return. Also converts transfer
DtoH copies to async to match CUDA.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index bb19087586..0c5b4bad05 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -211,13 +211,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       auto &idx = itr->first;
-      auto arg_id = idx.arg_id;
-      AMDGPUDriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx],
-                                                         ctx.array_runtime_sizes[arg_id]);
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
+                                                               ctx.array_runtime_sizes[idx.arg_id], active_stream);
+    }
+    for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
   }
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From ef3b95b18361dce692b02e4beff5a0a496fb5ff3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:30:20 -0700
Subject: [PATCH 047/109] Use async DtoH on active_stream for sizer stride
 readback

The sizer kernel now runs on the user stream via Context::launch, so
the synchronous copy_d2h (NULL stream) can read stale stride values
with non-blocking streams. Use stream-aware async DtoH + sync for
both CUDA and AMDGPU, falling back to copy_d2h for other backends.
---
 .../runtime/llvm/llvm_runtime_executor.cpp    | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index bc319f9c38..1fff73575b 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -922,9 +922,9 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_);
     copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes);
 
-    // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel
-    // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we
-    // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc.
+    // Invoke the device interpreter. `JITModule::call` launches this as a single-thread kernel on the active
+    // stream (CUDA/AMDGPU both dispatch through `{CUDA,AMDGPU}Context::launch` which uses `stream_`), so the
+    // writes are stream-ordered before the subsequent main-kernel dispatch.
     //
     // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves
     // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of
@@ -943,8 +943,27 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
     // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch.
+    // Use async DtoH on active_stream + sync so the readback is ordered after the sizer kernel.
     uint64_t stride_u64 = 0;
-    copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
+#if defined(QD_WITH_AMDGPU)
+    if (config_.arch == Arch::amdgpu) {
+      void *active_stream = AMDGPUContext::get_instance().get_stream();
+      AMDGPUDriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_,
+                                                               sizeof(uint64_t), active_stream);
+      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
+    } else
+#endif
+#if defined(QD_WITH_CUDA)
+        if (config_.arch == Arch::cuda) {
+      void *active_stream = CUDAContext::get_instance().get_stream();
+      CUDADriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_,
+                                                             sizeof(uint64_t), active_stream);
+      CUDADriver::get_instance().stream_synchronize(active_stream);
+    } else
+#endif
+    {
+      copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
+    }
     stride = static_cast<std::size_t>(stride_u64);
   }
 

From fc5b710bbc7995d8b92f457359b238ab3125e2e3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:39:31 -0700
Subject: [PATCH 048/109] Add missing #include <vector> to amdgpu_context.h for
 IWYU consistency

Mirrors the explicit include already added to cuda_context.h in this PR.
The file compiled via transitive inclusion through kernel_profiler.h but
should not depend on that.
---
 quadrants/rhi/amdgpu/amdgpu_context.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h
index a3515c30cb..9283afa078 100644
--- a/quadrants/rhi/amdgpu/amdgpu_context.h
+++ b/quadrants/rhi/amdgpu/amdgpu_context.h
@@ -3,6 +3,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <thread>
+#include <vector>
 
 #include "quadrants/program/kernel_profiler.h"
 #include "quadrants/rhi/amdgpu/amdgpu_driver.h"

From 8550aa012d4759d7e7de0737aabbce86b4c33bf7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:45:20 -0700
Subject: [PATCH 049/109] Fix end-of-launcher sync: conditional + dealloc race

Two fixes in the post-launch cleanup:

1. The unconditional stream_synchronize(active_stream) blocked the host
   on every kernel launch, defeating stream concurrency for the common
   case (no return value, no host-backed arrays). Make it conditional:
   sync only when result_buffer_size > 0 (the stale-bytes path), or
   when transfers are present (already had its own sync).

2. The transfers branch queued async DtoH on active_stream then
   immediately deallocated device memory via mem_free_async(NULL stream).
   With CU_STREAM_NON_BLOCKING streams, the dealloc could race with the
   in-flight DtoH. Add stream_synchronize(active_stream) between the
   DtoH loop and the dealloc loop.
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index d6931da87a..b0d2da095c 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -259,11 +259,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                              ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
+    CUDADriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
+  } else if (ctx.result_buffer_size > 0) {
+    CUDADriver::get_instance().stream_synchronize(active_stream);
   }
-  CUDADriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From 6374cf3bfe91f700863af6cd510fe7ed00446f34 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:45:59 -0700
Subject: [PATCH 050/109] Reject qd_stream inside autograd Tape context

The Tape replay path (Tape.grad) calls func.grad(*args) with no kwargs,
so qd_stream is silently dropped and the backward kernel runs on the
default stream with no ordering guarantee relative to the forward on
the user's stream. Raise RuntimeError when both are used, matching the
existing graph=True incompatibility pattern. Document the limitation.
---
 docs/source/user_guide/streams.md | 1 +
 python/quadrants/lang/kernel.py   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0fb2627c0c..85d4e8d12c 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -133,4 +133,5 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
+- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 7c68373f34..8a1004c6a8 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -649,6 +649,9 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
+        if qd_stream is not None and self.runtime.target_tape:
+            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                               "context, or omit qd_stream.")
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()

From 7f0f29958c234668651c864fa999e696f4d3a895 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:51:43 -0700
Subject: [PATCH 051/109] Fix end-of-launcher sync: conditional + dealloc race
 on AMDGPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors 8550aa0 from the CUDA path:
1. Make stream_synchronize conditional — only sync when result_buffer
   or transfers need it, avoiding host-blocking on every launch.
2. Add sync between async DtoH and device memory deallocation to
   prevent race with non-blocking streams.
Also fixes black formatting from base branch merge.
---
 python/quadrants/lang/kernel.py              | 6 ++++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 8a1004c6a8..766689b02d 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -650,8 +650,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
         if qd_stream is not None and self.runtime.target_tape:
-            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
-                               "context, or omit qd_stream.")
+            raise RuntimeError(
+                "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                "context, or omit qd_stream."
+            )
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 0c5b4bad05..b32e0981ea 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -214,12 +214,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
       AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx],
                                                                ctx.array_runtime_sizes[idx.arg_id], active_stream);
     }
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
     for (auto itr = transfers.begin(); itr != transfers.end(); itr++) {
       executor->deallocate_memory_on_device(itr->second.second);
     }
+  } else if (ctx.result_buffer_size > 0) {
+    AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
-  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 }
 
 KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) {

From ca8ace3b6be6ed794f0f7619f92d3328a61d1e41 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 05:57:14 -0700
Subject: [PATCH 052/109] Fix linter formatting; guard graph+stream; sync
 has_print on stream

Three changes:

1. Fix black formatting of the Tape+stream RuntimeError.

2. Raise RuntimeError when qd_stream is passed to a graph=True kernel,
   enforcing the documented limitation in streams.md rather than
   silently bypassing the end-of-launcher sync.

3. When a kernel has print statements but no return value, and runs on
   a qd_stream, sync the user stream before runtime_ops.sync(). The
   NULL-stream sync in runtime_ops does not drain CU_STREAM_NON_BLOCKING
   user streams, so CUDA printf buffers would otherwise not be flushed.
---
 python/quadrants/lang/kernel.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index 8a1004c6a8..e0cdf945b5 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -561,6 +561,11 @@ def launch_kernel(
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED
+            if launch_ctx.use_graph and qd_stream is not None:
+                raise RuntimeError(
+                    "qd_stream is not compatible with graph=True kernels. "
+                    "See docs/source/user_guide/streams.md for details."
+                )
             if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"):
                 launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id
             stream_handle = qd_stream.handle if qd_stream is not None else 0
@@ -582,6 +587,8 @@ def launch_kernel(
 
         return_type = self.return_type
         if return_type or self.has_print:
+            if qd_stream is not None and self.has_print and not return_type:
+                qd_stream.synchronize()
             runtime_ops.sync()
 
         if not return_type:
@@ -650,8 +657,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
         if qd_stream is not None and self.runtime.target_tape:
-            raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
-                               "context, or omit qd_stream.")
+            raise RuntimeError(
+                "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                "context, or omit qd_stream."
+            )
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()

From 1f471b37ab7728777cb0cb339ba16c0b3164301e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:06:11 -0700
Subject: [PATCH 053/109] Fix AMDGPU stream flag comment:
 HIP_STREAM_NON_BLOCKING not CU_STREAM_NON_BLOCKING

---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 50b80294ce..e57c8675d7 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -78,7 +78,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         int sid = offloaded_tasks[j].stream_parallel_group_id;
         if (stream_by_id.find(sid) == stream_by_id.end()) {
           void *s = nullptr;
-          AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/);
+          AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
           stream_by_id[sid] = s;
         }
       }

From 84806cfdfdd3b5aa366745872429892fc37c2157 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:14:18 -0700
Subject: [PATCH 054/109] Fix NULL-stream DtoH races in synchronize() and
 allocate_llvm_runtime_memory_jit

synchronize() now drains the active user stream (if any) before the
NULL stream, so fetch_result_uint64 callers (lazy field-pointer
caches at three sites) read correct values when the runtime-query
kernel ran on a non-blocking user stream.

allocate_llvm_runtime_memory_jit: use async H2D on active_stream for
the zero-stamp and sync the active stream before the DtoH readback,
so the allocator kernel result is visible.
---
 quadrants/rhi/amdgpu/amdgpu_device.cpp           | 6 ++++--
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp
index 68c377a73a..d127ce19a0 100644
--- a/quadrants/rhi/amdgpu/amdgpu_device.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/rhi/amdgpu/amdgpu_device.h"
+#include "quadrants/rhi/amdgpu/amdgpu_context.h"
 #include "quadrants/rhi/llvm/device_memory_pool.h"
 
 #include "quadrants/jit/jit_module.h"
@@ -93,11 +94,12 @@ uint64_t *AmdgpuDevice::allocate_llvm_runtime_memory_jit(const LlvmRuntimeAllocP
   // the kernel without writing to *result. To detect that here, zero the slot first so a null readback unambiguously
   // means "allocation failed" and we can surface a helpful host-side message instead of letting the downstream
   // hipMemset trip on the stale pointer with a cryptic hipErrorInvalidValue.
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
   uint64 zero = 0;
-  AMDGPUDriver::get_instance().memcpy_host_to_device(params.result_buffer, &zero, sizeof(uint64));
+  AMDGPUDriver::get_instance().memcpy_host_to_device_async(params.result_buffer, &zero, sizeof(uint64), active_stream);
   params.runtime_jit->call<void *, std::size_t, std::size_t>("runtime_memory_allocate_aligned", params.runtime,
                                                              params.size, quadrants_page_size, params.result_buffer);
-  AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   uint64 *ret{nullptr};
   AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, params.result_buffer, sizeof(uint64));
   QD_ERROR_IF(ret == nullptr,
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 1fff73575b..390987768a 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -188,12 +188,20 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re
 void LlvmRuntimeExecutor::synchronize() {
   if (config_.arch == Arch::cuda) {
 #if defined(QD_WITH_CUDA)
+    auto *active_stream = CUDAContext::get_instance().get_stream();
+    if (active_stream != nullptr) {
+      CUDADriver::get_instance().stream_synchronize(active_stream);
+    }
     CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No CUDA support");
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(QD_WITH_AMDGPU)
+    auto *active_stream = AMDGPUContext::get_instance().get_stream();
+    if (active_stream != nullptr) {
+      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
+    }
     AMDGPUDriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No AMDGPU support");

From b1c6eea4249b29c530debbba122b502da4619592 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:32:06 -0700
Subject: [PATCH 055/109] Sync active_stream before adstack sizer stride
 readback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

publish_adstack_metadata launches the sizer kernel on active_stream
but reads the computed stride via synchronous copy_d2h (NULL stream).
With CU_STREAM_NON_BLOCKING user streams, the NULL stream does not
wait for the sizer kernel to complete, so the readback can return
stale stride values — sizing the adstack heap incorrectly. Add
stream_synchronize(active_stream) before the D2H.
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 69be9408b5..9e2d3c9041 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -943,7 +943,12 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
-    // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch.
+    // The sizer kernel runs on active_stream; drain it before reading the stride on the host.
+#if defined(QD_WITH_CUDA)
+    if (config_.arch == Arch::cuda) {
+      CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream());
+    }
+#endif
     uint64_t stride_u64 = 0;
     copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
     stride = static_cast<std::size_t>(stride_u64);

From 88f1bf7ef578e1043fc7df0b8fe575df7dde5bc7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:47:47 -0700
Subject: [PATCH 056/109] Add stream_parallel_group_id to QD_STMT_DEF_FIELDS
 for cache key correctness

Without this, the offline cache considers two kernels that differ only in stream_parallel_group_id assignments
as identical, potentially serving a cached version with wrong group IDs.
---
 quadrants/ir/statements.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h
index 503a1ed183..c9b0d79841 100644
--- a/quadrants/ir/statements.h
+++ b/quadrants/ir/statements.h
@@ -978,7 +978,7 @@ class RangeForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized);
+  QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 
@@ -1012,7 +1012,7 @@ class StructForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt);
+  QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 
@@ -1393,7 +1393,8 @@ class OffloadedStmt : public Stmt {
                      reversed,
                      num_cpu_threads,
                      index_offsets,
-                     mem_access_opt);
+                     mem_access_opt,
+                     stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 

From ca560b64d6e1f20ec4bcfc68d8081d87c466de10 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 06:58:28 -0700
Subject: [PATCH 057/109] Fix clang-format: multi-line QD_STMT_DEF_FIELDS for
 RangeForStmt and StructForStmt

---
 quadrants/ir/statements.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h
index c9b0d79841..c29c648995 100644
--- a/quadrants/ir/statements.h
+++ b/quadrants/ir/statements.h
@@ -978,7 +978,14 @@ class RangeForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id);
+  QD_STMT_DEF_FIELDS(begin,
+                     end,
+                     reversed,
+                     is_bit_vectorized,
+                     num_cpu_threads,
+                     block_dim,
+                     strictly_serialized,
+                     stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 
@@ -1012,7 +1019,13 @@ class StructForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id);
+  QD_STMT_DEF_FIELDS(snode,
+                     index_offsets,
+                     is_bit_vectorized,
+                     num_cpu_threads,
+                     block_dim,
+                     mem_access_opt,
+                     stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 

From 397f29814f3997d284ca026d1c0db2d56fa46406 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 07:03:15 -0700
Subject: [PATCH 058/109] Fix clang-format: break long QD_STMT_DEF_FIELDS lines
 in statements.h

---
 quadrants/ir/statements.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h
index c9b0d79841..c29c648995 100644
--- a/quadrants/ir/statements.h
+++ b/quadrants/ir/statements.h
@@ -978,7 +978,14 @@ class RangeForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id);
+  QD_STMT_DEF_FIELDS(begin,
+                     end,
+                     reversed,
+                     is_bit_vectorized,
+                     num_cpu_threads,
+                     block_dim,
+                     strictly_serialized,
+                     stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 
@@ -1012,7 +1019,13 @@ class StructForStmt : public Stmt {
 
   std::unique_ptr<Stmt> clone() const override;
 
-  QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id);
+  QD_STMT_DEF_FIELDS(snode,
+                     index_offsets,
+                     is_bit_vectorized,
+                     num_cpu_threads,
+                     block_dim,
+                     mem_access_opt,
+                     stream_parallel_group_id);
   QD_DEFINE_ACCEPT
 };
 

From ae1c932db2df45bdd0069e5c2a3b748a8b3d2128 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 07:17:21 -0700
Subject: [PATCH 059/109] Reflow comments and docstring to 120-char line width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++--
 quadrants/runtime/amdgpu/kernel_launcher.cpp       | 4 ++--
 tests/python/test_streams.py                       | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index d91afcac00..0b789cedf5 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -28,8 +28,8 @@ PER_AMDGPU_FUNCTION(memcpy_async, hipMemcpyAsync, void *, void *, std::size_t, u
 PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t);
-// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers
-// fall back to the synchronous variants on devices without memory-pool support.
+// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers fall back to the synchronous variants
+// on devices without memory-pool support.
 PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *);
 PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32);
 PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t);
diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index b32e0981ea..67befa8b66 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -105,8 +105,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   auto *active_stream = AMDGPUContext::get_instance().get_stream();
 
   char *device_result_buffer{nullptr};
-  // Must always allocate device_result_buffer (even when result_buffer_size
-  // is 0) to avoid memory access faults from allocate_memory_on_device below.
+  // Must always allocate device_result_buffer (even when result_buffer_size is 0) to avoid memory access faults
+  // from allocate_memory_on_device below.
   AMDGPUDriver::get_instance().malloc_async((void **)&device_result_buffer,
                                             std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream);
 
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 073d383c2e..969d18ecf1 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -199,8 +199,7 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
 
 @test_utils.test()
 def test_concurrent_streams_with_events():
-    """Two slow kernels on separate streams run concurrently (~1s on GPU),
-    serial fallback on CPU/Metal."""
+    """Two slow kernels on separate streams run concurrently (~1s on GPU), serial fallback on CPU/Metal."""
     SPIN_ITERS = 5_000_000
 
     @qd.kernel

From 3ef0340bdbba610abfd400042a9617b7e0542f03 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 07:40:06 -0700
Subject: [PATCH 060/109] Use context/device synchronize in synchronize() to
 drain all streams

stream_synchronize(nullptr) does not drain non-blocking user streams
(CU_STREAM_NON_BLOCKING / HIP_STREAM_NON_BLOCKING), so qd.sync()
failed to honor its "drain everything" contract. Python's finally
block resets stream_ to nullptr before qd.sync() runs, making the
previous active-stream check dead code for the user-facing path.

Replace with cuCtxSynchronize (CUDA) / hipDeviceSynchronize (AMDGPU)
which drain all streams on the device, correctly implementing the
documented qd.sync() semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h |  3 +++
 quadrants/rhi/cuda/cuda_driver_functions.inc.h     |  3 +++
 quadrants/runtime/llvm/llvm_runtime_executor.cpp   | 14 ++++----------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
index 0b789cedf5..c94a7f14db 100644
--- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
+++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h
@@ -15,6 +15,9 @@ PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *);
 PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *);
 PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **);
 
+// Device synchronization
+PER_AMDGPU_FUNCTION(device_synchronize, hipDeviceSynchronize);
+
 // Stream management
 PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32);
 PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *);
diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
index 55c5e3e0b8..b4164b7c33 100644
--- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h
+++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h
@@ -53,6 +53,9 @@ PER_CUDA_FUNCTION(kernel_get_occupancy, cuOccupancyMaxActiveBlocksPerMultiproces
 PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_attribute_enum, int);
 
 
+// Context management
+PER_CUDA_FUNCTION(context_synchronize, cuCtxSynchronize);
+
 // Stream management
 PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *);
 PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32);
diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 390987768a..6d631cfc2f 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -188,21 +188,15 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re
 void LlvmRuntimeExecutor::synchronize() {
   if (config_.arch == Arch::cuda) {
 #if defined(QD_WITH_CUDA)
-    auto *active_stream = CUDAContext::get_instance().get_stream();
-    if (active_stream != nullptr) {
-      CUDADriver::get_instance().stream_synchronize(active_stream);
-    }
-    CUDADriver::get_instance().stream_synchronize(nullptr);
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().context_synchronize();
 #else
     QD_ERROR("No CUDA support");
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(QD_WITH_AMDGPU)
-    auto *active_stream = AMDGPUContext::get_instance().get_stream();
-    if (active_stream != nullptr) {
-      AMDGPUDriver::get_instance().stream_synchronize(active_stream);
-    }
-    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+    AMDGPUContext::get_instance().make_current();
+    AMDGPUDriver::get_instance().device_synchronize();
 #else
     QD_ERROR("No AMDGPU support");
 #endif

From 3a81a46abcd5a53eea40df89e7283b4516479667 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 08:32:15 -0700
Subject: [PATCH 061/109] Use synchronous mem_free in dealloc_memory pool
 branch

mem_free_async on the NULL stream does not sync with non-blocking
user streams, so a Python ndarray dropped while a kernel is still
in flight could return its slab to the mempool prematurely. Using
synchronous mem_free matches pre-stream-rewire behavior and
implicitly waits for all pending work on the device.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp
index d127ce19a0..280cd9f7e1 100644
--- a/quadrants/rhi/amdgpu/amdgpu_device.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp
@@ -125,7 +125,7 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) {
   }
   QD_ASSERT(!info.is_imported);
   if (info.use_memory_pool) {
-    AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr);
+    AMDGPUDriver::get_instance().mem_free(info.ptr);
   } else if (info.use_cached) {
     DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/)
         .release(info.size, (uint64_t *)info.ptr, false);

From 3c6b24eb4706574a9bb755c335b0e1cda318b35b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 08:50:48 -0700
Subject: [PATCH 062/109] Add tests for stream/event context managers,
 event.synchronize, error paths

Cover the gaps flagged by the test-coverage CI check:
- Stream and Event used as context managers (__enter__/__exit__)
- Event.synchronize() method
- RuntimeError when qd_stream is combined with autograd Tape
- RuntimeError when qd_stream is combined with graph=True

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/test_streams.py | 78 ++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index fabc217e96..8a00024220 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -1,6 +1,7 @@
 """Tests for GPU stream and event support."""
 
 import numpy as np
+import pytest
 
 import quadrants as qd
 from quadrants.lang.stream import Event, Stream
@@ -195,3 +196,80 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)):
     s.synchronize()
     assert np.allclose(arr.to_numpy(), 99.0)
     s.destroy()
+
+
+@test_utils.test()
+def test_stream_context_manager():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 11.0
+
+    with qd.create_stream() as s:
+        fill(qd_stream=s)
+        s.synchronize()
+    assert s.handle == 0
+    assert np.allclose(x.to_numpy(), 11.0)
+
+
+@test_utils.test()
+def test_event_context_manager():
+    with qd.create_event() as e:
+        assert isinstance(e, Event)
+    assert e.handle == 0
+
+
+@test_utils.test()
+def test_event_synchronize():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 13.0
+
+    s = qd.create_stream()
+    fill(qd_stream=s)
+    e = qd.create_event()
+    e.record(s)
+    e.synchronize()
+    assert np.allclose(x.to_numpy(), 13.0)
+    e.destroy()
+    s.destroy()
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_tape_raises():
+    x = qd.field(qd.f32, shape=(), needs_grad=True)
+    loss = qd.field(qd.f32, shape=(), needs_grad=True)
+
+    @qd.kernel
+    def compute():
+        loss[None] = x[None] ** 2
+
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with autograd Tape"):
+        with qd.ad.Tape(loss):
+            compute(qd_stream=s)
+    s.destroy()
+
+
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_graph_raises():
+    N = 64
+    x = qd.field(qd.f32, shape=(N,))
+
+    @qd.kernel
+    def fill():
+        for i in range(N):
+            x[i] = 1.0
+
+    fill.use_graph = True
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with graph=True"):
+        fill(qd_stream=s)
+    s.destroy()

From 3499bbcccef6f174cbc15649b0dcbd00eaf5c990 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:04:44 -0700
Subject: [PATCH 063/109] Thread active_stream through AMDGPU profiler
 event_record and sync

Profiler events were hardcoded to the NULL stream while kernels now
run on user streams; with HIP_STREAM_NON_BLOCKING both events signal
immediately on the empty NULL stream, yielding ~0 ms timings.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/rhi/amdgpu/amdgpu_profiler.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
index 731d536bca..e963f7df20 100644
--- a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
+++ b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp
@@ -59,8 +59,9 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
 }
 
 void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
-  AMDGPUDriver::get_instance().event_record(handle, 0);
-  AMDGPUDriver::get_instance().stream_synchronize(nullptr);
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
+  AMDGPUDriver::get_instance().event_record(handle, active_stream);
+  AMDGPUDriver::get_instance().stream_synchronize(active_stream);
 
   // get elapsed time and destroy events
   auto record = event_toolkit_->get_current_event_record();
@@ -154,7 +155,8 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::
 
   AMDGPUDriver::get_instance().event_create(&(record.start_event), HIP_EVENT_DEFAULT);
   AMDGPUDriver::get_instance().event_create(&(record.stop_event), HIP_EVENT_DEFAULT);
-  AMDGPUDriver::get_instance().event_record((record.start_event), 0);
+  void *active_stream = AMDGPUContext::get_instance().get_stream();
+  AMDGPUDriver::get_instance().event_record((record.start_event), active_stream);
   event_records_.push_back(record);
 
   if (!base_event_) {
@@ -163,7 +165,7 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std::
     for (int i = 0; i < n_iters; i++) {
       void *e;
       AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
-      AMDGPUDriver::get_instance().event_record(e, 0);
+      AMDGPUDriver::get_instance().event_record(e, active_stream);
       AMDGPUDriver::get_instance().event_synchronize(e);
       auto final_t = Time::get_time();
       if (i == n_iters - 1) {

From c549e072779fc12f4a33c381d1f76ef6167cd0e7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:53:18 -0700
Subject: [PATCH 064/109] Fix graph+stream error guard and test

Check self.use_graph instead of launch_ctx.use_graph so the error
fires even when QD_GRAPH env var is off.  Use @qd.kernel(graph=True)
in the test instead of manually setting .use_graph attribute.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/kernel.py | 2 +-
 tests/python/test_streams.py    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index e0cdf945b5..cb337d1bc1 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -561,7 +561,7 @@ def launch_kernel(
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED
-            if launch_ctx.use_graph and qd_stream is not None:
+            if self.use_graph and qd_stream is not None:
                 raise RuntimeError(
                     "qd_stream is not compatible with graph=True kernels. "
                     "See docs/source/user_guide/streams.md for details."
diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 8a00024220..7f03703dac 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -263,12 +263,11 @@ def test_stream_with_graph_raises():
     N = 64
     x = qd.field(qd.f32, shape=(N,))
 
-    @qd.kernel
+    @qd.kernel(graph=True)
     def fill():
         for i in range(N):
             x[i] = 1.0
 
-    fill.use_graph = True
     s = qd.create_stream()
     with pytest.raises(RuntimeError, match="not compatible with graph=True"):
         fill(qd_stream=s)

From 5d284acf162364a7a1c271647388fe1111a09029 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 09:57:09 -0700
Subject: [PATCH 065/109] Update qd.sync() docstring and streams doc to reflect
 default-stream-only semantics

qd.sync() synchronizes the default (NULL) stream, not explicit
non-blocking streams.  Update the docstring and add a note to the
streams user guide.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/source/user_guide/streams.md    | 1 +
 python/quadrants/lang/runtime_ops.py | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 85d4e8d12c..0f9dbf7496 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -134,4 +134,5 @@ qd_stream.destroy()
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
 - **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
+- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py
index 0ecd122f56..8b07cfb99a 100644
--- a/python/quadrants/lang/runtime_ops.py
+++ b/python/quadrants/lang/runtime_ops.py
@@ -4,8 +4,12 @@
 
 
 def sync():
-    """Blocks the calling thread until all the previously
-    launched Quadrants kernels have completed.
+    """Synchronizes the default stream.
+
+    Blocks the calling thread until all work on the default GPU stream
+    has completed.  Kernels launched on explicit streams created via
+    :func:`quadrants.create_stream` are **not** waited on — call
+    ``stream.synchronize()`` for those.
     """
     impl.get_runtime().sync()
 

From df0b03a6d1505a47b583677b7d6af4bdf040388a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 11:12:05 -0700
Subject: [PATCH 066/109] Fix stream_parallel identity check failing on
 dual-import-path builds

The _is_stream_parallel_with validation uses ASTResolver.resolve_to which
compares objects with `is`. On Linux build runners where quadrants is
available from both the source tree and installed location, the
stream_parallel function object may differ between import paths. Add a
fallback that checks __name__ and __module__ when identity fails, and
add ASTResolver.resolve_value for general AST-to-object resolution.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/ast/ast_transformer.py  |  4 +--
 .../function_def_transformer.py               | 10 +++++-
 python/quadrants/lang/ast/symbol_resolver.py  | 32 +++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index b5b78455c6..a0048ccb61 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -40,7 +40,7 @@
 from quadrants.lang.field import Field
 from quadrants.lang.matrix import Matrix, MatrixType
 from quadrants.lang.snode import append, deactivate, length
-from quadrants.lang.stream import stream_parallel
+
 from quadrants.lang.struct import Struct, StructType
 from quadrants.lang.util import (
     is_from_quadrants_module as _is_from_quadrants_module,
@@ -1539,7 +1539,7 @@ def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None:
             raise QuadrantsSyntaxError("'with ... as ...' is not supported in Quadrants kernels")
         if not isinstance(item.context_expr, ast.Call):
             raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression")
-        if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars):
+        if not FunctionDefTransformer._is_stream_parallel_with(node, ctx.global_vars):
             raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()")
         if not ctx.is_kernel:
             raise QuadrantsSyntaxError("qd.stream_parallel() can only be used inside @qd.kernel, not @qd.func")
diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index d6b64b5080..12997eba80 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -462,7 +462,15 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
         item = stmt.items[0]
         if not isinstance(item.context_expr, ast.Call):
             return False
-        return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars)
+        func_node = item.context_expr.func
+        if ASTResolver.resolve_to(func_node, stream_parallel, global_vars):
+            return True
+        resolved = ASTResolver.resolve_value(func_node, global_vars)
+        return (
+            resolved is not None
+            and getattr(resolved, "__name__", None) == "stream_parallel"
+            and getattr(resolved, "__module__", None) == "quadrants.lang.stream"
+        )
 
     @staticmethod
     def _is_docstring(stmt: ast.stmt, index: int) -> bool:
diff --git a/python/quadrants/lang/ast/symbol_resolver.py b/python/quadrants/lang/ast/symbol_resolver.py
index 81296fcefb..f95373a463 100644
--- a/python/quadrants/lang/ast/symbol_resolver.py
+++ b/python/quadrants/lang/ast/symbol_resolver.py
@@ -55,3 +55,35 @@ def resolve_to(node, wanted, scope):
                 return False
         # The name ``scope`` here could be a bit confusing
         return scope is wanted
+
+    @staticmethod
+    def resolve_value(node, scope):
+        """Resolve an AST Name/Attribute node to a Python object.
+
+        Same traversal as resolve_to but returns the resolved object (or None)
+        instead of comparing against a wanted value.
+        """
+        if isinstance(node, ast.Name):
+            return scope.get(node.id) if isinstance(scope, dict) else None
+
+        if not isinstance(node, ast.Attribute):
+            return None
+
+        v = node.value
+        chain = [node.attr]
+        while isinstance(v, ast.Attribute):
+            chain.append(v.attr)
+            v = v.value
+        if not isinstance(v, ast.Name):
+            return None
+        chain.append(v.id)
+
+        for attr in reversed(chain):
+            try:
+                if isinstance(scope, dict):
+                    scope = scope[attr]
+                else:
+                    scope = getattr(scope, attr)
+            except (KeyError, AttributeError):
+                return None
+        return scope

From ff8056d34acecbb295d40a3a216e33b0fd0ddab8 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 11:17:39 -0700
Subject: [PATCH 067/109] Reflow sync() docstring to 120-char line width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/runtime_ops.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py
index 8b07cfb99a..71919e2379 100644
--- a/python/quadrants/lang/runtime_ops.py
+++ b/python/quadrants/lang/runtime_ops.py
@@ -6,10 +6,8 @@
 def sync():
     """Synchronizes the default stream.
 
-    Blocks the calling thread until all work on the default GPU stream
-    has completed.  Kernels launched on explicit streams created via
-    :func:`quadrants.create_stream` are **not** waited on — call
-    ``stream.synchronize()`` for those.
+    Blocks the calling thread until all work on the default GPU stream has completed.  Kernels launched on explicit
+    streams created via :func:`quadrants.create_stream` are **not** waited on — call ``stream.synchronize()`` for those.
     """
     impl.get_runtime().sync()
 

From acff351a403af45f3cf0b27660ae2033c2544401 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 11:22:39 -0700
Subject: [PATCH 068/109] Remove unused ASTResolver import from
 ast_transformer.py

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/ast/ast_transformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index a0048ccb61..152a952044 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -28,7 +28,6 @@
 from quadrants.lang.ast.ast_transformers.function_def_transformer import (
     FunctionDefTransformer,
 )
-from quadrants.lang.ast.symbol_resolver import ASTResolver
 from quadrants.lang.exception import (
     QuadrantsIndexError,
     QuadrantsRuntimeTypeError,

From 70eb471521763e251588c48e3e607248f8152c64 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 11:33:14 -0700
Subject: [PATCH 069/109] Fix import sorting in ast_transformer.py

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/ast/ast_transformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py
index 152a952044..263a4a11a3 100644
--- a/python/quadrants/lang/ast/ast_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformer.py
@@ -39,7 +39,6 @@
 from quadrants.lang.field import Field
 from quadrants.lang.matrix import Matrix, MatrixType
 from quadrants.lang.snode import append, deactivate, length
-
 from quadrants.lang.struct import Struct, StructType
 from quadrants.lang.util import (
     is_from_quadrants_module as _is_from_quadrants_module,

From ebd5e119cf5019e8539e1de5f3a75d1d8c936e22 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 12:36:40 -0700
Subject: [PATCH 070/109] Add AST-level fallback for stream_parallel detection

When object resolution fails (dual import paths), fall back to checking
the AST node name directly. Inside @qd.kernel the only valid with-context
is qd.stream_parallel(), so checking the attribute name is sufficient.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../ast_transformers/function_def_transformer.py  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 12997eba80..7a42dfff87 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -466,11 +466,16 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
         if ASTResolver.resolve_to(func_node, stream_parallel, global_vars):
             return True
         resolved = ASTResolver.resolve_value(func_node, global_vars)
-        return (
-            resolved is not None
-            and getattr(resolved, "__name__", None) == "stream_parallel"
-            and getattr(resolved, "__module__", None) == "quadrants.lang.stream"
-        )
+        if resolved is not None:
+            return (
+                getattr(resolved, "__name__", None) == "stream_parallel"
+                and getattr(resolved, "__module__", "").startswith("quadrants")
+            )
+        if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel":
+            return True
+        if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel":
+            return True
+        return False
 
     @staticmethod
     def _is_docstring(stmt: ast.stmt, index: int) -> bool:

From a6c385200add940d1f7182041e756c7b6748e744 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 12:37:08 -0700
Subject: [PATCH 071/109] Add diagnostic info to stream_parallel exclusivity
 error message

Include the failing statement type, index, and body length to help
debug the persistent Linux build x64 test failures.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../lang/ast/ast_transformers/function_def_transformer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 7a42dfff87..4ffee5fc2e 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -489,8 +489,14 @@ def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dic
             if FunctionDefTransformer._is_docstring(stmt, i):
                 continue
             if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars):
+                stmt_desc = f"{type(stmt).__name__}"
+                if isinstance(stmt, ast.With) and stmt.items:
+                    ctx_expr = stmt.items[0].context_expr
+                    if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute):
+                        stmt_desc += f"(with {ast.dump(ctx_expr.func)})"
                 raise QuadrantsSyntaxError(
                     "When using qd.stream_parallel(), all top-level statements "
                     "in the kernel must be 'with qd.stream_parallel():' blocks. "
-                    "Move non-parallel code to a separate kernel."
+                    f"Move non-parallel code to a separate kernel. "
+                    f"[stmt {i}: {stmt_desc}, body_len={len(body)}]"
                 )

From 03d2b293908f17fe3f7a8e7ba78720a45f8d620d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 12:55:46 -0700
Subject: [PATCH 072/109] Fix black formatting in function_def_transformer.py

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../lang/ast/ast_transformers/function_def_transformer.py  | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 4ffee5fc2e..23a1f9431a 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -467,10 +467,9 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
             return True
         resolved = ASTResolver.resolve_value(func_node, global_vars)
         if resolved is not None:
-            return (
-                getattr(resolved, "__name__", None) == "stream_parallel"
-                and getattr(resolved, "__module__", "").startswith("quadrants")
-            )
+            return getattr(resolved, "__name__", None) == "stream_parallel" and getattr(
+                resolved, "__module__", ""
+            ).startswith("quadrants")
         if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel":
             return True
         if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel":

From 3af5bc8607784720664d4ef18051da1712b60e1f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 13:49:18 -0700
Subject: [PATCH 073/109] Apply black formatting to function_def_transformer.py

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../function_def_transformer.py               | 121 +++++++++++++-----
 1 file changed, 91 insertions(+), 30 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 4ffee5fc2e..123767be55 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -56,7 +56,9 @@ def _decl_and_create_variable(
             assert this_arg_features is not None
             marker = this_arg_features[0]
             if marker == _TENSOR_T_NDARRAY_MARKER:
-                raw_element_type, ndim, needs_grad, boundary, layout = this_arg_features[1:]
+                raw_element_type, ndim, needs_grad, boundary, layout = (
+                    this_arg_features[1:]
+                )
                 return False, (
                     kernel_arguments.decl_ndarray_arg,
                     (
@@ -75,7 +77,9 @@ def _decl_and_create_variable(
                 assert ctx.global_vars is not None
                 return True, ctx.global_vars.get(name)
             raise AssertionError(f"unknown qd.Tensor marker: {marker!r}")
-        if annotation == annotations.template or isinstance(annotation, annotations.template):
+        if annotation == annotations.template or isinstance(
+            annotation, annotations.template
+        ):
             if name in ctx.template_vars:
                 return True, ctx.template_vars[name]
             assert ctx.global_vars is not None
@@ -98,8 +102,12 @@ def _decl_and_create_variable(
                 needs_grad,
                 BoundaryMode(boundary),
             )
-            offset = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_offset")
-            size = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_size")
+            offset = kernel_arguments.decl_scalar_arg(
+                primitive_types.i32, full_name + "_offset"
+            )
+            size = kernel_arguments.decl_scalar_arg(
+                primitive_types.i32, full_name + "_size"
+            )
             return True, BufferView(arr, offset, size)
         if isinstance(annotation, ndarray_type.NdarrayType):
             assert this_arg_features is not None
@@ -139,7 +147,10 @@ def _transform_kernel_arg(
             ctx.create_variable(argument_name, argument_type)
             for field_idx, field in enumerate(dataclasses.fields(argument_type)):
                 flat_name = create_flat_name(argument_name, field.name)
-                if pruning.enforcing and flat_name not in pruning.used_vars_by_func_id[func_id]:
+                if (
+                    pruning.enforcing
+                    and flat_name not in pruning.used_vars_by_func_id[func_id]
+                ):
                     continue
                 # if a field is a dataclass, then feed back into process_kernel_arg recursively
                 if dataclasses.is_dataclass(field.type):
@@ -177,7 +188,9 @@ def _transform_kernel_arg(
             ctx.create_variable(argument_name, obj)
 
     @staticmethod
-    def _transform_as_kernel(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None:
+    def _transform_as_kernel(
+        ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments
+    ) -> None:
         assert ctx.func is not None
         assert ctx.arg_features is not None
         if node.returns is not None:
@@ -226,7 +239,9 @@ def _walk_obj(obj, arg_idx, path):
                         child = child._unwrap()
                     if isinstance(child, _ndarray.Ndarray):
                         _register_ndarray(child, arg_idx, (*path, field.name))
-                    elif dataclasses.is_dataclass(child) and not isinstance(child, type):
+                    elif dataclasses.is_dataclass(child) and not isinstance(
+                        child, type
+                    ):
                         _walk_obj(child, arg_idx, (*path, field.name))
             else:
                 for attr_name, attr_val in vars(obj).items():
@@ -250,7 +265,9 @@ def _register_ndarray(nd, arg_idx, attr_chain):
                 element_type, ndim, name, needs_grad
             )
             arr = any_array.AnyArray(
-                _qd_core.make_external_tensor_expr(element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE),
+                _qd_core.make_external_tensor_expr(
+                    element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE
+                ),
                 _qd_layout=layout,
             )
             cache[key] = arr
@@ -259,7 +276,9 @@ def _register_ndarray(nd, arg_idx, attr_chain):
         assert ctx.py_args is not None
         for i, arg_meta in enumerate(ctx.func.arg_metas):
             anno = arg_meta.annotation
-            is_template = anno is annotations.template or isinstance(anno, annotations.template)
+            is_template = anno is annotations.template or isinstance(
+                anno, annotations.template
+            )
             is_tensor_anno = anno is _TensorClass
             if not (is_template or is_tensor_anno):
                 continue
@@ -297,15 +316,21 @@ def _transform_func_arg(
         # directly — ndarray and field impls are both valid pass-by-reference arguments.
         if argument_type is _TensorClass:
             data = FunctionDefTransformer._unwrap_tensor(data)
-            _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None)
+            _cache = getattr(
+                getattr(ctx, "global_context", None), "ndarray_to_any_array", None
+            )
             promoted = _cache.get(id(data)) if _cache else None
-            ctx.create_variable(argument_name, promoted if promoted is not None else data)
+            ctx.create_variable(
+                argument_name, promoted if promoted is not None else data
+            )
             return None
 
         if dataclasses.is_dataclass(argument_type):
             for field in dataclasses.fields(argument_type):
                 flat_name = create_flat_name(argument_name, field.name)
-                data_child = FunctionDefTransformer._unwrap_tensor(getattr(data, field.name))
+                data_child = FunctionDefTransformer._unwrap_tensor(
+                    getattr(data, field.name)
+                )
                 if isinstance(
                     data_child,
                     (
@@ -317,11 +342,19 @@ def _transform_func_arg(
                 ):
                     # qd.Tensor struct fields skip check_matched (the Tensor class has no such method — it is
                     # polymorphic).
-                    if field.type is not _TensorClass and hasattr(field.type, "check_matched"):
+                    if field.type is not _TensorClass and hasattr(
+                        field.type, "check_matched"
+                    ):
                         field.type.check_matched(data_child.get_type(), field.name)
-                    _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None)
+                    _cache = getattr(
+                        getattr(ctx, "global_context", None),
+                        "ndarray_to_any_array",
+                        None,
+                    )
                     promoted = _cache.get(id(data_child)) if _cache else None
-                    ctx.create_variable(flat_name, promoted if promoted is not None else data_child)
+                    ctx.create_variable(
+                        flat_name, promoted if promoted is not None else data_child
+                    )
                 elif dataclasses.is_dataclass(data_child):
                     FunctionDefTransformer._transform_func_arg(
                         ctx,
@@ -338,9 +371,17 @@ def _transform_func_arg(
         # Ndarray arguments are passed by reference.
         if isinstance(argument_type, (ndarray_type.NdarrayType)):
             if not isinstance(
-                data, (_ndarray.ScalarNdarray, matrix.VectorNdarray, matrix.MatrixNdarray, any_array.AnyArray)
+                data,
+                (
+                    _ndarray.ScalarNdarray,
+                    matrix.VectorNdarray,
+                    matrix.MatrixNdarray,
+                    any_array.AnyArray,
+                ),
             ):
-                raise QuadrantsSyntaxError(f"Argument {argument_name} of type {argument_type} is not recognized.")
+                raise QuadrantsSyntaxError(
+                    f"Argument {argument_name} of type {argument_type} is not recognized."
+                )
             argument_type.check_matched(data.get_type(), argument_name)
             ctx.create_variable(argument_name, data)
             return None
@@ -350,7 +391,9 @@ def _transform_func_arg(
         # not here — data.arr is an Expr node during func compilation, not a real Ndarray.
         if isinstance(argument_type, buffer_view_type.BufferViewType):
             if not isinstance(data, BufferView):
-                raise QuadrantsSyntaxError(f"Argument {argument_name} expects a BufferView, got {type(data).__name__}")
+                raise QuadrantsSyntaxError(
+                    f"Argument {argument_name} expects a BufferView, got {type(data).__name__}"
+                )
             ctx.create_variable(argument_name, data)
             return None
 
@@ -389,7 +432,9 @@ def _transform_func_arg(
             return None
 
         if id(argument_type) in primitive_types.type_ids:
-            ctx.create_variable(argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type)))
+            ctx.create_variable(
+                argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type))
+            )
             return None
         # Create a copy for non-template arguments,
         # so that they are passed by value.
@@ -398,7 +443,9 @@ def _transform_func_arg(
         return None
 
     @staticmethod
-    def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None:
+    def _transform_as_func(
+        ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments
+    ) -> None:
         # pylint: disable=import-outside-toplevel
         from quadrants.lang.kernel_impl import Func
 
@@ -406,7 +453,9 @@ def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, ar
         assert ctx.py_args is not None
         for py_arg_i, py_arg in enumerate(ctx.py_args):
             argument = ctx.func.arg_metas_expanded[py_arg_i]
-            FunctionDefTransformer._transform_func_arg(ctx, argument.name, argument.annotation, py_arg)
+            FunctionDefTransformer._transform_func_arg(
+                ctx, argument.name, argument.annotation, py_arg
+            )
 
         # deal with dataclasses
         for v in ctx.func.orig_arguments:
@@ -446,7 +495,9 @@ def build_FunctionDef(
                 FunctionDefTransformer._transform_as_func(ctx, node, args)
 
         if ctx.is_kernel:
-            FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars)
+            FunctionDefTransformer._validate_stream_parallel_exclusivity(
+                node.body, ctx.global_vars
+            )
 
         with ctx.variable_scope_guard():
             build_stmts(ctx, node.body)
@@ -467,10 +518,9 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
             return True
         resolved = ASTResolver.resolve_value(func_node, global_vars)
         if resolved is not None:
-            return (
-                getattr(resolved, "__name__", None) == "stream_parallel"
-                and getattr(resolved, "__module__", "").startswith("quadrants")
-            )
+            return getattr(resolved, "__name__", None) == "stream_parallel" and getattr(
+                resolved, "__module__", ""
+            ).startswith("quadrants")
         if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel":
             return True
         if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel":
@@ -479,11 +529,20 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
 
     @staticmethod
     def _is_docstring(stmt: ast.stmt, index: int) -> bool:
-        return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str))
+        return (
+            index == 0
+            and isinstance(stmt, ast.Expr)
+            and isinstance(stmt.value, (ast.Constant, ast.Str))
+        )
 
     @staticmethod
-    def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None:
-        if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body):
+    def _validate_stream_parallel_exclusivity(
+        body: list[ast.stmt], global_vars: dict[str, Any]
+    ) -> None:
+        if not any(
+            FunctionDefTransformer._is_stream_parallel_with(s, global_vars)
+            for s in body
+        ):
             return
         for i, stmt in enumerate(body):
             if FunctionDefTransformer._is_docstring(stmt, i):
@@ -492,7 +551,9 @@ def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dic
                 stmt_desc = f"{type(stmt).__name__}"
                 if isinstance(stmt, ast.With) and stmt.items:
                     ctx_expr = stmt.items[0].context_expr
-                    if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute):
+                    if isinstance(ctx_expr, ast.Call) and isinstance(
+                        ctx_expr.func, ast.Attribute
+                    ):
                         stmt_desc += f"(with {ast.dump(ctx_expr.func)})"
                 raise QuadrantsSyntaxError(
                     "When using qd.stream_parallel(), all top-level statements "

From 28440602d0796fb0f15dfa176932c68ab499dc57 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 14:08:47 -0700
Subject: [PATCH 074/109] Fix black formatting in function_def_transformer.py
 (post-merge)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../function_def_transformer.py               | 23 ++++---------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 123767be55..debbd2efa9 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -495,9 +495,7 @@ def build_FunctionDef(
                 FunctionDefTransformer._transform_as_func(ctx, node, args)
 
         if ctx.is_kernel:
-            FunctionDefTransformer._validate_stream_parallel_exclusivity(
-                node.body, ctx.global_vars
-            )
+            FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars)
 
         with ctx.variable_scope_guard():
             build_stmts(ctx, node.body)
@@ -529,20 +527,11 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo
 
     @staticmethod
     def _is_docstring(stmt: ast.stmt, index: int) -> bool:
-        return (
-            index == 0
-            and isinstance(stmt, ast.Expr)
-            and isinstance(stmt.value, (ast.Constant, ast.Str))
-        )
+        return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str))
 
     @staticmethod
-    def _validate_stream_parallel_exclusivity(
-        body: list[ast.stmt], global_vars: dict[str, Any]
-    ) -> None:
-        if not any(
-            FunctionDefTransformer._is_stream_parallel_with(s, global_vars)
-            for s in body
-        ):
+    def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None:
+        if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body):
             return
         for i, stmt in enumerate(body):
             if FunctionDefTransformer._is_docstring(stmt, i):
@@ -551,9 +540,7 @@ def _validate_stream_parallel_exclusivity(
                 stmt_desc = f"{type(stmt).__name__}"
                 if isinstance(stmt, ast.With) and stmt.items:
                     ctx_expr = stmt.items[0].context_expr
-                    if isinstance(ctx_expr, ast.Call) and isinstance(
-                        ctx_expr.func, ast.Attribute
-                    ):
+                    if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute):
                         stmt_desc += f"(with {ast.dump(ctx_expr.func)})"
                 raise QuadrantsSyntaxError(
                     "When using qd.stream_parallel(), all top-level statements "

From 5903e499b25e83d4fd3930e28e6ab67c21033d1b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Fri, 1 May 2026 14:21:05 -0700
Subject: [PATCH 075/109] Run black -l 120 on function_def_transformer.py
 (post-merge formatting)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../function_def_transformer.py               | 77 +++++--------------
 1 file changed, 19 insertions(+), 58 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index debbd2efa9..2878921709 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -56,9 +56,7 @@ def _decl_and_create_variable(
             assert this_arg_features is not None
             marker = this_arg_features[0]
             if marker == _TENSOR_T_NDARRAY_MARKER:
-                raw_element_type, ndim, needs_grad, boundary, layout = (
-                    this_arg_features[1:]
-                )
+                raw_element_type, ndim, needs_grad, boundary, layout = this_arg_features[1:]
                 return False, (
                     kernel_arguments.decl_ndarray_arg,
                     (
@@ -77,9 +75,7 @@ def _decl_and_create_variable(
                 assert ctx.global_vars is not None
                 return True, ctx.global_vars.get(name)
             raise AssertionError(f"unknown qd.Tensor marker: {marker!r}")
-        if annotation == annotations.template or isinstance(
-            annotation, annotations.template
-        ):
+        if annotation == annotations.template or isinstance(annotation, annotations.template):
             if name in ctx.template_vars:
                 return True, ctx.template_vars[name]
             assert ctx.global_vars is not None
@@ -102,12 +98,8 @@ def _decl_and_create_variable(
                 needs_grad,
                 BoundaryMode(boundary),
             )
-            offset = kernel_arguments.decl_scalar_arg(
-                primitive_types.i32, full_name + "_offset"
-            )
-            size = kernel_arguments.decl_scalar_arg(
-                primitive_types.i32, full_name + "_size"
-            )
+            offset = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_offset")
+            size = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_size")
             return True, BufferView(arr, offset, size)
         if isinstance(annotation, ndarray_type.NdarrayType):
             assert this_arg_features is not None
@@ -147,10 +139,7 @@ def _transform_kernel_arg(
             ctx.create_variable(argument_name, argument_type)
             for field_idx, field in enumerate(dataclasses.fields(argument_type)):
                 flat_name = create_flat_name(argument_name, field.name)
-                if (
-                    pruning.enforcing
-                    and flat_name not in pruning.used_vars_by_func_id[func_id]
-                ):
+                if pruning.enforcing and flat_name not in pruning.used_vars_by_func_id[func_id]:
                     continue
                 # if a field is a dataclass, then feed back into process_kernel_arg recursively
                 if dataclasses.is_dataclass(field.type):
@@ -188,9 +177,7 @@ def _transform_kernel_arg(
             ctx.create_variable(argument_name, obj)
 
     @staticmethod
-    def _transform_as_kernel(
-        ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments
-    ) -> None:
+    def _transform_as_kernel(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None:
         assert ctx.func is not None
         assert ctx.arg_features is not None
         if node.returns is not None:
@@ -239,9 +226,7 @@ def _walk_obj(obj, arg_idx, path):
                         child = child._unwrap()
                     if isinstance(child, _ndarray.Ndarray):
                         _register_ndarray(child, arg_idx, (*path, field.name))
-                    elif dataclasses.is_dataclass(child) and not isinstance(
-                        child, type
-                    ):
+                    elif dataclasses.is_dataclass(child) and not isinstance(child, type):
                         _walk_obj(child, arg_idx, (*path, field.name))
             else:
                 for attr_name, attr_val in vars(obj).items():
@@ -265,9 +250,7 @@ def _register_ndarray(nd, arg_idx, attr_chain):
                 element_type, ndim, name, needs_grad
             )
             arr = any_array.AnyArray(
-                _qd_core.make_external_tensor_expr(
-                    element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE
-                ),
+                _qd_core.make_external_tensor_expr(element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE),
                 _qd_layout=layout,
             )
             cache[key] = arr
@@ -276,9 +259,7 @@ def _register_ndarray(nd, arg_idx, attr_chain):
         assert ctx.py_args is not None
         for i, arg_meta in enumerate(ctx.func.arg_metas):
             anno = arg_meta.annotation
-            is_template = anno is annotations.template or isinstance(
-                anno, annotations.template
-            )
+            is_template = anno is annotations.template or isinstance(anno, annotations.template)
             is_tensor_anno = anno is _TensorClass
             if not (is_template or is_tensor_anno):
                 continue
@@ -316,21 +297,15 @@ def _transform_func_arg(
         # directly — ndarray and field impls are both valid pass-by-reference arguments.
         if argument_type is _TensorClass:
             data = FunctionDefTransformer._unwrap_tensor(data)
-            _cache = getattr(
-                getattr(ctx, "global_context", None), "ndarray_to_any_array", None
-            )
+            _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None)
             promoted = _cache.get(id(data)) if _cache else None
-            ctx.create_variable(
-                argument_name, promoted if promoted is not None else data
-            )
+            ctx.create_variable(argument_name, promoted if promoted is not None else data)
             return None
 
         if dataclasses.is_dataclass(argument_type):
             for field in dataclasses.fields(argument_type):
                 flat_name = create_flat_name(argument_name, field.name)
-                data_child = FunctionDefTransformer._unwrap_tensor(
-                    getattr(data, field.name)
-                )
+                data_child = FunctionDefTransformer._unwrap_tensor(getattr(data, field.name))
                 if isinstance(
                     data_child,
                     (
@@ -342,9 +317,7 @@ def _transform_func_arg(
                 ):
                     # qd.Tensor struct fields skip check_matched (the Tensor class has no such method — it is
                     # polymorphic).
-                    if field.type is not _TensorClass and hasattr(
-                        field.type, "check_matched"
-                    ):
+                    if field.type is not _TensorClass and hasattr(field.type, "check_matched"):
                         field.type.check_matched(data_child.get_type(), field.name)
                     _cache = getattr(
                         getattr(ctx, "global_context", None),
@@ -352,9 +325,7 @@ def _transform_func_arg(
                         None,
                     )
                     promoted = _cache.get(id(data_child)) if _cache else None
-                    ctx.create_variable(
-                        flat_name, promoted if promoted is not None else data_child
-                    )
+                    ctx.create_variable(flat_name, promoted if promoted is not None else data_child)
                 elif dataclasses.is_dataclass(data_child):
                     FunctionDefTransformer._transform_func_arg(
                         ctx,
@@ -379,9 +350,7 @@ def _transform_func_arg(
                     any_array.AnyArray,
                 ),
             ):
-                raise QuadrantsSyntaxError(
-                    f"Argument {argument_name} of type {argument_type} is not recognized."
-                )
+                raise QuadrantsSyntaxError(f"Argument {argument_name} of type {argument_type} is not recognized.")
             argument_type.check_matched(data.get_type(), argument_name)
             ctx.create_variable(argument_name, data)
             return None
@@ -391,9 +360,7 @@ def _transform_func_arg(
         # not here — data.arr is an Expr node during func compilation, not a real Ndarray.
         if isinstance(argument_type, buffer_view_type.BufferViewType):
             if not isinstance(data, BufferView):
-                raise QuadrantsSyntaxError(
-                    f"Argument {argument_name} expects a BufferView, got {type(data).__name__}"
-                )
+                raise QuadrantsSyntaxError(f"Argument {argument_name} expects a BufferView, got {type(data).__name__}")
             ctx.create_variable(argument_name, data)
             return None
 
@@ -432,9 +399,7 @@ def _transform_func_arg(
             return None
 
         if id(argument_type) in primitive_types.type_ids:
-            ctx.create_variable(
-                argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type))
-            )
+            ctx.create_variable(argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type)))
             return None
         # Create a copy for non-template arguments,
         # so that they are passed by value.
@@ -443,9 +408,7 @@ def _transform_func_arg(
         return None
 
     @staticmethod
-    def _transform_as_func(
-        ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments
-    ) -> None:
+    def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None:
         # pylint: disable=import-outside-toplevel
         from quadrants.lang.kernel_impl import Func
 
@@ -453,9 +416,7 @@ def _transform_as_func(
         assert ctx.py_args is not None
         for py_arg_i, py_arg in enumerate(ctx.py_args):
             argument = ctx.func.arg_metas_expanded[py_arg_i]
-            FunctionDefTransformer._transform_func_arg(
-                ctx, argument.name, argument.annotation, py_arg
-            )
+            FunctionDefTransformer._transform_func_arg(ctx, argument.name, argument.annotation, py_arg)
 
         # deal with dataclasses
         for v in ctx.func.orig_arguments:

From 360adc8fad4e9709f51016fd131686f41679c64e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:28:58 -0700
Subject: [PATCH 076/109] Reject qd_stream on autodiff kernels

Streams are not compatible with reverse-mode or forward-mode
differentiation.  The adstack sizer and Tape replay paths assume the
default stream; rather than fixing every race, block the combination
at the Python entry point with a clear error message.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/source/user_guide/streams.md | 2 +-
 python/quadrants/lang/kernel.py   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 0f9dbf7496..b4b70b774b 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -133,6 +133,6 @@ qd_stream.destroy()
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee.
+- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context.
 - **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
 - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
index eecf92631a..0b45a5816b 100644
--- a/python/quadrants/lang/kernel.py
+++ b/python/quadrants/lang/kernel.py
@@ -664,6 +664,11 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
         qd_stream = kwargs.pop("qd_stream", None)
+        if qd_stream is not None and self.autodiff_mode != _NONE:
+            raise RuntimeError(
+                "qd_stream is not compatible with autodiff kernels. Streams cannot be used with "
+                "reverse-mode or forward-mode differentiation."
+            )
         if qd_stream is not None and self.runtime.target_tape:
             raise RuntimeError(
                 "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "

From e20fe99687dd0f2cfb78a7895414bd481d6f7fa6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:29:55 -0700
Subject: [PATCH 077/109] Revert adstack sizer stream_synchronize

Autodiff+streams is now blocked at the Python level, so the adstack
code path never runs on a non-default stream.  Remove the unnecessary
stream_synchronize we added in publish_adstack_metadata.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 214c12de11..8326335dfb 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -941,12 +941,6 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf
     runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
                                               runtime_context_ptr_for_sizer, bytecode_dev_ptr);
 
-    // The sizer kernel runs on active_stream; drain it before reading the stride on the host.
-#if defined(QD_WITH_CUDA)
-    if (config_.arch == Arch::cuda) {
-      CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream());
-    }
-#endif
     uint64_t stride_u64 = 0;
     copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
     stride = static_cast<std::size_t>(stride_u64);

From e3c5f6f59461392be9b16ea76550b278649a8899 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 01:40:06 -0700
Subject: [PATCH 078/109] Reset llvm_runtime_executor.cpp to upstream

Our branch had a stale copy of publish_adstack_metadata and
ensure_adstack_heap that conflicted with upstream's refactor into
ensure_adstack_heap_float / ensure_adstack_heap_int.  Since autodiff
is now blocked with streams at the Python level, we have no changes
to make in this file.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../runtime/llvm/llvm_runtime_executor.cpp    | 425 ------------------
 1 file changed, 425 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 8326335dfb..658c139c0f 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -606,431 +606,6 @@ void *LlvmRuntimeExecutor::get_runtime_temporaries_device_ptr() {
   return runtime_temporaries_cache_;
 }
 
-// Publish the per-task adstack metadata into the LLVMRuntime struct and size the heap. The codegen path loads
-// stride / offset / max_size from these fields at every `AdStack*` site (see `ensure_ad_stack_metadata_llvm` in
-// codegen_llvm.cpp), so we must write them before every launch even for tasks where the compile-time and
-// launch-time bounds agree. `evaluate_adstack_size_expr` is called only when the symbolic tree is available; the
-// offline cache does not currently serialize `SizeExpr`, so cache hits fall back to `max_size_compile_time`.
-std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInfo &ad_stack,
-                                                          std::size_t num_threads,
-                                                          LaunchContextBuilder *ctx,
-                                                          void *device_runtime_context_ptr) {
-  const auto n_stacks = ad_stack.allocas.size();
-  if (n_stacks == 0 || num_threads == 0) {
-    return 0;
-  }
-  auto align_up_8 = [](std::size_t n) -> std::size_t { return (n + 7u) & ~std::size_t{7u}; };
-  // Allocate / grow the two device-side metadata arrays. Capacity is in u64 entries, kept at or above n_stacks.
-  // On GPU these buffers are written exclusively by the device-side sizer kernel (`runtime_eval_adstack_size_expr`);
-  // on CPU the host evaluator writes them directly via `std::memcpy`. Either way the pointers published into
-  // `runtime->adstack_offsets` / `adstack_max_sizes` stay stable across launches unless we grow here.
-  auto grow_to = [&](DeviceAllocationUnique &alloc, std::size_t capacity_u64) {
-    Device::AllocParams params{};
-    params.size = capacity_u64 * sizeof(uint64_t);
-    params.host_read = false;
-    params.host_write = false;
-    params.export_sharing = false;
-    params.usage = AllocUsage::Storage;
-    DeviceAllocation new_alloc;
-    RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-    QD_ERROR_IF(res != RhiResult::success, "Failed to allocate {} bytes for adstack metadata array (err: {})",
-                params.size, int(res));
-    alloc = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-  };
-  if (n_stacks > adstack_metadata_capacity_) {
-    std::size_t new_cap = std::max<std::size_t>(n_stacks, 2 * adstack_metadata_capacity_);
-    grow_to(adstack_offsets_alloc_, new_cap);
-    grow_to(adstack_max_sizes_alloc_, new_cap);
-    adstack_metadata_capacity_ = new_cap;
-  }
-  void *offsets_dev_ptr = get_device_alloc_info_ptr(*adstack_offsets_alloc_);
-  void *max_sizes_dev_ptr = get_device_alloc_info_ptr(*adstack_max_sizes_alloc_);
-
-  auto copy_h2d = [&](void *dst, const void *src, std::size_t bytes) {
-    if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-      CUDADriver::get_instance().memcpy_host_to_device(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-      AMDGPUDriver::get_instance().memcpy_host_to_device(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else {
-      std::memcpy(dst, src, bytes);
-    }
-  };
-  auto copy_d2h = [&](void *dst, const void *src, std::size_t bytes) {
-    if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-      CUDADriver::get_instance().memcpy_device_to_host(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-      AMDGPUDriver::get_instance().memcpy_device_to_host(dst, const_cast<void *>(src), bytes);
-#else
-      QD_NOT_IMPLEMENTED;
-#endif
-    } else {
-      std::memcpy(dst, src, bytes);
-    }
-  };
-
-  // Cache the runtime-field addresses on the first call; then publish the metadata-array pointers into the
-  // runtime struct. The stride field is written by the sizer on GPU and by this function on CPU, so we cache the
-  // address either way.
-  if (runtime_adstack_stride_field_ptr_ == nullptr) {
-    auto *const runtime_jit = get_runtime_jit_module();
-    runtime_jit->call<void *>("runtime_get_adstack_metadata_field_ptrs", llvm_runtime_);
-    runtime_adstack_stride_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_));
-    runtime_adstack_offsets_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_));
-    runtime_adstack_max_sizes_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 2, result_buffer_cache_));
-  }
-  copy_h2d(runtime_adstack_offsets_field_ptr_, &offsets_dev_ptr, sizeof(void *));
-  copy_h2d(runtime_adstack_max_sizes_field_ptr_, &max_sizes_dev_ptr, sizeof(void *));
-
-  std::size_t stride = 0;
-  const bool is_gpu_llvm = (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu);
-
-  // Host-eval fast path. The on-device sizer kernel exists to handle one specific leaf, `ExternalTensorRead`,
-  // whose ndarray data lives in GPU-private memory (`cudaMalloc` / `hipMalloc`, no UVA fallback) and thus
-  // cannot be touched from the host. Every other SizeExpr leaf - `Const`, `BoundVariable`,
-  // `ExternalTensorShape`, `FieldLoad` - is host-resolvable through the existing `evaluate_adstack_size_expr`
-  // path, so when the kernel's SizeExprs are all `ExternalTensorRead`-free we can skip the encode + bytecode
-  // h2d + sizer-kernel launch + d2h-stride pipeline entirely and write the metadata directly via `copy_h2d`.
-  // On CUDA the saved `cuMemcpyDtoH` for the per-launch stride readback is the dominant cost: every reverse-
-  // mode kernel launch in a 100-substep test paid one such synchronous DtoH each, and that compound stall
-  // accounted for the bulk of the GPU launch overhead under adstack mode. The condition is computed once per
-  // launch by scanning each stack's `nodes` vector for an `ExternalTensorRead` leaf; the scan is O(total
-  // SizeExpr nodes), well below the cost of the cheapest h2d / d2h on any LLVM GPU backend.
-  bool all_size_exprs_host_resolvable = true;
-  for (std::size_t i = 0; i < n_stacks && all_size_exprs_host_resolvable; ++i) {
-    if (i >= ad_stack.size_exprs.size()) {
-      continue;
-    }
-    for (const auto &node : ad_stack.size_exprs[i].nodes) {
-      if (static_cast<SizeExpr::Kind>(node.kind) == SizeExpr::Kind::ExternalTensorRead) {
-        all_size_exprs_host_resolvable = false;
-        break;
-      }
-    }
-  }
-  const bool use_host_eval = !is_gpu_llvm || all_size_exprs_host_resolvable;
-  if (use_host_eval) {
-    // CPU + GPU-without-ExternalTensorRead path: run the host evaluator directly. On CPU we use synchronous
-    // `copy_h2d` (just `std::memcpy` for that arch), but on CUDA / AMDGPU we ship the same payload through
-    // pinned-host memory via async `cuMemcpyHtoDAsync` / `hipMemcpyHtoDAsync` so the host returns immediately
-    // after queueing the copies on the default stream and the subsequent main-kernel launch (also on the
-    // default stream) stream-orders after the copies. The synchronous `cuMemcpyHtoD_v2` path used to block
-    // the host on every one of the three writes we issue per launch; with thousands of reverse-mode launches
-    // per `test_differentiable_rigid` run, those serial host stalls were a measurable fraction of wallclock.
-    // `FieldLoad` is serviced by `SNodeRwAccessorsBank` regardless of arch.
-    // Guard `program_impl_->program` lookups against the C++-only-tests setup where `program_impl_` itself is null;
-    // the on-device branch below already does this and falls back to `max_size_compile_time`.
-    Program *prog = (program_impl_ != nullptr) ? program_impl_->program : nullptr;
-    std::vector<uint64_t> host_max_sizes(n_stacks);
-    for (std::size_t i = 0; i < n_stacks; ++i) {
-      const SerializedSizeExpr *expr = (i < ad_stack.size_exprs.size()) ? &ad_stack.size_exprs[i] : nullptr;
-      int64_t v = -1;
-      if (expr != nullptr && !expr->nodes.empty() && prog != nullptr) {
-        v = evaluate_adstack_size_expr(*expr, prog, ctx);
-      }
-      if (v < 0) {
-        v = static_cast<int64_t>(ad_stack.allocas[i].max_size_compile_time);
-      }
-      host_max_sizes[i] = static_cast<uint64_t>(std::max<int64_t>(v, 1));
-    }
-    std::vector<uint64_t> host_offsets(n_stacks);
-    for (std::size_t i = 0; i < n_stacks; ++i) {
-      host_offsets[i] = stride;
-      stride += align_up_8(sizeof(int64_t) + ad_stack.allocas[i].entry_size_bytes * host_max_sizes[i]);
-    }
-    uint64_t stride_u64 = static_cast<uint64_t>(stride);
-    if (!is_gpu_llvm) {
-      copy_h2d(offsets_dev_ptr, host_offsets.data(), n_stacks * sizeof(uint64_t));
-      copy_h2d(max_sizes_dev_ptr, host_max_sizes.data(), n_stacks * sizeof(uint64_t));
-      copy_h2d(runtime_adstack_stride_field_ptr_, &stride_u64, sizeof(uint64_t));
-    } else {
-      // Three-block payload packed into the pinned-host scratch as `[stride_u64, offsets[n_stacks],
-      // max_sizes[n_stacks]]`. Three async DMAs land on the three target device addresses (the runtime
-      // struct's stride field, the offsets storage buffer, the max_sizes storage buffer) sourced from
-      // the corresponding offsets within the pinned scratch. The driver's H2D DMA engine reads from the
-      // pinned bytes at execution time, so we must not overwrite the scratch before all three copies
-      // have completed - hence the per-launch `event_record` after the last copy and the
-      // `event_synchronize` at the top of the next launch. The wait is typically a no-op because a few
-      // microseconds of small copies finish well before the host returns, dispatches the main kernel,
-      // and re-enters this function on the next launch.
-      const std::size_t header_bytes = sizeof(uint64_t);
-      const std::size_t array_bytes = n_stacks * sizeof(uint64_t);
-      const std::size_t total_bytes = header_bytes + 2 * array_bytes;
-
-      auto wait_pending = [this]() {
-        if (!pinned_metadata_event_pending_) {
-          return;
-        }
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().event_synchronize(pinned_metadata_event_);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          AMDGPUDriver::get_instance().event_synchronize(pinned_metadata_event_);
-        }
-#endif
-        pinned_metadata_event_pending_ = false;
-      };
-
-      // Grow / first-allocate the pinned host scratch and the per-launch completion event. Doubling growth
-      // means the pinned alloc / free traffic is amortised to O(log peak_total_bytes) across a run.
-      if (total_bytes > pinned_metadata_scratch_capacity_) {
-        wait_pending();
-        if (pinned_metadata_scratch_ != nullptr) {
-#if defined(QD_WITH_CUDA)
-          if (config_.arch == Arch::cuda) {
-            CUDADriver::get_instance().mem_free_host(pinned_metadata_scratch_);
-          }
-#endif
-#if defined(QD_WITH_AMDGPU)
-          if (config_.arch == Arch::amdgpu) {
-            AMDGPUDriver::get_instance().mem_free_host(pinned_metadata_scratch_);
-          }
-#endif
-          pinned_metadata_scratch_ = nullptr;
-        }
-        std::size_t new_capacity = std::max<std::size_t>(total_bytes, 2 * pinned_metadata_scratch_capacity_);
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          // `hipHostMallocDefault == 0`. Coherent / portable / write-combined flags are intentionally not set;
-          // the workload is small payloads written linearly by the host and DMA-read by the GPU once.
-          AMDGPUDriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity, 0u);
-        }
-#endif
-        pinned_metadata_scratch_capacity_ = new_capacity;
-      }
-      if (pinned_metadata_event_ == nullptr) {
-        // `cuEventCreate` flag `0` (CU_EVENT_DEFAULT) means timing-enabled, which the driver costs us nothing
-        // to set up here and lets future profilers attach without re-creating the event. `hipEventCreateWithFlags`
-        // takes the same encoding.
-#if defined(QD_WITH_CUDA)
-        if (config_.arch == Arch::cuda) {
-          CUDADriver::get_instance().event_create(&pinned_metadata_event_, 0u);
-        }
-#endif
-#if defined(QD_WITH_AMDGPU)
-        if (config_.arch == Arch::amdgpu) {
-          AMDGPUDriver::get_instance().event_create(&pinned_metadata_event_, 0u);
-        }
-#endif
-      }
-      // Block until any in-flight copies from the previous launch have finished pulling from the pinned scratch
-      // before we overwrite it. In steady state this is a no-op because the small DMAs finish well before the
-      // host loops back here; the wait exists only to defend against an unusual interleaving where the GPU
-      // queue is backlogged and the next launch enters this function before the previous launch's last copy
-      // has been consumed.
-      wait_pending();
-
-      auto *pinned = static_cast<uint64_t *>(pinned_metadata_scratch_);
-      pinned[0] = stride_u64;
-      std::memcpy(pinned + 1, host_offsets.data(), array_bytes);
-      std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes);
-
-      // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the
-      // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active
-      // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the
-      // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no
-      // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel`
-      // (i.e. the default stream), so the copies match that.
-#if defined(QD_WITH_CUDA)
-      if (config_.arch == Arch::cuda) {
-        void *active_stream = CUDAContext::get_instance().get_stream();
-        CUDADriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, header_bytes,
-                                                               active_stream);
-        CUDADriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, active_stream);
-        CUDADriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes,
-                                                               active_stream);
-        CUDADriver::get_instance().event_record(pinned_metadata_event_, active_stream);
-      }
-#endif
-#if defined(QD_WITH_AMDGPU)
-      if (config_.arch == Arch::amdgpu) {
-        void *active_stream = nullptr;  // AMDGPUContext::launch always uses the default stream.
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned,
-                                                                 header_bytes, active_stream);
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes,
-                                                                 active_stream);
-        AMDGPUDriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes,
-                                                                 active_stream);
-        AMDGPUDriver::get_instance().event_record(pinned_metadata_event_, active_stream);
-      }
-#endif
-      pinned_metadata_event_pending_ = true;
-    }
-  } else {
-    // GPU (CUDA / AMDGPU): encode the SizeExpr trees into device bytecode, upload, launch the sizer runtime
-    // function, read back just the computed stride. The sizer kernel writes `adstack_max_sizes[]`,
-    // `adstack_offsets[]`, and `adstack_per_thread_stride` directly into the runtime struct and the metadata
-    // arrays above - no further host-writes to those fields are needed this launch.
-    //
-    // Why this architecture rather than host-eval: on CUDA / AMDGPU the ndarray data lives in GPU-private memory
-    // (plain `cudaMalloc` / `hipMalloc`, not managed / unified), so the host evaluator's `ExternalTensorRead`
-    // deref reads garbage. Moving the interpreter on-device keeps the pointer semantics intact - it reads the
-    // data pointer out of `ctx->arg_buffer` (which the kernel will read too) and dereferences it where the
-    // memory lives, with no migration / readback of the ndarray payload itself.
-    std::vector<uint8_t> bytecode;
-    if (program_impl_ != nullptr && program_impl_->program != nullptr) {
-      bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, program_impl_->program, ctx);
-    } else {
-      // No program attached (rare: C++-only tests that construct Program without a full runtime). Fall through
-      // to compile-time bounds by emitting an empty-tree bytecode - the device interpreter sees
-      // `root_node_idx == -1` for every stack and routes to `max_size_compile_time`.
-      bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, nullptr, ctx);
-    }
-    // Grow the scratch buffer if the bytecode outgrew the cached capacity. Amortised doubling keeps the
-    // allocation traffic O(log max_bytecode_bytes) across a run.
-    const std::size_t bytecode_bytes = bytecode.size();
-    if (bytecode_bytes > adstack_sizer_bytecode_capacity_) {
-      std::size_t new_cap = std::max<std::size_t>(bytecode_bytes, 2 * adstack_sizer_bytecode_capacity_);
-      Device::AllocParams params{};
-      params.size = new_cap;
-      params.host_read = false;
-      params.host_write = false;
-      params.export_sharing = false;
-      params.usage = AllocUsage::Storage;
-      DeviceAllocation new_alloc;
-      RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-      QD_ERROR_IF(res != RhiResult::success,
-                  "Failed to allocate {} bytes for the adstack sizer bytecode scratch buffer (err: {})", params.size,
-                  int(res));
-      adstack_sizer_bytecode_alloc_ = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-      adstack_sizer_bytecode_capacity_ = new_cap;
-    }
-    void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_);
-    copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes);
-
-    // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel
-    // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we
-    // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc.
-    //
-    // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves
-    // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of
-    // `RuntimeContext` because HIP has no UVA fallback and the host pointer faults with `hipErrorIllegalAddress`. CUDA
-    // stages the device copy only when the driver + kernel do not expose HMM / system-allocated memory (queried via
-    // `CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`): CUDA UVA covers pinned / CUDA-managed memory only, not the plain
-    // `std::make_unique<RuntimeContext>()` backing, so a host pointer works on HMM-capable setups but faults otherwise
-    // (Turing without HMM, Windows, pre-535 Linux drivers) as `CUDA_ERROR_ILLEGAL_ADDRESS` at the next DtoH sync
-    // `illegal memory access ... while calling memcpy_device_to_host`. When the caller passes `nullptr` (HMM-capable
-    // CUDA) we fall back to the host pointer; the launcher gates the allocation so HMM-equipped setups pay no staging
-    // cost.
-    auto *const runtime_jit = get_runtime_jit_module();
-    void *runtime_context_ptr_for_sizer =
-        device_runtime_context_ptr != nullptr ? device_runtime_context_ptr : static_cast<void *>(&ctx->get_context());
-    runtime_jit->call<void *, void *, void *>("runtime_eval_adstack_size_expr", llvm_runtime_,
-                                              runtime_context_ptr_for_sizer, bytecode_dev_ptr);
-
-    uint64_t stride_u64 = 0;
-    copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t));
-    stride = static_cast<std::size_t>(stride_u64);
-  }
-
-  std::size_t needed_bytes = stride * num_threads;
-  ensure_adstack_heap(needed_bytes);
-  return needed_bytes;
-}
-
-void LlvmRuntimeExecutor::ensure_adstack_heap(std::size_t needed_bytes) {
-  if (needed_bytes == 0 || needed_bytes <= adstack_heap_size_) {
-    return;
-  }
-  // Amortized doubling keeps the number of re-allocations across a run bounded by log(peak_size).
-  std::size_t new_size = std::max(needed_bytes, std::size_t(2) * adstack_heap_size_);
-
-  Device::AllocParams params{};
-  params.size = new_size;
-  params.host_read = false;
-  params.host_write = false;
-  params.export_sharing = false;
-  params.usage = AllocUsage::Storage;
-  DeviceAllocation new_alloc;
-  RhiResult res = llvm_device()->allocate_memory(params, &new_alloc);
-  QD_ERROR_IF(res != RhiResult::success,
-              "Failed to allocate {} bytes for the adstack heap (err: {}). Consider lowering `ad_stack_size` or the "
-              "per-kernel reverse-mode adstack count.",
-              new_size, int(res));
-  // `get_device_alloc_info_ptr` is the RHI-agnostic accessor that returns the raw host-visible
-  // pointer on CPU and the device-visible pointer on CUDA / AMDGPU (`get_memory_addr` is only
-  // implemented on the GPU devices, so we route through this helper instead).
-  void *new_ptr = get_device_alloc_info_ptr(new_alloc);
-
-  auto new_guard = std::make_unique<DeviceAllocationGuard>(std::move(new_alloc));
-
-  // Publish the new buffer pointer and size into the runtime struct. On CPU the runtime lives in host memory,
-  // so plain stores through the cached field pointers are correct. On CUDA / AMDGPU the runtime lives in device
-  // memory, so the host writes via the driver's host->device memcpy. The field-address query runs exactly once,
-  // on the first grow, and caches the two device pointers; every subsequent grow is just two 8-byte memcpys.
-  if (runtime_adstack_heap_buffer_field_ptr_ == nullptr) {
-    auto *const runtime_jit = get_runtime_jit_module();
-    runtime_jit->call<void *>("runtime_get_adstack_heap_field_ptrs", llvm_runtime_);
-    runtime_adstack_heap_buffer_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_));
-    runtime_adstack_heap_size_field_ptr_ = quadrants_union_cast_with_different_sizes<void *>(
-        fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_));
-  }
-  uint64 size_u64 = static_cast<uint64>(new_size);
-  if (config_.arch == Arch::cuda) {
-#if defined(QD_WITH_CUDA)
-    CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr, sizeof(void *));
-    CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64));
-#else
-    QD_NOT_IMPLEMENTED;
-#endif
-  } else if (config_.arch == Arch::amdgpu) {
-#if defined(QD_WITH_AMDGPU)
-    AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr,
-                                                       sizeof(void *));
-    AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64));
-#else
-    QD_NOT_IMPLEMENTED;
-#endif
-  } else {
-    *reinterpret_cast<void **>(runtime_adstack_heap_buffer_field_ptr_) = new_ptr;
-    *reinterpret_cast<uint64 *>(runtime_adstack_heap_size_field_ptr_) = size_u64;
-  }
-
-  // Replace and release the old allocation. `DeviceAllocationGuard`'s destructor calls
-  // `llvm_device()->dealloc_memory`. The new slab has already been handed to `new_guard` above, so the move-assignment
-  // here is what destroys the *previous* guard - the new allocation is not the one being freed. Safety of the release
-  // depends on the backend:
-  //   - CPU: host `std::free`. No GPU involved, always safe.
-  //   - CUDA: `CudaDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=true)` ->
-  //     `cuMemFree_v2`, which synchronizes with pending device work before returning.
-  //   - AMDGPU: `AmdgpuDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=false)` ->
-  //     `CachingAllocator::release`, which pools the allocation *without* calling `hipFree` and *without*
-  //     synchronizing. The physical memory stays mapped, so an in-flight kernel still holding the old base pointer
-  //     keeps reading/writing valid storage. The cross-launch safety invariant for AMDGPU comes from
-  //     `amdgpu::KernelLauncher::launch_llvm_kernel` ending with `hipFree(context_pointer)`, which synchronizes
-  //     with all in-flight kernels launched during that call. By the time the *next* `launch_llvm_kernel` reaches
-  //     `ensure_adstack_heap` and can destroy the previous guard, no GPU kernel from the prior call is still
-  //     referencing the old slab. CUDA does not need this extra hop -- the `cuMemFree_v2` in the bullet above
-  //     already syncs -- and the CUDA launcher correspondingly does not allocate a device-side `context_pointer`
-  //     (it passes the `RuntimeContext` by host reference).
-  adstack_heap_alloc_ = std::move(new_guard);
-  adstack_heap_size_ = new_size;
-}
-
 void LlvmRuntimeExecutor::preallocate_runtime_memory() {
   if (preallocated_runtime_memory_allocs_ != nullptr)
     return;

From f6fee4fbd2bcf3040b9edae6970294ed9daca671 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 02:21:09 -0700
Subject: [PATCH 079/109] Add test for qd_stream + autodiff kernel error guard

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/python/test_streams.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index 7f03703dac..db7588aaf7 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -258,6 +258,21 @@ def compute():
     s.destroy()
 
 
+@test_utils.test(arch=[qd.cuda])
+def test_stream_with_autodiff_kernel_raises():
+    x = qd.field(qd.f32, shape=(), needs_grad=True)
+    loss = qd.field(qd.f32, shape=(), needs_grad=True)
+
+    @qd.kernel
+    def compute():
+        loss[None] = x[None] ** 2
+
+    s = qd.create_stream()
+    with pytest.raises(RuntimeError, match="not compatible with autodiff"):
+        compute.grad(qd_stream=s)
+    s.destroy()
+
+
 @test_utils.test(arch=[qd.cuda])
 def test_stream_with_graph_raises():
     N = 64

From 6e49c52d13f426dcac3c14b5b839059db2cb5839 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 03:12:15 -0700
Subject: [PATCH 080/109] Restore context_pointer free comment in AMDGPU kernel
 launcher

The comment explains a non-obvious race condition: context_pointer must be
freed directly (now via mem_free_async on active_stream) rather than through
AMDGPUContext's deferred free list, because that list is drained by
LlvmRuntimeExecutor::synchronize which can be called mid-launch.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index ab34003cbd..42db3934dd 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -288,6 +288,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   } else if (ctx.result_buffer_size > 0) {
     AMDGPUDriver::get_instance().stream_synchronize(active_stream);
   }
+  // Free the per-launch `RuntimeContext` on the active stream rather than through `AMDGPUContext`'s deferred free
+  // list.  The deferred list is drained by `LlvmRuntimeExecutor::synchronize`, which is also called from
+  // `fetch_result_uint64` during `ensure_adstack_heap`'s field-pointer query -- that path would free
+  // `context_pointer` mid-launch, and HIP could recycle the address for the adstack heap allocated right after,
+  // clobbering the `RuntimeContext` the next task still reads from.
   AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream);
 }
 

From 9fd8b7b9d718948f09f1c4335bd7127946f20d16 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 04:55:19 -0700
Subject: [PATCH 081/109] Extract stream/event methods from program.cpp into
 program_stream.cpp

Move the 9 CUDA-only stream/event Program methods into a dedicated
translation unit.  The CMake glob on quadrants/program/* picks up
the new file automatically.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.cpp        |  94 ------------------------
 quadrants/program/program_stream.cpp | 103 +++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 94 deletions(-)
 create mode 100644 quadrants/program/program_stream.cpp

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 5abcd255b3..8f6fdb2186 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -20,11 +20,6 @@
 #include "quadrants/codegen/llvm/struct_llvm.h"
 #endif
 
-#ifdef QD_WITH_CUDA
-#include "quadrants/rhi/cuda/cuda_driver.h"
-#include "quadrants/rhi/cuda/cuda_context.h"
-#endif
-
 #ifdef QD_WITH_VULKAN
 #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h"
 #include "quadrants/rhi/vulkan/vulkan_loader.h"
@@ -494,93 +489,4 @@ void Program::enqueue_compute_op_lambda(std::function<void(Device *device, Comma
   program_impl_->enqueue_compute_op_lambda(op, image_refs);
 }
 
-uint64 Program::stream_create() {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    void *stream = nullptr;
-    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
-    return reinterpret_cast<uint64>(stream);
-  }
-#endif
-  return 0;
-}
-
-void Program::stream_destroy(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::stream_synchronize(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::set_current_cuda_stream(uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-uint64 Program::event_create() {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
-    CUDAContext::get_instance().make_current();
-    void *event = nullptr;
-    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
-    return reinterpret_cast<uint64>(event);
-  }
-#endif
-  return 0;
-}
-
-void Program::event_destroy(uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
-  }
-#endif
-}
-
-void Program::event_record(uint64 event_handle, uint64 stream_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
-                                            reinterpret_cast<void *>(stream_handle));
-  }
-#endif
-}
-
-void Program::event_synchronize(uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
-  }
-#endif
-}
-
-void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
-#ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
-                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
-  }
-#endif
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
new file mode 100644
index 0000000000..b20252ddbc
--- /dev/null
+++ b/quadrants/program/program_stream.cpp
@@ -0,0 +1,103 @@
+// Stream and event operations for the Program class.
+// Extracted from program.cpp to keep backend-specific GPU stream/event
+// lifecycle code separate from the core Program logic.
+
+#include "program.h"
+
+#ifdef QD_WITH_CUDA
+#include "quadrants/rhi/cuda/cuda_driver.h"
+#include "quadrants/rhi/cuda/cuda_context.h"
+#endif
+
+namespace quadrants::lang {
+
+uint64 Program::stream_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    void *stream = nullptr;
+    CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
+    return reinterpret_cast<uint64>(stream);
+  }
+#endif
+  return 0;
+}
+
+void Program::stream_destroy(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::stream_synchronize(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::set_current_cuda_stream(uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+uint64 Program::event_create() {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda) {
+    CUDAContext::get_instance().make_current();
+    void *event = nullptr;
+    CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
+    return reinterpret_cast<uint64>(event);
+  }
+#endif
+  return 0;
+}
+
+void Program::event_destroy(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
+                                            reinterpret_cast<void *>(stream_handle));
+  }
+#endif
+}
+
+void Program::event_synchronize(uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
+  }
+#endif
+}
+
+void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+#ifdef QD_WITH_CUDA
+  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+    CUDAContext::get_instance().make_current();
+    CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
+                                                 reinterpret_cast<void *>(event_handle), 0 /*flags*/);
+  }
+#endif
+}
+
+}  // namespace quadrants::lang

From 9e6f865cfb29b78e5c99705b84e3a6a1bc80bc86 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:26:10 -0700
Subject: [PATCH 082/109] Introduce StreamManager delegate class for
 stream/event ops

Move the CUDA stream/event logic into a StreamManager class
(program_stream.h/.cpp).  Program keeps its public API unchanged
and delegates to stream_manager_ internally, so the pybind layer
and Python code need no changes.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.cpp        |  1 +
 quadrants/program/program.h          |  2 +
 quadrants/program/program_stream.cpp | 77 ++++++++++++++++++++--------
 quadrants/program/program_stream.h   | 31 +++++++++++
 4 files changed, 90 insertions(+), 21 deletions(-)
 create mode 100644 quadrants/program/program_stream.h

diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp
index 8f6fdb2186..ff9901add5 100644
--- a/quadrants/program/program.cpp
+++ b/quadrants/program/program.cpp
@@ -60,6 +60,7 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
   config = default_compile_config;
   config.arch = desired_arch;
   config.fit();
+  stream_manager_ = StreamManager(config.arch);
 
   profiler = make_profiler(config.arch, config.kernel_profiler);
   if (arch_uses_llvm(config.arch)) {
diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index fe2f30ca74..7fb6019026 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -21,6 +21,7 @@
 #include "quadrants/program/kernel_profiler.h"
 #include "quadrants/program/snode_expr_utils.h"
 #include "quadrants/program/snode_rw_accessors_bank.h"
+#include "quadrants/program/program_stream.h"
 #include "quadrants/program/context.h"
 #include "quadrants/struct/snode_tree.h"
 #include "quadrants/system/threading.h"
@@ -338,6 +339,7 @@ class QD_DLL_EXPORT Program {
 
  private:
   CompileConfig compile_config_;
+  StreamManager stream_manager_{Arch::x64};  // re-initialized in constructor after arch is known
 
   uint64 ndarray_writer_counter_{0};
   uint64 ndarray_reader_counter_{0};
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index b20252ddbc..442e0cfa8d 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -1,7 +1,6 @@
-// Stream and event operations for the Program class.
-// Extracted from program.cpp to keep backend-specific GPU stream/event
-// lifecycle code separate from the core Program logic.
+// StreamManager implementation and Program delegation.
 
+#include "program_stream.h"
 #include "program.h"
 
 #ifdef QD_WITH_CUDA
@@ -11,9 +10,13 @@
 
 namespace quadrants::lang {
 
-uint64 Program::stream_create() {
+// ---------------------------------------------------------------------------
+// StreamManager
+// ---------------------------------------------------------------------------
+
+uint64 StreamManager::create_stream() {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     void *stream = nullptr;
     CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/);
@@ -23,36 +26,36 @@ uint64 Program::stream_create() {
   return 0;
 }
 
-void Program::stream_destroy(uint64 stream_handle) {
+void StreamManager::destroy_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda && stream_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-void Program::stream_synchronize(uint64 stream_handle) {
+void StreamManager::synchronize_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda && stream_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-void Program::set_current_cuda_stream(uint64 stream_handle) {
+void StreamManager::set_current_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     CUDAContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
 #endif
 }
 
-uint64 Program::event_create() {
+uint64 StreamManager::create_event() {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     void *event = nullptr;
     CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/);
@@ -62,18 +65,18 @@ uint64 Program::event_create() {
   return 0;
 }
 
-void Program::event_destroy(uint64 event_handle) {
+void StreamManager::destroy_event(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
 
-void Program::event_record(uint64 event_handle, uint64 stream_handle) {
+void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                             reinterpret_cast<void *>(stream_handle));
@@ -81,18 +84,18 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) {
 #endif
 }
 
-void Program::event_synchronize(uint64 event_handle) {
+void StreamManager::synchronize_event(uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
 #endif
 }
 
-void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
+void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #ifdef QD_WITH_CUDA
-  if (compile_config().arch == Arch::cuda && event_handle != 0) {
+  if (arch_ == Arch::cuda && event_handle != 0) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                  reinterpret_cast<void *>(event_handle), 0 /*flags*/);
@@ -100,4 +103,36 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) {
 #endif
 }
 
+// ---------------------------------------------------------------------------
+// Program delegation — keeps the pybind / Python API unchanged.
+// ---------------------------------------------------------------------------
+
+uint64 Program::stream_create() {
+  return stream_manager_.create_stream();
+}
+void Program::stream_destroy(uint64 h) {
+  stream_manager_.destroy_stream(h);
+}
+void Program::stream_synchronize(uint64 h) {
+  stream_manager_.synchronize_stream(h);
+}
+void Program::set_current_cuda_stream(uint64 h) {
+  stream_manager_.set_current_stream(h);
+}
+uint64 Program::event_create() {
+  return stream_manager_.create_event();
+}
+void Program::event_destroy(uint64 h) {
+  stream_manager_.destroy_event(h);
+}
+void Program::event_record(uint64 eh, uint64 sh) {
+  stream_manager_.record_event(eh, sh);
+}
+void Program::event_synchronize(uint64 h) {
+  stream_manager_.synchronize_event(h);
+}
+void Program::stream_wait_event(uint64 sh, uint64 eh) {
+  stream_manager_.stream_wait_event(sh, eh);
+}
+
 }  // namespace quadrants::lang
diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
new file mode 100644
index 0000000000..ae6b7221d5
--- /dev/null
+++ b/quadrants/program/program_stream.h
@@ -0,0 +1,31 @@
+// StreamManager — manages CUDA stream and event lifecycle.
+// Isolated from Program so that backend-specific GPU plumbing
+// does not pollute the core Program interface.
+
+#pragma once
+
+#include "quadrants/common/core.h"
+#include "quadrants/util/lang_util.h"
+
+namespace quadrants::lang {
+
+class StreamManager {
+ public:
+  explicit StreamManager(Arch arch) : arch_(arch) {}
+
+  uint64 create_stream();
+  void destroy_stream(uint64 stream_handle);
+  void synchronize_stream(uint64 stream_handle);
+  void set_current_stream(uint64 stream_handle);
+
+  uint64 create_event();
+  void destroy_event(uint64 event_handle);
+  void record_event(uint64 event_handle, uint64 stream_handle);
+  void synchronize_event(uint64 event_handle);
+  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+
+ private:
+  Arch arch_;
+};
+
+}  // namespace quadrants::lang

From 1c81322cbe0e418a6deaa765d877a505d29ced16 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:29:18 -0700
Subject: [PATCH 083/109] Fix clang-format in program_stream.h

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index ae6b7221d5..54a8e88d0b 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -11,7 +11,8 @@ namespace quadrants::lang {
 
 class StreamManager {
  public:
-  explicit StreamManager(Arch arch) : arch_(arch) {}
+  explicit StreamManager(Arch arch) : arch_(arch) {
+  }
 
   uint64 create_stream();
   void destroy_stream(uint64 stream_handle);

From 84ba5b05b7d9d502eccace8f52e88ea9df0ccbc6 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:34:12 -0700
Subject: [PATCH 084/109] Fix clang-format in program_stream.h

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index ae6b7221d5..54a8e88d0b 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -11,7 +11,8 @@ namespace quadrants::lang {
 
 class StreamManager {
  public:
-  explicit StreamManager(Arch arch) : arch_(arch) {}
+  explicit StreamManager(Arch arch) : arch_(arch) {
+  }
 
   uint64 create_stream();
   void destroy_stream(uint64 stream_handle);

From b1b4ee60b298aa3e7ea93903c3895dd5a59cf155 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 05:36:58 -0700
Subject: [PATCH 085/109] Remove Program wrapper methods, bind StreamManager
 directly via pybind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add stream_manager() accessor to Program.  Update export_lang.cpp
to call StreamManager methods through lambdas.  Delete the 9
one-line delegation methods from Program — the declarations in
program.h and definitions in program_stream.cpp are both gone.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program.h          | 12 +++-------
 quadrants/program/program_stream.cpp | 33 ----------------------------
 quadrants/python/export_lang.cpp     | 18 +++++++--------
 3 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/quadrants/program/program.h b/quadrants/program/program.h
index 7fb6019026..600533f1cf 100644
--- a/quadrants/program/program.h
+++ b/quadrants/program/program.h
@@ -320,15 +320,9 @@ class QD_DLL_EXPORT Program {
     return ndarrays_.size();
   }
 
-  uint64 stream_create();
-  void stream_destroy(uint64 stream_handle);
-  void stream_synchronize(uint64 stream_handle);
-  void set_current_cuda_stream(uint64 stream_handle);
-  uint64 event_create();
-  void event_destroy(uint64 event_handle);
-  void event_record(uint64 event_handle, uint64 stream_handle);
-  void event_synchronize(uint64 event_handle);
-  void stream_wait_event(uint64 stream_handle, uint64 event_handle);
+  StreamManager &stream_manager() {
+    return stream_manager_;
+  }
 
   // TODO(zhanlue): Move these members and corresponding interfaces to
   // ProgramImpl Ideally, Program should serve as a pure interface class and all
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 442e0cfa8d..b1c2429dd6 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -1,7 +1,6 @@
 // StreamManager implementation and Program delegation.
 
 #include "program_stream.h"
-#include "program.h"
 
 #ifdef QD_WITH_CUDA
 #include "quadrants/rhi/cuda/cuda_driver.h"
@@ -103,36 +102,4 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle)
 #endif
 }
 
-// ---------------------------------------------------------------------------
-// Program delegation — keeps the pybind / Python API unchanged.
-// ---------------------------------------------------------------------------
-
-uint64 Program::stream_create() {
-  return stream_manager_.create_stream();
-}
-void Program::stream_destroy(uint64 h) {
-  stream_manager_.destroy_stream(h);
-}
-void Program::stream_synchronize(uint64 h) {
-  stream_manager_.synchronize_stream(h);
-}
-void Program::set_current_cuda_stream(uint64 h) {
-  stream_manager_.set_current_stream(h);
-}
-uint64 Program::event_create() {
-  return stream_manager_.create_event();
-}
-void Program::event_destroy(uint64 h) {
-  stream_manager_.destroy_event(h);
-}
-void Program::event_record(uint64 eh, uint64 sh) {
-  stream_manager_.record_event(eh, sh);
-}
-void Program::event_synchronize(uint64 h) {
-  stream_manager_.synchronize_event(h);
-}
-void Program::stream_wait_event(uint64 sh, uint64 eh) {
-  stream_manager_.stream_wait_event(sh, eh);
-}
-
 }  // namespace quadrants::lang
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index 8cfdd78b5a..c46d40ac10 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -406,15 +406,15 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
       .def("get_device_caps", &Program::get_device_caps)
-      .def("stream_create", &Program::stream_create)
-      .def("stream_destroy", &Program::stream_destroy)
-      .def("stream_synchronize", &Program::stream_synchronize)
-      .def("set_current_cuda_stream", &Program::set_current_cuda_stream)
-      .def("event_create", &Program::event_create)
-      .def("event_destroy", &Program::event_destroy)
-      .def("event_record", &Program::event_record)
-      .def("event_synchronize", &Program::event_synchronize)
-      .def("stream_wait_event", &Program::stream_wait_event)
+      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
+      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
+      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
+      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
+      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
+      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
+      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
+      .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); })
       .def("get_graph_cache_size", &Program::get_graph_cache_size)
       .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call)
       .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call)

From d3317f5cf00e4955095edefdeab68227426243c5 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 06:01:11 -0700
Subject: [PATCH 086/109] Fix AMDGPU branches in StreamManager: use arch_
 member instead of compile_config()

The base branch refactored stream/event methods from Program:: to
StreamManager::, which stores the arch in arch_. Our AMDGPU branches
still referenced compile_config().arch which is a Program method.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 88288cc313..b4adc0226a 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -28,7 +28,7 @@ uint64 StreamManager::create_stream() {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     void *stream = nullptr;
     AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/);
@@ -46,7 +46,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu && stream_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast<void *>(stream_handle));
   }
@@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu && stream_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }
@@ -76,7 +76,7 @@ void StreamManager::set_current_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUContext::get_instance().set_stream(reinterpret_cast<void *>(stream_handle));
   }
@@ -93,7 +93,7 @@ uint64 StreamManager::create_event() {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     void *event = nullptr;
     AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/);
@@ -111,7 +111,7 @@ void StreamManager::destroy_event(uint64 event_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_destroy(reinterpret_cast<void *>(event_handle));
   }
@@ -127,7 +127,7 @@ void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_record(reinterpret_cast<void *>(event_handle),
                                               reinterpret_cast<void *>(stream_handle));
@@ -143,7 +143,7 @@ void StreamManager::synchronize_event(uint64 event_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast<void *>(event_handle));
   }
@@ -159,7 +159,7 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle)
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (compile_config().arch == Arch::amdgpu && event_handle != 0) {
+  if (arch_ == Arch::amdgpu && event_handle != 0) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast<void *>(stream_handle),
                                                    reinterpret_cast<void *>(event_handle), 0 /*flags*/);

From 7e102672eab2ff2713c26cd90445566b81d57a53 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:18:07 -0700
Subject: [PATCH 087/109] Reflow comment in program_stream.h to 120-char width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h
index 54a8e88d0b..69265c26b3 100644
--- a/quadrants/program/program_stream.h
+++ b/quadrants/program/program_stream.h
@@ -1,6 +1,5 @@
-// StreamManager — manages CUDA stream and event lifecycle.
-// Isolated from Program so that backend-specific GPU plumbing
-// does not pollute the core Program interface.
+// StreamManager — manages CUDA stream and event lifecycle, isolated from Program so that backend-specific GPU
+// plumbing does not pollute the core Program interface.
 
 #pragma once
 

From 614c742cd9cfb0195ae32dedee09d4d7fd374949 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:19:08 -0700
Subject: [PATCH 088/109] Use captured prog_ref for all Stream/Event operations

All methods on Stream and Event now resolve the Program through the
captured weakref first, falling back to the current runtime only for
externally-wrapped handles.  Fixes a bug where destroy/synchronize/
record/wait would call into the wrong Program after qd.reset().

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 063a2aeafc..85e7c1e86b 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -22,10 +22,16 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
     def handle(self) -> int:
         return self._handle
 
+    def _prog(self):
+        if self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                return prog
+        return impl.get_runtime().prog
+
     def synchronize(self):
         """Block until all operations on this stream complete."""
-        prog = impl.get_runtime().prog
-        prog.stream_synchronize(self._handle)
+        self._prog().stream_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
@@ -33,8 +39,7 @@ def destroy(self):
         No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = impl.get_runtime().prog
-            prog.stream_destroy(self._handle)
+            self._prog().stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
@@ -69,22 +74,26 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None):
     def handle(self) -> int:
         return self._handle
 
+    def _prog(self):
+        if self._prog_ref is not None:
+            prog = self._prog_ref()
+            if prog is not None:
+                return prog
+        return impl.get_runtime().prog
+
     def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
-        prog = impl.get_runtime().prog
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        prog.event_record(self._handle, stream_handle)
+        self._prog().event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
         """Make a stream wait for this event. None means the default stream."""
-        prog = impl.get_runtime().prog
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        prog.stream_wait_event(stream_handle, self._handle)
+        self._prog().stream_wait_event(stream_handle, self._handle)
 
     def synchronize(self):
         """Block the host until this event has been reached."""
-        prog = impl.get_runtime().prog
-        prog.event_synchronize(self._handle)
+        self._prog().event_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
@@ -92,8 +101,7 @@ def destroy(self):
         No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = impl.get_runtime().prog
-            prog.event_destroy(self._handle)
+            self._prog().event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):

From 3dad35ad4ad58bd92d034e0eb01a65a92705c897 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 09:31:35 -0700
Subject: [PATCH 089/109] Fix stale handle safety in Stream/Event after
 qd.reset()

When _prog_ref is set but the weakref has expired (Program destroyed),
_prog() now returns None instead of falling back to the current runtime.
Active operations (synchronize, record, wait) raise RuntimeError; destroy
silently no-ops and zeroes the handle.

Also allow synchronize_stream(0) to sync the default stream in CUDA,
matching cuStreamSynchronize(nullptr) semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py      | 41 ++++++++++++++++++----------
 quadrants/program/program_stream.cpp |  2 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 85e7c1e86b..5898cb434e 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -23,23 +23,28 @@ def handle(self) -> int:
         return self._handle
 
     def _prog(self):
+        """Resolve the owning Program, or None if the owner was collected."""
         if self._prog_ref is not None:
-            prog = self._prog_ref()
-            if prog is not None:
-                return prog
+            return self._prog_ref()
         return impl.get_runtime().prog
 
     def synchronize(self):
         """Block until all operations on this stream complete."""
-        self._prog().stream_synchronize(self._handle)
+        prog = self._prog()
+        if prog is None:
+            raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())")
+        prog.stream_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
 
-        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
+        No-op if the owning Program has already been collected, or for streams wrapping external handles
+        (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            self._prog().stream_destroy(self._handle)
+            prog = self._prog()
+            if prog is not None:
+                prog.stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
@@ -75,33 +80,41 @@ def handle(self) -> int:
         return self._handle
 
     def _prog(self):
+        """Resolve the owning Program, or None if the owner was collected."""
         if self._prog_ref is not None:
-            prog = self._prog_ref()
-            if prog is not None:
-                return prog
+            return self._prog_ref()
         return impl.get_runtime().prog
 
+    def _require_prog(self):
+        prog = self._prog()
+        if prog is None:
+            raise RuntimeError("Event's owning Program has been destroyed (e.g. after qd.reset())")
+        return prog
+
     def record(self, qd_stream: Stream | None = None):
         """Record this event on a stream. None means the default stream."""
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        self._prog().event_record(self._handle, stream_handle)
+        self._require_prog().event_record(self._handle, stream_handle)
 
     def wait(self, qd_stream: Stream | None = None):
         """Make a stream wait for this event. None means the default stream."""
         stream_handle = qd_stream.handle if qd_stream is not None else 0
-        self._prog().stream_wait_event(stream_handle, self._handle)
+        self._require_prog().stream_wait_event(stream_handle, self._handle)
 
     def synchronize(self):
         """Block the host until this event has been reached."""
-        self._prog().event_synchronize(self._handle)
+        self._require_prog().event_synchronize(self._handle)
 
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
 
-        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
+        No-op if the owning Program has already been collected, or for events wrapping external handles
+        (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            self._prog().event_destroy(self._handle)
+            prog = self._prog()
+            if prog is not None:
+                prog.event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index b1c2429dd6..8a7431532a 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -36,7 +36,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) {
 
 void StreamManager::synchronize_stream(uint64 stream_handle) {
 #ifdef QD_WITH_CUDA
-  if (arch_ == Arch::cuda && stream_handle != 0) {
+  if (arch_ == Arch::cuda) {
     CUDAContext::get_instance().make_current();
     CUDADriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }

From bebc9040869cdbcdcf8094b80fb4c849f28f16ce Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:14:23 -0700
Subject: [PATCH 090/109] Extract stream/event pybind bindings into
 export_stream.cpp

Move the 9 stream/event .def() bindings from export_lang.cpp into a
new export_stream.cpp, following the existing export_math/export_misc
pattern.  Satisfies the feature-factorization check for the 1225-line
export_lang.cpp.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export.h          |  6 ++++++
 quadrants/python/export_lang.cpp   | 14 +++-----------
 quadrants/python/export_stream.cpp | 26 ++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 quadrants/python/export_stream.cpp

diff --git a/quadrants/python/export.h b/quadrants/python/export.h
index 331c35b4b6..92736daedf 100644
--- a/quadrants/python/export.h
+++ b/quadrants/python/export.h
@@ -21,6 +21,10 @@
 
 #include "quadrants/common/core.h"
 
+namespace quadrants::lang {
+class Program;
+}  // namespace quadrants::lang
+
 namespace quadrants {
 
 namespace py = pybind11;
@@ -33,4 +37,6 @@ void export_math(py::module &m);
 
 void export_misc(py::module &m);
 
+void export_stream(py::module &m, py::class_<lang::Program> &program_class);
+
 }  // namespace quadrants
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index c46d40ac10..b3dc79bef5 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -314,8 +314,8 @@ void export_lang(py::module &m) {
   auto compiled_kernel_data = py::class_<CompiledKernelData>(m, "CompiledKernelData")
                                   .def("_debug_dump_to_string", &CompiledKernelData::debug_dump_to_string);
 
-  py::class_<Program>(m, "Program")
-      .def(py::init<>())
+  auto program_class = py::class_<Program>(m, "Program");
+  program_class.def(py::init<>())
       .def(
           "ndarray_to_dlpack",
           [](Program *program, pybind11::object owner, Ndarray *ndarray, const std::vector<int> &layout,
@@ -406,20 +406,12 @@ void export_lang(py::module &m) {
       .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference)
       .def("launch_kernel", &Program::launch_kernel)
       .def("get_device_caps", &Program::get_device_caps)
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
-      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
-      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
-      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
-      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
-      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
-      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
-      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
-      .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); })
       .def("get_graph_cache_size", &Program::get_graph_cache_size)
       .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call)
       .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call)
       .def("get_graph_num_nodes_on_last_call", &Program::get_graph_num_nodes_on_last_call)
       .def("get_graph_total_builds", &Program::get_graph_total_builds);
+  export_stream(m, program_class);
 
   py::class_<CompileResult>(m, "CompileResult")
       .def_property_readonly(
diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
new file mode 100644
index 0000000000..f3f2fad525
--- /dev/null
+++ b/quadrants/python/export_stream.cpp
@@ -0,0 +1,26 @@
+/*******************************************************************************
+    Copyright (c) The Quadrants Authors (2016- ). All Rights Reserved.
+    The use of this software is governed by the LICENSE file.
+*******************************************************************************/
+
+#include "quadrants/python/export.h"
+#include "quadrants/program/program.h"
+
+namespace quadrants {
+
+void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
+  using lang::Program;
+  program_class
+      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+      .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
+      .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
+      .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })
+      .def("event_create", [](Program *p) { return p->stream_manager().create_event(); })
+      .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); })
+      .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); })
+      .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); })
+      .def("stream_wait_event",
+           [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); });
+}
+
+}  // namespace quadrants

From b4450f7c1837e3fb603ddf267fb0a01a8f781154 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:19:23 -0700
Subject: [PATCH 091/109] Fix clang-format in export_stream.cpp

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export_stream.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
index f3f2fad525..66b3c8a3d7 100644
--- a/quadrants/python/export_stream.cpp
+++ b/quadrants/python/export_stream.cpp
@@ -10,8 +10,7 @@ namespace quadrants {
 
 void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
   using lang::Program;
-  program_class
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+  program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
       .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
       .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
       .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })

From b6cd986a9f319a9d5d9c9c1dd5bce239feb1af97 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:21:49 -0700
Subject: [PATCH 092/109] Fix clang-format line break in CUDA kernel launcher

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/cuda/kernel_launcher.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index a97ba400d5..9558c57d66 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -141,8 +141,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         int effective_grid_dim = prepare_task(j, t);
         CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
         QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
-        cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
-                            {&ctx.get_context()}, {});
+        cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()},
+                            {});
       }
 
       for (auto &[sid, s] : stream_by_id) {

From 3b09331daf736eb85d220bdd6760dd5e5e553bd2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:24:59 -0700
Subject: [PATCH 093/109] Fix clang-format in export_stream.cpp

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/python/export_stream.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp
index f3f2fad525..66b3c8a3d7 100644
--- a/quadrants/python/export_stream.cpp
+++ b/quadrants/python/export_stream.cpp
@@ -10,8 +10,7 @@ namespace quadrants {
 
 void export_stream(py::module &m, py::class_<lang::Program> &program_class) {
   using lang::Program;
-  program_class
-      .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
+  program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); })
       .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); })
       .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); })
       .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); })

From af4a30615f6e625fd47ef4cb30cb2259993e5df4 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 10:29:18 -0700
Subject: [PATCH 094/109] Skip coverage probes in stream_parallel exclusivity
 check; restore deleted comments

The Linux build CI runs with QD_KERNEL_COVERAGE=1, which injects
_qd_cov[probe_id] = 1 Assign nodes before each statement in the kernel
body. _validate_stream_parallel_exclusivity was rejecting these probes as
non-stream_parallel statements. Add _is_coverage_probe() to skip them.

Also restores the 4 safety comments in CUDA kernel_launcher.cpp's
prepare_task lambda that were flagged by the deleted-comments check,
fixes clang-format line break, and reflows the symbol_resolver.py
docstring to 120 characters.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../function_def_transformer.py               | 13 ++++++++++
 python/quadrants/lang/ast/symbol_resolver.py  |  4 +--
 quadrants/runtime/cuda/kernel_launcher.cpp    | 25 +++++++++++++++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
index 123767be55..142694091f 100644
--- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
+++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py
@@ -535,6 +535,17 @@ def _is_docstring(stmt: ast.stmt, index: int) -> bool:
             and isinstance(stmt.value, (ast.Constant, ast.Str))
         )
 
+    @staticmethod
+    def _is_coverage_probe(stmt: ast.stmt) -> bool:
+        if not isinstance(stmt, ast.Assign) or len(stmt.targets) != 1:
+            return False
+        target = stmt.targets[0]
+        return (
+            isinstance(target, ast.Subscript)
+            and isinstance(target.value, ast.Name)
+            and target.value.id.startswith("_qd_cov")
+        )
+
     @staticmethod
     def _validate_stream_parallel_exclusivity(
         body: list[ast.stmt], global_vars: dict[str, Any]
@@ -547,6 +558,8 @@ def _validate_stream_parallel_exclusivity(
         for i, stmt in enumerate(body):
             if FunctionDefTransformer._is_docstring(stmt, i):
                 continue
+            if FunctionDefTransformer._is_coverage_probe(stmt):
+                continue
             if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars):
                 stmt_desc = f"{type(stmt).__name__}"
                 if isinstance(stmt, ast.With) and stmt.items:
diff --git a/python/quadrants/lang/ast/symbol_resolver.py b/python/quadrants/lang/ast/symbol_resolver.py
index f95373a463..c2b4fcaffe 100644
--- a/python/quadrants/lang/ast/symbol_resolver.py
+++ b/python/quadrants/lang/ast/symbol_resolver.py
@@ -60,8 +60,8 @@ def resolve_to(node, wanted, scope):
     def resolve_value(node, scope):
         """Resolve an AST Name/Attribute node to a Python object.
 
-        Same traversal as resolve_to but returns the resolved object (or None)
-        instead of comparing against a wanted value.
+        Same traversal as resolve_to but returns the resolved object (or None) instead of comparing against a wanted
+        value.
         """
         if isinstance(node, ast.Name):
             return scope.get(node.id) if isinstance(scope, dict) else None
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 658b2089d9..b08af6733e 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -81,8 +81,19 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
     int effective_grid_dim = task.grid_dim;
     if (!task.ad_stack.allocas.empty()) {
       std::size_t n = resolve_num_threads(task.ad_stack, executor);
+      // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without it the sizer
+      // launches with a host pointer and the next DtoH sync trips `CUDA_ERROR_ILLEGAL_ADDRESS ...
+      // memcpy_device_to_host` on GPUs whose driver + kernel cannot coherently access pageable host memory (the HMM
+      // capability gated below in `launch_llvm_kernel`). `nullptr` on HMM-capable setups keeps
+      // `publish_adstack_metadata`'s host-pointer fast path.
       executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr);
       if (task.ad_stack.bound_expr.has_value()) {
+        // Reducer length is the gating ndarray's full flat element count, not `n`: the lazy row-claim atomic-rmw
+        // fires once per LCA execution, and `gpu_parallel_struct_for` / `gpu_parallel_range_for` grid-stride (`i +=
+        // grid_dim()`) so a single dispatched thread can hit the LCA many times across one launch when the logical
+        // loop span exceeds the (capped) concurrent thread count. Walking the reducer over the full ndarray length
+        // keeps `bound_row_capacities[task_index]` consistent with the total claim count, which the codegen-emitted
+        // bounds clamp reads. Mirrors the CPU launcher's `bound_count_length` derivation.
         std::size_t bound_count_length = n;
         if (task.ad_stack.bound_expr->field_source_kind == StaticAdStackBoundExpr::FieldSourceKind::NdArray &&
             !task.ad_stack.bound_expr->ndarray_arg_id.empty() && task.ad_stack.bound_expr->ndarray_ndim > 0 &&
@@ -92,6 +103,11 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
             std::vector<int> indices = task.ad_stack.bound_expr->ndarray_arg_id;
             indices.push_back(TypeFactory::SHAPE_POS_IN_NDARRAY);
             indices.push_back(axis);
+            // get_struct_arg_host (NOT get_struct_arg): `launch_llvm_kernel` above has already swapped
+            // `ctx_->arg_buffer` to a device pointer, so a plain `get_struct_arg` here would dereference device
+            // memory from the host - SIGSEGV / CUDA_ERROR_ILLEGAL_ADDRESS on drivers without HMM, garbage
+            // `flat_len` on HMM-capable setups. The host backing buffer (`arg_buffer_`) stays host-resident across
+            // the swap and holds the same shape entries, so the host-safe variant is byte-equivalent here.
             flat_len *= int64_t(ctx.get_struct_arg_host<int32_t>(indices));
           }
           bound_count_length = static_cast<std::size_t>(std::max<int64_t>(0, flat_len));
@@ -100,6 +116,11 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
                                                       device_context_ptr);
         executor->ensure_per_task_float_heap_post_reducer(task_index, task.ad_stack, n);
       }
+      // Floor division (not ceiling): the heap-row count `n` resolved by `resolve_num_threads` floors at
+      // `kAdStackMaxConcurrentThreads`, so dispatching `cap_blocks * block_dim` threads must not exceed that count.
+      // Ceiling division would over-dispatch by `block_dim - 1` threads when `block_dim` does not divide
+      // `kAdStackMaxConcurrentThreads` evenly (e.g. `block_dim=192`: `ceil(65536/192)*192 = 65664`), and threads
+      // with `linear_thread_idx >= 65536` would index past the heap end.
       if (task.block_dim > 0) {
         const std::size_t cap_blocks =
             std::max<std::size_t>(1u, kAdStackMaxConcurrentThreads / static_cast<std::size_t>(task.block_dim));
@@ -143,8 +164,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         int effective_grid_dim = prepare_task(j, t);
         CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
         QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
-        cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
-                            {&ctx.get_context()}, {});
+        cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()},
+                            {});
       }
 
       for (auto &[sid, s] : stream_by_id) {

From e8d9cf0413588ddfd1c51967407d53d8c657136e Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 12:18:08 -0700
Subject: [PATCH 095/109] Allow synchronizing the default AMDGPU stream (handle
 0)

The stream_handle != 0 guard made synchronize_stream a no-op for the
default stream on AMDGPU, unlike the CUDA path. HIP supports
hipStreamSynchronize(nullptr), so remove the guard to match CUDA
semantics.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/program/program_stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp
index 31fb12e76d..9686a86332 100644
--- a/quadrants/program/program_stream.cpp
+++ b/quadrants/program/program_stream.cpp
@@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) {
   }
 #endif
 #ifdef QD_WITH_AMDGPU
-  if (arch_ == Arch::amdgpu && stream_handle != 0) {
+  if (arch_ == Arch::amdgpu) {
     AMDGPUContext::get_instance().make_current();
     AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast<void *>(stream_handle));
   }

From 48c3922acac9a7959cda3fbec90aaa4cdbabbb1a Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 12:20:34 -0700
Subject: [PATCH 096/109] Fall back to current runtime for Stream/Event destroy
 after reset

When the owning Program has been collected (e.g. after qd.reset()),
destroy() and __del__ now fall back to the current runtime's Program
to free the underlying CUDA resource.  This is safe because CUDAContext
is a singleton, so stream/event handles remain valid across Programs.
Prevents resource leaks in create/reset cycles.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 42 ++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 5898cb434e..6187b6f9c4 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -35,21 +35,34 @@ def synchronize(self):
             raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())")
         prog.stream_synchronize(self._handle)
 
+    def _destroy_prog(self):
+        """Resolve a Program for resource cleanup.
+
+        Falls back to the current runtime when the owner has been collected, which is safe because
+        CUDAContext is a singleton so the CUDA stream handle remains valid.
+        """
+        prog = self._prog()
+        if prog is None:
+            try:
+                return impl.get_runtime().prog
+            except Exception:
+                return None
+        return prog
+
     def destroy(self):
         """Explicitly destroy the stream. Safe to call multiple times.
 
-        No-op if the owning Program has already been collected, or for streams wrapping external handles
-        (created via Stream(ptr) without a prog_ref).
+        No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog()
+            prog = self._destroy_prog()
             if prog is not None:
                 prog.stream_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog_ref()
+            prog = self._destroy_prog()
             if prog is not None:
                 try:
                     prog.stream_destroy(self._handle)
@@ -105,21 +118,34 @@ def synchronize(self):
         """Block the host until this event has been reached."""
         self._require_prog().event_synchronize(self._handle)
 
+    def _destroy_prog(self):
+        """Resolve a Program for resource cleanup.
+
+        Falls back to the current runtime when the owner has been collected, which is safe because
+        CUDAContext is a singleton so the CUDA event handle remains valid.
+        """
+        prog = self._prog()
+        if prog is None:
+            try:
+                return impl.get_runtime().prog
+            except Exception:
+                return None
+        return prog
+
     def destroy(self):
         """Explicitly destroy the event. Safe to call multiple times.
 
-        No-op if the owning Program has already been collected, or for events wrapping external handles
-        (created via Event(ptr) without a prog_ref).
+        No-op for events wrapping external handles (created via Event(ptr) without a prog_ref).
         """
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog()
+            prog = self._destroy_prog()
             if prog is not None:
                 prog.event_destroy(self._handle)
             self._handle = 0
 
     def __del__(self):
         if self._handle != 0 and self._prog_ref is not None:
-            prog = self._prog_ref()
+            prog = self._destroy_prog()
             if prog is not None:
                 try:
                     prog.event_destroy(self._handle)

From 44ee707afa655e728ebf452d0e4102de3e75da7f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sat, 2 May 2026 15:04:03 -0700
Subject: [PATCH 097/109] Reflow _destroy_prog docstrings to 120-char width

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 python/quadrants/lang/stream.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py
index 6187b6f9c4..e87816568c 100644
--- a/python/quadrants/lang/stream.py
+++ b/python/quadrants/lang/stream.py
@@ -38,8 +38,8 @@ def synchronize(self):
     def _destroy_prog(self):
         """Resolve a Program for resource cleanup.
 
-        Falls back to the current runtime when the owner has been collected, which is safe because
-        CUDAContext is a singleton so the CUDA stream handle remains valid.
+        Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a
+        singleton so the CUDA stream handle remains valid.
         """
         prog = self._prog()
         if prog is None:
@@ -121,8 +121,8 @@ def synchronize(self):
     def _destroy_prog(self):
         """Resolve a Program for resource cleanup.
 
-        Falls back to the current runtime when the owner has been collected, which is safe because
-        CUDAContext is a singleton so the CUDA event handle remains valid.
+        Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a
+        singleton so the CUDA event handle remains valid.
         """
         prog = self._prog()
         if prog is None:

From ac4b825074b40d65ee8dc367510d7d3d6557c59b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 3 May 2026 02:53:16 -0700
Subject: [PATCH 098/109] Guard stream-parallel cleanup with exception safety

Wrap the launch+synchronize section in try/catch so that acquired
streams are returned to the pool and active_stream is restored even
when a launch or stream_synchronize throws.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 28 +++++++++++++-------
 quadrants/runtime/cuda/kernel_launcher.cpp   | 28 +++++++++++++-------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 96ac406902..8cdd9fa3c1 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -141,17 +141,25 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         }
       }
 
-      for (size_t j = group_start; j < i; j++) {
-        const auto &t = offloaded_tasks[j];
-        int effective_grid_dim = prepare_task(j, t);
-        AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
-        QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
-        amdgpu_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
-                              {(void *)&context_pointer}, {arg_size});
-      }
+      try {
+        for (size_t j = group_start; j < i; j++) {
+          const auto &t = offloaded_tasks[j];
+          int effective_grid_dim = prepare_task(j, t);
+          AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
+          QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
+          amdgpu_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
+                                {(void *)&context_pointer}, {arg_size});
+        }
 
-      for (auto &[sid, s] : stream_by_id) {
-        AMDGPUDriver::get_instance().stream_synchronize(s);
+        for (auto &[sid, s] : stream_by_id) {
+          AMDGPUDriver::get_instance().stream_synchronize(s);
+        }
+      } catch (...) {
+        for (auto &[sid, s] : stream_by_id) {
+          AMDGPUContext::get_instance().release_stream(s);
+        }
+        AMDGPUContext::get_instance().set_stream(active_stream);
+        throw;
       }
       for (auto &[sid, s] : stream_by_id) {
         AMDGPUContext::get_instance().release_stream(s);
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 0a0d6faafc..17a04067a4 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -160,17 +160,25 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         }
       }
 
-      for (size_t j = group_start; j < i; j++) {
-        const auto &t = offloaded_tasks[j];
-        int effective_grid_dim = prepare_task(j, t);
-        CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
-        QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
-        cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()},
-                            {});
-      }
+      try {
+        for (size_t j = group_start; j < i; j++) {
+          const auto &t = offloaded_tasks[j];
+          int effective_grid_dim = prepare_task(j, t);
+          CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]);
+          QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim);
+          cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes,
+                              {&ctx.get_context()}, {});
+        }
 
-      for (auto &[sid, s] : stream_by_id) {
-        CUDADriver::get_instance().stream_synchronize(s);
+        for (auto &[sid, s] : stream_by_id) {
+          CUDADriver::get_instance().stream_synchronize(s);
+        }
+      } catch (...) {
+        for (auto &[sid, s] : stream_by_id) {
+          CUDAContext::get_instance().release_stream(s);
+        }
+        CUDAContext::get_instance().set_stream(active_stream);
+        throw;
       }
       for (auto &[sid, s] : stream_by_id) {
         CUDAContext::get_instance().release_stream(s);

From 65d5cb92d0d8fa8cd82bf7b26eb7744e76d40c98 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 3 May 2026 03:37:40 -0700
Subject: [PATCH 099/109] Restore explanatory comments removed during
 stream-parallel refactor

The prepare_task lambda extraction dropped several non-obvious comments
explaining adstack gate roles, lazy-claim buffer rationale, device-side
reducer mechanics, shape-entry unit-stability, and grid-dim capping
rationale. Restore them.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp |  3 +++
 quadrants/runtime/cuda/kernel_launcher.cpp   | 24 ++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 8cdd9fa3c1..e74152927b 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -61,6 +61,9 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
   const bool any_lazy_task = std::any_of(offloaded_tasks.begin(), offloaded_tasks.end(),
                                          [](const OffloadedTask &t) { return t.ad_stack.bound_expr.has_value(); });
   if (any_lazy_task) {
+    // Allocate / reset the per-kernel lazy-claim arrays once before the first task. See the matching CPU launcher
+    // block for rationale; on AMDGPU the same memcpy_host_to_device path through the cached field pointers publishes
+    // the cleared counter and UINT32_MAX-defaulted capacity arrays.
     executor->publish_adstack_lazy_claim_buffers(offloaded_tasks.size());
   }
 
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 17a04067a4..4653ddb55a 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -70,9 +70,18 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
   // Two gates govern the per-launch adstack publish work, both opt-in by the kernel's IR shape. Forward-only kernels
   // skip both gates and pay zero adstack overhead; reverse-mode kernels without a captured `bound_expr` skip the
   // lazy-claim block, paying the per-task `publish_adstack_metadata` only.
+  //   - `any_adstack`: at least one task has an `AdStackAllocaStmt`. Gates the per-task `publish_adstack_metadata`
+  //     call (sets per-thread stride for the codegen heap-base addressing).
+  //   - `any_lazy_task`: at least one task has a captured `bound_expr` (the codegen routes such tasks through the
+  //     lazy LCA-block atomic-rmw row claim, which reads `runtime->adstack_row_counters[task_id]` and
+  //     `runtime->adstack_bound_row_capacities[task_id]`). Gates `publish_adstack_lazy_claim_buffers` and the
+  //     per-task reducer dispatch + DtoH heap sizing.
   const bool any_lazy_task = std::any_of(offloaded_tasks.begin(), offloaded_tasks.end(),
                                          [](const OffloadedTask &t) { return t.ad_stack.bound_expr.has_value(); });
   if (any_lazy_task) {
+    // Allocate / reset the per-kernel lazy-claim arrays once before the first task. See the matching CPU launcher
+    // block for rationale; on CUDA the same memcpy_host_to_device path through the cached field pointers publishes
+    // the cleared counter and UINT32_MAX-defaulted capacity arrays.
     executor->publish_adstack_lazy_claim_buffers(offloaded_tasks.size());
   }
 
@@ -88,6 +97,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
       // `publish_adstack_metadata`'s host-pointer fast path.
       executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr);
       if (task.ad_stack.bound_expr.has_value()) {
+        // Device-side reducer for tasks with a captured ndarray-backed `bound_expr`: a single-thread CUDA kernel
+        // walks the gating ndarray, counts gate-passing threads, writes the count into
+        // `runtime->adstack_bound_row_capacities[task_index]`. The codegen-emitted clamp at the float LCA-block
+        // claim site reads it back. Tasks without a captured gate keep the UINT32_MAX default and the clamp stays
+        // inert.
+        //
         // Reducer length is the gating ndarray's full flat element count, not `n`: the lazy row-claim atomic-rmw
         // fires once per LCA execution, and `gpu_parallel_struct_for` / `gpu_parallel_range_for` grid-stride (`i +=
         // grid_dim()`) so a single dispatched thread can hit the LCA many times across one launch when the logical
@@ -98,6 +113,9 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         if (task.ad_stack.bound_expr->field_source_kind == StaticAdStackBoundExpr::FieldSourceKind::NdArray &&
             !task.ad_stack.bound_expr->ndarray_arg_id.empty() && task.ad_stack.bound_expr->ndarray_ndim > 0 &&
             ctx.args_type != nullptr) {
+          // Length = product of shape entries via `args_type`. See `runtime/cpu/kernel_launcher.cpp` for the
+          // unit-stability rationale; `array_runtime_sizes` carries different units depending on the dispatch entry
+          // point and would undercount by `sizeof(elem)`x for `qd.ndarray` arguments.
           int64_t flat_len = 1;
           for (int axis = 0; axis < task.ad_stack.bound_expr->ndarray_ndim; ++axis) {
             std::vector<int> indices = task.ad_stack.bound_expr->ndarray_arg_id;
@@ -119,6 +137,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx,
         // dispatched-threads worst case on sparse-grid workloads.
         executor->ensure_per_task_float_heap_post_reducer(task_index, task.ad_stack, n, &ctx);
       }
+      // For adstack-bearing tasks, dispatch at most `kAdStackMaxConcurrentThreads` (matching the heap row count
+      // resolved above). The runtime's grid-strided loop (`gpu_parallel_struct_for` / `gpu_parallel_range_for`,
+      // `quadrants/runtime/llvm/runtime_module/runtime.cpp`) walks the full element list / range with
+      // `i += grid_dim()`, so a smaller grid completes the same workload sequentially per slot. Tasks without an
+      // adstack keep the codegen-emitted `task.grid_dim` (saturating_grid_dim) for max throughput.
+      //
       // Floor division (not ceiling): the heap-row count `n` resolved by `resolve_num_threads` floors at
       // `kAdStackMaxConcurrentThreads`, so dispatching `cap_blocks * block_dim` threads must not exceed that count.
       // Ceiling division would over-dispatch by `block_dim - 1` threads when `block_dim` does not divide

From b5554ca6267ce618b70490c4b77436a906a7314b Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 5 May 2026 01:13:15 -0700
Subject: [PATCH 100/109] Fix clang-format line length in kernel launchers

---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 3 ++-
 quadrants/runtime/cuda/kernel_launcher.cpp   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index 79ef2ef740..aede51b290 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -324,7 +324,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   }
   int arg_size = sizeof(RuntimeContext *);
   if (launcher_ctx.runtime_context_dev_ptr == nullptr) {
-    AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), active_stream);
+    AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext),
+                                              active_stream);
   }
   void *context_pointer = launcher_ctx.runtime_context_dev_ptr;
   AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index fb68dfcbb4..0f5aa38fba 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -401,7 +401,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   void *device_context_ptr = nullptr;
   if (needs_sizer_device_ctx) {
     if (launcher_ctx.runtime_context_dev_ptr == nullptr) {
-      CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), active_stream);
+      CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext),
+                                              active_stream);
     }
     device_context_ptr = launcher_ctx.runtime_context_dev_ptr;
     CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(),

From 8a7cdd795633e825e52521e2966f1873a2ccd7c7 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Tue, 5 May 2026 02:26:16 -0700
Subject: [PATCH 101/109] Use default stream for persistent buffer alloc/free

Persistent scratch buffers (result_buffer, arg_buffer, runtime_context)
must use nullptr (default stream) for malloc_async/mem_free_async so
the operations serialize with all non-blocking streams. Using
active_stream caused use-after-free when the active stream changed
between launches.
---
 quadrants/runtime/amdgpu/kernel_launcher.cpp | 11 +++++------
 quadrants/runtime/cuda/kernel_launcher.cpp   | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp
index aede51b290..88e64ce7ac 100644
--- a/quadrants/runtime/amdgpu/kernel_launcher.cpp
+++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp
@@ -230,10 +230,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   const std::size_t needed_result = std::max(ctx.result_buffer_size, sizeof(uint64));
   if (needed_result > persistent_result_buffer_capacity_) {
     if (persistent_result_buffer_dev_ptr_ != nullptr) {
-      AMDGPUDriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, active_stream);
+      AMDGPUDriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, nullptr);
     }
     const std::size_t new_cap = std::max(needed_result, 2 * persistent_result_buffer_capacity_);
-    AMDGPUDriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, active_stream);
+    AMDGPUDriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, nullptr);
     persistent_result_buffer_capacity_ = new_cap;
   }
   device_result_buffer = static_cast<char *>(persistent_result_buffer_dev_ptr_);
@@ -311,10 +311,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   if (ctx.arg_buffer_size > 0) {
     if (ctx.arg_buffer_size > launcher_ctx.arg_buffer_capacity) {
       if (launcher_ctx.arg_buffer_dev_ptr != nullptr) {
-        AMDGPUDriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, active_stream);
+        AMDGPUDriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, nullptr);
       }
       const std::size_t new_cap = std::max<std::size_t>(ctx.arg_buffer_size, 2 * launcher_ctx.arg_buffer_capacity);
-      AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, active_stream);
+      AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, nullptr);
       launcher_ctx.arg_buffer_capacity = new_cap;
     }
     device_arg_buffer = static_cast<char *>(launcher_ctx.arg_buffer_dev_ptr);
@@ -324,8 +324,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   }
   int arg_size = sizeof(RuntimeContext *);
   if (launcher_ctx.runtime_context_dev_ptr == nullptr) {
-    AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext),
-                                              active_stream);
+    AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), nullptr);
   }
   void *context_pointer = launcher_ctx.runtime_context_dev_ptr;
   AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext),
diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp
index 0f5aa38fba..1e44305024 100644
--- a/quadrants/runtime/cuda/kernel_launcher.cpp
+++ b/quadrants/runtime/cuda/kernel_launcher.cpp
@@ -278,10 +278,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   const std::size_t needed_result = std::max(ctx.result_buffer_size, sizeof(uint64));
   if (needed_result > persistent_result_buffer_capacity_) {
     if (persistent_result_buffer_dev_ptr_ != nullptr) {
-      CUDADriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, active_stream);
+      CUDADriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, nullptr);
     }
     const std::size_t new_cap = std::max(needed_result, 2 * persistent_result_buffer_capacity_);
-    CUDADriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, active_stream);
+    CUDADriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, nullptr);
     persistent_result_buffer_capacity_ = new_cap;
   }
   device_result_buffer = static_cast<char *>(persistent_result_buffer_dev_ptr_);
@@ -367,10 +367,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   if (ctx.arg_buffer_size > 0) {
     if (ctx.arg_buffer_size > launcher_ctx.arg_buffer_capacity) {
       if (launcher_ctx.arg_buffer_dev_ptr != nullptr) {
-        CUDADriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, active_stream);
+        CUDADriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, nullptr);
       }
       const std::size_t new_cap = std::max<std::size_t>(ctx.arg_buffer_size, 2 * launcher_ctx.arg_buffer_capacity);
-      CUDADriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, active_stream);
+      CUDADriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, nullptr);
       launcher_ctx.arg_buffer_capacity = new_cap;
     }
     device_arg_buffer = static_cast<char *>(launcher_ctx.arg_buffer_dev_ptr);
@@ -401,8 +401,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx
   void *device_context_ptr = nullptr;
   if (needs_sizer_device_ctx) {
     if (launcher_ctx.runtime_context_dev_ptr == nullptr) {
-      CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext),
-                                              active_stream);
+      CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), nullptr);
     }
     device_context_ptr = launcher_ctx.runtime_context_dev_ptr;
     CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(),

From 594bb8a782f924ee5e5ed937a07ec6dd1fb75ff1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:28:54 -0700
Subject: [PATCH 102/109] Update streams doc: rename fill_a/fill_b, remove
 redundant synchronize

Rename fill_a/fill_b to some_func1/some_func2 in explicit stream
examples. Remove redundant synchronize() from context manager example
since destroy() already waits for in-flight work.
---
 docs/source/user_guide/streams.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 26ea154321..7158647a18 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -63,8 +63,8 @@ For cases that require manual control — such as launching separate kernels on
 s1 = qd.create_stream()
 s2 = qd.create_stream()
 
-fill_a(qd_stream=s1)
-fill_b(qd_stream=s2)
+some_func1(qd_stream=s1)
+some_func2(qd_stream=s2)
 
 s1.synchronize()
 s2.synchronize()
@@ -115,9 +115,8 @@ Streams and events support `with` blocks for automatic cleanup:
 
 ```python
 with qd.create_stream() as s:
-    fill_a(qd_stream=s)
-    s.synchronize()
-# s.destroy() called automatically
+    some_func1(qd_stream=s)
+# s.destroy() called automatically — waits for in-flight work
 ```
 
 ### PyTorch interop (CUDA)

From bfa9ff977c62caff137f207808edfeb0a8c25895 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:29:44 -0700
Subject: [PATCH 103/109] Remove incorrect claim about data corruption without
 stream management

---
 docs/source/user_guide/streams.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 7158647a18..6ff231f7f1 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -121,7 +121,7 @@ with qd.create_stream() as s:
 
 ### PyTorch interop (CUDA)
 
-When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption.
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions.
 
 #### Running Quadrants kernels on PyTorch's stream
 

From c38f53e9b4dd7a8d14c06c713f165c91c07763ff Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:30:25 -0700
Subject: [PATCH 104/109] Remove PyTorch interop section from streams doc

---
 docs/source/user_guide/streams.md | 36 -------------------------------
 1 file changed, 36 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 6ff231f7f1..ab0625235d 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -119,42 +119,6 @@ with qd.create_stream() as s:
 # s.destroy() called automatically — waits for in-flight work
 ```
 
-### PyTorch interop (CUDA)
-
-When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions.
-
-#### Running Quadrants kernels on PyTorch's stream
-
-```python
-import torch
-from quadrants.lang.stream import Stream
-
-torch_stream_ptr = torch.cuda.current_stream().cuda_stream
-stream = Stream(torch_stream_ptr)
-
-physics_kernel(qd_stream=stream)
-observations = compute_obs_tensor()  # PyTorch op on the same stream
-apply_actions_kernel(qd_stream=stream)
-```
-
-Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream.
-
-#### Running PyTorch operations on a Quadrants stream
-
-```python
-qd_stream = qd.create_stream()
-torch_stream = torch.cuda.ExternalStream(qd_stream.handle)
-
-with torch.cuda.stream(torch_stream):
-    physics_kernel(qd_stream=qd_stream)
-    observations = compute_obs_tensor()
-    apply_actions_kernel(qd_stream=qd_stream)
-
-qd_stream.destroy()
-```
-
-`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly.
-
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.

From c8d779261c0561f5a74b55a52846de3262138320 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:32:21 -0700
Subject: [PATCH 105/109] Move sync behavior notes out of Limitations into own
 section

---
 docs/source/user_guide/streams.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index ab0625235d..6ea695e476 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -119,9 +119,12 @@ with qd.create_stream() as s:
 # s.destroy() called automatically — waits for in-flight work
 ```
 
+## Synchronization notes
+
+- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
+- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. `stream_parallel` handles this automatically.
+
 ## Limitations
 
 - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
 - **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context.
-- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
-- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. `stream_parallel` handles this automatically.

From 8ef3a0bd0578e4d2c36283f908129f7b8a843536 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:34:45 -0700
Subject: [PATCH 106/109] Revert qd.sync() to default-stream-only
 synchronization

context_synchronize/device_synchronize waits on all streams, which
contradicts the documented behavior that qd.sync() only waits on the
default stream. stream_parallel already synchronizes its pooled streams
before returning, so a global barrier is unnecessary.
---
 quadrants/runtime/llvm/llvm_runtime_executor.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
index 02b2b3d8a8..663e8ffbb0 100644
--- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp
+++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp
@@ -201,15 +201,13 @@ Program *LlvmRuntimeExecutor::get_program() const {
 void LlvmRuntimeExecutor::synchronize() {
   if (config_.arch == Arch::cuda) {
 #if defined(QD_WITH_CUDA)
-    CUDAContext::get_instance().make_current();
-    CUDADriver::get_instance().context_synchronize();
+    CUDADriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No CUDA support");
 #endif
   } else if (config_.arch == Arch::amdgpu) {
 #if defined(QD_WITH_AMDGPU)
-    AMDGPUContext::get_instance().make_current();
-    AMDGPUDriver::get_instance().device_synchronize();
+    AMDGPUDriver::get_instance().stream_synchronize(nullptr);
 #else
     QD_ERROR("No AMDGPU support");
 #endif

From cf09b26df9ef729e6d6cfa85f886836b1e025090 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 07:50:23 -0700
Subject: [PATCH 107/109] Clarify that qd_stream is implicit in any @qd.kernel
 call

Address review comment: make explicit that qd_stream is a special
keyword argument handled by the @qd.kernel decorator, not something
the user declares in the kernel signature.
---
 docs/source/user_guide/streams.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 6ea695e476..37d8967eeb 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -59,12 +59,19 @@ For cases that require manual control — such as launching separate kernels on
 
 ### Creating and using streams
 
+Any `@qd.kernel` function accepts a special `qd_stream` keyword argument — you do not need to declare it in the kernel signature. The `@qd.kernel` decorator handles it automatically.
+
 ```python
+@qd.kernel
+def my_kernel():
+    for i in range(N):
+        a[i] = i
+
 s1 = qd.create_stream()
 s2 = qd.create_stream()
 
-some_func1(qd_stream=s1)
-some_func2(qd_stream=s2)
+my_kernel(qd_stream=s1)
+my_kernel(qd_stream=s2)
 
 s1.synchronize()
 s2.synchronize()
@@ -73,7 +80,7 @@ s1.destroy()
 s2.destroy()
 ```
 
-Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes.
+Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes.
 
 ### Events
 

From 5f36533bc5fead1ca433675f013bd54898a5e563 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 08:05:16 -0700
Subject: [PATCH 108/109] Note that graph/autodiff + qd_stream raises
 RuntimeError

---
 docs/source/user_guide/streams.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
index 37d8967eeb..a8db331bcc 100644
--- a/docs/source/user_guide/streams.md
+++ b/docs/source/user_guide/streams.md
@@ -133,5 +133,5 @@ with qd.create_stream() as s:
 
 ## Limitations
 
-- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
-- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context.
+- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True` (if you do, a `RuntimeError` will be raised).
+- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context (if you do, a `RuntimeError` will be raised).

From b298d92eb5d1c4b6777c2d10d555288b665eb2a3 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 7 May 2026 09:14:20 -0700
Subject: [PATCH 109/109] Add tests for build_With error branches

Cover all 5 error paths in ASTTransformer.build_With: multiple context
managers, with-as syntax, non-call expression, non-stream_parallel
context manager, and stream_parallel inside @qd.func.
---
 tests/python/test_streams.py | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py
index fbf7abb155..b89a3b4a42 100644
--- a/tests/python/test_streams.py
+++ b/tests/python/test_streams.py
@@ -538,3 +538,111 @@ def parallel_fill(
         qd.sync()
         assert np.allclose(a.to_numpy(), v), f"iteration {iteration}"
         assert np.allclose(b.to_numpy(), v * 2.0), f"iteration {iteration}"
+
+
+@test_utils.test()
+def test_with_multiple_context_managers_rejected():
+    import pytest
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+
+    with pytest.raises(QuadrantsSyntaxError, match="single context manager"):
+
+        @qd.kernel
+        def bad():
+            with qd.stream_parallel(), qd.stream_parallel():
+                for i in range(N):
+                    a[i] = 1.0
+
+        bad()
+
+
+@test_utils.test()
+def test_with_as_rejected():
+    import pytest
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+
+    with pytest.raises(QuadrantsSyntaxError, match="with .* as"):
+
+        @qd.kernel
+        def bad():
+            with qd.stream_parallel() as s:
+                for i in range(N):
+                    a[i] = 1.0
+
+        bad()
+
+
+@test_utils.test()
+def test_with_non_call_expression_rejected():
+    import pytest
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+    dummy = qd.stream_parallel
+
+    with pytest.raises(QuadrantsSyntaxError, match="requires a call expression"):
+
+        @qd.kernel
+        def bad():
+            with dummy:
+                for i in range(N):
+                    a[i] = 1.0
+
+        bad()
+
+
+@test_utils.test()
+def test_with_non_stream_parallel_rejected():
+    import pytest
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+
+    def other_ctx():
+        pass
+
+    with pytest.raises(QuadrantsSyntaxError, match="only supports qd.stream_parallel"):
+
+        @qd.kernel
+        def bad():
+            with other_ctx():
+                for i in range(N):
+                    a[i] = 1.0
+
+        bad()
+
+
+@test_utils.test()
+def test_stream_parallel_in_func_rejected():
+    import pytest
+
+    from quadrants.lang.exception import QuadrantsSyntaxError
+
+    N = 64
+    a = qd.field(qd.f32, shape=(N,))
+
+    with pytest.raises(QuadrantsSyntaxError, match="only be used inside @qd.kernel"):
+
+        @qd.func
+        def helper():
+            with qd.stream_parallel():
+                for i in range(N):
+                    a[i] = 1.0
+
+        @qd.kernel
+        def bad():
+            helper()
+
+        bad()