From ab15b1b82c1cc2ef2d0029db9faf913ce4ef2145 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 16:40:10 -0700 Subject: [PATCH 001/109] Add CUDA stream and event API for concurrent kernel execution Introduces qd.create_stream() and qd.create_event() for launching kernels on separate CUDA streams with event-based synchronization. The qd_stream kwarg on kernel calls routes the launch to a specific stream. Non-CUDA backends return no-op handles (0). Routes kernel launcher memory ops through the active stream. --- python/quadrants/lang/__init__.py | 2 + python/quadrants/lang/kernel.py | 16 +- python/quadrants/lang/stream.py | 96 +++++++++ quadrants/program/program.cpp | 93 +++++++++ quadrants/program/program.h | 10 + quadrants/python/export_lang.cpp | 11 +- .../rhi/cuda/cuda_driver_functions.inc.h | 2 + quadrants/runtime/cuda/kernel_launcher.cpp | 20 +- tests/python/test_api.py | 4 + tests/python/test_cache.py | 8 +- tests/python/test_streams.py | 197 ++++++++++++++++++ 11 files changed, 443 insertions(+), 16 deletions(-) create mode 100644 python/quadrants/lang/stream.py create mode 100644 tests/python/test_streams.py diff --git a/python/quadrants/lang/__init__.py b/python/quadrants/lang/__init__.py index dc4fb2cf19..43a4b44b89 100644 --- a/python/quadrants/lang/__init__.py +++ b/python/quadrants/lang/__init__.py @@ -15,6 +15,7 @@ from quadrants.lang.runtime_ops import * from quadrants.lang.snode import * from quadrants.lang.source_builder import * +from quadrants.lang.stream import * from quadrants.lang.struct import * from quadrants.types.enums import DeviceCapability, Format, Layout # noqa: F401 @@ -45,6 +46,7 @@ "shell", "snode", "source_builder", + "stream", "struct", "util", ] diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index af6dbdacb5..4b1578ac4b 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -424,7 +424,9 @@ def materialize(self, key: "CompiledKernelKeyType | None", py_args: tuple[Any, . ] runtime._current_global_context = None - def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args) -> Any: + def launch_kernel( + self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args, qd_stream=None + ) -> Any: assert len(args) == len(self.arg_metas), f"{len(self.arg_metas)} arguments needed but {len(args)} provided" callbacks: list[Callable[[], None]] = [] @@ -503,7 +505,14 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled ) self.src_ll_cache_observations.cache_stored = True self._last_compiled_kernel_data = compiled_kernel_data - prog.launch_kernel(compiled_kernel_data, launch_ctx) + stream_handle = qd_stream.handle if qd_stream is not None else 0 + if stream_handle: + prog.set_current_cuda_stream(stream_handle) + try: + prog.launch_kernel(compiled_kernel_data, launch_ctx) + finally: + if stream_handle: + prog.set_current_cuda_stream(0) except Exception as e: e = handle_exception_from_cpp(e) if impl.get_runtime().print_full_traceback: @@ -547,6 +556,7 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut # Thus this part needs to be fast. (i.e. < 3us on a 4 GHz x64 CPU) @_shell_pop_print def __call__(self, *py_args, **kwargs) -> Any: + qd_stream = kwargs.pop("qd_stream", None) if impl.get_runtime()._arch == _ARCH_PYTHON: return self.func(*py_args, **kwargs) config = impl.current_cfg() @@ -578,7 +588,7 @@ def __call__(self, *py_args, **kwargs) -> Any: kernel_cpp = self.materialized_kernels[key] compiled_kernel_data = self.compiled_kernel_data_by_key.get(key, None) self.launch_observations.found_kernel_in_materialize_cache = compiled_kernel_data is not None - ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args) + ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args, qd_stream=qd_stream) if compiled_kernel_data is None: assert self._last_compiled_kernel_data is not None self.compiled_kernel_data_by_key[key] = self._last_compiled_kernel_data diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py new file mode 100644 index 0000000000..8530982455 --- /dev/null +++ b/python/quadrants/lang/stream.py @@ -0,0 +1,96 @@ +from quadrants.lang import impl + + +class Stream: + """Wraps a backend-specific GPU stream for concurrent kernel execution. + + On backends without native streams (e.g. CPU), this is a no-op object. + """ + + def __init__(self, handle: int): + self._handle = handle + + @property + def handle(self) -> int: + return self._handle + + def synchronize(self): + """Block until all operations on this stream complete.""" + prog = impl.get_runtime().prog + prog.stream_synchronize(self._handle) + + def destroy(self): + """Explicitly destroy the stream. Safe to call multiple times.""" + if self._handle != 0: + prog = impl.get_runtime().prog + prog.stream_destroy(self._handle) + self._handle = 0 + + def __del__(self): + if self._handle != 0: + try: + self.destroy() + except Exception: + pass + + +class Event: + """Wraps a backend-specific GPU event for stream synchronization. + + On backends without native events (e.g. CPU), this is a no-op object. + """ + + def __init__(self, handle: int): + self._handle = handle + + @property + def handle(self) -> int: + return self._handle + + def record(self, stream: Stream | None = None): + """Record this event on a stream. None means the default stream.""" + prog = impl.get_runtime().prog + stream_handle = stream.handle if stream is not None else 0 + prog.event_record(self._handle, stream_handle) + + def wait(self, qd_stream: Stream | None = None): + """Make a stream wait for this event. None means the default stream.""" + prog = impl.get_runtime().prog + stream_handle = qd_stream.handle if qd_stream is not None else 0 + prog.stream_wait_event(stream_handle, self._handle) + + def synchronize(self): + """Block the host until this event has been reached.""" + prog = impl.get_runtime().prog + prog.event_synchronize(self._handle) + + def destroy(self): + """Explicitly destroy the event. Safe to call multiple times.""" + if self._handle != 0: + prog = impl.get_runtime().prog + prog.event_destroy(self._handle) + self._handle = 0 + + def __del__(self): + if self._handle != 0: + try: + self.destroy() + except Exception: + pass + + +def create_stream() -> Stream: + """Create a new GPU stream for concurrent kernel execution.""" + prog = impl.get_runtime().prog + handle = prog.stream_create() + return Stream(handle) + + +def create_event() -> Event: + """Create a new GPU event for stream synchronization.""" + prog = impl.get_runtime().prog + handle = prog.event_create() + return Event(handle) + + +__all__ = ["Stream", "Event", "create_stream", "create_event"] diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 7f5dfef2d8..9b2ff0886b 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -20,6 +20,11 @@ #include "quadrants/codegen/llvm/struct_llvm.h" #endif +#ifdef QD_WITH_CUDA +#include "quadrants/rhi/cuda/cuda_driver.h" +#include "quadrants/rhi/cuda/cuda_context.h" +#endif + #ifdef QD_WITH_VULKAN #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h" #include "quadrants/rhi/vulkan/vulkan_loader.h" @@ -481,4 +486,92 @@ void Program::enqueue_compute_op_lambda( program_impl_->enqueue_compute_op_lambda(op, image_refs); } +uint64 Program::stream_create() { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + void *stream = nullptr; + CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/); + return reinterpret_cast(stream); + } +#endif + return 0; +} + +void Program::stream_destroy(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && stream_handle != 0) { + CUDADriver::get_instance().stream_destroy( + reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::stream_synchronize(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + CUDADriver::get_instance().stream_synchronize( + reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::set_current_cuda_stream(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().set_stream( + reinterpret_cast(stream_handle)); + } +#endif +} + +uint64 Program::event_create() { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + void *event = nullptr; + CUDADriver::get_instance().event_create(&event, + 0x02 /*CU_EVENT_DISABLE_TIMING*/); + return reinterpret_cast(event); + } +#endif + return 0; +} + +void Program::event_destroy(uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDADriver::get_instance().event_destroy( + reinterpret_cast(event_handle)); + } +#endif +} + +void Program::event_record(uint64 event_handle, uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDADriver::get_instance().event_record( + reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::event_synchronize(uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDADriver::get_instance().event_synchronize( + reinterpret_cast(event_handle)); + } +#endif +} + +void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDADriver::get_instance().stream_wait_event( + reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); + } +#endif +} + } // namespace quadrants::lang diff --git a/quadrants/program/program.h b/quadrants/program/program.h index 1fa2c2ac57..9568c371c8 100644 --- a/quadrants/program/program.h +++ b/quadrants/program/program.h @@ -300,6 +300,16 @@ class QD_DLL_EXPORT Program { return ndarrays_.size(); } + uint64 stream_create(); + void stream_destroy(uint64 stream_handle); + void stream_synchronize(uint64 stream_handle); + void set_current_cuda_stream(uint64 stream_handle); + uint64 event_create(); + void event_destroy(uint64 event_handle); + void event_record(uint64 event_handle, uint64 stream_handle); + void event_synchronize(uint64 event_handle); + void stream_wait_event(uint64 stream_handle, uint64 event_handle); + // TODO(zhanlue): Move these members and corresponding interfaces to // ProgramImpl Ideally, Program should serve as a pure interface class and all // the implementations should fall inside ProgramImpl diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index b3d23c0037..2f5da8b1b4 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -495,7 +495,16 @@ void export_lang(py::module &m) { .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference) .def("launch_kernel", &Program::launch_kernel) - .def("get_device_caps", &Program::get_device_caps); + .def("get_device_caps", &Program::get_device_caps) + .def("stream_create", &Program::stream_create) + .def("stream_destroy", &Program::stream_destroy) + .def("stream_synchronize", &Program::stream_synchronize) + .def("set_current_cuda_stream", &Program::set_current_cuda_stream) + .def("event_create", &Program::event_create) + .def("event_destroy", &Program::event_destroy) + .def("event_record", &Program::event_record) + .def("event_synchronize", &Program::event_synchronize) + .def("stream_wait_event", &Program::stream_wait_event); py::class_(m, "CompileResult") .def_property_readonly( diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h index 25b3c7958e..a9690ca10b 100644 --- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h +++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h @@ -20,6 +20,7 @@ PER_CUDA_FUNCTION(context_set_limit, cuCtxSetLimit, int, std::size_t); // Stream management PER_CUDA_FUNCTION(stream_create, cuStreamCreate, void **, uint32); +PER_CUDA_FUNCTION(stream_destroy, cuStreamDestroy_v2, void *); // Memory management PER_CUDA_FUNCTION(memcpy_host_to_device, cuMemcpyHtoD_v2, void *, void *, std::size_t); @@ -52,6 +53,7 @@ PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_a // Stream management PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *); +PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32); // Event management PER_CUDA_FUNCTION(event_create, cuEventCreate, void **, uint32) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 5eae5e747d..13845d5a9b 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -1,5 +1,6 @@ #include "quadrants/runtime/cuda/kernel_launcher.h" #include "quadrants/rhi/cuda/cuda_context.h" +#include "quadrants/rhi/cuda/cuda_driver.h" namespace quadrants::lang { namespace cuda { @@ -43,10 +44,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, // kernels. std::unordered_map device_ptrs; + auto *active_stream = CUDAContext::get_instance().get_stream(); + char *device_result_buffer{nullptr}; CUDADriver::get_instance().malloc_async( (void **)&device_result_buffer, - std::max(ctx.result_buffer_size, sizeof(uint64)), nullptr); + std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream); ctx.get_context().runtime = executor->get_llvm_runtime(); for (int i = 0; i < (int)parameters.size(); i++) { @@ -120,7 +123,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } if (transfers.size() > 0) { - CUDADriver::get_instance().stream_synchronize(nullptr); + CUDADriver::get_instance().stream_synchronize(active_stream); } char *host_result_buffer = (char *)ctx.get_context().result_buffer; if (ctx.result_buffer_size > 0) { @@ -129,10 +132,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, char *device_arg_buffer = nullptr; if (ctx.arg_buffer_size > 0) { CUDADriver::get_instance().malloc_async((void **)&device_arg_buffer, - ctx.arg_buffer_size, nullptr); + ctx.arg_buffer_size, active_stream); CUDADriver::get_instance().memcpy_host_to_device_async( device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size, - nullptr); + active_stream); ctx.get_context().arg_buffer = device_arg_buffer; } @@ -144,17 +147,18 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, {}); } if (ctx.arg_buffer_size > 0) { - CUDADriver::get_instance().mem_free_async(device_arg_buffer, nullptr); + CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream); } if (ctx.result_buffer_size > 0) { CUDADriver::get_instance().memcpy_device_to_host_async( host_result_buffer, device_result_buffer, ctx.result_buffer_size, - nullptr); + active_stream); } - CUDADriver::get_instance().mem_free_async(device_result_buffer, nullptr); + CUDADriver::get_instance().mem_free_async(device_result_buffer, + active_stream); // copy data back to host if (transfers.size() > 0) { - CUDADriver::get_instance().stream_synchronize(nullptr); + CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; CUDADriver::get_instance().memcpy_device_to_host( diff --git a/tests/python/test_api.py b/tests/python/test_api.py index cf12abc393..002014c960 100644 --- a/tests/python/test_api.py +++ b/tests/python/test_api.py @@ -59,6 +59,7 @@ def _get_expected_matrix_apis(): "DEBUG", "DeviceCapability", "ERROR", + "Event", "Field", "FieldsBuilder", "Format", @@ -73,6 +74,7 @@ def _get_expected_matrix_apis(): "SNode", "ScalarField", "ScalarNdarray", + "Stream", "Struct", "StructField", "TRACE", @@ -117,6 +119,8 @@ def _get_expected_matrix_apis(): "clock_freq_hz", "cos", "cpu", + "create_event", + "create_stream", "cuda", "data_oriented", "dataclass", diff --git a/tests/python/test_cache.py b/tests/python/test_cache.py index c3821e44c5..e31daf61e7 100644 --- a/tests/python/test_cache.py +++ b/tests/python/test_cache.py @@ -216,11 +216,11 @@ def test_fastcache(tmp_path: pathlib.Path, monkeypatch): qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True) is_valid = False - def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args): + def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None): nonlocal is_valid is_valid = True assert compiled_kernel_data is None - return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args) + return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream) monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel) @@ -242,11 +242,11 @@ def fun(value: qd.types.ndarray(), offset: qd.template()): qd_init_same_arch(offline_cache_file_path=str(tmp_path), offline_cache=True) is_valid = False - def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args): + def launch_kernel(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=None): nonlocal is_valid is_valid = True assert compiled_kernel_data is not None - return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args) + return launch_kernel_orig(self, key, t_kernel, compiled_kernel_data, *args, qd_stream=qd_stream) monkeypatch.setattr("quadrants.lang.kernel_impl.Kernel.launch_kernel", launch_kernel) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py new file mode 100644 index 0000000000..fabc217e96 --- /dev/null +++ b/tests/python/test_streams.py @@ -0,0 +1,197 @@ +"""Tests for GPU stream and event support.""" + +import numpy as np + +import quadrants as qd +from quadrants.lang.stream import Event, Stream + +from tests import test_utils + + +@test_utils.test(arch=[qd.cuda]) +def test_create_and_destroy_stream(): + s = qd.create_stream() + assert isinstance(s, Stream) + assert s.handle != 0 + s.destroy() + assert s.handle == 0 + + +@test_utils.test(arch=[qd.cuda]) +def test_create_and_destroy_event(): + e = qd.create_event() + assert isinstance(e, Event) + assert e.handle != 0 + e.destroy() + assert e.handle == 0 + + +@test_utils.test() +def test_kernel_on_stream(): + N = 1024 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 42.0 + + s = qd.create_stream() + fill(qd_stream=s) + s.synchronize() + assert np.allclose(x.to_numpy(), 42.0) + s.destroy() + + +@test_utils.test() +def test_two_streams(): + N = 1024 + a = qd.field(qd.f32, shape=(N,)) + b = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill_a(): + for i in range(N): + a[i] = 1.0 + + @qd.kernel + def fill_b(): + for i in range(N): + b[i] = 2.0 + + s1 = qd.create_stream() + s2 = qd.create_stream() + fill_a(qd_stream=s1) + fill_b(qd_stream=s2) + s1.synchronize() + s2.synchronize() + assert np.allclose(a.to_numpy(), 1.0) + assert np.allclose(b.to_numpy(), 2.0) + s1.destroy() + s2.destroy() + + +@test_utils.test() +def test_event_synchronization(): + N = 1024 + x = qd.field(qd.f32, shape=(N,)) + y = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill_x(): + for i in range(N): + x[i] = 10.0 + + @qd.kernel + def copy_x_to_y(): + for i in range(N): + y[i] = x[i] + + s1 = qd.create_stream() + fill_x(qd_stream=s1) + + e = qd.create_event() + e.record(s1) + + # Default stream waits for s1 to finish fill_x + e.wait() + copy_x_to_y() + qd.sync() + + assert np.allclose(y.to_numpy(), 10.0) + + e.destroy() + s1.destroy() + + +@test_utils.test() +def test_event_wait_on_stream(): + N = 1024 + x = qd.field(qd.f32, shape=(N,)) + y = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill_x(): + for i in range(N): + x[i] = 5.0 + + @qd.kernel + def copy_x_to_y(): + for i in range(N): + y[i] = x[i] + + s1 = qd.create_stream() + s2 = qd.create_stream() + + fill_x(qd_stream=s1) + + e = qd.create_event() + e.record(s1) + + # s2 waits for s1's event before running + e.wait(qd_stream=s2) + copy_x_to_y(qd_stream=s2) + s2.synchronize() + + assert np.allclose(y.to_numpy(), 5.0) + + e.destroy() + s1.destroy() + s2.destroy() + + +@test_utils.test() +def test_default_stream_kernel(): + N = 1024 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 7.0 + + fill() + qd.sync() + assert np.allclose(x.to_numpy(), 7.0) + + +@test_utils.test(arch=[qd.cpu]) +def test_stream_noop_on_cpu(): + """Streams should be no-ops on CPU without errors.""" + N = 64 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 3.0 + + s = qd.create_stream() + assert s.handle == 0 + fill(qd_stream=s) + qd.sync() + assert np.allclose(x.to_numpy(), 3.0) + + e = qd.create_event() + assert e.handle == 0 + e.record(s) + e.wait() + s.destroy() + e.destroy() + + +@test_utils.test() +def test_stream_with_ndarray(): + N = 1024 + + @qd.kernel + def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): + for i in range(N): + arr[i] = 99.0 + + arr = qd.ndarray(qd.f32, shape=(N,)) + s = qd.create_stream() + fill(arr, qd_stream=s) + s.synchronize() + assert np.allclose(arr.to_numpy(), 99.0) + s.destroy() From 7bd18ca4e1a9b6e99632af2c7c62076b4195ae3d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 16:42:22 -0700 Subject: [PATCH 002/109] Add AMDGPU/HIP stream support and async memory operations Mirrors the CUDA stream implementation for HIP: adds stream_ member to AMDGPUContext, stream_destroy/stream_wait_event/malloc_async/ mem_free_async to HIP driver functions, and AMDGPU branches in all Program stream/event methods. Converts AMDGPU kernel launcher to use async memory operations through the active stream. CPU backend returns 0 handles (no-op). --- quadrants/program/program.cpp | 64 ++++++++++++++ quadrants/rhi/amdgpu/amdgpu_context.cpp | 4 +- quadrants/rhi/amdgpu/amdgpu_context.h | 9 ++ .../rhi/amdgpu/amdgpu_driver_functions.inc.h | 8 ++ quadrants/runtime/amdgpu/kernel_launcher.cpp | 51 +++++------ tests/python/test_streams.py | 84 ++++++++++++++++++- 6 files changed, 191 insertions(+), 29 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 9b2ff0886b..f4bb8da35b 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -25,6 +25,11 @@ #include "quadrants/rhi/cuda/cuda_context.h" #endif +#ifdef QD_WITH_AMDGPU +#include "quadrants/rhi/amdgpu/amdgpu_driver.h" +#include "quadrants/rhi/amdgpu/amdgpu_context.h" +#endif + #ifdef QD_WITH_VULKAN #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h" #include "quadrants/rhi/vulkan/vulkan_loader.h" @@ -493,6 +498,13 @@ uint64 Program::stream_create() { CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/); return reinterpret_cast(stream); } +#endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + void *stream = nullptr; + AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/); + return reinterpret_cast(stream); + } #endif return 0; } @@ -504,6 +516,12 @@ void Program::stream_destroy(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUDriver::get_instance().stream_destroy( + reinterpret_cast(stream_handle)); + } +#endif } void Program::stream_synchronize(uint64 stream_handle) { @@ -513,6 +531,12 @@ void Program::stream_synchronize(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + AMDGPUDriver::get_instance().stream_synchronize( + reinterpret_cast(stream_handle)); + } +#endif } void Program::set_current_cuda_stream(uint64 stream_handle) { @@ -522,6 +546,12 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().set_stream( + reinterpret_cast(stream_handle)); + } +#endif } uint64 Program::event_create() { @@ -532,6 +562,14 @@ uint64 Program::event_create() { 0x02 /*CU_EVENT_DISABLE_TIMING*/); return reinterpret_cast(event); } +#endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu) { + void *event = nullptr; + AMDGPUDriver::get_instance().event_create(&event, + 0x02 /*hipEventDisableTiming*/); + return reinterpret_cast(event); + } #endif return 0; } @@ -543,6 +581,12 @@ void Program::event_destroy(uint64 event_handle) { reinterpret_cast(event_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_destroy( + reinterpret_cast(event_handle)); + } +#endif } void Program::event_record(uint64 event_handle, uint64 stream_handle) { @@ -553,6 +597,13 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { reinterpret_cast(stream_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_record( + reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); + } +#endif } void Program::event_synchronize(uint64 event_handle) { @@ -562,6 +613,12 @@ void Program::event_synchronize(uint64 event_handle) { reinterpret_cast(event_handle)); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().event_synchronize( + reinterpret_cast(event_handle)); + } +#endif } void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { @@ -572,6 +629,13 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { reinterpret_cast(event_handle), 0 /*flags*/); } #endif +#ifdef QD_WITH_AMDGPU + if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUDriver::get_instance().stream_wait_event( + reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); + } +#endif } } // namespace quadrants::lang diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index 22f55339ee..f940ed9a7c 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -188,7 +188,7 @@ void AMDGPUContext::launch(void *func, void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02, (void *)&pack_size, (void *)0x03}; driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1, - dynamic_shared_mem_bytes, nullptr, nullptr, + dynamic_shared_mem_bytes, stream_, nullptr, reinterpret_cast(&config)); } std::free(packed_arg); @@ -197,7 +197,7 @@ void AMDGPUContext::launch(void *func, profiler_->stop(task_handle); if (debug_) { - driver_.stream_synchronize(nullptr); + driver_.stream_synchronize(stream_); } } diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 9529953bf1..68e7cd7314 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -23,6 +23,7 @@ class AMDGPUContext { KernelProfilerBase *profiler_{nullptr}; AMDGPUDriver &driver_; bool debug_{false}; + void *stream_{nullptr}; std::vector kernel_arg_pointer_; public: @@ -116,6 +117,14 @@ class AMDGPUContext { return std::unique_lock(lock_); } + void set_stream(void *stream) { + stream_ = stream; + } + + void *get_stream() const { + return stream_; + } + static AMDGPUContext &get_instance(); }; diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index dbb3612c87..6063d268a9 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -26,6 +26,7 @@ PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); // Stream management PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32); +PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); // Memory management PER_AMDGPU_FUNCTION(memcpy_host_to_device, @@ -69,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, @@ -76,6 +78,7 @@ PER_AMDGPU_FUNCTION(malloc_managed, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); PER_AMDGPU_FUNCTION(mem_free, hipFree, void *); +PER_AMDGPU_FUNCTION(mem_free_async, hipFreeAsync, void *, void *); PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *); PER_AMDGPU_FUNCTION(mem_get_attribute, hipPointerGetAttribute, @@ -121,6 +124,11 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, // Stream management PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *); +PER_AMDGPU_FUNCTION(stream_wait_event, + hipStreamWaitEvent, + void *, + void *, + uint32); // Event management PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 6ef0b0e0e5..1d8430d35e 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -1,5 +1,6 @@ #include "quadrants/runtime/amdgpu/kernel_launcher.h" #include "quadrants/rhi/amdgpu/amdgpu_context.h" +#include "quadrants/rhi/amdgpu/amdgpu_driver.h" #include "quadrants/program/launch_context_builder.h" namespace quadrants::lang { @@ -32,18 +33,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, transfers; std::unordered_map device_ptrs; + auto *active_stream = AMDGPUContext::get_instance().get_stream(); + char *device_result_buffer{nullptr}; - // Here we have to guarantee the result_result_buffer isn't nullptr - // It is interesting - The code following - // L60: DeviceAllocation devalloc = - // executor->allocate_memory_on_device( call another kernel and it will result - // in - // Memory access fault by GPU node-1 (Agent handle: 0xeda5ca0) on address - // (nil). Reason: Page not present or supervisor privilege. - // if you don't allocate it. - AMDGPUDriver::get_instance().malloc( + // Must always allocate device_result_buffer (even when result_buffer_size + // is 0) to avoid memory access faults from allocate_memory_on_device below. + AMDGPUDriver::get_instance().malloc_async( (void **)&device_result_buffer, - std::max(ctx.result_buffer_size, sizeof(uint64))); + std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream); for (int i = 0; i < (int)parameters.size(); i++) { const auto &kv = parameters[i]; @@ -86,27 +83,28 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } if (transfers.size() > 0) { - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } char *host_result_buffer = (char *)ctx.get_context().result_buffer; if (ctx.result_buffer_size > 0) { - // Malloc_Async and Free_Async are available after ROCm 5.4 ctx.get_context().result_buffer = (uint64 *)device_result_buffer; } char *device_arg_buffer = nullptr; if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().malloc((void **)&device_arg_buffer, - ctx.arg_buffer_size); - AMDGPUDriver::get_instance().memcpy_host_to_device( - device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size); + AMDGPUDriver::get_instance().malloc_async( + (void **)&device_arg_buffer, ctx.arg_buffer_size, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + device_arg_buffer, ctx.get_context().arg_buffer, ctx.arg_buffer_size, + active_stream); ctx.get_context().arg_buffer = device_arg_buffer; } void *context_pointer; int arg_size = sizeof(RuntimeContext *); - AMDGPUDriver::get_instance().malloc((void **)&context_pointer, - sizeof(RuntimeContext)); - AMDGPUDriver::get_instance().memcpy_host_to_device( - context_pointer, &ctx.get_context(), sizeof(RuntimeContext)); + AMDGPUDriver::get_instance().malloc_async( + (void **)&context_pointer, sizeof(RuntimeContext), active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + context_pointer, &ctx.get_context(), sizeof(RuntimeContext), + active_stream); AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); @@ -119,13 +117,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free(device_arg_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, + active_stream); } if (ctx.result_buffer_size > 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host( - host_result_buffer, device_result_buffer, ctx.result_buffer_size); + AMDGPUDriver::get_instance().memcpy_device_to_host_async( + host_result_buffer, device_result_buffer, ctx.result_buffer_size, + active_stream); } if (transfers.size()) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; auto arg_id = idx.arg_id; @@ -135,8 +136,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - // Since we always allocating above then we should always free - AMDGPUDriver::get_instance().mem_free(device_result_buffer); + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, + active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index fabc217e96..073d383c2e 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -8,7 +8,7 @@ from tests import test_utils -@test_utils.test(arch=[qd.cuda]) +@test_utils.test(arch=[qd.cuda, qd.amdgpu]) def test_create_and_destroy_stream(): s = qd.create_stream() assert isinstance(s, Stream) @@ -17,7 +17,7 @@ def test_create_and_destroy_stream(): assert s.handle == 0 -@test_utils.test(arch=[qd.cuda]) +@test_utils.test(arch=[qd.cuda, qd.amdgpu]) def test_create_and_destroy_event(): e = qd.create_event() assert isinstance(e, Event) @@ -195,3 +195,83 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): s.synchronize() assert np.allclose(arr.to_numpy(), 99.0) s.destroy() + + +@test_utils.test() +def test_concurrent_streams_with_events(): + """Two slow kernels on separate streams run concurrently (~1s on GPU), + serial fallback on CPU/Metal.""" + SPIN_ITERS = 5_000_000 + + @qd.kernel + def slow_fill( + a: qd.types.ndarray(dtype=qd.f32, ndim=1), + lcg_state: qd.types.ndarray(dtype=qd.i32, ndim=1), + index: qd.i32, + value: qd.f32, + ): + qd.loop_config(block_dim=1) + for _ in range(1): + x = lcg_state[index] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + lcg_state[index] = x + a[index] = value + + @qd.kernel + def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): + qd.loop_config(block_dim=1) + for _ in range(1): + a[2] = a[0] + a[1] + + import time + + # Warm up JIT + a_warmup = qd.ndarray(qd.f32, shape=(3,)) + lcg_warmup = qd.ndarray(qd.i32, shape=(3,)) + slow_fill(a_warmup, lcg_warmup, 0, 0.0) + add_first_two(a_warmup) + qd.sync() + + # Serial baseline + a = qd.ndarray(qd.f32, shape=(3,)) + lcg = qd.ndarray(qd.i32, shape=(3,)) + qd.sync() + t0 = time.perf_counter() + slow_fill(a, lcg, 0, 5.0) + slow_fill(a, lcg, 1, 7.0) + add_first_two(a) + qd.sync() + serial_time = time.perf_counter() - t0 + assert np.isclose(a.to_numpy()[2], 12.0) + + # Streams + a = qd.ndarray(qd.f32, shape=(3,)) + lcg = qd.ndarray(qd.i32, shape=(3,)) + s1 = qd.create_stream() + s2 = qd.create_stream() + e1 = qd.create_event() + e2 = qd.create_event() + qd.sync() + t0 = time.perf_counter() + slow_fill(a, lcg, 0, 5.0, qd_stream=s1) + slow_fill(a, lcg, 1, 7.0, qd_stream=s2) + e1.record(s1) + e2.record(s2) + e1.wait() + e2.wait() + add_first_two(a) + qd.sync() + stream_time = time.perf_counter() - t0 + assert np.isclose(a.to_numpy()[2], 12.0) + + speedup = serial_time / stream_time + if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): + assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" + else: + assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" + + s1.destroy() + s2.destroy() + e1.destroy() + e2.destroy() From a40ed4ccd03a1162cf40a5f4fa35ee6ee7979abc Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 16:47:44 -0700 Subject: [PATCH 003/109] Add qd.stream_parallel() context manager for implicit stream parallelism Introduces stream_parallel() for running top-level for-loop blocks on separate GPU streams. The AST transformer maps 'with qd.stream_parallel()' blocks to stream-parallel group IDs, which propagate through IR lowering and offloading to the CUDA/AMDGPU kernel launchers. Each unique group ID gets its own stream at launch time. Includes validation that all top-level kernel statements must be stream_parallel blocks (no mixing), and offline cache key support. --- python/quadrants/lang/ast/ast_transformer.py | 32 +++- .../function_def_transformer.py | 29 +++ python/quadrants/lang/stream.py | 15 +- quadrants/analysis/gen_offline_cache_key.cpp | 1 + quadrants/codegen/amdgpu/codegen_amdgpu.cpp | 1 + quadrants/codegen/cuda/codegen_cuda.cpp | 1 + quadrants/codegen/llvm/llvm_compiled_data.h | 13 +- quadrants/ir/frontend_ir.cpp | 12 +- quadrants/ir/frontend_ir.h | 12 ++ quadrants/ir/statements.cpp | 3 + quadrants/ir/statements.h | 3 + quadrants/python/export_lang.cpp | 4 +- quadrants/runtime/amdgpu/kernel_launcher.cpp | 52 ++++- quadrants/runtime/cuda/kernel_launcher.cpp | 52 ++++- quadrants/transforms/lower_ast.cpp | 3 + quadrants/transforms/offload.cpp | 3 + tests/python/test_api.py | 1 + tests/python/test_streams.py | 178 ++++++++++++++++-- 18 files changed, 377 insertions(+), 38 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index 1b13ead0f9..f5cfbeef1a 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -28,6 +28,7 @@ from quadrants.lang.ast.ast_transformers.function_def_transformer import ( FunctionDefTransformer, ) +from quadrants.lang.ast.symbol_resolver import ASTResolver from quadrants.lang.exception import ( QuadrantsIndexError, QuadrantsRuntimeTypeError, @@ -39,6 +40,7 @@ from quadrants.lang.field import Field from quadrants.lang.matrix import Matrix, MatrixType from quadrants.lang.snode import append, deactivate, length +from quadrants.lang.stream import stream_parallel from quadrants.lang.struct import Struct, StructType from quadrants.types import primitive_types from quadrants.types.utils import is_integral @@ -108,7 +110,11 @@ def build_AnnAssign(ctx: ASTTransformerFuncContext, node: ast.AnnAssign): @staticmethod def build_assign_annotated( - ctx: ASTTransformerFuncContext, target: ast.Name, value, is_static_assign: bool, annotation: Type + ctx: ASTTransformerFuncContext, + target: ast.Name, + value, + is_static_assign: bool, + annotation: Type, ): """Build an annotated assignment like this: target: annotation = value. @@ -156,7 +162,10 @@ def build_Assign(ctx: ASTTransformerFuncContext, node: ast.Assign) -> None: @staticmethod def build_assign_unpack( - ctx: ASTTransformerFuncContext, node_target: list | ast.Tuple, values, is_static_assign: bool + ctx: ASTTransformerFuncContext, + node_target: list | ast.Tuple, + values, + is_static_assign: bool, ): """Build the unpack assignments like this: (target1, target2) = (value1, value2). The function should be called only if the node target is a tuple. @@ -538,7 +547,8 @@ def build_Return(ctx: ASTTransformerFuncContext, node: ast.Return) -> None: else: raise QuadrantsSyntaxError("The return type is not supported now!") ctx.ast_builder.create_kernel_exprgroup_return( - expr.make_expr_group(return_exprs), _qd_core.DebugInfo(ctx.get_pos_info(node)) + expr.make_expr_group(return_exprs), + _qd_core.DebugInfo(ctx.get_pos_info(node)), ) else: ctx.return_data = node.value.ptr @@ -1381,6 +1391,22 @@ def build_Continue(ctx: ASTTransformerFuncContext, node: ast.Continue) -> None: ctx.ast_builder.insert_continue_stmt(_qd_core.DebugInfo(ctx.get_pos_info(node))) return None + @staticmethod + def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None: + if len(node.items) != 1: + raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports a single context manager") + item = node.items[0] + if item.optional_vars is not None: + raise QuadrantsSyntaxError("'with ... as ...' is not supported in Quadrants kernels") + if not isinstance(item.context_expr, ast.Call): + raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression") + if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars): + raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()") + ctx.ast_builder.begin_stream_parallel() + build_stmts(ctx, node.body) + ctx.ast_builder.end_stream_parallel() + return None + @staticmethod def build_Pass(ctx: ASTTransformerFuncContext, node: ast.Pass) -> None: return None diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 6d000b69f5..dacbac4c96 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -21,10 +21,12 @@ from quadrants.lang.ast.ast_transformer_utils import ( ASTTransformerFuncContext, ) +from quadrants.lang.ast.symbol_resolver import ASTResolver from quadrants.lang.exception import ( QuadrantsSyntaxError, ) from quadrants.lang.matrix import MatrixType +from quadrants.lang.stream import stream_parallel from quadrants.lang.struct import StructType from quadrants.lang.util import to_quadrants_type from quadrants.types import annotations, ndarray_type, primitive_types @@ -295,7 +297,34 @@ def build_FunctionDef( else: FunctionDefTransformer._transform_as_func(ctx, node, args) + if ctx.is_kernel: + FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars) + with ctx.variable_scope_guard(): build_stmts(ctx, node.body) return None + + @staticmethod + def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> bool: + if not isinstance(stmt, ast.With): + return False + if len(stmt.items) != 1: + return False + item = stmt.items[0] + if not isinstance(item.context_expr, ast.Call): + return False + return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars) + + @staticmethod + def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None: + has_sp = any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body) + if not has_sp: + return + for stmt in body: + if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars): + raise QuadrantsSyntaxError( + "When using qd.stream_parallel(), all top-level statements " + "in the kernel must be 'with qd.stream_parallel():' blocks. " + "Move non-parallel code to a separate kernel." + ) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 8530982455..77979184d4 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager + from quadrants.lang import impl @@ -93,4 +95,15 @@ def create_event() -> Event: return Event(handle) -__all__ = ["Stream", "Event", "create_stream", "create_event"] +@contextmanager +def stream_parallel(): + """Run top-level for loops in this block on separate GPU streams. + + Used inside @qd.kernel. At Python runtime (outside kernels), this is a + no-op. During kernel compilation, the AST transformer calls into the C++ + ASTBuilder to tag loops with a stream-parallel group ID. + """ + yield + + +__all__ = ["Stream", "Event", "create_stream", "create_event", "stream_parallel"] diff --git a/quadrants/analysis/gen_offline_cache_key.cpp b/quadrants/analysis/gen_offline_cache_key.cpp index f9eb5dc324..9a38eb9ac2 100644 --- a/quadrants/analysis/gen_offline_cache_key.cpp +++ b/quadrants/analysis/gen_offline_cache_key.cpp @@ -382,6 +382,7 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor { emit(stmt->strictly_serialized); emit(stmt->mem_access_opt); emit(stmt->block_dim); + emit(stmt->stream_parallel_group_id); emit(stmt->body.get()); } diff --git a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp index bba1c87f20..e0fcca575e 100644 --- a/quadrants/codegen/amdgpu/codegen_amdgpu.cpp +++ b/quadrants/codegen/amdgpu/codegen_amdgpu.cpp @@ -396,6 +396,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { current_task->grid_dim = num_SMs * query_max_block_per_sm; } current_task->block_dim = stmt->block_dim; + current_task->stream_parallel_group_id = stmt->stream_parallel_group_id; QD_ASSERT(current_task->grid_dim != 0); QD_ASSERT(current_task->block_dim != 0); offloaded_tasks.push_back(*current_task); diff --git a/quadrants/codegen/cuda/codegen_cuda.cpp b/quadrants/codegen/cuda/codegen_cuda.cpp index 8395f7adca..4795db23d2 100644 --- a/quadrants/codegen/cuda/codegen_cuda.cpp +++ b/quadrants/codegen/cuda/codegen_cuda.cpp @@ -692,6 +692,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM { } current_task->block_dim = stmt->block_dim; current_task->dynamic_shared_array_bytes = dynamic_shared_array_bytes; + current_task->stream_parallel_group_id = stmt->stream_parallel_group_id; QD_ASSERT(current_task->grid_dim != 0); QD_ASSERT(current_task->block_dim != 0); offloaded_tasks.push_back(*current_task); diff --git a/quadrants/codegen/llvm/llvm_compiled_data.h b/quadrants/codegen/llvm/llvm_compiled_data.h index 16d4978bd4..f496e6fa3c 100644 --- a/quadrants/codegen/llvm/llvm_compiled_data.h +++ b/quadrants/codegen/llvm/llvm_compiled_data.h @@ -14,16 +14,23 @@ class OffloadedTask { int block_dim{0}; int grid_dim{0}; int dynamic_shared_array_bytes{0}; + int stream_parallel_group_id{0}; explicit OffloadedTask(const std::string &name = "", int block_dim = 0, int grid_dim = 0, - int dynamic_shared_array_bytes = 0) + int dynamic_shared_array_bytes = 0, + int stream_parallel_group_id = 0) : name(name), block_dim(block_dim), grid_dim(grid_dim), - dynamic_shared_array_bytes(dynamic_shared_array_bytes) {}; - QD_IO_DEF(name, block_dim, grid_dim, dynamic_shared_array_bytes); + dynamic_shared_array_bytes(dynamic_shared_array_bytes), + stream_parallel_group_id(stream_parallel_group_id) {}; + QD_IO_DEF(name, + block_dim, + grid_dim, + dynamic_shared_array_bytes, + stream_parallel_group_id); }; struct LLVMCompiledTask { diff --git a/quadrants/ir/frontend_ir.cpp b/quadrants/ir/frontend_ir.cpp index ae2e3ebe7c..6cf3087643 100644 --- a/quadrants/ir/frontend_ir.cpp +++ b/quadrants/ir/frontend_ir.cpp @@ -119,7 +119,8 @@ FrontendForStmt::FrontendForStmt(const FrontendForStmt &o) num_cpu_threads(o.num_cpu_threads), strictly_serialized(o.strictly_serialized), mem_access_opt(o.mem_access_opt), - block_dim(o.block_dim) { + block_dim(o.block_dim), + stream_parallel_group_id(o.stream_parallel_group_id) { } void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) { @@ -127,6 +128,7 @@ void FrontendForStmt::init_config(Arch arch, const ForLoopConfig &config) { strictly_serialized = config.strictly_serialized; mem_access_opt = config.mem_access_opt; block_dim = config.block_dim; + stream_parallel_group_id = config.stream_parallel_group_id; if (arch == Arch::cuda || arch == Arch::amdgpu) { num_cpu_threads = 1; QD_ASSERT(block_dim <= quadrants_max_gpu_block_dim); @@ -1542,6 +1544,8 @@ void ASTBuilder::begin_frontend_range_for(const Expr &i, const Expr &s, const Expr &e, const DebugInfo &dbg_info) { + for_loop_dec_.config.stream_parallel_group_id = + current_stream_parallel_group_id_; auto stmt_unique = std::make_unique( i, s, e, arch_, for_loop_dec_.config, dbg_info); auto stmt = stmt_unique.get(); @@ -1558,6 +1562,8 @@ void ASTBuilder::begin_frontend_struct_for_on_snode(const ExprGroup &loop_vars, for_loop_dec_.config.strictly_serialized, "ti.loop_config(serialize=True) does not have effect on the struct for. " "The execution order is not guaranteed."); + for_loop_dec_.config.stream_parallel_group_id = + current_stream_parallel_group_id_; auto stmt_unique = std::make_unique( loop_vars, snode, arch_, for_loop_dec_.config, dbg_info); for_loop_dec_.reset(); @@ -1574,6 +1580,8 @@ void ASTBuilder::begin_frontend_struct_for_on_external_tensor( for_loop_dec_.config.strictly_serialized, "ti.loop_config(serialize=True) does not have effect on the struct for. " "The execution order is not guaranteed."); + for_loop_dec_.config.stream_parallel_group_id = + current_stream_parallel_group_id_; auto stmt_unique = std::make_unique( loop_vars, external_tensor, arch_, for_loop_dec_.config, dbg_info); for_loop_dec_.reset(); @@ -1591,6 +1599,8 @@ void ASTBuilder::begin_frontend_mesh_for( for_loop_dec_.config.strictly_serialized, "ti.loop_config(serialize=True) does not have effect on the mesh for. " "The execution order is not guaranteed."); + for_loop_dec_.config.stream_parallel_group_id = + current_stream_parallel_group_id_; auto stmt_unique = std::make_unique(ExprGroup(i), mesh_ptr, element_type, arch_, for_loop_dec_.config, dbg_info); diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h index bce009f9e7..693a7f461f 100644 --- a/quadrants/ir/frontend_ir.h +++ b/quadrants/ir/frontend_ir.h @@ -23,6 +23,7 @@ struct ForLoopConfig { MemoryAccessOptions mem_access_opt; int block_dim{0}; bool uniform{false}; + int stream_parallel_group_id{0}; }; #define QD_DEFINE_CLONE_FOR_FRONTEND_IR \ @@ -207,6 +208,7 @@ class FrontendForStmt : public Stmt { bool strictly_serialized; MemoryAccessOptions mem_access_opt; int block_dim; + int stream_parallel_group_id{0}; FrontendForStmt(const ExprGroup &loop_vars, SNode *snode, @@ -961,6 +963,8 @@ class ASTBuilder { Arch arch_; ForLoopDecoratorRecorder for_loop_dec_; int id_counter_{0}; + int stream_parallel_group_counter_{0}; + int current_stream_parallel_group_id_{0}; public: ASTBuilder(Block *initial, Arch arch, bool is_kernel) @@ -1107,6 +1111,14 @@ class ASTBuilder { for_loop_dec_.reset(); } + void begin_stream_parallel() { + current_stream_parallel_group_id_ = ++stream_parallel_group_counter_; + } + + void end_stream_parallel() { + current_stream_parallel_group_id_ = 0; + } + Identifier get_next_id(const std::string &name = "") { return Identifier(id_counter_++, name); } diff --git a/quadrants/ir/statements.cpp b/quadrants/ir/statements.cpp index 14c55be85e..79b469a22a 100644 --- a/quadrants/ir/statements.cpp +++ b/quadrants/ir/statements.cpp @@ -244,6 +244,7 @@ std::unique_ptr RangeForStmt::clone() const { begin, end, body->clone(), is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized); new_stmt->reversed = reversed; + new_stmt->stream_parallel_group_id = stream_parallel_group_id; return new_stmt; } @@ -265,6 +266,7 @@ std::unique_ptr StructForStmt::clone() const { auto new_stmt = std::make_unique( snode, body->clone(), is_bit_vectorized, num_cpu_threads, block_dim); new_stmt->mem_access_opt = mem_access_opt; + new_stmt->stream_parallel_group_id = stream_parallel_group_id; return new_stmt; } @@ -439,6 +441,7 @@ std::unique_ptr OffloadedStmt::clone() const { new_stmt->tls_size = tls_size; new_stmt->bls_size = bls_size; new_stmt->mem_access_opt = mem_access_opt; + new_stmt->stream_parallel_group_id = stream_parallel_group_id; return new_stmt; } diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h index e06bb6d4df..3f440fe4e2 100644 --- a/quadrants/ir/statements.h +++ b/quadrants/ir/statements.h @@ -1016,6 +1016,7 @@ class RangeForStmt : public Stmt { int block_dim; bool strictly_serialized; std::string range_hint; + int stream_parallel_group_id{0}; RangeForStmt(Stmt *begin, Stmt *end, @@ -1061,6 +1062,7 @@ class StructForStmt : public Stmt { int num_cpu_threads; int block_dim; MemoryAccessOptions mem_access_opt; + int stream_parallel_group_id{0}; StructForStmt(SNode *snode, std::unique_ptr &&body, @@ -1443,6 +1445,7 @@ class OffloadedStmt : public Stmt { std::size_t tls_size{1}; // avoid allocating dynamic memory with 0 byte std::size_t bls_size{0}; MemoryAccessOptions mem_access_opt; + int stream_parallel_group_id{0}; OffloadedStmt(TaskType task_type, Arch arch, Kernel *kernel); diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index 2f5da8b1b4..d134464d49 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -357,7 +357,9 @@ void export_lang(py::module &m) { .def("strictly_serialize", &ASTBuilder::strictly_serialize) .def("block_dim", &ASTBuilder::block_dim) .def("insert_snode_access_flag", &ASTBuilder::insert_snode_access_flag) - .def("reset_snode_access_flag", &ASTBuilder::reset_snode_access_flag); + .def("reset_snode_access_flag", &ASTBuilder::reset_snode_access_flag) + .def("begin_stream_parallel", &ASTBuilder::begin_stream_parallel) + .def("end_stream_parallel", &ASTBuilder::end_stream_parallel); auto device_capability_config = py::class_(m, "DeviceCapabilityConfig") diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 1d8430d35e..1b82b33459 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -1,3 +1,5 @@ +#include + #include "quadrants/runtime/amdgpu/kernel_launcher.h" #include "quadrants/rhi/amdgpu/amdgpu_context.h" #include "quadrants/rhi/amdgpu/amdgpu_driver.h" @@ -108,12 +110,50 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); - for (auto &task : offloaded_tasks) { - QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, - task.block_dim); - amdgpu_module->launch(task.name, task.grid_dim, task.block_dim, - task.dynamic_shared_array_bytes, - {(void *)&context_pointer}, {arg_size}); + for (size_t i = 0; i < offloaded_tasks.size();) { + auto &task = offloaded_tasks[i]; + if (task.stream_parallel_group_id == 0) { + QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, + task.block_dim); + amdgpu_module->launch(task.name, task.grid_dim, task.block_dim, + task.dynamic_shared_array_bytes, + {(void *)&context_pointer}, {arg_size}); + i++; + } else { + size_t group_start = i; + while (i < offloaded_tasks.size() && + offloaded_tasks[i].stream_parallel_group_id != 0) { + i++; + } + + std::map stream_by_id; + for (size_t j = group_start; j < i; j++) { + int sid = offloaded_tasks[j].stream_parallel_group_id; + if (stream_by_id.find(sid) == stream_by_id.end()) { + void *s = nullptr; + AMDGPUDriver::get_instance().stream_create(&s, 0); + stream_by_id[sid] = s; + } + } + + for (size_t j = group_start; j < i; j++) { + auto &t = offloaded_tasks[j]; + AMDGPUContext::get_instance().set_stream( + stream_by_id[t.stream_parallel_group_id]); + amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, + t.dynamic_shared_array_bytes, + {(void *)&context_pointer}, {arg_size}); + } + + for (auto &[sid, s] : stream_by_id) { + AMDGPUDriver::get_instance().stream_synchronize(s); + } + for (auto &[sid, s] : stream_by_id) { + AMDGPUDriver::get_instance().stream_destroy(s); + } + + AMDGPUContext::get_instance().set_stream(active_stream); + } } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 13845d5a9b..94aa786b56 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -1,3 +1,5 @@ +#include + #include "quadrants/runtime/cuda/kernel_launcher.h" #include "quadrants/rhi/cuda/cuda_context.h" #include "quadrants/rhi/cuda/cuda_driver.h" @@ -139,12 +141,50 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, ctx.get_context().arg_buffer = device_arg_buffer; } - for (auto task : offloaded_tasks) { - QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, - task.block_dim); - cuda_module->launch(task.name, task.grid_dim, task.block_dim, - task.dynamic_shared_array_bytes, {&ctx.get_context()}, - {}); + for (size_t i = 0; i < offloaded_tasks.size();) { + auto &task = offloaded_tasks[i]; + if (task.stream_parallel_group_id == 0) { + QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, + task.block_dim); + cuda_module->launch(task.name, task.grid_dim, task.block_dim, + task.dynamic_shared_array_bytes, {&ctx.get_context()}, + {}); + i++; + } else { + size_t group_start = i; + while (i < offloaded_tasks.size() && + offloaded_tasks[i].stream_parallel_group_id != 0) { + i++; + } + + std::map stream_by_id; + for (size_t j = group_start; j < i; j++) { + int sid = offloaded_tasks[j].stream_parallel_group_id; + if (stream_by_id.find(sid) == stream_by_id.end()) { + void *s = nullptr; + CUDADriver::get_instance().stream_create(&s, 0); + stream_by_id[sid] = s; + } + } + + for (size_t j = group_start; j < i; j++) { + auto &t = offloaded_tasks[j]; + CUDAContext::get_instance().set_stream( + stream_by_id[t.stream_parallel_group_id]); + cuda_module->launch(t.name, t.grid_dim, t.block_dim, + t.dynamic_shared_array_bytes, {&ctx.get_context()}, + {}); + } + + for (auto &[sid, s] : stream_by_id) { + CUDADriver::get_instance().stream_synchronize(s); + } + for (auto &[sid, s] : stream_by_id) { + CUDADriver::get_instance().stream_destroy(s); + } + + CUDAContext::get_instance().set_stream(active_stream); + } } if (ctx.arg_buffer_size > 0) { CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream); diff --git a/quadrants/transforms/lower_ast.cpp b/quadrants/transforms/lower_ast.cpp index 74b698a9e6..ef1bb6f06a 100644 --- a/quadrants/transforms/lower_ast.cpp +++ b/quadrants/transforms/lower_ast.cpp @@ -232,6 +232,7 @@ class LowerAST : public IRVisitor { snode, std::move(stmt->body), stmt->is_bit_vectorized, stmt->num_cpu_threads, stmt->block_dim); new_for->index_offsets = offsets; + new_for->stream_parallel_group_id = stmt->stream_parallel_group_id; VecStatement new_statements; for (int i = 0; i < (int)stmt->loop_var_ids.size(); i++) { Stmt *loop_index = new_statements.push_back( @@ -270,6 +271,7 @@ class LowerAST : public IRVisitor { begin, end, std::move(stmt->body), stmt->is_bit_vectorized, stmt->num_cpu_threads, stmt->block_dim, stmt->strictly_serialized, /*range_hint=*/fmt::format("arg ({})", fmt::join(arg_id, ", "))); + new_for->stream_parallel_group_id = stmt->stream_parallel_group_id; VecStatement new_statements; Stmt *loop_index = new_statements.push_back(new_for.get(), 0); @@ -311,6 +313,7 @@ class LowerAST : public IRVisitor { begin_stmt, end_stmt, std::move(stmt->body), stmt->is_bit_vectorized, stmt->num_cpu_threads, stmt->block_dim, stmt->strictly_serialized); + new_for->stream_parallel_group_id = stmt->stream_parallel_group_id; new_for->body->insert(std::make_unique(new_for.get(), 0), 0); new_for->body->local_var_to_stmt[stmt->loop_var_ids[0]] = diff --git a/quadrants/transforms/offload.cpp b/quadrants/transforms/offload.cpp index 2f20247364..f3e254a889 100644 --- a/quadrants/transforms/offload.cpp +++ b/quadrants/transforms/offload.cpp @@ -134,6 +134,7 @@ class Offloader { offloaded->body->insert(std::move(s->body->statements[j])); } offloaded->range_hint = s->range_hint; + offloaded->stream_parallel_group_id = s->stream_parallel_group_id; root_block->insert(std::move(offloaded)); } else if (auto st = stmt->cast()) { assemble_serial_statements(); @@ -257,6 +258,8 @@ class Offloader { offloaded_struct_for->num_cpu_threads = std::min(for_stmt->num_cpu_threads, config.cpu_max_num_threads); offloaded_struct_for->mem_access_opt = mem_access_opt; + offloaded_struct_for->stream_parallel_group_id = + for_stmt->stream_parallel_group_id; root_block->insert(std::move(offloaded_struct_for)); } diff --git a/tests/python/test_api.py b/tests/python/test_api.py index 002014c960..241f3143de 100644 --- a/tests/python/test_api.py +++ b/tests/python/test_api.py @@ -218,6 +218,7 @@ def _get_expected_matrix_apis(): "static_assert", "static_print", "stop_grad", + "stream_parallel", "svd", "sym_eig", "sync", diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 073d383c2e..4c28b6f581 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -180,23 +180,6 @@ def fill(): e.destroy() -@test_utils.test() -def test_stream_with_ndarray(): - N = 1024 - - @qd.kernel - def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): - for i in range(N): - arr[i] = 99.0 - - arr = qd.ndarray(qd.f32, shape=(N,)) - s = qd.create_stream() - fill(arr, qd_stream=s) - s.synchronize() - assert np.allclose(arr.to_numpy(), 99.0) - s.destroy() - - @test_utils.test() def test_concurrent_streams_with_events(): """Two slow kernels on separate streams run concurrently (~1s on GPU), @@ -275,3 +258,164 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): s2.destroy() e1.destroy() e2.destroy() + + +@test_utils.test() +def test_stream_parallel_basic(): + """Each with qd.stream_parallel() block runs on its own stream (serial fallback on CPU/Metal).""" + N = 1024 + a = qd.field(qd.f32, shape=(N,)) + b = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill_parallel(): + with qd.stream_parallel(): + for i in range(N): + a[i] = 1.0 + with qd.stream_parallel(): + for j in range(N): + b[j] = 2.0 + + fill_parallel() + qd.sync() + assert np.allclose(a.to_numpy(), 1.0) + assert np.allclose(b.to_numpy(), 2.0) + + +@test_utils.test() +def test_stream_parallel_multiple_loops_per_stream(): + """Multiple for loops inside one stream_parallel block share a stream (serial fallback on CPU/Metal).""" + N = 1024 + a = qd.field(qd.f32, shape=(N,)) + b = qd.field(qd.f32, shape=(N,)) + c = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def parallel_phase(): + with qd.stream_parallel(): + for i in range(N): + a[i] = 1.0 + for i in range(N): + a[i] = a[i] + 1.0 + with qd.stream_parallel(): + for j in range(N): + b[j] = 10.0 + + @qd.kernel + def combine(): + for i in range(N): + c[i] = a[i] + b[i] + + parallel_phase() + combine() + qd.sync() + assert np.allclose(a.to_numpy(), 2.0) + assert np.allclose(b.to_numpy(), 10.0) + assert np.allclose(c.to_numpy(), 12.0) + + +@test_utils.test() +def test_stream_parallel_timing(): + """stream_parallel achieves speedup on GPU, serial fallback elsewhere.""" + SPIN_ITERS = 5_000_000 + + a = qd.field(qd.i32, shape=(2,)) + b = qd.field(qd.i32, shape=(2,)) + + @qd.kernel + def serial_spin(): + for _ in range(1): + x = a[0] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + a[0] = x + for _ in range(1): + x = a[1] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + a[1] = x + + @qd.kernel + def parallel_spin(): + with qd.stream_parallel(): + for _ in range(1): + x = b[0] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + b[0] = x + with qd.stream_parallel(): + for _ in range(1): + x = b[1] + for _j in range(SPIN_ITERS): + x = (1664525 * x + 1013904223) % 2147483647 + b[1] = x + + import time + + # Warm up + serial_spin() + parallel_spin() + qd.sync() + + qd.sync() + t0 = time.perf_counter() + serial_spin() + qd.sync() + serial_time = time.perf_counter() - t0 + + qd.sync() + t0 = time.perf_counter() + parallel_spin() + qd.sync() + stream_time = time.perf_counter() - t0 + + speedup = serial_time / stream_time + if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): + assert speedup > 1.5, ( + f"Expected >1.5x speedup, got {speedup:.2f}x " f"(serial={serial_time:.3f}s, stream={stream_time:.3f}s)" + ) + else: + assert speedup > 0.75, ( + f"Expected >=0.75x (serial fallback), got {speedup:.2f}x " + f"(serial={serial_time:.3f}s, stream={stream_time:.3f}s)" + ) + + +@test_utils.test() +def test_stream_parallel_rejects_mixed_top_level(): + """Mixing stream_parallel and non-stream_parallel at top level is an error.""" + import pytest # noqa: I001 + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + + with pytest.raises(QuadrantsSyntaxError, match="all top-level statements"): + + @qd.kernel + def bad_kernel(): + with qd.stream_parallel(): + for i in range(N): + a[i] = 1.0 + for i in range(N): + a[i] = 2.0 + + bad_kernel() + + +@test_utils.test() +def test_stream_with_ndarray(): + N = 1024 + + @qd.kernel + def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): + for i in range(N): + arr[i] = 99.0 + + arr = qd.ndarray(qd.f32, shape=(N,)) + s = qd.create_stream() + fill(arr, qd_stream=s) + s.synchronize() + assert np.allclose(arr.to_numpy(), 99.0) + s.destroy() From b856b33247dfbb55ca5f781e788fc50d5e32c9e9 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:25:18 -0700 Subject: [PATCH 004/109] Address review feedback for CUDA streams PR - Make CUDAContext::stream_ thread_local for thread-safety - Convert sync memcpy_host_to_device to async on active_stream - Use weakref in Stream/Event __del__ to safely handle interpreter shutdown - Add __enter__/__exit__ context manager support for Stream and Event - Use consistent qd_stream parameter naming in Event.record and Event.wait - Add handle==0 guard to stream_synchronize --- python/quadrants/lang/stream.py | 60 ++++++++++++++++------ quadrants/program/program.cpp | 2 +- quadrants/rhi/cuda/cuda_context.cpp | 6 +-- quadrants/rhi/cuda/cuda_context.h | 2 +- quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++-- 5 files changed, 55 insertions(+), 25 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 8530982455..8f6cfab3d6 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -1,14 +1,22 @@ +import weakref + from quadrants.lang import impl +def _get_prog_weakref(): + return weakref.ref(impl.get_runtime().prog) + + class Stream: """Wraps a backend-specific GPU stream for concurrent kernel execution. On backends without native streams (e.g. CPU), this is a no-op object. + Call destroy() explicitly or use as a context manager to ensure cleanup. """ - def __init__(self, handle: int): + def __init__(self, handle: int, prog_ref: weakref.ref | None = None): self._handle = handle + self._prog_ref = prog_ref @property def handle(self) -> int: @@ -27,30 +35,41 @@ def destroy(self): self._handle = 0 def __del__(self): - if self._handle != 0: - try: - self.destroy() - except Exception: - pass + if self._handle != 0 and self._prog_ref is not None: + prog = self._prog_ref() + if prog is not None: + try: + prog.stream_destroy(self._handle) + self._handle = 0 + except Exception: + pass + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() class Event: """Wraps a backend-specific GPU event for stream synchronization. On backends without native events (e.g. CPU), this is a no-op object. + Call destroy() explicitly or use as a context manager to ensure cleanup. """ - def __init__(self, handle: int): + def __init__(self, handle: int, prog_ref: weakref.ref | None = None): self._handle = handle + self._prog_ref = prog_ref @property def handle(self) -> int: return self._handle - def record(self, stream: Stream | None = None): + def record(self, qd_stream: Stream | None = None): """Record this event on a stream. None means the default stream.""" prog = impl.get_runtime().prog - stream_handle = stream.handle if stream is not None else 0 + stream_handle = qd_stream.handle if qd_stream is not None else 0 prog.event_record(self._handle, stream_handle) def wait(self, qd_stream: Stream | None = None): @@ -72,25 +91,34 @@ def destroy(self): self._handle = 0 def __del__(self): - if self._handle != 0: - try: - self.destroy() - except Exception: - pass + if self._handle != 0 and self._prog_ref is not None: + prog = self._prog_ref() + if prog is not None: + try: + prog.event_destroy(self._handle) + self._handle = 0 + except Exception: + pass + + def __enter__(self): + return self + + def __exit__(self, *args): + self.destroy() def create_stream() -> Stream: """Create a new GPU stream for concurrent kernel execution.""" prog = impl.get_runtime().prog handle = prog.stream_create() - return Stream(handle) + return Stream(handle, _get_prog_weakref()) def create_event() -> Event: """Create a new GPU event for stream synchronization.""" prog = impl.get_runtime().prog handle = prog.event_create() - return Event(handle) + return Event(handle, _get_prog_weakref()) __all__ = ["Stream", "Event", "create_stream", "create_event"] diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 9b2ff0886b..be152d02da 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -508,7 +508,7 @@ void Program::stream_destroy(uint64 stream_handle) { void Program::stream_synchronize(uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { + if (compile_config().arch == Arch::cuda && stream_handle != 0) { CUDADriver::get_instance().stream_synchronize( reinterpret_cast(stream_handle)); } diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp index 89c16135a2..23399649a9 100644 --- a/quadrants/rhi/cuda/cuda_context.cpp +++ b/quadrants/rhi/cuda/cuda_context.cpp @@ -11,10 +11,10 @@ namespace quadrants::lang { +thread_local void *CUDAContext::stream_ = nullptr; + CUDAContext::CUDAContext() - : profiler_(nullptr), - driver_(CUDADriver::get_instance_without_context()), - stream_(nullptr) { + : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) { // CUDA initialization dev_count_ = 0; driver_.init(0); diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h index c57baa3d92..ba891644a7 100644 --- a/quadrants/rhi/cuda/cuda_context.h +++ b/quadrants/rhi/cuda/cuda_context.h @@ -30,7 +30,7 @@ class CUDAContext { int max_shared_memory_bytes_; bool debug_; bool supports_mem_pool_; - void *stream_; + static thread_local void *stream_; public: CUDAContext(); diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 13845d5a9b..9bbf75044e 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -85,8 +85,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->get_device_alloc_info_ptr(devalloc); transfers[data_ptr_idx] = {data_ptr, devalloc}; - CUDADriver::get_instance().memcpy_host_to_device( - (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz); + CUDADriver::get_instance().memcpy_host_to_device_async( + (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, + active_stream); if (grad_ptr != nullptr) { DeviceAllocation grad_devalloc = executor->allocate_memory_on_device( @@ -95,8 +96,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->get_device_alloc_info_ptr(grad_devalloc); transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc}; - CUDADriver::get_instance().memcpy_host_to_device( - (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz); + CUDADriver::get_instance().memcpy_host_to_device_async( + (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz, + active_stream); } else { device_ptrs[grad_ptr_idx] = nullptr; } From 7555ec5edf0581290df8b902b5a31e6162521fe3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:03 -0700 Subject: [PATCH 005/109] Move AMDGPU mem_free_async before transfers sync to match CUDA ordering Batch the device_result_buffer free into the stream pipeline before the sync barrier, matching the CUDA kernel launcher's ordering for consistency and marginal performance improvement. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 1d8430d35e..cff0f2b4a1 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -125,6 +125,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, host_result_buffer, device_result_buffer, ctx.result_buffer_size, active_stream); } + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, + active_stream); if (transfers.size()) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { @@ -136,8 +138,6 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->deallocate_memory_on_device(itr->second.second); } } - AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, - active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel( From c12d23e1e1426a0b538382cb5dcab489e4c09b2e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:18 -0700 Subject: [PATCH 006/109] Convert AMDGPU sync memcpy_host_to_device to async on active_stream Use memcpy_host_to_device_async for external array transfers so they are properly ordered on the active stream, matching the CUDA launcher. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index cff0f2b4a1..f772fc7b5b 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -66,8 +66,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, executor->get_device_alloc_info_ptr(devalloc); transfers[data_ptr_idx] = {data_ptr, devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device( - (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz); + AMDGPUDriver::get_instance().memcpy_host_to_device_async( + (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, + active_stream); } ctx.set_ndarray_ptrs(arg_id, (uint64)device_ptrs[data_ptr_idx], (uint64)ctx.array_ptrs[grad_ptr_idx]); From 1673a38761b50fb6af4767e569fbf88751bb4788 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:25 -0700 Subject: [PATCH 007/109] Document ROCm >= 5.4 requirement for hipMallocAsync/hipFreeAsync --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 6063d268a9..25e33774e7 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -70,6 +70,7 @@ PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +// hipMallocAsync/hipFreeAsync require ROCm >= 5.4 PER_AMDGPU_FUNCTION(malloc_async, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, From 60d015bfddac7068d1d1067d8f059e9c3236447e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:35 -0700 Subject: [PATCH 008/109] Relax concurrency test threshold and log timings Lower GPU speedup threshold from 1.5x to 1.3x to reduce flakiness in CI under contention, and print actual timings for diagnostics. --- tests/python/test_streams.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 073d383c2e..236578974d 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -266,8 +266,9 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): assert np.isclose(a.to_numpy()[2], 12.0) speedup = serial_time / stream_time + print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x") if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): - assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" + assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x" else: assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" From c4be4ffd7c77a68ed6176ce30900d1a2260dec5b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:27:55 -0700 Subject: [PATCH 009/109] Add handle==0 guard to AMDGPU stream_synchronize and make stream_ thread_local Mirror the CUDA fixes: guard stream_synchronize against handle==0 to avoid unintentional default stream sync, and make AMDGPUContext::stream_ thread_local for thread-safety. --- quadrants/program/program.cpp | 2 +- quadrants/rhi/amdgpu/amdgpu_context.cpp | 2 ++ quadrants/rhi/amdgpu/amdgpu_context.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index faac67970c..8bab1d30f7 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -532,7 +532,7 @@ void Program::stream_synchronize(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { AMDGPUDriver::get_instance().stream_synchronize( reinterpret_cast(stream_handle)); } diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index f940ed9a7c..24d924ed0d 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -13,6 +13,8 @@ namespace quadrants { namespace lang { +thread_local void *AMDGPUContext::stream_ = nullptr; + AMDGPUContext::AMDGPUContext() : driver_(AMDGPUDriver::get_instance_without_context()) { dev_count_ = 0; diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 68e7cd7314..4fc7c8328b 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -23,7 +23,7 @@ class AMDGPUContext { KernelProfilerBase *profiler_{nullptr}; AMDGPUDriver &driver_; bool debug_{false}; - void *stream_{nullptr}; + static thread_local void *stream_; std::vector kernel_arg_pointer_; public: From be7ad924c333a589f13bbbe34f2d9583649007f5 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:29:24 -0700 Subject: [PATCH 010/109] Clear stream_parallel_group_id in ForLoopDecoratorRecorder::reset() Prevents stale group IDs from leaking if insert_for is called after a path that set a non-zero stream_parallel_group_id, matching the reset pattern of all other ForLoopConfig fields. --- quadrants/ir/frontend_ir.h | 1 + 1 file changed, 1 insertion(+) diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h index 693a7f461f..38226ca1b3 100644 --- a/quadrants/ir/frontend_ir.h +++ b/quadrants/ir/frontend_ir.h @@ -954,6 +954,7 @@ class ASTBuilder { config.mem_access_opt.clear(); config.block_dim = 0; config.strictly_serialized = false; + config.stream_parallel_group_id = 0; } }; From ce8328102ae0b18f0b29d661b4dc4026edf3c4a8 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:29:36 -0700 Subject: [PATCH 011/109] Reject nested stream_parallel blocks Add an error check in begin_stream_parallel() to prevent nesting, which would produce undefined group ID semantics. --- quadrants/ir/frontend_ir.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h index 38226ca1b3..46d7a3ec7a 100644 --- a/quadrants/ir/frontend_ir.h +++ b/quadrants/ir/frontend_ir.h @@ -1113,6 +1113,8 @@ class ASTBuilder { } void begin_stream_parallel() { + QD_ERROR_IF(current_stream_parallel_group_id_ != 0, + "Nested stream_parallel blocks are not supported"); current_stream_parallel_group_id_ = ++stream_parallel_group_counter_; } From 880abc7e74cc8be0979d54747ff753929f00221d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 17:30:08 -0700 Subject: [PATCH 012/109] Document stream_parallel launcher design: per-launch streams, shared context safety Add comments explaining that streams are created/destroyed per launch (stream pooling as future optimization), and that RuntimeContext sharing across concurrent streams is safe because kernels only read from it. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++ quadrants/runtime/cuda/kernel_launcher.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index f859bb116c..6abd0778ed 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -127,6 +127,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, i++; } + // Create one stream per unique group ID. Streams are created/destroyed + // per launch; a stream pool could reduce overhead for hot loops. std::map stream_by_id; for (size_t j = group_start; j < i; j++) { int sid = offloaded_tasks[j].stream_parallel_group_id; @@ -137,6 +139,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } + // Launch tasks concurrently on their respective streams. The shared + // RuntimeContext is safe here: kernels only read from it (args/runtime + // pointers); result_buffer writes are to disjoint offsets per task. for (size_t j = group_start; j < i; j++) { auto &t = offloaded_tasks[j]; AMDGPUContext::get_instance().set_stream( diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 2e10226a13..9cf24915ab 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -159,6 +159,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, i++; } + // Create one stream per unique group ID. Streams are created/destroyed + // per launch; a stream pool could reduce overhead for hot loops. std::map stream_by_id; for (size_t j = group_start; j < i; j++) { int sid = offloaded_tasks[j].stream_parallel_group_id; @@ -169,6 +171,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, } } + // Launch tasks concurrently on their respective streams. The shared + // RuntimeContext is safe here: kernels only read from it (args/runtime + // pointers); result_buffer writes are to disjoint offsets per task. for (size_t j = group_start; j < i; j++) { auto &t = offloaded_tasks[j]; CUDAContext::get_instance().set_stream( From b28e7c60901fdde76ff2b9ea153534f15a0050ac Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 18:23:15 -0700 Subject: [PATCH 013/109] Revert "Relax concurrency test threshold and log timings" This reverts commit 60d015bfddac7068d1d1067d8f059e9c3236447e. --- tests/python/test_streams.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 236578974d..073d383c2e 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -266,9 +266,8 @@ def add_first_two(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): assert np.isclose(a.to_numpy()[2], 12.0) speedup = serial_time / stream_time - print(f"serial={serial_time:.4f}s stream={stream_time:.4f}s speedup={speedup:.2f}x") if qd.lang.impl.current_cfg().arch in (qd.cuda, qd.amdgpu): - assert speedup > 1.3, f"Expected >1.3x speedup, got {speedup:.2f}x" + assert speedup > 1.5, f"Expected >1.5x speedup, got {speedup:.2f}x" else: assert speedup > 0.75, f"Expected >=0.75x (serial fallback), got {speedup:.2f}x" From e9f98c645671a2a9b5ee3cae915c31a852053cf4 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 18:33:27 -0700 Subject: [PATCH 014/109] Add stream pool to reuse GPU streams across kernel launches Replace per-launch stream_create/stream_destroy with acquire_stream/ release_stream on CUDAContext and AMDGPUContext. Streams are cached in a pool and reused across invocations, avoiding the driver-level overhead of stream creation (~5-50us) on every kernel launch in hot loops. --- quadrants/rhi/amdgpu/amdgpu_context.h | 18 ++++++++++++++++++ quadrants/rhi/cuda/cuda_context.h | 19 +++++++++++++++++++ quadrants/runtime/amdgpu/kernel_launcher.cpp | 8 ++------ quadrants/runtime/cuda/kernel_launcher.cpp | 8 ++------ 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index 4fc7c8328b..dd99e4fd37 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -24,6 +24,7 @@ class AMDGPUContext { AMDGPUDriver &driver_; bool debug_{false}; static thread_local void *stream_; + std::vector stream_pool_; std::vector kernel_arg_pointer_; public: @@ -125,6 +126,23 @@ class AMDGPUContext { return stream_; } + void *acquire_stream() { + std::lock_guard _(lock_); + if (!stream_pool_.empty()) { + auto s = stream_pool_.back(); + stream_pool_.pop_back(); + return s; + } + void *s = nullptr; + AMDGPUDriver::get_instance().stream_create(&s, 0); + return s; + } + + void release_stream(void *s) { + std::lock_guard _(lock_); + stream_pool_.push_back(s); + } + static AMDGPUContext &get_instance(); }; diff --git a/quadrants/rhi/cuda/cuda_context.h b/quadrants/rhi/cuda/cuda_context.h index ba891644a7..b4a4809615 100644 --- a/quadrants/rhi/cuda/cuda_context.h +++ b/quadrants/rhi/cuda/cuda_context.h @@ -3,6 +3,7 @@ #include #include #include +#include #include "quadrants/program/kernel_profiler.h" #include "quadrants/rhi/cuda/cuda_driver.h" @@ -31,6 +32,7 @@ class CUDAContext { bool debug_; bool supports_mem_pool_; static thread_local void *stream_; + std::vector stream_pool_; public: CUDAContext(); @@ -120,6 +122,23 @@ class CUDAContext { void *get_stream() const { return stream_; } + + void *acquire_stream() { + std::lock_guard _(lock_); + if (!stream_pool_.empty()) { + auto s = stream_pool_.back(); + stream_pool_.pop_back(); + return s; + } + void *s = nullptr; + CUDADriver::get_instance().stream_create(&s, 0); + return s; + } + + void release_stream(void *s) { + std::lock_guard _(lock_); + stream_pool_.push_back(s); + } }; } // namespace quadrants::lang diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 6abd0778ed..88a3570924 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -127,15 +127,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, i++; } - // Create one stream per unique group ID. Streams are created/destroyed - // per launch; a stream pool could reduce overhead for hot loops. std::map stream_by_id; for (size_t j = group_start; j < i; j++) { int sid = offloaded_tasks[j].stream_parallel_group_id; if (stream_by_id.find(sid) == stream_by_id.end()) { - void *s = nullptr; - AMDGPUDriver::get_instance().stream_create(&s, 0); - stream_by_id[sid] = s; + stream_by_id[sid] = AMDGPUContext::get_instance().acquire_stream(); } } @@ -155,7 +151,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, AMDGPUDriver::get_instance().stream_synchronize(s); } for (auto &[sid, s] : stream_by_id) { - AMDGPUDriver::get_instance().stream_destroy(s); + AMDGPUContext::get_instance().release_stream(s); } AMDGPUContext::get_instance().set_stream(active_stream); diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 9cf24915ab..6743d7c291 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -159,15 +159,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, i++; } - // Create one stream per unique group ID. Streams are created/destroyed - // per launch; a stream pool could reduce overhead for hot loops. std::map stream_by_id; for (size_t j = group_start; j < i; j++) { int sid = offloaded_tasks[j].stream_parallel_group_id; if (stream_by_id.find(sid) == stream_by_id.end()) { - void *s = nullptr; - CUDADriver::get_instance().stream_create(&s, 0); - stream_by_id[sid] = s; + stream_by_id[sid] = CUDAContext::get_instance().acquire_stream(); } } @@ -187,7 +183,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, CUDADriver::get_instance().stream_synchronize(s); } for (auto &[sid, s] : stream_by_id) { - CUDADriver::get_instance().stream_destroy(s); + CUDAContext::get_instance().release_stream(s); } CUDAContext::get_instance().set_stream(active_stream); From 65a7967ca88aa33f62b3de4411cd3f51f870ed5f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 18:37:11 -0700 Subject: [PATCH 015/109] Add test for stream pool reuse across repeated kernel launches Calls a stream_parallel kernel 5 times in a loop to verify that pooled streams are correctly reused with correct results each iteration. --- tests/python/test_streams.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 4c28b6f581..86568c4e17 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -419,3 +419,31 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): s.synchronize() assert np.allclose(arr.to_numpy(), 99.0) s.destroy() + + +@test_utils.test() +def test_stream_pool_reuse(): + """Repeated stream_parallel invocations reuse pooled streams correctly.""" + N = 128 + a = qd.ndarray(qd.f32, shape=(N,)) + b = qd.ndarray(qd.f32, shape=(N,)) + + @qd.kernel + def parallel_fill( + x: qd.types.ndarray(dtype=qd.f32, ndim=1), + y: qd.types.ndarray(dtype=qd.f32, ndim=1), + val: qd.f32, + ): + with qd.stream_parallel(): + for i in range(N): + x[i] = val + with qd.stream_parallel(): + for i in range(N): + y[i] = val * 2.0 + + for iteration in range(5): + v = float(iteration + 1) + parallel_fill(a, b, v) + qd.sync() + assert np.allclose(a.to_numpy(), v), f"iteration {iteration}" + assert np.allclose(b.to_numpy(), v * 2.0), f"iteration {iteration}" From 5393d04c8d210edf8fe7d0301ae6f68e22e56b8f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 11 Mar 2026 18:51:33 -0700 Subject: [PATCH 016/109] Destroy pooled streams in CUDAContext and AMDGPUContext destructors --- quadrants/rhi/amdgpu/amdgpu_context.cpp | 4 ++++ quadrants/rhi/cuda/cuda_context.cpp | 11 ++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.cpp b/quadrants/rhi/amdgpu/amdgpu_context.cpp index 24d924ed0d..7163431e32 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_context.cpp @@ -204,6 +204,10 @@ void AMDGPUContext::launch(void *func, } AMDGPUContext::~AMDGPUContext() { + for (auto *s : stream_pool_) { + driver_.stream_destroy(s); + } + stream_pool_.clear(); if (context_) { driver_.device_primary_ctx_release(device_); } diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp index 23399649a9..286c4eb3ba 100644 --- a/quadrants/rhi/cuda/cuda_context.cpp +++ b/quadrants/rhi/cuda/cuda_context.cpp @@ -180,13 +180,10 @@ void CUDAContext::launch(void *func, } CUDAContext::~CUDAContext() { - // TODO: restore these? - /* - CUDADriver::get_instance().cuMemFree(context_buffer); - for (auto cudaModule: cudaModules) - CUDADriver::get_instance().cuModuleUnload(cudaModule); - CUDADriver::get_instance().cuCtxDestroy(context); - */ + for (auto *s : stream_pool_) { + driver_.stream_destroy(s); + } + stream_pool_.clear(); } CUDAContext &CUDAContext::get_instance() { From 9be110daf54838a2da4a430e254e25afdfb198e9 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 19 Apr 2026 17:24:53 -0700 Subject: [PATCH 017/109] Apply clang-format Made-with: Cursor --- quadrants/program/program.cpp | 28 ++++++++-------------- quadrants/rhi/cuda/cuda_context.cpp | 3 +-- quadrants/runtime/cuda/kernel_launcher.cpp | 3 +-- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index a591fb8dba..ec5a9fa57d 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -474,8 +474,7 @@ uint64 Program::stream_create() { void Program::stream_destroy(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && stream_handle != 0) { - CUDADriver::get_instance().stream_destroy( - reinterpret_cast(stream_handle)); + CUDADriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif } @@ -483,8 +482,7 @@ void Program::stream_destroy(uint64 stream_handle) { void Program::stream_synchronize(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && stream_handle != 0) { - CUDADriver::get_instance().stream_synchronize( - reinterpret_cast(stream_handle)); + CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif } @@ -492,8 +490,7 @@ void Program::stream_synchronize(uint64 stream_handle) { void Program::set_current_cuda_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda) { - CUDAContext::get_instance().set_stream( - reinterpret_cast(stream_handle)); + CUDAContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif } @@ -502,8 +499,7 @@ uint64 Program::event_create() { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda) { void *event = nullptr; - CUDADriver::get_instance().event_create(&event, - 0x02 /*CU_EVENT_DISABLE_TIMING*/); + CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/); return reinterpret_cast(event); } #endif @@ -513,8 +509,7 @@ uint64 Program::event_create() { void Program::event_destroy(uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDADriver::get_instance().event_destroy( - reinterpret_cast(event_handle)); + CUDADriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif } @@ -522,9 +517,8 @@ void Program::event_destroy(uint64 event_handle) { void Program::event_record(uint64 event_handle, uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDADriver::get_instance().event_record( - reinterpret_cast(event_handle), - reinterpret_cast(stream_handle)); + CUDADriver::get_instance().event_record(reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); } #endif } @@ -532,8 +526,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { void Program::event_synchronize(uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDADriver::get_instance().event_synchronize( - reinterpret_cast(event_handle)); + CUDADriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif } @@ -541,9 +534,8 @@ void Program::event_synchronize(uint64 event_handle) { void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDADriver::get_instance().stream_wait_event( - reinterpret_cast(stream_handle), - reinterpret_cast(event_handle), 0 /*flags*/); + CUDADriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); } #endif } diff --git a/quadrants/rhi/cuda/cuda_context.cpp b/quadrants/rhi/cuda/cuda_context.cpp index a605d06c64..60553da9c7 100644 --- a/quadrants/rhi/cuda/cuda_context.cpp +++ b/quadrants/rhi/cuda/cuda_context.cpp @@ -13,8 +13,7 @@ namespace quadrants::lang { thread_local void *CUDAContext::stream_ = nullptr; -CUDAContext::CUDAContext() - : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) { +CUDAContext::CUDAContext() : profiler_(nullptr), driver_(CUDADriver::get_instance_without_context()) { // CUDA initialization dev_count_ = 0; driver_.init(0); diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 34905218f9..0c5d7e9458 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -173,8 +173,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx CUDADriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer, ctx.result_buffer_size, active_stream); } - CUDADriver::get_instance().mem_free_async(device_result_buffer, - active_stream); + CUDADriver::get_instance().mem_free_async(device_result_buffer, active_stream); // copy data back to host if (transfers.size() > 0) { CUDADriver::get_instance().stream_synchronize(active_stream); From 31fffbf1730e32c200eed37e8b4a4740ddc28b50 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 19 Apr 2026 19:03:53 -0700 Subject: [PATCH 018/109] Apply clang-format Made-with: Cursor --- quadrants/program/program.cpp | 28 +++++++------------ .../rhi/amdgpu/amdgpu_driver_functions.inc.h | 6 +--- quadrants/runtime/amdgpu/kernel_launcher.cpp | 18 ++++++------ 3 files changed, 19 insertions(+), 33 deletions(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 43e8df1236..648f3291c3 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -491,8 +491,7 @@ void Program::stream_destroy(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { - AMDGPUDriver::get_instance().stream_destroy( - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif } @@ -505,8 +504,7 @@ void Program::stream_synchronize(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { - AMDGPUDriver::get_instance().stream_synchronize( - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif } @@ -519,8 +517,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { - AMDGPUContext::get_instance().set_stream( - reinterpret_cast(stream_handle)); + AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif } @@ -536,8 +533,7 @@ uint64 Program::event_create() { #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { void *event = nullptr; - AMDGPUDriver::get_instance().event_create(&event, - 0x02 /*hipEventDisableTiming*/); + AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); return reinterpret_cast(event); } #endif @@ -552,8 +548,7 @@ void Program::event_destroy(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_destroy( - reinterpret_cast(event_handle)); + AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif } @@ -567,9 +562,8 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_record( - reinterpret_cast(event_handle), - reinterpret_cast(stream_handle)); + AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); } #endif } @@ -582,8 +576,7 @@ void Program::event_synchronize(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().event_synchronize( - reinterpret_cast(event_handle)); + AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif } @@ -597,9 +590,8 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { - AMDGPUDriver::get_instance().stream_wait_event( - reinterpret_cast(stream_handle), - reinterpret_cast(event_handle), 0 /*flags*/); + AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); } #endif } diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 6a01c3a87a..6be39db108 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -62,11 +62,7 @@ PER_AMDGPU_FUNCTION(kernel_get_occupancy, hipOccupancyMaxActiveBlocksPerMultipro // Stream management PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *); -PER_AMDGPU_FUNCTION(stream_wait_event, - hipStreamWaitEvent, - void *, - void *, - uint32); +PER_AMDGPU_FUNCTION(stream_wait_event, hipStreamWaitEvent, void *, void *, uint32); // Event management PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 1c5c573d85..cace0821ce 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -86,16 +86,16 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx device_ptrs[data_ptr_idx] = executor->get_device_alloc_info_ptr(devalloc); transfers[data_ptr_idx] = {data_ptr, devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device_async( - (void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[data_ptr_idx], data_ptr, arr_sz, + active_stream); if (grad_ptr != nullptr) { DeviceAllocation grad_devalloc = executor->allocate_memory_on_device(arr_sz, (uint64 *)device_result_buffer); device_ptrs[grad_ptr_idx] = executor->get_device_alloc_info_ptr(grad_devalloc); transfers[grad_ptr_idx] = {grad_ptr, grad_devalloc}; - AMDGPUDriver::get_instance().memcpy_host_to_device_async( - (void *)device_ptrs[grad_ptr_idx], grad_ptr, arr_sz, active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async((void *)device_ptrs[grad_ptr_idx], grad_ptr, + arr_sz, active_stream); } else { device_ptrs[grad_ptr_idx] = nullptr; } @@ -141,8 +141,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx void *context_pointer; int arg_size = sizeof(RuntimeContext *); AMDGPUDriver::get_instance().malloc_async((void **)&context_pointer, sizeof(RuntimeContext), active_stream); - AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), - sizeof(RuntimeContext), active_stream); + AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext), + active_stream); AMDGPUContext::get_instance().push_back_kernel_arg_pointer(context_pointer); @@ -154,15 +154,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } QD_TRACE("Launching kernel"); if (ctx.arg_buffer_size > 0) { - AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, - active_stream); + AMDGPUDriver::get_instance().mem_free_async(device_arg_buffer, active_stream); } if (ctx.result_buffer_size > 0) { AMDGPUDriver::get_instance().memcpy_device_to_host_async(host_result_buffer, device_result_buffer, ctx.result_buffer_size, active_stream); } - AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, - active_stream); + AMDGPUDriver::get_instance().mem_free_async(device_result_buffer, active_stream); if (transfers.size()) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { From e9ce144a2302c55b097f61148bae2385808e8d5c Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 19 Apr 2026 20:33:32 -0700 Subject: [PATCH 019/109] Apply clang-format Made-with: Cursor --- quadrants/codegen/llvm/llvm_compiled_data.h | 6 +----- quadrants/ir/frontend_ir.h | 3 +-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/quadrants/codegen/llvm/llvm_compiled_data.h b/quadrants/codegen/llvm/llvm_compiled_data.h index 4ed2e69abc..ba7b74e674 100644 --- a/quadrants/codegen/llvm/llvm_compiled_data.h +++ b/quadrants/codegen/llvm/llvm_compiled_data.h @@ -26,11 +26,7 @@ class OffloadedTask { grid_dim(grid_dim), dynamic_shared_array_bytes(dynamic_shared_array_bytes), stream_parallel_group_id(stream_parallel_group_id) {}; - QD_IO_DEF(name, - block_dim, - grid_dim, - dynamic_shared_array_bytes, - stream_parallel_group_id); + QD_IO_DEF(name, block_dim, grid_dim, dynamic_shared_array_bytes, stream_parallel_group_id); }; struct LLVMCompiledTask { diff --git a/quadrants/ir/frontend_ir.h b/quadrants/ir/frontend_ir.h index 0ceed57772..b4ad04a9b5 100644 --- a/quadrants/ir/frontend_ir.h +++ b/quadrants/ir/frontend_ir.h @@ -1028,8 +1028,7 @@ class ASTBuilder { } void begin_stream_parallel() { - QD_ERROR_IF(current_stream_parallel_group_id_ != 0, - "Nested stream_parallel blocks are not supported"); + QD_ERROR_IF(current_stream_parallel_group_id_ != 0, "Nested stream_parallel blocks are not supported"); current_stream_parallel_group_id_ = ++stream_parallel_group_counter_; } diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 83df04490f..57659a5cfa 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -44,8 +44,8 @@ void KernelLauncher::launch_offloaded_tasks(JITModule *amdgpu_module, for (size_t j = group_start; j < i; j++) { const auto &t = offloaded_tasks[j]; AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); - amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, - {(void *)&context_pointer}, {arg_size}); + amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer}, + {arg_size}); } for (auto &[sid, s] : stream_by_id) { From d3cae3cbaa1a3ffd832a30320fa59c5af753e595 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 02:18:49 -0700 Subject: [PATCH 020/109] [Test] Exclude flaky test_perf_dispatch_python from Vulkan The pure-Python perf dispatch test is timing-sensitive and unreliable on the Vulkan software renderer in CI. The kernel variant of the same test still covers perf dispatch on Vulkan. Made-with: Cursor --- tests/python/test_perf_dispatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py index eaef03d99f..b533105c42 100644 --- a/tests/python/test_perf_dispatch.py +++ b/tests/python/test_perf_dispatch.py @@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2( assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2 -@test_utils.test() +@test_utils.test(exclude=[qd.vulkan]) def test_perf_dispatch_python() -> None: WARMUP = 3 From 798f87a18139fb8799d9b1d91135b2f6b8066a8d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 04:55:29 -0700 Subject: [PATCH 021/109] Exclude flaky test_perf_dispatch_python from Metal and Vulkan The pure-Python perf_dispatch timing test is unreliable on Mac Metal and Vulkan (MoltenVK) where timing differences between implementations are too small to consistently pick the fastest one. Made-with: Cursor --- tests/python/test_perf_dispatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_perf_dispatch.py b/tests/python/test_perf_dispatch.py index eaef03d99f..2de074ed3c 100644 --- a/tests/python/test_perf_dispatch.py +++ b/tests/python/test_perf_dispatch.py @@ -109,7 +109,7 @@ def my_func1_impl_a_shape0_ge_2( assert len(speed_checker._trial_count_by_dispatch_impl_by_geometry_hash[geometry]) == 2 -@test_utils.test() +@test_utils.test(exclude=[qd.metal, qd.vulkan]) def test_perf_dispatch_python() -> None: WARMUP = 3 From cd5b486beab0fc878652d4d2d043f44f1bd58e12 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 08:35:22 -0700 Subject: [PATCH 022/109] [Doc] Add user guide for streams API --- docs/source/user_guide/index.md | 1 + docs/source/user_guide/streams.md | 145 ++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 docs/source/user_guide/streams.md diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md index 05a5dfc434..7775e56f0e 100644 --- a/docs/source/user_guide/index.md +++ b/docs/source/user_guide/index.md @@ -54,6 +54,7 @@ tile16 :titlesonly: graph +streams perf_dispatch ``` diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md new file mode 100644 index 0000000000..0a610fd217 --- /dev/null +++ b/docs/source/user_guide/streams.md @@ -0,0 +1,145 @@ +# Streams + +Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default +stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently +and control synchronization with events. + +## Supported platforms + +| Backend | Streams | Events | Notes | +|---------|---------|--------|-------| +| CUDA | Yes | Yes | Full concurrent execution | +| CPU | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | +| Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | +| Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | + +On backends without native stream support, `create_stream()` and `create_event()` return objects with handle +`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially. + +## Creating and using streams + +```python +import quadrants as qd + +qd.init(arch=qd.cuda) + +N = 1024 +a = qd.field(qd.f32, shape=(N,)) +b = qd.field(qd.f32, shape=(N,)) + +@qd.kernel +def fill_a(): + for i in range(N): + a[i] = 1.0 + +@qd.kernel +def fill_b(): + for i in range(N): + b[i] = 2.0 + +s1 = qd.create_stream() +s2 = qd.create_stream() + +fill_a(qd_stream=s1) +fill_b(qd_stream=s2) + +s1.synchronize() +s2.synchronize() + +s1.destroy() +s2.destroy() +``` + +Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute +concurrently. Call `synchronize()` to block until all work on a stream completes. + +## Events + +Events let you express dependencies between streams without full synchronization. + +```python +s1 = qd.create_stream() +s2 = qd.create_stream() + +@qd.kernel +def produce(): + for i in range(N): + a[i] = 10.0 + +@qd.kernel +def consume(): + for i in range(N): + b[i] = a[i] + +produce(qd_stream=s1) + +e = qd.create_event() +e.record(s1) # record when s1 finishes produce() +e.wait(qd_stream=s2) # s2 waits for that event before proceeding + +consume(qd_stream=s2) # safe to read a[] — produce() is guaranteed complete +s2.synchronize() + +e.destroy() +s1.destroy() +s2.destroy() +``` + +`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait +until the recorded point is reached. If `qd_stream` is omitted, the default stream waits. + +## Context managers + +Streams and events support `with` blocks for automatic cleanup: + +```python +with qd.create_stream() as s: + fill_a(qd_stream=s) + s.synchronize() +# s.destroy() called automatically +``` + +## PyTorch interop (CUDA) + +When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to +avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different +streams with no ordering guarantees, leading to intermittent data corruption. + +### Running Quadrants kernels on PyTorch's stream + +```python +import torch +from quadrants.lang.stream import Stream + +torch_stream_ptr = torch.cuda.current_stream().cuda_stream +stream = Stream(torch_stream_ptr) + +physics_kernel(qd_stream=stream) +observations = compute_obs_tensor() # PyTorch op on the same stream +apply_actions_kernel(qd_stream=stream) +``` + +Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this +wrapper — PyTorch owns the underlying stream. + +### Running PyTorch operations on a Quadrants stream + +```python +qd_stream = qd.create_stream() +torch_stream = torch.cuda.ExternalStream(qd_stream.handle) + +with torch.cuda.stream(torch_stream): + physics_kernel(qd_stream=qd_stream) + observations = compute_obs_tensor() + apply_actions_kernel(qd_stream=qd_stream) + +qd_stream.destroy() +``` + +`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly. + +## Limitations + +- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. +- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one + stream's output is another stream's input. From 22389690c487e1bc05da15ed213b7e2f7bb0d7ed Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 08:43:36 -0700 Subject: [PATCH 023/109] [Doc] Update streams doc with AMDGPU support --- docs/source/user_guide/streams.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 0a610fd217..cd26e01d20 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -9,6 +9,7 @@ and control synchronization with events. | Backend | Streams | Events | Notes | |---------|---------|--------|-------| | CUDA | Yes | Yes | Full concurrent execution | +| AMDGPU | Yes | Yes | Full concurrent execution (requires ROCm >= 5.4) | | CPU | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | | Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | | Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | From 8cd793c888fec5815aa5b7d04361aad251da5268 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 08:51:10 -0700 Subject: [PATCH 024/109] [Doc] Add stream_parallel() section to streams user guide --- docs/source/user_guide/streams.md | 78 +++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index cd26e01d20..b9a2f5798e 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -1,23 +1,26 @@ # Streams Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default -stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently -and control synchronization with events. +stream, which serializes everything. With streams, you can run multiple top-level for loops in parallel. ## Supported platforms -| Backend | Streams | Events | Notes | -|---------|---------|--------|-------| -| CUDA | Yes | Yes | Full concurrent execution | -| AMDGPU | Yes | Yes | Full concurrent execution (requires ROCm >= 5.4) | -| CPU | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | -| Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | -| Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | +| Backend | Supported | +|---------|-----------| +| CUDA | Yes | +| AMDGPU | Yes | +| CPU | No-op | +| Metal | No-op | +| Vulkan | No-op | -On backends without native stream support, `create_stream()` and `create_event()` return objects with handle -`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially. +On backends without native stream support, stream operations are no-ops and for loops run serially. Code using +streams is portable across all backends — it will run without modifications, but serially. -## Creating and using streams +## Stream parallelism + +Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream. The runtime +creates temporary streams, launches the for loops, and synchronizes automatically before the next +non-parallel statement. ```python import quadrants as qd @@ -27,17 +30,43 @@ qd.init(arch=qd.cuda) N = 1024 a = qd.field(qd.f32, shape=(N,)) b = qd.field(qd.f32, shape=(N,)) +c = qd.field(qd.f32, shape=(N,)) @qd.kernel -def fill_a(): - for i in range(N): - a[i] = 1.0 +def compute_ab(): + with qd.stream_parallel(): + for i in range(N): + a[i] = compute_a(i) + with qd.stream_parallel(): + for j in range(N): + b[j] = compute_b(j) @qd.kernel -def fill_b(): +def combine(): for i in range(N): - b[i] = 2.0 + c[i] = a[i] + b[i] + +compute_ab() # the two stream_parallel blocks run concurrently +combine() # runs after compute_ab() returns — a[] and b[] are ready +``` + +Consecutive `with qd.stream_parallel():` blocks run concurrently. Multiple for loops within a single block +share a stream and run serially on it. All streams are synchronized before the kernel returns. + +### Restrictions +- All top-level statements in a kernel must be either all `stream_parallel` blocks or all regular statements. + Mixing the two at the top level is a compile-time error. +- Nesting `stream_parallel` blocks is not supported. + +## Explicit streams + +For cases that require manual control — such as launching separate kernels on different streams or +interoperating with PyTorch — you can create and manage streams directly. + +### Creating and using streams + +```python s1 = qd.create_stream() s2 = qd.create_stream() @@ -54,7 +83,7 @@ s2.destroy() Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes. -## Events +### Events Events let you express dependencies between streams without full synchronization. @@ -89,7 +118,7 @@ s2.destroy() `e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait until the recorded point is reached. If `qd_stream` is omitted, the default stream waits. -## Context managers +### Context managers Streams and events support `with` blocks for automatic cleanup: @@ -100,13 +129,13 @@ with qd.create_stream() as s: # s.destroy() called automatically ``` -## PyTorch interop (CUDA) +### PyTorch interop (CUDA) When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption. -### Running Quadrants kernels on PyTorch's stream +#### Running Quadrants kernels on PyTorch's stream ```python import torch @@ -123,7 +152,7 @@ apply_actions_kernel(qd_stream=stream) Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream. -### Running PyTorch operations on a Quadrants stream +#### Running PyTorch operations on a Quadrants stream ```python qd_stream = qd.create_stream() @@ -142,5 +171,6 @@ qd_stream.destroy() ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. -- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one - stream's output is another stream's input. +- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for + inserting events or `synchronize()` calls when one stream's output is another stream's input. + `stream_parallel` handles this automatically. From 08b85d5bd8df98d16d337cca55468af82eecb5c4 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 09:07:21 -0700 Subject: [PATCH 025/109] [Doc] Note stream pooling in streams user guide --- docs/source/user_guide/streams.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index b9a2f5798e..87d662a045 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -18,9 +18,7 @@ streams is portable across all backends — it will run without modifications, b ## Stream parallelism -Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream. The runtime -creates temporary streams, launches the for loops, and synchronizes automatically before the next -non-parallel statement. +Inside a `@qd.kernel`, each `with qd.stream_parallel():` block runs on its own GPU stream. ```python import quadrants as qd From f2a2596c577235d796fa810a969d902e5dfe7016 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 11:11:04 -0700 Subject: [PATCH 026/109] Reflow stream.py docstrings to 120c line width --- python/quadrants/lang/stream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 8f6cfab3d6..5e54b227cd 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -10,8 +10,8 @@ def _get_prog_weakref(): class Stream: """Wraps a backend-specific GPU stream for concurrent kernel execution. - On backends without native streams (e.g. CPU), this is a no-op object. - Call destroy() explicitly or use as a context manager to ensure cleanup. + On backends without native streams (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as + a context manager to ensure cleanup. """ def __init__(self, handle: int, prog_ref: weakref.ref | None = None): @@ -54,8 +54,8 @@ def __exit__(self, *args): class Event: """Wraps a backend-specific GPU event for stream synchronization. - On backends without native events (e.g. CPU), this is a no-op object. - Call destroy() explicitly or use as a context manager to ensure cleanup. + On backends without native events (e.g. CPU), this is a no-op object. Call destroy() explicitly or use as + a context manager to ensure cleanup. """ def __init__(self, handle: int, prog_ref: weakref.ref | None = None): From de99f3efb295525d5ef1c80b30dc0b0007c97290 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 28 Apr 2026 11:15:31 -0700 Subject: [PATCH 027/109] Unwrap prose lines in streams.md to match repo doc style --- docs/source/user_guide/streams.md | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 0a610fd217..0fb2627c0c 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -1,8 +1,6 @@ # Streams -Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default -stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently -and control synchronization with events. +Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently and control synchronization with events. ## Supported platforms @@ -13,8 +11,7 @@ and control synchronization with events. | Metal | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | | Vulkan | No-op | No-op | `qd_stream` is silently ignored, kernels run serially | -On backends without native stream support, `create_stream()` and `create_event()` return objects with handle -`0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially. +On backends without native stream support, `create_stream()` and `create_event()` return objects with handle `0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially. ## Creating and using streams @@ -50,8 +47,7 @@ s1.destroy() s2.destroy() ``` -Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute -concurrently. Call `synchronize()` to block until all work on a stream completes. +Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes. ## Events @@ -85,8 +81,7 @@ s1.destroy() s2.destroy() ``` -`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait -until the recorded point is reached. If `qd_stream` is omitted, the default stream waits. +`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait until the recorded point is reached. If `qd_stream` is omitted, the default stream waits. ## Context managers @@ -101,9 +96,7 @@ with qd.create_stream() as s: ## PyTorch interop (CUDA) -When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to -avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different -streams with no ordering guarantees, leading to intermittent data corruption. +When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption. ### Running Quadrants kernels on PyTorch's stream @@ -119,8 +112,7 @@ observations = compute_obs_tensor() # PyTorch op on the same stream apply_actions_kernel(qd_stream=stream) ``` -Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this -wrapper — PyTorch owns the underlying stream. +Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream. ### Running PyTorch operations on a Quadrants stream @@ -141,5 +133,4 @@ qd_stream.destroy() ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. -- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one - stream's output is another stream's input. +- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. From 401d6f81f0641c73118e1356feb9b87c3480e4f1 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 03:49:13 -0700 Subject: [PATCH 028/109] Use CU_STREAM_NON_BLOCKING for user-created streams Streams created with CU_STREAM_DEFAULT (flag 0) implicitly synchronize with the legacy NULL stream, defeating concurrent execution when any code path (including the kernel launcher's sizer-context block) posts work on the NULL stream. Switch to CU_STREAM_NON_BLOCKING (0x1) to match PyTorch/JAX/CuPy conventions and deliver the concurrency the stream API promises. --- quadrants/program/program.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 83adc99627..a38ddd0dbb 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -498,7 +498,7 @@ uint64 Program::stream_create() { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda) { void *stream = nullptr; - CUDADriver::get_instance().stream_create(&stream, 0 /*flags*/); + CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/); return reinterpret_cast(stream); } #endif From a3c98f8da17148524f73d6c5faf348337cd7e8a9 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 03:49:40 -0700 Subject: [PATCH 029/109] Use async DtoH memcpy on active_stream for external array readback The post-kernel readback of host-backed external arrays used synchronous cuMemcpyDtoH which implicitly serializes through the NULL stream, defeating stream isolation. Switch to memcpy_device_to_host_async on active_stream with a scoped stream_synchronize, consistent with the HtoD direction already converted in this branch. --- quadrants/runtime/cuda/kernel_launcher.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index de6bab83e6..8a33bf0b61 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -253,8 +253,12 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; - CUDADriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx], - ctx.array_runtime_sizes[idx.arg_id]); + CUDADriver::get_instance().memcpy_device_to_host_async( + itr->second.first, (void *)device_ptrs[idx], + ctx.array_runtime_sizes[idx.arg_id], active_stream); + } + CUDADriver::get_instance().stream_synchronize(active_stream); + for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } } From ca14f6753f35b3feedb4fd2f84e3ca0d3475a1e3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 03:50:01 -0700 Subject: [PATCH 030/109] Guard destroy()/__exit__ against destroying externally-owned handles Stream.__del__ already checks self._prog_ref is not None to avoid destroying handles wrapping external streams (e.g. PyTorch), but destroy() and __exit__ did not. A user doing `with Stream(torch_stream_ptr): ...` would destroy the PyTorch stream on block exit. Add the same ownership guard to destroy() for both Stream and Event. --- python/quadrants/lang/stream.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 5e54b227cd..063a2aeafc 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -28,8 +28,11 @@ def synchronize(self): prog.stream_synchronize(self._handle) def destroy(self): - """Explicitly destroy the stream. Safe to call multiple times.""" - if self._handle != 0: + """Explicitly destroy the stream. Safe to call multiple times. + + No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref). + """ + if self._handle != 0 and self._prog_ref is not None: prog = impl.get_runtime().prog prog.stream_destroy(self._handle) self._handle = 0 @@ -84,8 +87,11 @@ def synchronize(self): prog.event_synchronize(self._handle) def destroy(self): - """Explicitly destroy the event. Safe to call multiple times.""" - if self._handle != 0: + """Explicitly destroy the event. Safe to call multiple times. + + No-op for events wrapping external handles (created via Event(ptr) without a prog_ref). + """ + if self._handle != 0 and self._prog_ref is not None: prog = impl.get_runtime().prog prog.event_destroy(self._handle) self._handle = 0 From b46de06b5c0cc9892cadac4b22812f72d80522d2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 03:58:15 -0700 Subject: [PATCH 031/109] Fix clang-format indentation for memcpy_device_to_host_async --- quadrants/runtime/cuda/kernel_launcher.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 8a33bf0b61..f3f48ab21e 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -253,9 +253,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; - CUDADriver::get_instance().memcpy_device_to_host_async( - itr->second.first, (void *)device_ptrs[idx], - ctx.array_runtime_sizes[idx.arg_id], active_stream); + CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], + ctx.array_runtime_sizes[idx.arg_id], active_stream); } CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { From 8efd51f116d3825d152ee67bfbb2430a5ee25d6b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:04:18 -0700 Subject: [PATCH 032/109] Address review comments: fix AMDGPU stream issues - Fix stream_synchronize(nullptr) in do-while loop to sync active stream, mirroring the CUDA path (claude red) - Remove unused kernel_arg_pointer_ member from AMDGPUContext (claude yellow) - Reword misleading ROCm fallback comment to clarify it's per-device, not per-runtime-version (claude yellow) - Fix stream_create ABI: bind to hipStreamCreateWithFlags instead of hipStreamCreate to match the two-arg call signature (codex P2) --- quadrants/rhi/amdgpu/amdgpu_context.h | 1 - quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index b9fd5c403c..083406c3f9 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -25,7 +25,6 @@ class AMDGPUContext { bool debug_{false}; bool supports_mem_pool_{false}; static thread_local void *stream_; - std::vector kernel_arg_pointer_; public: AMDGPUContext(); diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index b6a4d7ba3e..d91afcac00 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -16,7 +16,7 @@ PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *); PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); // Stream management -PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32); +PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32); PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); // Memory management @@ -29,7 +29,7 @@ PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, voi PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); // hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers -// transparently fall back to the synchronous variants when unsupported. +// fall back to the synchronous variants on devices without memory-pool support. PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 5bb5e70194..d54331f237 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -71,7 +71,8 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder & do { launch_offloaded_tasks(ctx, amdgpu_module, offloaded_tasks, context_pointer, arg_size); counter_val = 0; - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + auto *stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().stream_synchronize(stream); AMDGPUDriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t)); } while (counter_val != 0); } From b9eef6e844a6940848a0f4a52c9f5820ef69e388 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:07:11 -0700 Subject: [PATCH 033/109] Use async DtoH on active_stream for do-while loop counter readback The do-while loop counter readback in launch_offloaded_tasks_with_do_while used synchronous cuMemcpyDtoH which serializes through the NULL stream, defeating stream isolation on every loop iteration. Switch to async memcpy on the active stream followed by stream_synchronize, matching the pattern used elsewhere in the launcher. --- quadrants/runtime/cuda/kernel_launcher.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index f3f48ab21e..a1ccc470ab 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -71,8 +71,9 @@ void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder & launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr); counter_val = 0; auto *stream = CUDAContext::get_instance().get_stream(); + CUDADriver::get_instance().memcpy_device_to_host_async(&counter_val, ctx.graph_do_while_flag_dev_ptr, + sizeof(int32_t), stream); CUDADriver::get_instance().stream_synchronize(stream); - CUDADriver::get_instance().memcpy_device_to_host(&counter_val, ctx.graph_do_while_flag_dev_ptr, sizeof(int32_t)); } while (counter_val != 0); } From f0dd7d6acb648aef15f8bb726ac86a0d0bca9d05 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:07:30 -0700 Subject: [PATCH 034/109] Use active_stream for sizer device context staging The needs_sizer_device_ctx block (malloc_async, memcpy_host_to_device_async, mem_free_async) was using nullptr (NULL stream) while the consuming sizer kernel runs on active_stream. With non-blocking streams (e.g. wrapped PyTorch streams), there is no implicit ordering between them, creating a race where the sizer kernel could read stale or freed memory. --- quadrants/runtime/cuda/kernel_launcher.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index a1ccc470ab..ca27b78dd1 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -227,9 +227,9 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx needs_sizer_device_ctx = needs_sizer_device_ctx && !CUDAContext::get_instance().supports_pageable_memory_access(); void *device_context_ptr = nullptr; if (needs_sizer_device_ctx) { - CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), nullptr); + CUDADriver::get_instance().malloc_async(&device_context_ptr, sizeof(RuntimeContext), active_stream); CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(), - sizeof(RuntimeContext), nullptr); + sizeof(RuntimeContext), active_stream); } if (ctx.graph_do_while_arg_id >= 0) { @@ -239,7 +239,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx launch_offloaded_tasks(ctx, cuda_module, offloaded_tasks, device_context_ptr); } if (needs_sizer_device_ctx) { - CUDADriver::get_instance().mem_free_async(device_context_ptr, nullptr); + CUDADriver::get_instance().mem_free_async(device_context_ptr, active_stream); } if (ctx.arg_buffer_size > 0) { CUDADriver::get_instance().mem_free_async(device_arg_buffer, active_stream); From 8b3d4ed5f513603e1c3066090576cc0d90742329 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:08:04 -0700 Subject: [PATCH 035/109] Add make_current() to stream/event Program methods All other CUDA entry points (kernel_launcher, jit_cuda, graph_manager) call CUDAContext::get_instance().make_current() to bind the primary context on the calling thread. The new stream/event methods skipped this, which would cause CUDA_ERROR_INVALID_CONTEXT if called from a thread other than the qd.init thread. --- quadrants/program/program.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index a38ddd0dbb..5abcd255b3 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -497,6 +497,7 @@ void Program::enqueue_compute_op_lambda(std::function(stream); @@ -508,6 +509,7 @@ uint64 Program::stream_create() { void Program::stream_destroy(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && stream_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif @@ -516,6 +518,7 @@ void Program::stream_destroy(uint64 stream_handle) { void Program::stream_synchronize(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && stream_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif @@ -524,6 +527,7 @@ void Program::stream_synchronize(uint64 stream_handle) { void Program::set_current_cuda_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().make_current(); CUDAContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif @@ -532,6 +536,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { uint64 Program::event_create() { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().make_current(); void *event = nullptr; CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/); return reinterpret_cast(event); @@ -543,6 +548,7 @@ uint64 Program::event_create() { void Program::event_destroy(uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif @@ -551,6 +557,7 @@ void Program::event_destroy(uint64 event_handle) { void Program::event_record(uint64 event_handle, uint64 stream_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); } @@ -560,6 +567,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { void Program::event_synchronize(uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif @@ -568,6 +576,7 @@ void Program::event_synchronize(uint64 event_handle) { void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #ifdef QD_WITH_CUDA if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); } From 34e9fa6aa47672ad4a59d2d2d4e952b1aec66698 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:14:01 -0700 Subject: [PATCH 036/109] Use HIP_STREAM_NON_BLOCKING for AMDGPU stream_create to mirror CUDA path --- quadrants/program/program.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 36c27942d0..f3fdeef548 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -510,7 +510,7 @@ uint64 Program::stream_create() { #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { void *stream = nullptr; - AMDGPUDriver::get_instance().stream_create(&stream, 0 /*flags*/); + AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); return reinterpret_cast(stream); } #endif From 3b0ba294ace8518f70f8cb0516787bc9651ed644 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:18:35 -0700 Subject: [PATCH 037/109] Restore deleted comments, fix docstring wrapping, fix per-task adstack publish in stream-parallel loop - Restore the deleted comments explaining why device_context_ptr is passed to publish_adstack_metadata (CUDA_ERROR_ILLEGAL_ADDRESS / hipErrorIllegalAddress on non-HMM GPUs). - Reflow stream.py docstring to 120-char wrap. - Move publish_adstack_metadata into the inner per-task loop for stream-parallel dispatch so each task gets its own adstack metadata published before launch (fixes latent bug for autodiff kernels). --- python/quadrants/lang/stream.py | 5 ++--- quadrants/runtime/amdgpu/kernel_launcher.cpp | 11 ++++++++--- quadrants/runtime/cuda/kernel_launcher.cpp | 11 +++++++++-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 721d989109..395cc9d25c 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -132,9 +132,8 @@ def create_event() -> Event: def stream_parallel(): """Run top-level for loops in this block on separate GPU streams. - Used inside @qd.kernel. At Python runtime (outside kernels), this is a - no-op. During kernel compilation, the AST transformer calls into the C++ - ASTBuilder to tag loops with a stream-parallel group ID. + Used inside @qd.kernel. At Python runtime (outside kernels), this is a no-op. During kernel compilation, the AST + transformer calls into the C++ ASTBuilder to tag loops with a stream-parallel group ID. """ yield diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 0c71f8fa85..fa053b74b5 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -55,8 +55,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, auto *active_stream = AMDGPUContext::get_instance().get_stream(); for (size_t i = 0; i < offloaded_tasks.size();) { const auto &task = offloaded_tasks[i]; - executor->publish_adstack_metadata(task.ad_stack, resolve_num_threads(task, executor), &ctx, context_pointer); if (task.stream_parallel_group_id == 0) { + // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without this the + // sizer launches with a host pointer and the next DtoH sync trips + // `hipErrorIllegalAddress ... memcpy_device_to_host` because HIP has no UVA fallback for the host + // `RuntimeContext` struct. + executor->publish_adstack_metadata(task.ad_stack, resolve_num_threads(task, executor), &ctx, context_pointer); QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, task.block_dim); amdgpu_module->launch(task.name, task.grid_dim, task.block_dim, task.dynamic_shared_array_bytes, {(void *)&context_pointer}, {arg_size}); @@ -79,9 +83,10 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, for (size_t j = group_start; j < i; j++) { const auto &t = offloaded_tasks[j]; + executor->publish_adstack_metadata(t.ad_stack, resolve_num_threads(t, executor), &ctx, context_pointer); AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); - amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer}, - {arg_size}); + amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, + {(void *)&context_pointer}, {arg_size}); } for (auto &[sid, s] : stream_by_id) { diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index a3e97e3a26..ac0ccc8896 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -54,9 +54,14 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, auto *active_stream = CUDAContext::get_instance().get_stream(); for (size_t i = 0; i < offloaded_tasks.size();) { const auto &task = offloaded_tasks[i]; - std::size_t n = resolve_num_threads(task.ad_stack, executor); - executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr); if (task.stream_parallel_group_id == 0) { + std::size_t n = resolve_num_threads(task.ad_stack, executor); + // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without it the sizer + // launches with a host pointer and the next DtoH sync trips `CUDA_ERROR_ILLEGAL_ADDRESS ... + // memcpy_device_to_host` on GPUs whose driver + kernel cannot coherently access pageable host memory (the HMM + // capability gated below in `launch_llvm_kernel`). `nullptr` on HMM-capable setups keeps + // `publish_adstack_metadata`'s host-pointer fast path. + executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr); QD_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, task.block_dim); cuda_module->launch(task.name, task.grid_dim, task.block_dim, task.dynamic_shared_array_bytes, {&ctx.get_context()}, {}); @@ -79,6 +84,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, for (size_t j = group_start; j < i; j++) { const auto &t = offloaded_tasks[j]; + std::size_t n_t = resolve_num_threads(t.ad_stack, executor); + executor->publish_adstack_metadata(t.ad_stack, n_t, &ctx, device_context_ptr); CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); cuda_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()}, {}); } From 1c62eaecb93bca645a2f80cf40a3ee0ff849dead Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:22:05 -0700 Subject: [PATCH 038/109] Fix clang-format line break in AMDGPU kernel launcher --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index fa053b74b5..43664a68da 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -85,8 +85,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, const auto &t = offloaded_tasks[j]; executor->publish_adstack_metadata(t.ad_stack, resolve_num_threads(t, executor), &ctx, context_pointer); AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); - amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, - {(void *)&context_pointer}, {arg_size}); + amdgpu_module->launch(t.name, t.grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {(void *)&context_pointer}, + {arg_size}); } for (auto &[sid, s] : stream_by_id) { From 162239e38cbd9ce3fcd1365181c1f3470be194d8 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:35:34 -0700 Subject: [PATCH 039/109] Use active stream for AMDGPU adstack metadata copies in publish_adstack_metadata AMDGPUContext::launch now dispatches on the user stream, so the adstack H2D copies must target the same stream to maintain ordering. Mirrors the CUDA branch. --- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 69be9408b5..bc319f9c38 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -851,11 +851,10 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes); // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the - // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active - // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the - // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no - // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel` - // (i.e. the default stream), so the copies match that. + // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. Both CUDA and AMDGPU + // fetch the active stream from their respective context singletons (configurable via `set_stream`, + // defaults to the null stream), matching the stream used by `CUDAContext::launch` / + // `AMDGPUContext::launch`. #if defined(QD_WITH_CUDA) if (config_.arch == Arch::cuda) { void *active_stream = CUDAContext::get_instance().get_stream(); @@ -869,7 +868,7 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf #endif #if defined(QD_WITH_AMDGPU) if (config_.arch == Arch::amdgpu) { - void *active_stream = nullptr; // AMDGPUContext::launch always uses the default stream. + void *active_stream = AMDGPUContext::get_instance().get_stream(); AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, header_bytes, active_stream); AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, From 216f7d53d91af16c033410a73be767332cf5625b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:42:54 -0700 Subject: [PATCH 040/109] Address Claude review: reject stream_parallel in @qd.func, use non-blocking streams - Reject qd.stream_parallel() inside @qd.func with a clear error; it's only valid in @qd.kernel. - Use CU_STREAM_NON_BLOCKING (0x1) for internal stream-parallel streams, matching the convention in Program::stream_create. Blocking streams (flag 0) serialize with the legacy NULL stream, defeating the purpose of parallel dispatch. --- python/quadrants/lang/ast/ast_transformer.py | 2 ++ quadrants/runtime/amdgpu/kernel_launcher.cpp | 2 +- quadrants/runtime/cuda/kernel_launcher.cpp | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index 99e10bc4f9..b5b78455c6 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -1541,6 +1541,8 @@ def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None: raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression") if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars): raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()") + if not ctx.is_kernel: + raise QuadrantsSyntaxError("qd.stream_parallel() can only be used inside @qd.kernel, not @qd.func") ctx.ast_builder.begin_stream_parallel() build_stmts(ctx, node.body) ctx.ast_builder.end_stream_parallel() diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 43664a68da..1da2ec5b0a 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -76,7 +76,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int sid = offloaded_tasks[j].stream_parallel_group_id; if (stream_by_id.find(sid) == stream_by_id.end()) { void *s = nullptr; - AMDGPUDriver::get_instance().stream_create(&s, 0); + AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/); stream_by_id[sid] = s; } } diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index ac0ccc8896..b11d3a334b 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -77,7 +77,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int sid = offloaded_tasks[j].stream_parallel_group_id; if (stream_by_id.find(sid) == stream_by_id.end()) { void *s = nullptr; - CUDADriver::get_instance().stream_create(&s, 0); + CUDADriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/); stream_by_id[sid] = s; } } From 9334efd4f102def5c5458e7ccd0a99f63e80d63e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:46:42 -0700 Subject: [PATCH 041/109] Add make_current() to all AMDGPU stream/event Program methods Mirrors commit 8b3d4ed from the CUDA path: HIP uses the same primary-context-per-thread model, so calling these methods from a non-init thread requires make_current() to bind the context first. --- quadrants/program/program.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 89972bdf6f..2c9e57e378 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -510,6 +510,7 @@ uint64 Program::stream_create() { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); void *stream = nullptr; AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); return reinterpret_cast(stream); @@ -527,6 +528,7 @@ void Program::stream_destroy(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif @@ -541,6 +543,7 @@ void Program::stream_synchronize(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif @@ -555,6 +558,7 @@ void Program::set_current_cuda_stream(uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif @@ -571,6 +575,7 @@ uint64 Program::event_create() { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu) { + AMDGPUContext::get_instance().make_current(); void *event = nullptr; AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); return reinterpret_cast(event); @@ -588,6 +593,7 @@ void Program::event_destroy(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif @@ -603,6 +609,7 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); } @@ -618,6 +625,7 @@ void Program::event_synchronize(uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif @@ -633,6 +641,7 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #endif #ifdef QD_WITH_AMDGPU if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); } From aa4a70f91983d26fed7c73a380d5a13646997ed2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:52:32 -0700 Subject: [PATCH 042/109] Use async DtoH on active_stream for resolve_num_threads readback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolve_num_threads reads dynamic range_for begin/end from device temporaries via synchronous cuMemcpyDtoH (NULL stream). With CU_STREAM_NON_BLOCKING user streams, the prep task's store on active_stream has no ordering with the NULL stream, so the readback can return stale values — leading to wrong adstack sizing and either CUDA_ERROR_ILLEGAL_ADDRESS or silent gradient corruption. Switch to async memcpy on active_stream + stream_synchronize, matching the pattern used at all other DtoH sites in the launcher. --- quadrants/runtime/cuda/kernel_launcher.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index ca27b78dd1..005ad480e9 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -23,15 +23,17 @@ std::size_t resolve_num_threads(const AdStackSizingInfo &info, LlvmRuntimeExecut std::int32_t begin = info.begin_const_value; std::int32_t end = info.end_const_value; if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) { + auto *active_stream = CUDAContext::get_instance().get_stream(); auto *temp_dev_ptr = reinterpret_cast(executor->get_runtime_temporaries_device_ptr()); if (info.begin_offset_bytes >= 0) { - CUDADriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes, - sizeof(std::int32_t)); + CUDADriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes, + sizeof(std::int32_t), active_stream); } if (info.end_offset_bytes >= 0) { - CUDADriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes, - sizeof(std::int32_t)); + CUDADriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes, + sizeof(std::int32_t), active_stream); } + CUDADriver::get_instance().stream_synchronize(active_stream); } // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by // `linear_thread_idx()` (`block_idx * block_dim + thread_idx`), so only `static_num_threads = grid_dim * From 1fba4f56f6a0a2a276ffb7bd23c2d8a6374fde6b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 04:57:54 -0700 Subject: [PATCH 043/109] Use async DtoH on active_stream for AMDGPU resolve_num_threads readback Mirrors aa4a70f from the CUDA path: with non-blocking user streams, synchronous DtoH on the NULL stream has no ordering with the prep task's store on active_stream, risking stale begin/end values. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index d54331f237..bb19087586 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -25,15 +25,17 @@ std::size_t resolve_num_threads(const OffloadedTask &task, LlvmRuntimeExecutor * std::int32_t begin = info.begin_const_value; std::int32_t end = info.end_const_value; if (info.begin_offset_bytes >= 0 || info.end_offset_bytes >= 0) { + auto *active_stream = AMDGPUContext::get_instance().get_stream(); auto *temp_dev_ptr = reinterpret_cast(executor->get_runtime_temporaries_device_ptr()); if (info.begin_offset_bytes >= 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host(&begin, temp_dev_ptr + info.begin_offset_bytes, - sizeof(std::int32_t)); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&begin, temp_dev_ptr + info.begin_offset_bytes, + sizeof(std::int32_t), active_stream); } if (info.end_offset_bytes >= 0) { - AMDGPUDriver::get_instance().memcpy_device_to_host(&end, temp_dev_ptr + info.end_offset_bytes, - sizeof(std::int32_t)); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&end, temp_dev_ptr + info.end_offset_bytes, + sizeof(std::int32_t), active_stream); } + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } // Clamp the logical iteration count to the launched thread count: adstack slices are indexed by // `linear_thread_idx()`, so only `static_num_threads = grid_dim * block_dim` slices can be touched From 74604f2753aa778a200d65fe1d882f9c32f6f096 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:03:57 -0700 Subject: [PATCH 044/109] Allow docstrings in stream_parallel kernels, merge base branch updates The stream_parallel exclusivity validation now skips docstrings (bare string expressions at body[0]), so kernels with docstrings don't get falsely rejected. Also applied style cleanup from earlier review (use `if not any(...)` pattern). --- .../ast/ast_transformers/function_def_transformer.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 135b702d6f..d6b64b5080 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -464,12 +464,17 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo return False return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars) + @staticmethod + def _is_docstring(stmt: ast.stmt, index: int) -> bool: + return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str)) + @staticmethod def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None: - has_sp = any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body) - if not has_sp: + if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body): return - for stmt in body: + for i, stmt in enumerate(body): + if FunctionDefTransformer._is_docstring(stmt, i): + continue if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars): raise QuadrantsSyntaxError( "When using qd.stream_parallel(), all top-level statements " From 5901a7fc83e7b17a3f6d580449b14db952ccf5d2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:13:32 -0700 Subject: [PATCH 045/109] Sync active_stream at end of launch_llvm_kernel unconditionally The result-buffer DtoH and mem_free_async are queued on active_stream, but stream_synchronize only ran inside the transfers.size() > 0 branch. For the ndarray/CUDA-tensor path (transfers empty), the launcher returned with the DtoH still in flight on a CU_STREAM_NON_BLOCKING stream. The post-launcher cuStreamSynchronize(NULL) in runtime_ops.sync does not drain non-blocking streams, so fetch_ret_impl could read stale bytes. Move the sync to the end of the function unconditionally. --- quadrants/runtime/cuda/kernel_launcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 005ad480e9..d6931da87a 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -259,11 +259,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], ctx.array_runtime_sizes[idx.arg_id], active_stream); } - CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } } + CUDADriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From f89bde02c5497856745bc93dd73fd2825ad2d489 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:19:16 -0700 Subject: [PATCH 046/109] Sync active_stream unconditionally at end of AMDGPU launch_llvm_kernel Mirrors 5901a7fc from the CUDA path: when transfers is empty, the result-buffer DtoH and mem_free_async were left in-flight on a non-blocking stream with no sync before return. Also converts transfer DtoH copies to async to match CUDA. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index bb19087586..0c5b4bad05 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -211,13 +211,15 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { auto &idx = itr->first; - auto arg_id = idx.arg_id; - AMDGPUDriver::get_instance().memcpy_device_to_host(itr->second.first, (void *)device_ptrs[idx], - ctx.array_runtime_sizes[arg_id]); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], + ctx.array_runtime_sizes[idx.arg_id], active_stream); + } + for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } } AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From ef3b95b18361dce692b02e4beff5a0a496fb5ff3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:30:20 -0700 Subject: [PATCH 047/109] Use async DtoH on active_stream for sizer stride readback The sizer kernel now runs on the user stream via Context::launch, so the synchronous copy_d2h (NULL stream) can read stale stride values with non-blocking streams. Use stream-aware async DtoH + sync for both CUDA and AMDGPU, falling back to copy_d2h for other backends. --- .../runtime/llvm/llvm_runtime_executor.cpp | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index bc319f9c38..1fff73575b 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -922,9 +922,9 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_); copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes); - // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel - // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we - // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc. + // Invoke the device interpreter. `JITModule::call` launches this as a single-thread kernel on the active + // stream (CUDA/AMDGPU both dispatch through `{CUDA,AMDGPU}Context::launch` which uses `stream_`), so the + // writes are stream-ordered before the subsequent main-kernel dispatch. // // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of @@ -943,8 +943,27 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf runtime_context_ptr_for_sizer, bytecode_dev_ptr); // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch. + // Use async DtoH on active_stream + sync so the readback is ordered after the sizer kernel. uint64_t stride_u64 = 0; - copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); +#if defined(QD_WITH_AMDGPU) + if (config_.arch == Arch::amdgpu) { + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_, + sizeof(uint64_t), active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); + } else +#endif +#if defined(QD_WITH_CUDA) + if (config_.arch == Arch::cuda) { + void *active_stream = CUDAContext::get_instance().get_stream(); + CUDADriver::get_instance().memcpy_device_to_host_async(&stride_u64, runtime_adstack_stride_field_ptr_, + sizeof(uint64_t), active_stream); + CUDADriver::get_instance().stream_synchronize(active_stream); + } else +#endif + { + copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); + } stride = static_cast(stride_u64); } From fc5b710bbc7995d8b92f457359b238ab3125e2e3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:39:31 -0700 Subject: [PATCH 048/109] Add missing #include to amdgpu_context.h for IWYU consistency Mirrors the explicit include already added to cuda_context.h in this PR. The file compiled via transitive inclusion through kernel_profiler.h but should not depend on that. --- quadrants/rhi/amdgpu/amdgpu_context.h | 1 + 1 file changed, 1 insertion(+) diff --git a/quadrants/rhi/amdgpu/amdgpu_context.h b/quadrants/rhi/amdgpu/amdgpu_context.h index a3515c30cb..9283afa078 100644 --- a/quadrants/rhi/amdgpu/amdgpu_context.h +++ b/quadrants/rhi/amdgpu/amdgpu_context.h @@ -3,6 +3,7 @@ #include #include #include +#include #include "quadrants/program/kernel_profiler.h" #include "quadrants/rhi/amdgpu/amdgpu_driver.h" From 8550aa012d4759d7e7de0737aabbce86b4c33bf7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:45:20 -0700 Subject: [PATCH 049/109] Fix end-of-launcher sync: conditional + dealloc race Two fixes in the post-launch cleanup: 1. The unconditional stream_synchronize(active_stream) blocked the host on every kernel launch, defeating stream concurrency for the common case (no return value, no host-backed arrays). Make it conditional: sync only when result_buffer_size > 0 (the stale-bytes path), or when transfers are present (already had its own sync). 2. The transfers branch queued async DtoH on active_stream then immediately deallocated device memory via mem_free_async(NULL stream). With CU_STREAM_NON_BLOCKING streams, the dealloc could race with the in-flight DtoH. Add stream_synchronize(active_stream) between the DtoH loop and the dealloc loop. --- quadrants/runtime/cuda/kernel_launcher.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index d6931da87a..b0d2da095c 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -259,11 +259,13 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx CUDADriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], ctx.array_runtime_sizes[idx.arg_id], active_stream); } + CUDADriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } + } else if (ctx.result_buffer_size > 0) { + CUDADriver::get_instance().stream_synchronize(active_stream); } - CUDADriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From 6374cf3bfe91f700863af6cd510fe7ed00446f34 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:45:59 -0700 Subject: [PATCH 050/109] Reject qd_stream inside autograd Tape context The Tape replay path (Tape.grad) calls func.grad(*args) with no kwargs, so qd_stream is silently dropped and the backward kernel runs on the default stream with no ordering guarantee relative to the forward on the user's stream. Raise RuntimeError when both are used, matching the existing graph=True incompatibility pattern. Document the limitation. --- docs/source/user_guide/streams.md | 1 + python/quadrants/lang/kernel.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 0fb2627c0c..85d4e8d12c 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -133,4 +133,5 @@ qd_stream.destroy() ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. +- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee. - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index 7c68373f34..8a1004c6a8 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -649,6 +649,9 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut @_shell_pop_print def __call__(self, *py_args, **kwargs) -> Any: qd_stream = kwargs.pop("qd_stream", None) + if qd_stream is not None and self.runtime.target_tape: + raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " + "context, or omit qd_stream.") if impl.get_runtime()._arch == _ARCH_PYTHON: return self.func(*py_args, **kwargs) config = impl.current_cfg() From 7f0f29958c234668651c864fa999e696f4d3a895 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:51:43 -0700 Subject: [PATCH 051/109] Fix end-of-launcher sync: conditional + dealloc race on AMDGPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors 8550aa0 from the CUDA path: 1. Make stream_synchronize conditional — only sync when result_buffer or transfers need it, avoiding host-blocking on every launch. 2. Add sync between async DtoH and device memory deallocation to prevent race with non-blocking streams. Also fixes black formatting from base branch merge. --- python/quadrants/lang/kernel.py | 6 ++++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index 8a1004c6a8..766689b02d 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -650,8 +650,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut def __call__(self, *py_args, **kwargs) -> Any: qd_stream = kwargs.pop("qd_stream", None) if qd_stream is not None and self.runtime.target_tape: - raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " - "context, or omit qd_stream.") + raise RuntimeError( + "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " + "context, or omit qd_stream." + ) if impl.get_runtime()._arch == _ARCH_PYTHON: return self.func(*py_args, **kwargs) config = impl.current_cfg() diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 0c5b4bad05..b32e0981ea 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -214,12 +214,14 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx AMDGPUDriver::get_instance().memcpy_device_to_host_async(itr->second.first, (void *)device_ptrs[idx], ctx.array_runtime_sizes[idx.arg_id], active_stream); } + AMDGPUDriver::get_instance().stream_synchronize(active_stream); for (auto itr = transfers.begin(); itr != transfers.end(); itr++) { executor->deallocate_memory_on_device(itr->second.second); } + } else if (ctx.result_buffer_size > 0) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); } AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); - AMDGPUDriver::get_instance().stream_synchronize(active_stream); } KernelLauncher::Handle KernelLauncher::register_llvm_kernel(const LLVM::CompiledKernelData &compiled) { From ca8ace3b6be6ed794f0f7619f92d3328a61d1e41 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 05:57:14 -0700 Subject: [PATCH 052/109] Fix linter formatting; guard graph+stream; sync has_print on stream Three changes: 1. Fix black formatting of the Tape+stream RuntimeError. 2. Raise RuntimeError when qd_stream is passed to a graph=True kernel, enforcing the documented limitation in streams.md rather than silently bypassing the end-of-launcher sync. 3. When a kernel has print statements but no return value, and runs on a qd_stream, sync the user stream before runtime_ops.sync(). The NULL-stream sync in runtime_ops does not drain CU_STREAM_NON_BLOCKING user streams, so CUDA printf buffers would otherwise not be flushed. --- python/quadrants/lang/kernel.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index 8a1004c6a8..e0cdf945b5 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -561,6 +561,11 @@ def launch_kernel( self.src_ll_cache_observations.cache_stored = True self._last_compiled_kernel_data = compiled_kernel_data launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED + if launch_ctx.use_graph and qd_stream is not None: + raise RuntimeError( + "qd_stream is not compatible with graph=True kernels. " + "See docs/source/user_guide/streams.md for details." + ) if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"): launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id stream_handle = qd_stream.handle if qd_stream is not None else 0 @@ -582,6 +587,8 @@ def launch_kernel( return_type = self.return_type if return_type or self.has_print: + if qd_stream is not None and self.has_print and not return_type: + qd_stream.synchronize() runtime_ops.sync() if not return_type: @@ -650,8 +657,10 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut def __call__(self, *py_args, **kwargs) -> Any: qd_stream = kwargs.pop("qd_stream", None) if qd_stream is not None and self.runtime.target_tape: - raise RuntimeError("qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " - "context, or omit qd_stream.") + raise RuntimeError( + "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " + "context, or omit qd_stream." + ) if impl.get_runtime()._arch == _ARCH_PYTHON: return self.func(*py_args, **kwargs) config = impl.current_cfg() From 1f471b37ab7728777cb0cb339ba16c0b3164301e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:06:11 -0700 Subject: [PATCH 053/109] Fix AMDGPU stream flag comment: HIP_STREAM_NON_BLOCKING not CU_STREAM_NON_BLOCKING --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 50b80294ce..e57c8675d7 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -78,7 +78,7 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int sid = offloaded_tasks[j].stream_parallel_group_id; if (stream_by_id.find(sid) == stream_by_id.end()) { void *s = nullptr; - AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*CU_STREAM_NON_BLOCKING*/); + AMDGPUDriver::get_instance().stream_create(&s, 0x1 /*HIP_STREAM_NON_BLOCKING*/); stream_by_id[sid] = s; } } From 84806cfdfdd3b5aa366745872429892fc37c2157 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:14:18 -0700 Subject: [PATCH 054/109] Fix NULL-stream DtoH races in synchronize() and allocate_llvm_runtime_memory_jit synchronize() now drains the active user stream (if any) before the NULL stream, so fetch_result_uint64 callers (lazy field-pointer caches at three sites) read correct values when the runtime-query kernel ran on a non-blocking user stream. allocate_llvm_runtime_memory_jit: use async H2D on active_stream for the zero-stamp and sync the active stream before the DtoH readback, so the allocator kernel result is visible. --- quadrants/rhi/amdgpu/amdgpu_device.cpp | 6 ++++-- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp index 68c377a73a..d127ce19a0 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp @@ -1,4 +1,5 @@ #include "quadrants/rhi/amdgpu/amdgpu_device.h" +#include "quadrants/rhi/amdgpu/amdgpu_context.h" #include "quadrants/rhi/llvm/device_memory_pool.h" #include "quadrants/jit/jit_module.h" @@ -93,11 +94,12 @@ uint64_t *AmdgpuDevice::allocate_llvm_runtime_memory_jit(const LlvmRuntimeAllocP // the kernel without writing to *result. To detect that here, zero the slot first so a null readback unambiguously // means "allocation failed" and we can surface a helpful host-side message instead of letting the downstream // hipMemset trip on the stale pointer with a cryptic hipErrorInvalidValue. + void *active_stream = AMDGPUContext::get_instance().get_stream(); uint64 zero = 0; - AMDGPUDriver::get_instance().memcpy_host_to_device(params.result_buffer, &zero, sizeof(uint64)); + AMDGPUDriver::get_instance().memcpy_host_to_device_async(params.result_buffer, &zero, sizeof(uint64), active_stream); params.runtime_jit->call("runtime_memory_allocate_aligned", params.runtime, params.size, quadrants_page_size, params.result_buffer); - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); uint64 *ret{nullptr}; AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, params.result_buffer, sizeof(uint64)); QD_ERROR_IF(ret == nullptr, diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 1fff73575b..390987768a 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -188,12 +188,20 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re void LlvmRuntimeExecutor::synchronize() { if (config_.arch == Arch::cuda) { #if defined(QD_WITH_CUDA) + auto *active_stream = CUDAContext::get_instance().get_stream(); + if (active_stream != nullptr) { + CUDADriver::get_instance().stream_synchronize(active_stream); + } CUDADriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No CUDA support"); #endif } else if (config_.arch == Arch::amdgpu) { #if defined(QD_WITH_AMDGPU) + auto *active_stream = AMDGPUContext::get_instance().get_stream(); + if (active_stream != nullptr) { + AMDGPUDriver::get_instance().stream_synchronize(active_stream); + } AMDGPUDriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No AMDGPU support"); From b1c6eea4249b29c530debbba122b502da4619592 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:32:06 -0700 Subject: [PATCH 055/109] Sync active_stream before adstack sizer stride readback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit publish_adstack_metadata launches the sizer kernel on active_stream but reads the computed stride via synchronous copy_d2h (NULL stream). With CU_STREAM_NON_BLOCKING user streams, the NULL stream does not wait for the sizer kernel to complete, so the readback can return stale stride values — sizing the adstack heap incorrectly. Add stream_synchronize(active_stream) before the D2H. --- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 69be9408b5..9e2d3c9041 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -943,7 +943,12 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf runtime_jit->call("runtime_eval_adstack_size_expr", llvm_runtime_, runtime_context_ptr_for_sizer, bytecode_dev_ptr); - // Read back the computed per-thread stride so we can size the heap on host. One 8-byte `DtoH` per launch. + // The sizer kernel runs on active_stream; drain it before reading the stride on the host. +#if defined(QD_WITH_CUDA) + if (config_.arch == Arch::cuda) { + CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream()); + } +#endif uint64_t stride_u64 = 0; copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); stride = static_cast(stride_u64); From 88f1bf7ef578e1043fc7df0b8fe575df7dde5bc7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:47:47 -0700 Subject: [PATCH 056/109] Add stream_parallel_group_id to QD_STMT_DEF_FIELDS for cache key correctness Without this, the offline cache considers two kernels that differ only in stream_parallel_group_id assignments as identical, potentially serving a cached version with wrong group IDs. --- quadrants/ir/statements.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h index 503a1ed183..c9b0d79841 100644 --- a/quadrants/ir/statements.h +++ b/quadrants/ir/statements.h @@ -978,7 +978,7 @@ class RangeForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized); + QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id); QD_DEFINE_ACCEPT }; @@ -1012,7 +1012,7 @@ class StructForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt); + QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id); QD_DEFINE_ACCEPT }; @@ -1393,7 +1393,8 @@ class OffloadedStmt : public Stmt { reversed, num_cpu_threads, index_offsets, - mem_access_opt); + mem_access_opt, + stream_parallel_group_id); QD_DEFINE_ACCEPT }; From ca560b64d6e1f20ec4bcfc68d8081d87c466de10 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 06:58:28 -0700 Subject: [PATCH 057/109] Fix clang-format: multi-line QD_STMT_DEF_FIELDS for RangeForStmt and StructForStmt --- quadrants/ir/statements.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h index c9b0d79841..c29c648995 100644 --- a/quadrants/ir/statements.h +++ b/quadrants/ir/statements.h @@ -978,7 +978,14 @@ class RangeForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id); + QD_STMT_DEF_FIELDS(begin, + end, + reversed, + is_bit_vectorized, + num_cpu_threads, + block_dim, + strictly_serialized, + stream_parallel_group_id); QD_DEFINE_ACCEPT }; @@ -1012,7 +1019,13 @@ class StructForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id); + QD_STMT_DEF_FIELDS(snode, + index_offsets, + is_bit_vectorized, + num_cpu_threads, + block_dim, + mem_access_opt, + stream_parallel_group_id); QD_DEFINE_ACCEPT }; From 397f29814f3997d284ca026d1c0db2d56fa46406 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 07:03:15 -0700 Subject: [PATCH 058/109] Fix clang-format: break long QD_STMT_DEF_FIELDS lines in statements.h --- quadrants/ir/statements.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/quadrants/ir/statements.h b/quadrants/ir/statements.h index c9b0d79841..c29c648995 100644 --- a/quadrants/ir/statements.h +++ b/quadrants/ir/statements.h @@ -978,7 +978,14 @@ class RangeForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(begin, end, reversed, is_bit_vectorized, num_cpu_threads, block_dim, strictly_serialized, stream_parallel_group_id); + QD_STMT_DEF_FIELDS(begin, + end, + reversed, + is_bit_vectorized, + num_cpu_threads, + block_dim, + strictly_serialized, + stream_parallel_group_id); QD_DEFINE_ACCEPT }; @@ -1012,7 +1019,13 @@ class StructForStmt : public Stmt { std::unique_ptr clone() const override; - QD_STMT_DEF_FIELDS(snode, index_offsets, is_bit_vectorized, num_cpu_threads, block_dim, mem_access_opt, stream_parallel_group_id); + QD_STMT_DEF_FIELDS(snode, + index_offsets, + is_bit_vectorized, + num_cpu_threads, + block_dim, + mem_access_opt, + stream_parallel_group_id); QD_DEFINE_ACCEPT }; From ae1c932db2df45bdd0069e5c2a3b748a8b3d2128 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 07:17:21 -0700 Subject: [PATCH 059/109] Reflow comments and docstring to 120-char line width Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 4 ++-- quadrants/runtime/amdgpu/kernel_launcher.cpp | 4 ++-- tests/python/test_streams.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index d91afcac00..0b789cedf5 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -28,8 +28,8 @@ PER_AMDGPU_FUNCTION(memcpy_async, hipMemcpyAsync, void *, void *, std::size_t, u PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, hipMemcpyHtoDAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, hipMemcpyDtoHAsync, void *, void *, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); -// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers -// fall back to the synchronous variants on devices without memory-pool support. +// hipMallocAsync/hipFreeAsync require ROCm >= 5.4; the AMDGPUDriver wrappers fall back to the synchronous variants +// on devices without memory-pool support. PER_AMDGPU_FUNCTION(malloc_async_impl, hipMallocAsync, void **, std::size_t, void *); PER_AMDGPU_FUNCTION(malloc_managed, hipMallocManaged, void **, std::size_t, uint32); PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index b32e0981ea..67befa8b66 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -105,8 +105,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx auto *active_stream = AMDGPUContext::get_instance().get_stream(); char *device_result_buffer{nullptr}; - // Must always allocate device_result_buffer (even when result_buffer_size - // is 0) to avoid memory access faults from allocate_memory_on_device below. + // Must always allocate device_result_buffer (even when result_buffer_size is 0) to avoid memory access faults + // from allocate_memory_on_device below. AMDGPUDriver::get_instance().malloc_async((void **)&device_result_buffer, std::max(ctx.result_buffer_size, sizeof(uint64)), active_stream); diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 073d383c2e..969d18ecf1 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -199,8 +199,7 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): @test_utils.test() def test_concurrent_streams_with_events(): - """Two slow kernels on separate streams run concurrently (~1s on GPU), - serial fallback on CPU/Metal.""" + """Two slow kernels on separate streams run concurrently (~1s on GPU), serial fallback on CPU/Metal.""" SPIN_ITERS = 5_000_000 @qd.kernel From 3ef0340bdbba610abfd400042a9617b7e0542f03 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 07:40:06 -0700 Subject: [PATCH 060/109] Use context/device synchronize in synchronize() to drain all streams stream_synchronize(nullptr) does not drain non-blocking user streams (CU_STREAM_NON_BLOCKING / HIP_STREAM_NON_BLOCKING), so qd.sync() failed to honor its "drain everything" contract. Python's finally block resets stream_ to nullptr before qd.sync() runs, making the previous active-stream check dead code for the user-facing path. Replace with cuCtxSynchronize (CUDA) / hipDeviceSynchronize (AMDGPU) which drain all streams on the device, correctly implementing the documented qd.sync() semantics. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h | 3 +++ quadrants/rhi/cuda/cuda_driver_functions.inc.h | 3 +++ quadrants/runtime/llvm/llvm_runtime_executor.cpp | 14 ++++---------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h index 0b789cedf5..c94a7f14db 100644 --- a/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h +++ b/quadrants/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -15,6 +15,9 @@ PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *); PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *); PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); +// Device synchronization +PER_AMDGPU_FUNCTION(device_synchronize, hipDeviceSynchronize); + // Stream management PER_AMDGPU_FUNCTION(stream_create, hipStreamCreateWithFlags, void **, uint32); PER_AMDGPU_FUNCTION(stream_destroy, hipStreamDestroy, void *); diff --git a/quadrants/rhi/cuda/cuda_driver_functions.inc.h b/quadrants/rhi/cuda/cuda_driver_functions.inc.h index 55c5e3e0b8..b4164b7c33 100644 --- a/quadrants/rhi/cuda/cuda_driver_functions.inc.h +++ b/quadrants/rhi/cuda/cuda_driver_functions.inc.h @@ -53,6 +53,9 @@ PER_CUDA_FUNCTION(kernel_get_occupancy, cuOccupancyMaxActiveBlocksPerMultiproces PER_CUDA_FUNCTION(kernel_set_attribute, cuFuncSetAttribute, void *, CUfunction_attribute_enum, int); +// Context management +PER_CUDA_FUNCTION(context_synchronize, cuCtxSynchronize); + // Stream management PER_CUDA_FUNCTION(stream_synchronize, cuStreamSynchronize, void *); PER_CUDA_FUNCTION(stream_wait_event, cuStreamWaitEvent, void *, void *, uint32); diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 390987768a..6d631cfc2f 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -188,21 +188,15 @@ void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *re void LlvmRuntimeExecutor::synchronize() { if (config_.arch == Arch::cuda) { #if defined(QD_WITH_CUDA) - auto *active_stream = CUDAContext::get_instance().get_stream(); - if (active_stream != nullptr) { - CUDADriver::get_instance().stream_synchronize(active_stream); - } - CUDADriver::get_instance().stream_synchronize(nullptr); + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().context_synchronize(); #else QD_ERROR("No CUDA support"); #endif } else if (config_.arch == Arch::amdgpu) { #if defined(QD_WITH_AMDGPU) - auto *active_stream = AMDGPUContext::get_instance().get_stream(); - if (active_stream != nullptr) { - AMDGPUDriver::get_instance().stream_synchronize(active_stream); - } - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUContext::get_instance().make_current(); + AMDGPUDriver::get_instance().device_synchronize(); #else QD_ERROR("No AMDGPU support"); #endif From 3a81a46abcd5a53eea40df89e7283b4516479667 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 08:32:15 -0700 Subject: [PATCH 061/109] Use synchronous mem_free in dealloc_memory pool branch mem_free_async on the NULL stream does not sync with non-blocking user streams, so a Python ndarray dropped while a kernel is still in flight could return its slab to the mempool prematurely. Using synchronous mem_free matches pre-stream-rewire behavior and implicitly waits for all pending work on the device. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_device.cpp b/quadrants/rhi/amdgpu/amdgpu_device.cpp index d127ce19a0..280cd9f7e1 100644 --- a/quadrants/rhi/amdgpu/amdgpu_device.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_device.cpp @@ -125,7 +125,7 @@ void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) { } QD_ASSERT(!info.is_imported); if (info.use_memory_pool) { - AMDGPUDriver::get_instance().mem_free_async(info.ptr, nullptr); + AMDGPUDriver::get_instance().mem_free(info.ptr); } else if (info.use_cached) { DeviceMemoryPool::get_instance(Arch::amdgpu, false /*merge_upon_release*/) .release(info.size, (uint64_t *)info.ptr, false); From 3c6b24eb4706574a9bb755c335b0e1cda318b35b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 08:50:48 -0700 Subject: [PATCH 062/109] Add tests for stream/event context managers, event.synchronize, error paths Cover the gaps flagged by the test-coverage CI check: - Stream and Event used as context managers (__enter__/__exit__) - Event.synchronize() method - RuntimeError when qd_stream is combined with autograd Tape - RuntimeError when qd_stream is combined with graph=True Co-authored-by: Cursor --- tests/python/test_streams.py | 78 ++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index fabc217e96..8a00024220 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -1,6 +1,7 @@ """Tests for GPU stream and event support.""" import numpy as np +import pytest import quadrants as qd from quadrants.lang.stream import Event, Stream @@ -195,3 +196,80 @@ def fill(arr: qd.types.ndarray(dtype=qd.f32, ndim=1)): s.synchronize() assert np.allclose(arr.to_numpy(), 99.0) s.destroy() + + +@test_utils.test() +def test_stream_context_manager(): + N = 64 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 11.0 + + with qd.create_stream() as s: + fill(qd_stream=s) + s.synchronize() + assert s.handle == 0 + assert np.allclose(x.to_numpy(), 11.0) + + +@test_utils.test() +def test_event_context_manager(): + with qd.create_event() as e: + assert isinstance(e, Event) + assert e.handle == 0 + + +@test_utils.test() +def test_event_synchronize(): + N = 64 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 13.0 + + s = qd.create_stream() + fill(qd_stream=s) + e = qd.create_event() + e.record(s) + e.synchronize() + assert np.allclose(x.to_numpy(), 13.0) + e.destroy() + s.destroy() + + +@test_utils.test(arch=[qd.cuda]) +def test_stream_with_tape_raises(): + x = qd.field(qd.f32, shape=(), needs_grad=True) + loss = qd.field(qd.f32, shape=(), needs_grad=True) + + @qd.kernel + def compute(): + loss[None] = x[None] ** 2 + + s = qd.create_stream() + with pytest.raises(RuntimeError, match="not compatible with autograd Tape"): + with qd.ad.Tape(loss): + compute(qd_stream=s) + s.destroy() + + +@test_utils.test(arch=[qd.cuda]) +def test_stream_with_graph_raises(): + N = 64 + x = qd.field(qd.f32, shape=(N,)) + + @qd.kernel + def fill(): + for i in range(N): + x[i] = 1.0 + + fill.use_graph = True + s = qd.create_stream() + with pytest.raises(RuntimeError, match="not compatible with graph=True"): + fill(qd_stream=s) + s.destroy() From 3499bbcccef6f174cbc15649b0dcbd00eaf5c990 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 09:04:44 -0700 Subject: [PATCH 063/109] Thread active_stream through AMDGPU profiler event_record and sync Profiler events were hardcoded to the NULL stream while kernels now run on user streams; with HIP_STREAM_NON_BLOCKING both events signal immediately on the empty NULL stream, yielding ~0 ms timings. Co-authored-by: Cursor --- quadrants/rhi/amdgpu/amdgpu_profiler.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp index 731d536bca..e963f7df20 100644 --- a/quadrants/rhi/amdgpu/amdgpu_profiler.cpp +++ b/quadrants/rhi/amdgpu/amdgpu_profiler.cpp @@ -59,8 +59,9 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, } void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { - AMDGPUDriver::get_instance().event_record(handle, 0); - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().event_record(handle, active_stream); + AMDGPUDriver::get_instance().stream_synchronize(active_stream); // get elapsed time and destroy events auto record = event_toolkit_->get_current_event_record(); @@ -154,7 +155,8 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std:: AMDGPUDriver::get_instance().event_create(&(record.start_event), HIP_EVENT_DEFAULT); AMDGPUDriver::get_instance().event_create(&(record.stop_event), HIP_EVENT_DEFAULT); - AMDGPUDriver::get_instance().event_record((record.start_event), 0); + void *active_stream = AMDGPUContext::get_instance().get_stream(); + AMDGPUDriver::get_instance().event_record((record.start_event), active_stream); event_records_.push_back(record); if (!base_event_) { @@ -163,7 +165,7 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(const std:: for (int i = 0; i < n_iters; i++) { void *e; AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT); - AMDGPUDriver::get_instance().event_record(e, 0); + AMDGPUDriver::get_instance().event_record(e, active_stream); AMDGPUDriver::get_instance().event_synchronize(e); auto final_t = Time::get_time(); if (i == n_iters - 1) { From c549e072779fc12f4a33c381d1f76ef6167cd0e7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 09:53:18 -0700 Subject: [PATCH 064/109] Fix graph+stream error guard and test Check self.use_graph instead of launch_ctx.use_graph so the error fires even when QD_GRAPH env var is off. Use @qd.kernel(graph=True) in the test instead of manually setting .use_graph attribute. Co-authored-by: Cursor --- python/quadrants/lang/kernel.py | 2 +- tests/python/test_streams.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index e0cdf945b5..cb337d1bc1 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -561,7 +561,7 @@ def launch_kernel( self.src_ll_cache_observations.cache_stored = True self._last_compiled_kernel_data = compiled_kernel_data launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED - if launch_ctx.use_graph and qd_stream is not None: + if self.use_graph and qd_stream is not None: raise RuntimeError( "qd_stream is not compatible with graph=True kernels. " "See docs/source/user_guide/streams.md for details." diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 8a00024220..7f03703dac 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -263,12 +263,11 @@ def test_stream_with_graph_raises(): N = 64 x = qd.field(qd.f32, shape=(N,)) - @qd.kernel + @qd.kernel(graph=True) def fill(): for i in range(N): x[i] = 1.0 - fill.use_graph = True s = qd.create_stream() with pytest.raises(RuntimeError, match="not compatible with graph=True"): fill(qd_stream=s) From 5d284acf162364a7a1c271647388fe1111a09029 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 09:57:09 -0700 Subject: [PATCH 065/109] Update qd.sync() docstring and streams doc to reflect default-stream-only semantics qd.sync() synchronizes the default (NULL) stream, not explicit non-blocking streams. Update the docstring and add a note to the streams user guide. Co-authored-by: Cursor --- docs/source/user_guide/streams.md | 1 + python/quadrants/lang/runtime_ops.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 85d4e8d12c..0f9dbf7496 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -134,4 +134,5 @@ qd_stream.destroy() - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. - **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee. +- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for. - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py index 0ecd122f56..8b07cfb99a 100644 --- a/python/quadrants/lang/runtime_ops.py +++ b/python/quadrants/lang/runtime_ops.py @@ -4,8 +4,12 @@ def sync(): - """Blocks the calling thread until all the previously - launched Quadrants kernels have completed. + """Synchronizes the default stream. + + Blocks the calling thread until all work on the default GPU stream + has completed. Kernels launched on explicit streams created via + :func:`quadrants.create_stream` are **not** waited on — call + ``stream.synchronize()`` for those. """ impl.get_runtime().sync() From df0b03a6d1505a47b583677b7d6af4bdf040388a Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 11:12:05 -0700 Subject: [PATCH 066/109] Fix stream_parallel identity check failing on dual-import-path builds The _is_stream_parallel_with validation uses ASTResolver.resolve_to which compares objects with `is`. On Linux build runners where quadrants is available from both the source tree and installed location, the stream_parallel function object may differ between import paths. Add a fallback that checks __name__ and __module__ when identity fails, and add ASTResolver.resolve_value for general AST-to-object resolution. Co-authored-by: Cursor --- python/quadrants/lang/ast/ast_transformer.py | 4 +-- .../function_def_transformer.py | 10 +++++- python/quadrants/lang/ast/symbol_resolver.py | 32 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index b5b78455c6..a0048ccb61 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -40,7 +40,7 @@ from quadrants.lang.field import Field from quadrants.lang.matrix import Matrix, MatrixType from quadrants.lang.snode import append, deactivate, length -from quadrants.lang.stream import stream_parallel + from quadrants.lang.struct import Struct, StructType from quadrants.lang.util import ( is_from_quadrants_module as _is_from_quadrants_module, @@ -1539,7 +1539,7 @@ def build_With(ctx: ASTTransformerFuncContext, node: ast.With) -> None: raise QuadrantsSyntaxError("'with ... as ...' is not supported in Quadrants kernels") if not isinstance(item.context_expr, ast.Call): raise QuadrantsSyntaxError("'with' in Quadrants kernels requires a call expression") - if not ASTResolver.resolve_to(item.context_expr.func, stream_parallel, ctx.global_vars): + if not FunctionDefTransformer._is_stream_parallel_with(node, ctx.global_vars): raise QuadrantsSyntaxError("'with' in Quadrants kernels only supports qd.stream_parallel()") if not ctx.is_kernel: raise QuadrantsSyntaxError("qd.stream_parallel() can only be used inside @qd.kernel, not @qd.func") diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index d6b64b5080..12997eba80 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -462,7 +462,15 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo item = stmt.items[0] if not isinstance(item.context_expr, ast.Call): return False - return ASTResolver.resolve_to(item.context_expr.func, stream_parallel, global_vars) + func_node = item.context_expr.func + if ASTResolver.resolve_to(func_node, stream_parallel, global_vars): + return True + resolved = ASTResolver.resolve_value(func_node, global_vars) + return ( + resolved is not None + and getattr(resolved, "__name__", None) == "stream_parallel" + and getattr(resolved, "__module__", None) == "quadrants.lang.stream" + ) @staticmethod def _is_docstring(stmt: ast.stmt, index: int) -> bool: diff --git a/python/quadrants/lang/ast/symbol_resolver.py b/python/quadrants/lang/ast/symbol_resolver.py index 81296fcefb..f95373a463 100644 --- a/python/quadrants/lang/ast/symbol_resolver.py +++ b/python/quadrants/lang/ast/symbol_resolver.py @@ -55,3 +55,35 @@ def resolve_to(node, wanted, scope): return False # The name ``scope`` here could be a bit confusing return scope is wanted + + @staticmethod + def resolve_value(node, scope): + """Resolve an AST Name/Attribute node to a Python object. + + Same traversal as resolve_to but returns the resolved object (or None) + instead of comparing against a wanted value. + """ + if isinstance(node, ast.Name): + return scope.get(node.id) if isinstance(scope, dict) else None + + if not isinstance(node, ast.Attribute): + return None + + v = node.value + chain = [node.attr] + while isinstance(v, ast.Attribute): + chain.append(v.attr) + v = v.value + if not isinstance(v, ast.Name): + return None + chain.append(v.id) + + for attr in reversed(chain): + try: + if isinstance(scope, dict): + scope = scope[attr] + else: + scope = getattr(scope, attr) + except (KeyError, AttributeError): + return None + return scope From ff8056d34acecbb295d40a3a216e33b0fd0ddab8 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 11:17:39 -0700 Subject: [PATCH 067/109] Reflow sync() docstring to 120-char line width Co-authored-by: Cursor --- python/quadrants/lang/runtime_ops.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py index 8b07cfb99a..71919e2379 100644 --- a/python/quadrants/lang/runtime_ops.py +++ b/python/quadrants/lang/runtime_ops.py @@ -6,10 +6,8 @@ def sync(): """Synchronizes the default stream. - Blocks the calling thread until all work on the default GPU stream - has completed. Kernels launched on explicit streams created via - :func:`quadrants.create_stream` are **not** waited on — call - ``stream.synchronize()`` for those. + Blocks the calling thread until all work on the default GPU stream has completed. Kernels launched on explicit + streams created via :func:`quadrants.create_stream` are **not** waited on — call ``stream.synchronize()`` for those. """ impl.get_runtime().sync() From acff351a403af45f3cf0b27660ae2033c2544401 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 11:22:39 -0700 Subject: [PATCH 068/109] Remove unused ASTResolver import from ast_transformer.py Co-authored-by: Cursor --- python/quadrants/lang/ast/ast_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index a0048ccb61..152a952044 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -28,7 +28,6 @@ from quadrants.lang.ast.ast_transformers.function_def_transformer import ( FunctionDefTransformer, ) -from quadrants.lang.ast.symbol_resolver import ASTResolver from quadrants.lang.exception import ( QuadrantsIndexError, QuadrantsRuntimeTypeError, From 70eb471521763e251588c48e3e607248f8152c64 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 11:33:14 -0700 Subject: [PATCH 069/109] Fix import sorting in ast_transformer.py Co-authored-by: Cursor --- python/quadrants/lang/ast/ast_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/quadrants/lang/ast/ast_transformer.py b/python/quadrants/lang/ast/ast_transformer.py index 152a952044..263a4a11a3 100644 --- a/python/quadrants/lang/ast/ast_transformer.py +++ b/python/quadrants/lang/ast/ast_transformer.py @@ -39,7 +39,6 @@ from quadrants.lang.field import Field from quadrants.lang.matrix import Matrix, MatrixType from quadrants.lang.snode import append, deactivate, length - from quadrants.lang.struct import Struct, StructType from quadrants.lang.util import ( is_from_quadrants_module as _is_from_quadrants_module, From ebd5e119cf5019e8539e1de5f3a75d1d8c936e22 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 12:36:40 -0700 Subject: [PATCH 070/109] Add AST-level fallback for stream_parallel detection When object resolution fails (dual import paths), fall back to checking the AST node name directly. Inside @qd.kernel the only valid with-context is qd.stream_parallel(), so checking the attribute name is sufficient. Co-authored-by: Cursor --- .../ast_transformers/function_def_transformer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 12997eba80..7a42dfff87 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -466,11 +466,16 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo if ASTResolver.resolve_to(func_node, stream_parallel, global_vars): return True resolved = ASTResolver.resolve_value(func_node, global_vars) - return ( - resolved is not None - and getattr(resolved, "__name__", None) == "stream_parallel" - and getattr(resolved, "__module__", None) == "quadrants.lang.stream" - ) + if resolved is not None: + return ( + getattr(resolved, "__name__", None) == "stream_parallel" + and getattr(resolved, "__module__", "").startswith("quadrants") + ) + if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel": + return True + if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel": + return True + return False @staticmethod def _is_docstring(stmt: ast.stmt, index: int) -> bool: From a6c385200add940d1f7182041e756c7b6748e744 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 12:37:08 -0700 Subject: [PATCH 071/109] Add diagnostic info to stream_parallel exclusivity error message Include the failing statement type, index, and body length to help debug the persistent Linux build x64 test failures. Co-authored-by: Cursor --- .../lang/ast/ast_transformers/function_def_transformer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 7a42dfff87..4ffee5fc2e 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -489,8 +489,14 @@ def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dic if FunctionDefTransformer._is_docstring(stmt, i): continue if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars): + stmt_desc = f"{type(stmt).__name__}" + if isinstance(stmt, ast.With) and stmt.items: + ctx_expr = stmt.items[0].context_expr + if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute): + stmt_desc += f"(with {ast.dump(ctx_expr.func)})" raise QuadrantsSyntaxError( "When using qd.stream_parallel(), all top-level statements " "in the kernel must be 'with qd.stream_parallel():' blocks. " - "Move non-parallel code to a separate kernel." + f"Move non-parallel code to a separate kernel. " + f"[stmt {i}: {stmt_desc}, body_len={len(body)}]" ) From 03d2b293908f17fe3f7a8e7ba78720a45f8d620d Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 12:55:46 -0700 Subject: [PATCH 072/109] Fix black formatting in function_def_transformer.py Co-authored-by: Cursor --- .../lang/ast/ast_transformers/function_def_transformer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 4ffee5fc2e..23a1f9431a 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -467,10 +467,9 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo return True resolved = ASTResolver.resolve_value(func_node, global_vars) if resolved is not None: - return ( - getattr(resolved, "__name__", None) == "stream_parallel" - and getattr(resolved, "__module__", "").startswith("quadrants") - ) + return getattr(resolved, "__name__", None) == "stream_parallel" and getattr( + resolved, "__module__", "" + ).startswith("quadrants") if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel": return True if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel": From 3af5bc8607784720664d4ef18051da1712b60e1f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 13:49:18 -0700 Subject: [PATCH 073/109] Apply black formatting to function_def_transformer.py Co-authored-by: Cursor --- .../function_def_transformer.py | 121 +++++++++++++----- 1 file changed, 91 insertions(+), 30 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 4ffee5fc2e..123767be55 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -56,7 +56,9 @@ def _decl_and_create_variable( assert this_arg_features is not None marker = this_arg_features[0] if marker == _TENSOR_T_NDARRAY_MARKER: - raw_element_type, ndim, needs_grad, boundary, layout = this_arg_features[1:] + raw_element_type, ndim, needs_grad, boundary, layout = ( + this_arg_features[1:] + ) return False, ( kernel_arguments.decl_ndarray_arg, ( @@ -75,7 +77,9 @@ def _decl_and_create_variable( assert ctx.global_vars is not None return True, ctx.global_vars.get(name) raise AssertionError(f"unknown qd.Tensor marker: {marker!r}") - if annotation == annotations.template or isinstance(annotation, annotations.template): + if annotation == annotations.template or isinstance( + annotation, annotations.template + ): if name in ctx.template_vars: return True, ctx.template_vars[name] assert ctx.global_vars is not None @@ -98,8 +102,12 @@ def _decl_and_create_variable( needs_grad, BoundaryMode(boundary), ) - offset = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_offset") - size = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_size") + offset = kernel_arguments.decl_scalar_arg( + primitive_types.i32, full_name + "_offset" + ) + size = kernel_arguments.decl_scalar_arg( + primitive_types.i32, full_name + "_size" + ) return True, BufferView(arr, offset, size) if isinstance(annotation, ndarray_type.NdarrayType): assert this_arg_features is not None @@ -139,7 +147,10 @@ def _transform_kernel_arg( ctx.create_variable(argument_name, argument_type) for field_idx, field in enumerate(dataclasses.fields(argument_type)): flat_name = create_flat_name(argument_name, field.name) - if pruning.enforcing and flat_name not in pruning.used_vars_by_func_id[func_id]: + if ( + pruning.enforcing + and flat_name not in pruning.used_vars_by_func_id[func_id] + ): continue # if a field is a dataclass, then feed back into process_kernel_arg recursively if dataclasses.is_dataclass(field.type): @@ -177,7 +188,9 @@ def _transform_kernel_arg( ctx.create_variable(argument_name, obj) @staticmethod - def _transform_as_kernel(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None: + def _transform_as_kernel( + ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments + ) -> None: assert ctx.func is not None assert ctx.arg_features is not None if node.returns is not None: @@ -226,7 +239,9 @@ def _walk_obj(obj, arg_idx, path): child = child._unwrap() if isinstance(child, _ndarray.Ndarray): _register_ndarray(child, arg_idx, (*path, field.name)) - elif dataclasses.is_dataclass(child) and not isinstance(child, type): + elif dataclasses.is_dataclass(child) and not isinstance( + child, type + ): _walk_obj(child, arg_idx, (*path, field.name)) else: for attr_name, attr_val in vars(obj).items(): @@ -250,7 +265,9 @@ def _register_ndarray(nd, arg_idx, attr_chain): element_type, ndim, name, needs_grad ) arr = any_array.AnyArray( - _qd_core.make_external_tensor_expr(element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE), + _qd_core.make_external_tensor_expr( + element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE + ), _qd_layout=layout, ) cache[key] = arr @@ -259,7 +276,9 @@ def _register_ndarray(nd, arg_idx, attr_chain): assert ctx.py_args is not None for i, arg_meta in enumerate(ctx.func.arg_metas): anno = arg_meta.annotation - is_template = anno is annotations.template or isinstance(anno, annotations.template) + is_template = anno is annotations.template or isinstance( + anno, annotations.template + ) is_tensor_anno = anno is _TensorClass if not (is_template or is_tensor_anno): continue @@ -297,15 +316,21 @@ def _transform_func_arg( # directly — ndarray and field impls are both valid pass-by-reference arguments. if argument_type is _TensorClass: data = FunctionDefTransformer._unwrap_tensor(data) - _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None) + _cache = getattr( + getattr(ctx, "global_context", None), "ndarray_to_any_array", None + ) promoted = _cache.get(id(data)) if _cache else None - ctx.create_variable(argument_name, promoted if promoted is not None else data) + ctx.create_variable( + argument_name, promoted if promoted is not None else data + ) return None if dataclasses.is_dataclass(argument_type): for field in dataclasses.fields(argument_type): flat_name = create_flat_name(argument_name, field.name) - data_child = FunctionDefTransformer._unwrap_tensor(getattr(data, field.name)) + data_child = FunctionDefTransformer._unwrap_tensor( + getattr(data, field.name) + ) if isinstance( data_child, ( @@ -317,11 +342,19 @@ def _transform_func_arg( ): # qd.Tensor struct fields skip check_matched (the Tensor class has no such method — it is # polymorphic). - if field.type is not _TensorClass and hasattr(field.type, "check_matched"): + if field.type is not _TensorClass and hasattr( + field.type, "check_matched" + ): field.type.check_matched(data_child.get_type(), field.name) - _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None) + _cache = getattr( + getattr(ctx, "global_context", None), + "ndarray_to_any_array", + None, + ) promoted = _cache.get(id(data_child)) if _cache else None - ctx.create_variable(flat_name, promoted if promoted is not None else data_child) + ctx.create_variable( + flat_name, promoted if promoted is not None else data_child + ) elif dataclasses.is_dataclass(data_child): FunctionDefTransformer._transform_func_arg( ctx, @@ -338,9 +371,17 @@ def _transform_func_arg( # Ndarray arguments are passed by reference. if isinstance(argument_type, (ndarray_type.NdarrayType)): if not isinstance( - data, (_ndarray.ScalarNdarray, matrix.VectorNdarray, matrix.MatrixNdarray, any_array.AnyArray) + data, + ( + _ndarray.ScalarNdarray, + matrix.VectorNdarray, + matrix.MatrixNdarray, + any_array.AnyArray, + ), ): - raise QuadrantsSyntaxError(f"Argument {argument_name} of type {argument_type} is not recognized.") + raise QuadrantsSyntaxError( + f"Argument {argument_name} of type {argument_type} is not recognized." + ) argument_type.check_matched(data.get_type(), argument_name) ctx.create_variable(argument_name, data) return None @@ -350,7 +391,9 @@ def _transform_func_arg( # not here — data.arr is an Expr node during func compilation, not a real Ndarray. if isinstance(argument_type, buffer_view_type.BufferViewType): if not isinstance(data, BufferView): - raise QuadrantsSyntaxError(f"Argument {argument_name} expects a BufferView, got {type(data).__name__}") + raise QuadrantsSyntaxError( + f"Argument {argument_name} expects a BufferView, got {type(data).__name__}" + ) ctx.create_variable(argument_name, data) return None @@ -389,7 +432,9 @@ def _transform_func_arg( return None if id(argument_type) in primitive_types.type_ids: - ctx.create_variable(argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type))) + ctx.create_variable( + argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type)) + ) return None # Create a copy for non-template arguments, # so that they are passed by value. @@ -398,7 +443,9 @@ def _transform_func_arg( return None @staticmethod - def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None: + def _transform_as_func( + ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments + ) -> None: # pylint: disable=import-outside-toplevel from quadrants.lang.kernel_impl import Func @@ -406,7 +453,9 @@ def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, ar assert ctx.py_args is not None for py_arg_i, py_arg in enumerate(ctx.py_args): argument = ctx.func.arg_metas_expanded[py_arg_i] - FunctionDefTransformer._transform_func_arg(ctx, argument.name, argument.annotation, py_arg) + FunctionDefTransformer._transform_func_arg( + ctx, argument.name, argument.annotation, py_arg + ) # deal with dataclasses for v in ctx.func.orig_arguments: @@ -446,7 +495,9 @@ def build_FunctionDef( FunctionDefTransformer._transform_as_func(ctx, node, args) if ctx.is_kernel: - FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars) + FunctionDefTransformer._validate_stream_parallel_exclusivity( + node.body, ctx.global_vars + ) with ctx.variable_scope_guard(): build_stmts(ctx, node.body) @@ -467,10 +518,9 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo return True resolved = ASTResolver.resolve_value(func_node, global_vars) if resolved is not None: - return ( - getattr(resolved, "__name__", None) == "stream_parallel" - and getattr(resolved, "__module__", "").startswith("quadrants") - ) + return getattr(resolved, "__name__", None) == "stream_parallel" and getattr( + resolved, "__module__", "" + ).startswith("quadrants") if isinstance(func_node, ast.Attribute) and func_node.attr == "stream_parallel": return True if isinstance(func_node, ast.Name) and func_node.id == "stream_parallel": @@ -479,11 +529,20 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo @staticmethod def _is_docstring(stmt: ast.stmt, index: int) -> bool: - return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str)) + return ( + index == 0 + and isinstance(stmt, ast.Expr) + and isinstance(stmt.value, (ast.Constant, ast.Str)) + ) @staticmethod - def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None: - if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body): + def _validate_stream_parallel_exclusivity( + body: list[ast.stmt], global_vars: dict[str, Any] + ) -> None: + if not any( + FunctionDefTransformer._is_stream_parallel_with(s, global_vars) + for s in body + ): return for i, stmt in enumerate(body): if FunctionDefTransformer._is_docstring(stmt, i): @@ -492,7 +551,9 @@ def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dic stmt_desc = f"{type(stmt).__name__}" if isinstance(stmt, ast.With) and stmt.items: ctx_expr = stmt.items[0].context_expr - if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute): + if isinstance(ctx_expr, ast.Call) and isinstance( + ctx_expr.func, ast.Attribute + ): stmt_desc += f"(with {ast.dump(ctx_expr.func)})" raise QuadrantsSyntaxError( "When using qd.stream_parallel(), all top-level statements " From 28440602d0796fb0f15dfa176932c68ab499dc57 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 14:08:47 -0700 Subject: [PATCH 074/109] Fix black formatting in function_def_transformer.py (post-merge) Co-authored-by: Cursor --- .../function_def_transformer.py | 23 ++++--------------- 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 123767be55..debbd2efa9 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -495,9 +495,7 @@ def build_FunctionDef( FunctionDefTransformer._transform_as_func(ctx, node, args) if ctx.is_kernel: - FunctionDefTransformer._validate_stream_parallel_exclusivity( - node.body, ctx.global_vars - ) + FunctionDefTransformer._validate_stream_parallel_exclusivity(node.body, ctx.global_vars) with ctx.variable_scope_guard(): build_stmts(ctx, node.body) @@ -529,20 +527,11 @@ def _is_stream_parallel_with(stmt: ast.stmt, global_vars: dict[str, Any]) -> boo @staticmethod def _is_docstring(stmt: ast.stmt, index: int) -> bool: - return ( - index == 0 - and isinstance(stmt, ast.Expr) - and isinstance(stmt.value, (ast.Constant, ast.Str)) - ) + return index == 0 and isinstance(stmt, ast.Expr) and isinstance(stmt.value, (ast.Constant, ast.Str)) @staticmethod - def _validate_stream_parallel_exclusivity( - body: list[ast.stmt], global_vars: dict[str, Any] - ) -> None: - if not any( - FunctionDefTransformer._is_stream_parallel_with(s, global_vars) - for s in body - ): + def _validate_stream_parallel_exclusivity(body: list[ast.stmt], global_vars: dict[str, Any]) -> None: + if not any(FunctionDefTransformer._is_stream_parallel_with(s, global_vars) for s in body): return for i, stmt in enumerate(body): if FunctionDefTransformer._is_docstring(stmt, i): @@ -551,9 +540,7 @@ def _validate_stream_parallel_exclusivity( stmt_desc = f"{type(stmt).__name__}" if isinstance(stmt, ast.With) and stmt.items: ctx_expr = stmt.items[0].context_expr - if isinstance(ctx_expr, ast.Call) and isinstance( - ctx_expr.func, ast.Attribute - ): + if isinstance(ctx_expr, ast.Call) and isinstance(ctx_expr.func, ast.Attribute): stmt_desc += f"(with {ast.dump(ctx_expr.func)})" raise QuadrantsSyntaxError( "When using qd.stream_parallel(), all top-level statements " From 5903e499b25e83d4fd3930e28e6ab67c21033d1b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 1 May 2026 14:21:05 -0700 Subject: [PATCH 075/109] Run black -l 120 on function_def_transformer.py (post-merge formatting) Co-authored-by: Cursor --- .../function_def_transformer.py | 77 +++++-------------- 1 file changed, 19 insertions(+), 58 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index debbd2efa9..2878921709 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -56,9 +56,7 @@ def _decl_and_create_variable( assert this_arg_features is not None marker = this_arg_features[0] if marker == _TENSOR_T_NDARRAY_MARKER: - raw_element_type, ndim, needs_grad, boundary, layout = ( - this_arg_features[1:] - ) + raw_element_type, ndim, needs_grad, boundary, layout = this_arg_features[1:] return False, ( kernel_arguments.decl_ndarray_arg, ( @@ -77,9 +75,7 @@ def _decl_and_create_variable( assert ctx.global_vars is not None return True, ctx.global_vars.get(name) raise AssertionError(f"unknown qd.Tensor marker: {marker!r}") - if annotation == annotations.template or isinstance( - annotation, annotations.template - ): + if annotation == annotations.template or isinstance(annotation, annotations.template): if name in ctx.template_vars: return True, ctx.template_vars[name] assert ctx.global_vars is not None @@ -102,12 +98,8 @@ def _decl_and_create_variable( needs_grad, BoundaryMode(boundary), ) - offset = kernel_arguments.decl_scalar_arg( - primitive_types.i32, full_name + "_offset" - ) - size = kernel_arguments.decl_scalar_arg( - primitive_types.i32, full_name + "_size" - ) + offset = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_offset") + size = kernel_arguments.decl_scalar_arg(primitive_types.i32, full_name + "_size") return True, BufferView(arr, offset, size) if isinstance(annotation, ndarray_type.NdarrayType): assert this_arg_features is not None @@ -147,10 +139,7 @@ def _transform_kernel_arg( ctx.create_variable(argument_name, argument_type) for field_idx, field in enumerate(dataclasses.fields(argument_type)): flat_name = create_flat_name(argument_name, field.name) - if ( - pruning.enforcing - and flat_name not in pruning.used_vars_by_func_id[func_id] - ): + if pruning.enforcing and flat_name not in pruning.used_vars_by_func_id[func_id]: continue # if a field is a dataclass, then feed back into process_kernel_arg recursively if dataclasses.is_dataclass(field.type): @@ -188,9 +177,7 @@ def _transform_kernel_arg( ctx.create_variable(argument_name, obj) @staticmethod - def _transform_as_kernel( - ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments - ) -> None: + def _transform_as_kernel(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None: assert ctx.func is not None assert ctx.arg_features is not None if node.returns is not None: @@ -239,9 +226,7 @@ def _walk_obj(obj, arg_idx, path): child = child._unwrap() if isinstance(child, _ndarray.Ndarray): _register_ndarray(child, arg_idx, (*path, field.name)) - elif dataclasses.is_dataclass(child) and not isinstance( - child, type - ): + elif dataclasses.is_dataclass(child) and not isinstance(child, type): _walk_obj(child, arg_idx, (*path, field.name)) else: for attr_name, attr_val in vars(obj).items(): @@ -265,9 +250,7 @@ def _register_ndarray(nd, arg_idx, attr_chain): element_type, ndim, name, needs_grad ) arr = any_array.AnyArray( - _qd_core.make_external_tensor_expr( - element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE - ), + _qd_core.make_external_tensor_expr(element_type, ndim, arg_id_vec, needs_grad, BoundaryMode.UNSAFE), _qd_layout=layout, ) cache[key] = arr @@ -276,9 +259,7 @@ def _register_ndarray(nd, arg_idx, attr_chain): assert ctx.py_args is not None for i, arg_meta in enumerate(ctx.func.arg_metas): anno = arg_meta.annotation - is_template = anno is annotations.template or isinstance( - anno, annotations.template - ) + is_template = anno is annotations.template or isinstance(anno, annotations.template) is_tensor_anno = anno is _TensorClass if not (is_template or is_tensor_anno): continue @@ -316,21 +297,15 @@ def _transform_func_arg( # directly — ndarray and field impls are both valid pass-by-reference arguments. if argument_type is _TensorClass: data = FunctionDefTransformer._unwrap_tensor(data) - _cache = getattr( - getattr(ctx, "global_context", None), "ndarray_to_any_array", None - ) + _cache = getattr(getattr(ctx, "global_context", None), "ndarray_to_any_array", None) promoted = _cache.get(id(data)) if _cache else None - ctx.create_variable( - argument_name, promoted if promoted is not None else data - ) + ctx.create_variable(argument_name, promoted if promoted is not None else data) return None if dataclasses.is_dataclass(argument_type): for field in dataclasses.fields(argument_type): flat_name = create_flat_name(argument_name, field.name) - data_child = FunctionDefTransformer._unwrap_tensor( - getattr(data, field.name) - ) + data_child = FunctionDefTransformer._unwrap_tensor(getattr(data, field.name)) if isinstance( data_child, ( @@ -342,9 +317,7 @@ def _transform_func_arg( ): # qd.Tensor struct fields skip check_matched (the Tensor class has no such method — it is # polymorphic). - if field.type is not _TensorClass and hasattr( - field.type, "check_matched" - ): + if field.type is not _TensorClass and hasattr(field.type, "check_matched"): field.type.check_matched(data_child.get_type(), field.name) _cache = getattr( getattr(ctx, "global_context", None), @@ -352,9 +325,7 @@ def _transform_func_arg( None, ) promoted = _cache.get(id(data_child)) if _cache else None - ctx.create_variable( - flat_name, promoted if promoted is not None else data_child - ) + ctx.create_variable(flat_name, promoted if promoted is not None else data_child) elif dataclasses.is_dataclass(data_child): FunctionDefTransformer._transform_func_arg( ctx, @@ -379,9 +350,7 @@ def _transform_func_arg( any_array.AnyArray, ), ): - raise QuadrantsSyntaxError( - f"Argument {argument_name} of type {argument_type} is not recognized." - ) + raise QuadrantsSyntaxError(f"Argument {argument_name} of type {argument_type} is not recognized.") argument_type.check_matched(data.get_type(), argument_name) ctx.create_variable(argument_name, data) return None @@ -391,9 +360,7 @@ def _transform_func_arg( # not here — data.arr is an Expr node during func compilation, not a real Ndarray. if isinstance(argument_type, buffer_view_type.BufferViewType): if not isinstance(data, BufferView): - raise QuadrantsSyntaxError( - f"Argument {argument_name} expects a BufferView, got {type(data).__name__}" - ) + raise QuadrantsSyntaxError(f"Argument {argument_name} expects a BufferView, got {type(data).__name__}") ctx.create_variable(argument_name, data) return None @@ -432,9 +399,7 @@ def _transform_func_arg( return None if id(argument_type) in primitive_types.type_ids: - ctx.create_variable( - argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type)) - ) + ctx.create_variable(argument_name, impl.expr_init_func(qd_ops.cast(data, argument_type))) return None # Create a copy for non-template arguments, # so that they are passed by value. @@ -443,9 +408,7 @@ def _transform_func_arg( return None @staticmethod - def _transform_as_func( - ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments - ) -> None: + def _transform_as_func(ctx: ASTTransformerFuncContext, node: ast.FunctionDef, args: ast.arguments) -> None: # pylint: disable=import-outside-toplevel from quadrants.lang.kernel_impl import Func @@ -453,9 +416,7 @@ def _transform_as_func( assert ctx.py_args is not None for py_arg_i, py_arg in enumerate(ctx.py_args): argument = ctx.func.arg_metas_expanded[py_arg_i] - FunctionDefTransformer._transform_func_arg( - ctx, argument.name, argument.annotation, py_arg - ) + FunctionDefTransformer._transform_func_arg(ctx, argument.name, argument.annotation, py_arg) # deal with dataclasses for v in ctx.func.orig_arguments: From 360adc8fad4e9709f51016fd131686f41679c64e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 01:28:58 -0700 Subject: [PATCH 076/109] Reject qd_stream on autodiff kernels Streams are not compatible with reverse-mode or forward-mode differentiation. The adstack sizer and Tape replay paths assume the default stream; rather than fixing every race, block the combination at the Python entry point with a clear error message. Co-authored-by: Cursor --- docs/source/user_guide/streams.md | 2 +- python/quadrants/lang/kernel.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 0f9dbf7496..b4b70b774b 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -133,6 +133,6 @@ qd_stream.destroy() ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. -- **Not compatible with autograd Tape.** Do not pass `qd_stream` to a kernel inside a `qd.ad.Tape` context. The backward replay does not preserve the stream, so gradient kernels would run on the default stream with no ordering guarantee. +- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context. - **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for. - **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py index eecf92631a..0b45a5816b 100644 --- a/python/quadrants/lang/kernel.py +++ b/python/quadrants/lang/kernel.py @@ -664,6 +664,11 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut @_shell_pop_print def __call__(self, *py_args, **kwargs) -> Any: qd_stream = kwargs.pop("qd_stream", None) + if qd_stream is not None and self.autodiff_mode != _NONE: + raise RuntimeError( + "qd_stream is not compatible with autodiff kernels. Streams cannot be used with " + "reverse-mode or forward-mode differentiation." + ) if qd_stream is not None and self.runtime.target_tape: raise RuntimeError( "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape " From e20fe99687dd0f2cfb78a7895414bd481d6f7fa6 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 01:29:55 -0700 Subject: [PATCH 077/109] Revert adstack sizer stream_synchronize Autodiff+streams is now blocked at the Python level, so the adstack code path never runs on a non-default stream. Remove the unnecessary stream_synchronize we added in publish_adstack_metadata. Co-authored-by: Cursor --- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 214c12de11..8326335dfb 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -941,12 +941,6 @@ std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInf runtime_jit->call("runtime_eval_adstack_size_expr", llvm_runtime_, runtime_context_ptr_for_sizer, bytecode_dev_ptr); - // The sizer kernel runs on active_stream; drain it before reading the stride on the host. -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - CUDADriver::get_instance().stream_synchronize(CUDAContext::get_instance().get_stream()); - } -#endif uint64_t stride_u64 = 0; copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); stride = static_cast(stride_u64); From e3c5f6f59461392be9b16ea76550b278649a8899 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 01:40:06 -0700 Subject: [PATCH 078/109] Reset llvm_runtime_executor.cpp to upstream Our branch had a stale copy of publish_adstack_metadata and ensure_adstack_heap that conflicted with upstream's refactor into ensure_adstack_heap_float / ensure_adstack_heap_int. Since autodiff is now blocked with streams at the Python level, we have no changes to make in this file. Co-authored-by: Cursor --- .../runtime/llvm/llvm_runtime_executor.cpp | 425 ------------------ 1 file changed, 425 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 8326335dfb..658c139c0f 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -606,431 +606,6 @@ void *LlvmRuntimeExecutor::get_runtime_temporaries_device_ptr() { return runtime_temporaries_cache_; } -// Publish the per-task adstack metadata into the LLVMRuntime struct and size the heap. The codegen path loads -// stride / offset / max_size from these fields at every `AdStack*` site (see `ensure_ad_stack_metadata_llvm` in -// codegen_llvm.cpp), so we must write them before every launch even for tasks where the compile-time and -// launch-time bounds agree. `evaluate_adstack_size_expr` is called only when the symbolic tree is available; the -// offline cache does not currently serialize `SizeExpr`, so cache hits fall back to `max_size_compile_time`. -std::size_t LlvmRuntimeExecutor::publish_adstack_metadata(const AdStackSizingInfo &ad_stack, - std::size_t num_threads, - LaunchContextBuilder *ctx, - void *device_runtime_context_ptr) { - const auto n_stacks = ad_stack.allocas.size(); - if (n_stacks == 0 || num_threads == 0) { - return 0; - } - auto align_up_8 = [](std::size_t n) -> std::size_t { return (n + 7u) & ~std::size_t{7u}; }; - // Allocate / grow the two device-side metadata arrays. Capacity is in u64 entries, kept at or above n_stacks. - // On GPU these buffers are written exclusively by the device-side sizer kernel (`runtime_eval_adstack_size_expr`); - // on CPU the host evaluator writes them directly via `std::memcpy`. Either way the pointers published into - // `runtime->adstack_offsets` / `adstack_max_sizes` stay stable across launches unless we grow here. - auto grow_to = [&](DeviceAllocationUnique &alloc, std::size_t capacity_u64) { - Device::AllocParams params{}; - params.size = capacity_u64 * sizeof(uint64_t); - params.host_read = false; - params.host_write = false; - params.export_sharing = false; - params.usage = AllocUsage::Storage; - DeviceAllocation new_alloc; - RhiResult res = llvm_device()->allocate_memory(params, &new_alloc); - QD_ERROR_IF(res != RhiResult::success, "Failed to allocate {} bytes for adstack metadata array (err: {})", - params.size, int(res)); - alloc = std::make_unique(std::move(new_alloc)); - }; - if (n_stacks > adstack_metadata_capacity_) { - std::size_t new_cap = std::max(n_stacks, 2 * adstack_metadata_capacity_); - grow_to(adstack_offsets_alloc_, new_cap); - grow_to(adstack_max_sizes_alloc_, new_cap); - adstack_metadata_capacity_ = new_cap; - } - void *offsets_dev_ptr = get_device_alloc_info_ptr(*adstack_offsets_alloc_); - void *max_sizes_dev_ptr = get_device_alloc_info_ptr(*adstack_max_sizes_alloc_); - - auto copy_h2d = [&](void *dst, const void *src, std::size_t bytes) { - if (config_.arch == Arch::cuda) { -#if defined(QD_WITH_CUDA) - CUDADriver::get_instance().memcpy_host_to_device(dst, const_cast(src), bytes); -#else - QD_NOT_IMPLEMENTED; -#endif - } else if (config_.arch == Arch::amdgpu) { -#if defined(QD_WITH_AMDGPU) - AMDGPUDriver::get_instance().memcpy_host_to_device(dst, const_cast(src), bytes); -#else - QD_NOT_IMPLEMENTED; -#endif - } else { - std::memcpy(dst, src, bytes); - } - }; - auto copy_d2h = [&](void *dst, const void *src, std::size_t bytes) { - if (config_.arch == Arch::cuda) { -#if defined(QD_WITH_CUDA) - CUDADriver::get_instance().memcpy_device_to_host(dst, const_cast(src), bytes); -#else - QD_NOT_IMPLEMENTED; -#endif - } else if (config_.arch == Arch::amdgpu) { -#if defined(QD_WITH_AMDGPU) - AMDGPUDriver::get_instance().memcpy_device_to_host(dst, const_cast(src), bytes); -#else - QD_NOT_IMPLEMENTED; -#endif - } else { - std::memcpy(dst, src, bytes); - } - }; - - // Cache the runtime-field addresses on the first call; then publish the metadata-array pointers into the - // runtime struct. The stride field is written by the sizer on GPU and by this function on CPU, so we cache the - // address either way. - if (runtime_adstack_stride_field_ptr_ == nullptr) { - auto *const runtime_jit = get_runtime_jit_module(); - runtime_jit->call("runtime_get_adstack_metadata_field_ptrs", llvm_runtime_); - runtime_adstack_stride_field_ptr_ = quadrants_union_cast_with_different_sizes( - fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_)); - runtime_adstack_offsets_field_ptr_ = quadrants_union_cast_with_different_sizes( - fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_)); - runtime_adstack_max_sizes_field_ptr_ = quadrants_union_cast_with_different_sizes( - fetch_result_uint64(quadrants_result_buffer_ret_value_id + 2, result_buffer_cache_)); - } - copy_h2d(runtime_adstack_offsets_field_ptr_, &offsets_dev_ptr, sizeof(void *)); - copy_h2d(runtime_adstack_max_sizes_field_ptr_, &max_sizes_dev_ptr, sizeof(void *)); - - std::size_t stride = 0; - const bool is_gpu_llvm = (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu); - - // Host-eval fast path. The on-device sizer kernel exists to handle one specific leaf, `ExternalTensorRead`, - // whose ndarray data lives in GPU-private memory (`cudaMalloc` / `hipMalloc`, no UVA fallback) and thus - // cannot be touched from the host. Every other SizeExpr leaf - `Const`, `BoundVariable`, - // `ExternalTensorShape`, `FieldLoad` - is host-resolvable through the existing `evaluate_adstack_size_expr` - // path, so when the kernel's SizeExprs are all `ExternalTensorRead`-free we can skip the encode + bytecode - // h2d + sizer-kernel launch + d2h-stride pipeline entirely and write the metadata directly via `copy_h2d`. - // On CUDA the saved `cuMemcpyDtoH` for the per-launch stride readback is the dominant cost: every reverse- - // mode kernel launch in a 100-substep test paid one such synchronous DtoH each, and that compound stall - // accounted for the bulk of the GPU launch overhead under adstack mode. The condition is computed once per - // launch by scanning each stack's `nodes` vector for an `ExternalTensorRead` leaf; the scan is O(total - // SizeExpr nodes), well below the cost of the cheapest h2d / d2h on any LLVM GPU backend. - bool all_size_exprs_host_resolvable = true; - for (std::size_t i = 0; i < n_stacks && all_size_exprs_host_resolvable; ++i) { - if (i >= ad_stack.size_exprs.size()) { - continue; - } - for (const auto &node : ad_stack.size_exprs[i].nodes) { - if (static_cast(node.kind) == SizeExpr::Kind::ExternalTensorRead) { - all_size_exprs_host_resolvable = false; - break; - } - } - } - const bool use_host_eval = !is_gpu_llvm || all_size_exprs_host_resolvable; - if (use_host_eval) { - // CPU + GPU-without-ExternalTensorRead path: run the host evaluator directly. On CPU we use synchronous - // `copy_h2d` (just `std::memcpy` for that arch), but on CUDA / AMDGPU we ship the same payload through - // pinned-host memory via async `cuMemcpyHtoDAsync` / `hipMemcpyHtoDAsync` so the host returns immediately - // after queueing the copies on the default stream and the subsequent main-kernel launch (also on the - // default stream) stream-orders after the copies. The synchronous `cuMemcpyHtoD_v2` path used to block - // the host on every one of the three writes we issue per launch; with thousands of reverse-mode launches - // per `test_differentiable_rigid` run, those serial host stalls were a measurable fraction of wallclock. - // `FieldLoad` is serviced by `SNodeRwAccessorsBank` regardless of arch. - // Guard `program_impl_->program` lookups against the C++-only-tests setup where `program_impl_` itself is null; - // the on-device branch below already does this and falls back to `max_size_compile_time`. - Program *prog = (program_impl_ != nullptr) ? program_impl_->program : nullptr; - std::vector host_max_sizes(n_stacks); - for (std::size_t i = 0; i < n_stacks; ++i) { - const SerializedSizeExpr *expr = (i < ad_stack.size_exprs.size()) ? &ad_stack.size_exprs[i] : nullptr; - int64_t v = -1; - if (expr != nullptr && !expr->nodes.empty() && prog != nullptr) { - v = evaluate_adstack_size_expr(*expr, prog, ctx); - } - if (v < 0) { - v = static_cast(ad_stack.allocas[i].max_size_compile_time); - } - host_max_sizes[i] = static_cast(std::max(v, 1)); - } - std::vector host_offsets(n_stacks); - for (std::size_t i = 0; i < n_stacks; ++i) { - host_offsets[i] = stride; - stride += align_up_8(sizeof(int64_t) + ad_stack.allocas[i].entry_size_bytes * host_max_sizes[i]); - } - uint64_t stride_u64 = static_cast(stride); - if (!is_gpu_llvm) { - copy_h2d(offsets_dev_ptr, host_offsets.data(), n_stacks * sizeof(uint64_t)); - copy_h2d(max_sizes_dev_ptr, host_max_sizes.data(), n_stacks * sizeof(uint64_t)); - copy_h2d(runtime_adstack_stride_field_ptr_, &stride_u64, sizeof(uint64_t)); - } else { - // Three-block payload packed into the pinned-host scratch as `[stride_u64, offsets[n_stacks], - // max_sizes[n_stacks]]`. Three async DMAs land on the three target device addresses (the runtime - // struct's stride field, the offsets storage buffer, the max_sizes storage buffer) sourced from - // the corresponding offsets within the pinned scratch. The driver's H2D DMA engine reads from the - // pinned bytes at execution time, so we must not overwrite the scratch before all three copies - // have completed - hence the per-launch `event_record` after the last copy and the - // `event_synchronize` at the top of the next launch. The wait is typically a no-op because a few - // microseconds of small copies finish well before the host returns, dispatches the main kernel, - // and re-enters this function on the next launch. - const std::size_t header_bytes = sizeof(uint64_t); - const std::size_t array_bytes = n_stacks * sizeof(uint64_t); - const std::size_t total_bytes = header_bytes + 2 * array_bytes; - - auto wait_pending = [this]() { - if (!pinned_metadata_event_pending_) { - return; - } -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - CUDADriver::get_instance().event_synchronize(pinned_metadata_event_); - } -#endif -#if defined(QD_WITH_AMDGPU) - if (config_.arch == Arch::amdgpu) { - AMDGPUDriver::get_instance().event_synchronize(pinned_metadata_event_); - } -#endif - pinned_metadata_event_pending_ = false; - }; - - // Grow / first-allocate the pinned host scratch and the per-launch completion event. Doubling growth - // means the pinned alloc / free traffic is amortised to O(log peak_total_bytes) across a run. - if (total_bytes > pinned_metadata_scratch_capacity_) { - wait_pending(); - if (pinned_metadata_scratch_ != nullptr) { -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - CUDADriver::get_instance().mem_free_host(pinned_metadata_scratch_); - } -#endif -#if defined(QD_WITH_AMDGPU) - if (config_.arch == Arch::amdgpu) { - AMDGPUDriver::get_instance().mem_free_host(pinned_metadata_scratch_); - } -#endif - pinned_metadata_scratch_ = nullptr; - } - std::size_t new_capacity = std::max(total_bytes, 2 * pinned_metadata_scratch_capacity_); -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - CUDADriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity); - } -#endif -#if defined(QD_WITH_AMDGPU) - if (config_.arch == Arch::amdgpu) { - // `hipHostMallocDefault == 0`. Coherent / portable / write-combined flags are intentionally not set; - // the workload is small payloads written linearly by the host and DMA-read by the GPU once. - AMDGPUDriver::get_instance().mem_alloc_host(&pinned_metadata_scratch_, new_capacity, 0u); - } -#endif - pinned_metadata_scratch_capacity_ = new_capacity; - } - if (pinned_metadata_event_ == nullptr) { - // `cuEventCreate` flag `0` (CU_EVENT_DEFAULT) means timing-enabled, which the driver costs us nothing - // to set up here and lets future profilers attach without re-creating the event. `hipEventCreateWithFlags` - // takes the same encoding. -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - CUDADriver::get_instance().event_create(&pinned_metadata_event_, 0u); - } -#endif -#if defined(QD_WITH_AMDGPU) - if (config_.arch == Arch::amdgpu) { - AMDGPUDriver::get_instance().event_create(&pinned_metadata_event_, 0u); - } -#endif - } - // Block until any in-flight copies from the previous launch have finished pulling from the pinned scratch - // before we overwrite it. In steady state this is a no-op because the small DMAs finish well before the - // host loops back here; the wait exists only to defend against an unusual interleaving where the GPU - // queue is backlogged and the next launch enters this function before the previous launch's last copy - // has been consumed. - wait_pending(); - - auto *pinned = static_cast(pinned_metadata_scratch_); - pinned[0] = stride_u64; - std::memcpy(pinned + 1, host_offsets.data(), array_bytes); - std::memcpy(pinned + 1 + n_stacks, host_max_sizes.data(), array_bytes); - - // Queue the metadata copies on the same stream the subsequent main-kernel dispatch will run on, so the - // GPU stream-orders the copies before the kernel reads `adstack_max_sizes` etc. On CUDA the active - // stream is `CUDAContext::get_instance().get_stream()` - configurable via `set_stream`, defaults to the - // null stream - and `CUDAContext::launch` dispatches kernels on the same handle. AMDGPU has no - // public stream-selection API: `AMDGPUContext::launch` always passes `nullptr` to `hipLaunchKernel` - // (i.e. the default stream), so the copies match that. -#if defined(QD_WITH_CUDA) - if (config_.arch == Arch::cuda) { - void *active_stream = CUDAContext::get_instance().get_stream(); - CUDADriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, header_bytes, - active_stream); - CUDADriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, active_stream); - CUDADriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes, - active_stream); - CUDADriver::get_instance().event_record(pinned_metadata_event_, active_stream); - } -#endif -#if defined(QD_WITH_AMDGPU) - if (config_.arch == Arch::amdgpu) { - void *active_stream = nullptr; // AMDGPUContext::launch always uses the default stream. - AMDGPUDriver::get_instance().memcpy_host_to_device_async(runtime_adstack_stride_field_ptr_, pinned, - header_bytes, active_stream); - AMDGPUDriver::get_instance().memcpy_host_to_device_async(offsets_dev_ptr, pinned + 1, array_bytes, - active_stream); - AMDGPUDriver::get_instance().memcpy_host_to_device_async(max_sizes_dev_ptr, pinned + 1 + n_stacks, array_bytes, - active_stream); - AMDGPUDriver::get_instance().event_record(pinned_metadata_event_, active_stream); - } -#endif - pinned_metadata_event_pending_ = true; - } - } else { - // GPU (CUDA / AMDGPU): encode the SizeExpr trees into device bytecode, upload, launch the sizer runtime - // function, read back just the computed stride. The sizer kernel writes `adstack_max_sizes[]`, - // `adstack_offsets[]`, and `adstack_per_thread_stride` directly into the runtime struct and the metadata - // arrays above - no further host-writes to those fields are needed this launch. - // - // Why this architecture rather than host-eval: on CUDA / AMDGPU the ndarray data lives in GPU-private memory - // (plain `cudaMalloc` / `hipMalloc`, not managed / unified), so the host evaluator's `ExternalTensorRead` - // deref reads garbage. Moving the interpreter on-device keeps the pointer semantics intact - it reads the - // data pointer out of `ctx->arg_buffer` (which the kernel will read too) and dereferences it where the - // memory lives, with no migration / readback of the ndarray payload itself. - std::vector bytecode; - if (program_impl_ != nullptr && program_impl_->program != nullptr) { - bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, program_impl_->program, ctx); - } else { - // No program attached (rare: C++-only tests that construct Program without a full runtime). Fall through - // to compile-time bounds by emitting an empty-tree bytecode - the device interpreter sees - // `root_node_idx == -1` for every stack and routes to `max_size_compile_time`. - bytecode = encode_adstack_size_expr_device_bytecode(ad_stack, nullptr, ctx); - } - // Grow the scratch buffer if the bytecode outgrew the cached capacity. Amortised doubling keeps the - // allocation traffic O(log max_bytecode_bytes) across a run. - const std::size_t bytecode_bytes = bytecode.size(); - if (bytecode_bytes > adstack_sizer_bytecode_capacity_) { - std::size_t new_cap = std::max(bytecode_bytes, 2 * adstack_sizer_bytecode_capacity_); - Device::AllocParams params{}; - params.size = new_cap; - params.host_read = false; - params.host_write = false; - params.export_sharing = false; - params.usage = AllocUsage::Storage; - DeviceAllocation new_alloc; - RhiResult res = llvm_device()->allocate_memory(params, &new_alloc); - QD_ERROR_IF(res != RhiResult::success, - "Failed to allocate {} bytes for the adstack sizer bytecode scratch buffer (err: {})", params.size, - int(res)); - adstack_sizer_bytecode_alloc_ = std::make_unique(std::move(new_alloc)); - adstack_sizer_bytecode_capacity_ = new_cap; - } - void *bytecode_dev_ptr = get_device_alloc_info_ptr(*adstack_sizer_bytecode_alloc_); - copy_h2d(bytecode_dev_ptr, bytecode.data(), bytecode_bytes); - - // Invoke the device interpreter. On CUDA / AMDGPU `JITModule::call` launches this as a single-thread kernel - // on the default stream and stream-orders it before the subsequent main-kernel dispatch, so the writes we - // do here are visible by the time the user's kernel reads `adstack_max_sizes` etc. - // - // The sizer kernel dereferences `ctx->arg_buffer` on device (that's how it resolves `ExternalTensorRead` leaves - // against ndarray pointers the caller packed into the arg buffer). AMDGPU always stages a device-side copy of - // `RuntimeContext` because HIP has no UVA fallback and the host pointer faults with `hipErrorIllegalAddress`. CUDA - // stages the device copy only when the driver + kernel do not expose HMM / system-allocated memory (queried via - // `CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`): CUDA UVA covers pinned / CUDA-managed memory only, not the plain - // `std::make_unique()` backing, so a host pointer works on HMM-capable setups but faults otherwise - // (Turing without HMM, Windows, pre-535 Linux drivers) as `CUDA_ERROR_ILLEGAL_ADDRESS` at the next DtoH sync - // `illegal memory access ... while calling memcpy_device_to_host`. When the caller passes `nullptr` (HMM-capable - // CUDA) we fall back to the host pointer; the launcher gates the allocation so HMM-equipped setups pay no staging - // cost. - auto *const runtime_jit = get_runtime_jit_module(); - void *runtime_context_ptr_for_sizer = - device_runtime_context_ptr != nullptr ? device_runtime_context_ptr : static_cast(&ctx->get_context()); - runtime_jit->call("runtime_eval_adstack_size_expr", llvm_runtime_, - runtime_context_ptr_for_sizer, bytecode_dev_ptr); - - uint64_t stride_u64 = 0; - copy_d2h(&stride_u64, runtime_adstack_stride_field_ptr_, sizeof(uint64_t)); - stride = static_cast(stride_u64); - } - - std::size_t needed_bytes = stride * num_threads; - ensure_adstack_heap(needed_bytes); - return needed_bytes; -} - -void LlvmRuntimeExecutor::ensure_adstack_heap(std::size_t needed_bytes) { - if (needed_bytes == 0 || needed_bytes <= adstack_heap_size_) { - return; - } - // Amortized doubling keeps the number of re-allocations across a run bounded by log(peak_size). - std::size_t new_size = std::max(needed_bytes, std::size_t(2) * adstack_heap_size_); - - Device::AllocParams params{}; - params.size = new_size; - params.host_read = false; - params.host_write = false; - params.export_sharing = false; - params.usage = AllocUsage::Storage; - DeviceAllocation new_alloc; - RhiResult res = llvm_device()->allocate_memory(params, &new_alloc); - QD_ERROR_IF(res != RhiResult::success, - "Failed to allocate {} bytes for the adstack heap (err: {}). Consider lowering `ad_stack_size` or the " - "per-kernel reverse-mode adstack count.", - new_size, int(res)); - // `get_device_alloc_info_ptr` is the RHI-agnostic accessor that returns the raw host-visible - // pointer on CPU and the device-visible pointer on CUDA / AMDGPU (`get_memory_addr` is only - // implemented on the GPU devices, so we route through this helper instead). - void *new_ptr = get_device_alloc_info_ptr(new_alloc); - - auto new_guard = std::make_unique(std::move(new_alloc)); - - // Publish the new buffer pointer and size into the runtime struct. On CPU the runtime lives in host memory, - // so plain stores through the cached field pointers are correct. On CUDA / AMDGPU the runtime lives in device - // memory, so the host writes via the driver's host->device memcpy. The field-address query runs exactly once, - // on the first grow, and caches the two device pointers; every subsequent grow is just two 8-byte memcpys. - if (runtime_adstack_heap_buffer_field_ptr_ == nullptr) { - auto *const runtime_jit = get_runtime_jit_module(); - runtime_jit->call("runtime_get_adstack_heap_field_ptrs", llvm_runtime_); - runtime_adstack_heap_buffer_field_ptr_ = quadrants_union_cast_with_different_sizes( - fetch_result_uint64(quadrants_result_buffer_ret_value_id, result_buffer_cache_)); - runtime_adstack_heap_size_field_ptr_ = quadrants_union_cast_with_different_sizes( - fetch_result_uint64(quadrants_result_buffer_ret_value_id + 1, result_buffer_cache_)); - } - uint64 size_u64 = static_cast(new_size); - if (config_.arch == Arch::cuda) { -#if defined(QD_WITH_CUDA) - CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr, sizeof(void *)); - CUDADriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64)); -#else - QD_NOT_IMPLEMENTED; -#endif - } else if (config_.arch == Arch::amdgpu) { -#if defined(QD_WITH_AMDGPU) - AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_buffer_field_ptr_, &new_ptr, - sizeof(void *)); - AMDGPUDriver::get_instance().memcpy_host_to_device(runtime_adstack_heap_size_field_ptr_, &size_u64, sizeof(uint64)); -#else - QD_NOT_IMPLEMENTED; -#endif - } else { - *reinterpret_cast(runtime_adstack_heap_buffer_field_ptr_) = new_ptr; - *reinterpret_cast(runtime_adstack_heap_size_field_ptr_) = size_u64; - } - - // Replace and release the old allocation. `DeviceAllocationGuard`'s destructor calls - // `llvm_device()->dealloc_memory`. The new slab has already been handed to `new_guard` above, so the move-assignment - // here is what destroys the *previous* guard - the new allocation is not the one being freed. Safety of the release - // depends on the backend: - // - CPU: host `std::free`. No GPU involved, always safe. - // - CUDA: `CudaDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=true)` -> - // `cuMemFree_v2`, which synchronizes with pending device work before returning. - // - AMDGPU: `AmdgpuDevice::dealloc_memory` routes through `DeviceMemoryPool::release(release_raw=false)` -> - // `CachingAllocator::release`, which pools the allocation *without* calling `hipFree` and *without* - // synchronizing. The physical memory stays mapped, so an in-flight kernel still holding the old base pointer - // keeps reading/writing valid storage. The cross-launch safety invariant for AMDGPU comes from - // `amdgpu::KernelLauncher::launch_llvm_kernel` ending with `hipFree(context_pointer)`, which synchronizes - // with all in-flight kernels launched during that call. By the time the *next* `launch_llvm_kernel` reaches - // `ensure_adstack_heap` and can destroy the previous guard, no GPU kernel from the prior call is still - // referencing the old slab. CUDA does not need this extra hop -- the `cuMemFree_v2` in the bullet above - // already syncs -- and the CUDA launcher correspondingly does not allocate a device-side `context_pointer` - // (it passes the `RuntimeContext` by host reference). - adstack_heap_alloc_ = std::move(new_guard); - adstack_heap_size_ = new_size; -} - void LlvmRuntimeExecutor::preallocate_runtime_memory() { if (preallocated_runtime_memory_allocs_ != nullptr) return; From f6fee4fbd2bcf3040b9edae6970294ed9daca671 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 02:21:09 -0700 Subject: [PATCH 079/109] Add test for qd_stream + autodiff kernel error guard Co-authored-by: Cursor --- tests/python/test_streams.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index 7f03703dac..db7588aaf7 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -258,6 +258,21 @@ def compute(): s.destroy() +@test_utils.test(arch=[qd.cuda]) +def test_stream_with_autodiff_kernel_raises(): + x = qd.field(qd.f32, shape=(), needs_grad=True) + loss = qd.field(qd.f32, shape=(), needs_grad=True) + + @qd.kernel + def compute(): + loss[None] = x[None] ** 2 + + s = qd.create_stream() + with pytest.raises(RuntimeError, match="not compatible with autodiff"): + compute.grad(qd_stream=s) + s.destroy() + + @test_utils.test(arch=[qd.cuda]) def test_stream_with_graph_raises(): N = 64 From 6e49c52d13f426dcac3c14b5b839059db2cb5839 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 03:12:15 -0700 Subject: [PATCH 080/109] Restore context_pointer free comment in AMDGPU kernel launcher The comment explains a non-obvious race condition: context_pointer must be freed directly (now via mem_free_async on active_stream) rather than through AMDGPUContext's deferred free list, because that list is drained by LlvmRuntimeExecutor::synchronize which can be called mid-launch. Co-authored-by: Cursor --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index ab34003cbd..42db3934dd 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -288,6 +288,11 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } else if (ctx.result_buffer_size > 0) { AMDGPUDriver::get_instance().stream_synchronize(active_stream); } + // Free the per-launch `RuntimeContext` on the active stream rather than through `AMDGPUContext`'s deferred free + // list. The deferred list is drained by `LlvmRuntimeExecutor::synchronize`, which is also called from + // `fetch_result_uint64` during `ensure_adstack_heap`'s field-pointer query -- that path would free + // `context_pointer` mid-launch, and HIP could recycle the address for the adstack heap allocated right after, + // clobbering the `RuntimeContext` the next task still reads from. AMDGPUDriver::get_instance().mem_free_async(context_pointer, active_stream); } From 9fd8b7b9d718948f09f1c4335bd7127946f20d16 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 04:55:19 -0700 Subject: [PATCH 081/109] Extract stream/event methods from program.cpp into program_stream.cpp Move the 9 CUDA-only stream/event Program methods into a dedicated translation unit. The CMake glob on quadrants/program/* picks up the new file automatically. Co-authored-by: Cursor --- quadrants/program/program.cpp | 94 ------------------------ quadrants/program/program_stream.cpp | 103 +++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 94 deletions(-) create mode 100644 quadrants/program/program_stream.cpp diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 5abcd255b3..8f6fdb2186 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -20,11 +20,6 @@ #include "quadrants/codegen/llvm/struct_llvm.h" #endif -#ifdef QD_WITH_CUDA -#include "quadrants/rhi/cuda/cuda_driver.h" -#include "quadrants/rhi/cuda/cuda_context.h" -#endif - #ifdef QD_WITH_VULKAN #include "quadrants/runtime/program_impls/vulkan/vulkan_program.h" #include "quadrants/rhi/vulkan/vulkan_loader.h" @@ -494,93 +489,4 @@ void Program::enqueue_compute_op_lambda(std::functionenqueue_compute_op_lambda(op, image_refs); } -uint64 Program::stream_create() { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { - CUDAContext::get_instance().make_current(); - void *stream = nullptr; - CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/); - return reinterpret_cast(stream); - } -#endif - return 0; -} - -void Program::stream_destroy(uint64 stream_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && stream_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); - } -#endif -} - -void Program::stream_synchronize(uint64 stream_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && stream_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); - } -#endif -} - -void Program::set_current_cuda_stream(uint64 stream_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { - CUDAContext::get_instance().make_current(); - CUDAContext::get_instance().set_stream(reinterpret_cast(stream_handle)); - } -#endif -} - -uint64 Program::event_create() { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { - CUDAContext::get_instance().make_current(); - void *event = nullptr; - CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/); - return reinterpret_cast(event); - } -#endif - return 0; -} - -void Program::event_destroy(uint64 event_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().event_destroy(reinterpret_cast(event_handle)); - } -#endif -} - -void Program::event_record(uint64 event_handle, uint64 stream_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().event_record(reinterpret_cast(event_handle), - reinterpret_cast(stream_handle)); - } -#endif -} - -void Program::event_synchronize(uint64 event_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); - } -#endif -} - -void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { -#ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), - reinterpret_cast(event_handle), 0 /*flags*/); - } -#endif -} - } // namespace quadrants::lang diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp new file mode 100644 index 0000000000..b20252ddbc --- /dev/null +++ b/quadrants/program/program_stream.cpp @@ -0,0 +1,103 @@ +// Stream and event operations for the Program class. +// Extracted from program.cpp to keep backend-specific GPU stream/event +// lifecycle code separate from the core Program logic. + +#include "program.h" + +#ifdef QD_WITH_CUDA +#include "quadrants/rhi/cuda/cuda_driver.h" +#include "quadrants/rhi/cuda/cuda_context.h" +#endif + +namespace quadrants::lang { + +uint64 Program::stream_create() { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().make_current(); + void *stream = nullptr; + CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/); + return reinterpret_cast(stream); + } +#endif + return 0; +} + +void Program::stream_destroy(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && stream_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::stream_synchronize(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && stream_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::set_current_cuda_stream(uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().make_current(); + CUDAContext::get_instance().set_stream(reinterpret_cast(stream_handle)); + } +#endif +} + +uint64 Program::event_create() { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda) { + CUDAContext::get_instance().make_current(); + void *event = nullptr; + CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/); + return reinterpret_cast(event); + } +#endif + return 0; +} + +void Program::event_destroy(uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().event_destroy(reinterpret_cast(event_handle)); + } +#endif +} + +void Program::event_record(uint64 event_handle, uint64 stream_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().event_record(reinterpret_cast(event_handle), + reinterpret_cast(stream_handle)); + } +#endif +} + +void Program::event_synchronize(uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); + } +#endif +} + +void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { +#ifdef QD_WITH_CUDA + if (compile_config().arch == Arch::cuda && event_handle != 0) { + CUDAContext::get_instance().make_current(); + CUDADriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), + reinterpret_cast(event_handle), 0 /*flags*/); + } +#endif +} + +} // namespace quadrants::lang From 9e6f865cfb29b78e5c99705b84e3a6a1bc80bc86 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 05:26:10 -0700 Subject: [PATCH 082/109] Introduce StreamManager delegate class for stream/event ops Move the CUDA stream/event logic into a StreamManager class (program_stream.h/.cpp). Program keeps its public API unchanged and delegates to stream_manager_ internally, so the pybind layer and Python code need no changes. Co-authored-by: Cursor --- quadrants/program/program.cpp | 1 + quadrants/program/program.h | 2 + quadrants/program/program_stream.cpp | 77 ++++++++++++++++++++-------- quadrants/program/program_stream.h | 31 +++++++++++ 4 files changed, 90 insertions(+), 21 deletions(-) create mode 100644 quadrants/program/program_stream.h diff --git a/quadrants/program/program.cpp b/quadrants/program/program.cpp index 8f6fdb2186..ff9901add5 100644 --- a/quadrants/program/program.cpp +++ b/quadrants/program/program.cpp @@ -60,6 +60,7 @@ Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) { config = default_compile_config; config.arch = desired_arch; config.fit(); + stream_manager_ = StreamManager(config.arch); profiler = make_profiler(config.arch, config.kernel_profiler); if (arch_uses_llvm(config.arch)) { diff --git a/quadrants/program/program.h b/quadrants/program/program.h index fe2f30ca74..7fb6019026 100644 --- a/quadrants/program/program.h +++ b/quadrants/program/program.h @@ -21,6 +21,7 @@ #include "quadrants/program/kernel_profiler.h" #include "quadrants/program/snode_expr_utils.h" #include "quadrants/program/snode_rw_accessors_bank.h" +#include "quadrants/program/program_stream.h" #include "quadrants/program/context.h" #include "quadrants/struct/snode_tree.h" #include "quadrants/system/threading.h" @@ -338,6 +339,7 @@ class QD_DLL_EXPORT Program { private: CompileConfig compile_config_; + StreamManager stream_manager_{Arch::x64}; // re-initialized in constructor after arch is known uint64 ndarray_writer_counter_{0}; uint64 ndarray_reader_counter_{0}; diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index b20252ddbc..442e0cfa8d 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -1,7 +1,6 @@ -// Stream and event operations for the Program class. -// Extracted from program.cpp to keep backend-specific GPU stream/event -// lifecycle code separate from the core Program logic. +// StreamManager implementation and Program delegation. +#include "program_stream.h" #include "program.h" #ifdef QD_WITH_CUDA @@ -11,9 +10,13 @@ namespace quadrants::lang { -uint64 Program::stream_create() { +// --------------------------------------------------------------------------- +// StreamManager +// --------------------------------------------------------------------------- + +uint64 StreamManager::create_stream() { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { + if (arch_ == Arch::cuda) { CUDAContext::get_instance().make_current(); void *stream = nullptr; CUDADriver::get_instance().stream_create(&stream, 0x1 /*CU_STREAM_NON_BLOCKING*/); @@ -23,36 +26,36 @@ uint64 Program::stream_create() { return 0; } -void Program::stream_destroy(uint64 stream_handle) { +void StreamManager::destroy_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && stream_handle != 0) { + if (arch_ == Arch::cuda && stream_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } #endif } -void Program::stream_synchronize(uint64 stream_handle) { +void StreamManager::synchronize_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && stream_handle != 0) { + if (arch_ == Arch::cuda && stream_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } #endif } -void Program::set_current_cuda_stream(uint64 stream_handle) { +void StreamManager::set_current_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { + if (arch_ == Arch::cuda) { CUDAContext::get_instance().make_current(); CUDAContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } #endif } -uint64 Program::event_create() { +uint64 StreamManager::create_event() { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda) { + if (arch_ == Arch::cuda) { CUDAContext::get_instance().make_current(); void *event = nullptr; CUDADriver::get_instance().event_create(&event, 0x02 /*CU_EVENT_DISABLE_TIMING*/); @@ -62,18 +65,18 @@ uint64 Program::event_create() { return 0; } -void Program::event_destroy(uint64 event_handle) { +void StreamManager::destroy_event(uint64 event_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { + if (arch_ == Arch::cuda && event_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } #endif } -void Program::event_record(uint64 event_handle, uint64 stream_handle) { +void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { + if (arch_ == Arch::cuda && event_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); @@ -81,18 +84,18 @@ void Program::event_record(uint64 event_handle, uint64 stream_handle) { #endif } -void Program::event_synchronize(uint64 event_handle) { +void StreamManager::synchronize_event(uint64 event_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { + if (arch_ == Arch::cuda && event_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } #endif } -void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { +void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #ifdef QD_WITH_CUDA - if (compile_config().arch == Arch::cuda && event_handle != 0) { + if (arch_ == Arch::cuda && event_handle != 0) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); @@ -100,4 +103,36 @@ void Program::stream_wait_event(uint64 stream_handle, uint64 event_handle) { #endif } +// --------------------------------------------------------------------------- +// Program delegation — keeps the pybind / Python API unchanged. +// --------------------------------------------------------------------------- + +uint64 Program::stream_create() { + return stream_manager_.create_stream(); +} +void Program::stream_destroy(uint64 h) { + stream_manager_.destroy_stream(h); +} +void Program::stream_synchronize(uint64 h) { + stream_manager_.synchronize_stream(h); +} +void Program::set_current_cuda_stream(uint64 h) { + stream_manager_.set_current_stream(h); +} +uint64 Program::event_create() { + return stream_manager_.create_event(); +} +void Program::event_destroy(uint64 h) { + stream_manager_.destroy_event(h); +} +void Program::event_record(uint64 eh, uint64 sh) { + stream_manager_.record_event(eh, sh); +} +void Program::event_synchronize(uint64 h) { + stream_manager_.synchronize_event(h); +} +void Program::stream_wait_event(uint64 sh, uint64 eh) { + stream_manager_.stream_wait_event(sh, eh); +} + } // namespace quadrants::lang diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h new file mode 100644 index 0000000000..ae6b7221d5 --- /dev/null +++ b/quadrants/program/program_stream.h @@ -0,0 +1,31 @@ +// StreamManager — manages CUDA stream and event lifecycle. +// Isolated from Program so that backend-specific GPU plumbing +// does not pollute the core Program interface. + +#pragma once + +#include "quadrants/common/core.h" +#include "quadrants/util/lang_util.h" + +namespace quadrants::lang { + +class StreamManager { + public: + explicit StreamManager(Arch arch) : arch_(arch) {} + + uint64 create_stream(); + void destroy_stream(uint64 stream_handle); + void synchronize_stream(uint64 stream_handle); + void set_current_stream(uint64 stream_handle); + + uint64 create_event(); + void destroy_event(uint64 event_handle); + void record_event(uint64 event_handle, uint64 stream_handle); + void synchronize_event(uint64 event_handle); + void stream_wait_event(uint64 stream_handle, uint64 event_handle); + + private: + Arch arch_; +}; + +} // namespace quadrants::lang From 1c81322cbe0e418a6deaa765d877a505d29ced16 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 05:29:18 -0700 Subject: [PATCH 083/109] Fix clang-format in program_stream.h Co-authored-by: Cursor --- quadrants/program/program_stream.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h index ae6b7221d5..54a8e88d0b 100644 --- a/quadrants/program/program_stream.h +++ b/quadrants/program/program_stream.h @@ -11,7 +11,8 @@ namespace quadrants::lang { class StreamManager { public: - explicit StreamManager(Arch arch) : arch_(arch) {} + explicit StreamManager(Arch arch) : arch_(arch) { + } uint64 create_stream(); void destroy_stream(uint64 stream_handle); From 84ba5b05b7d9d502eccace8f52e88ea9df0ccbc6 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 05:34:12 -0700 Subject: [PATCH 084/109] Fix clang-format in program_stream.h Co-authored-by: Cursor --- quadrants/program/program_stream.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h index ae6b7221d5..54a8e88d0b 100644 --- a/quadrants/program/program_stream.h +++ b/quadrants/program/program_stream.h @@ -11,7 +11,8 @@ namespace quadrants::lang { class StreamManager { public: - explicit StreamManager(Arch arch) : arch_(arch) {} + explicit StreamManager(Arch arch) : arch_(arch) { + } uint64 create_stream(); void destroy_stream(uint64 stream_handle); From b1b4ee60b298aa3e7ea93903c3895dd5a59cf155 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 05:36:58 -0700 Subject: [PATCH 085/109] Remove Program wrapper methods, bind StreamManager directly via pybind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add stream_manager() accessor to Program. Update export_lang.cpp to call StreamManager methods through lambdas. Delete the 9 one-line delegation methods from Program — the declarations in program.h and definitions in program_stream.cpp are both gone. Co-authored-by: Cursor --- quadrants/program/program.h | 12 +++------- quadrants/program/program_stream.cpp | 33 ---------------------------- quadrants/python/export_lang.cpp | 18 +++++++-------- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/quadrants/program/program.h b/quadrants/program/program.h index 7fb6019026..600533f1cf 100644 --- a/quadrants/program/program.h +++ b/quadrants/program/program.h @@ -320,15 +320,9 @@ class QD_DLL_EXPORT Program { return ndarrays_.size(); } - uint64 stream_create(); - void stream_destroy(uint64 stream_handle); - void stream_synchronize(uint64 stream_handle); - void set_current_cuda_stream(uint64 stream_handle); - uint64 event_create(); - void event_destroy(uint64 event_handle); - void event_record(uint64 event_handle, uint64 stream_handle); - void event_synchronize(uint64 event_handle); - void stream_wait_event(uint64 stream_handle, uint64 event_handle); + StreamManager &stream_manager() { + return stream_manager_; + } // TODO(zhanlue): Move these members and corresponding interfaces to // ProgramImpl Ideally, Program should serve as a pure interface class and all diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index 442e0cfa8d..b1c2429dd6 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -1,7 +1,6 @@ // StreamManager implementation and Program delegation. #include "program_stream.h" -#include "program.h" #ifdef QD_WITH_CUDA #include "quadrants/rhi/cuda/cuda_driver.h" @@ -103,36 +102,4 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) #endif } -// --------------------------------------------------------------------------- -// Program delegation — keeps the pybind / Python API unchanged. -// --------------------------------------------------------------------------- - -uint64 Program::stream_create() { - return stream_manager_.create_stream(); -} -void Program::stream_destroy(uint64 h) { - stream_manager_.destroy_stream(h); -} -void Program::stream_synchronize(uint64 h) { - stream_manager_.synchronize_stream(h); -} -void Program::set_current_cuda_stream(uint64 h) { - stream_manager_.set_current_stream(h); -} -uint64 Program::event_create() { - return stream_manager_.create_event(); -} -void Program::event_destroy(uint64 h) { - stream_manager_.destroy_event(h); -} -void Program::event_record(uint64 eh, uint64 sh) { - stream_manager_.record_event(eh, sh); -} -void Program::event_synchronize(uint64 h) { - stream_manager_.synchronize_event(h); -} -void Program::stream_wait_event(uint64 sh, uint64 eh) { - stream_manager_.stream_wait_event(sh, eh); -} - } // namespace quadrants::lang diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index 8cfdd78b5a..c46d40ac10 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -406,15 +406,15 @@ void export_lang(py::module &m) { .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference) .def("launch_kernel", &Program::launch_kernel) .def("get_device_caps", &Program::get_device_caps) - .def("stream_create", &Program::stream_create) - .def("stream_destroy", &Program::stream_destroy) - .def("stream_synchronize", &Program::stream_synchronize) - .def("set_current_cuda_stream", &Program::set_current_cuda_stream) - .def("event_create", &Program::event_create) - .def("event_destroy", &Program::event_destroy) - .def("event_record", &Program::event_record) - .def("event_synchronize", &Program::event_synchronize) - .def("stream_wait_event", &Program::stream_wait_event) + .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) + .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) + .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) + .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) + .def("event_create", [](Program *p) { return p->stream_manager().create_event(); }) + .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); }) + .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); }) + .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); }) + .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); }) .def("get_graph_cache_size", &Program::get_graph_cache_size) .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call) .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call) From d3317f5cf00e4955095edefdeab68227426243c5 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 06:01:11 -0700 Subject: [PATCH 086/109] Fix AMDGPU branches in StreamManager: use arch_ member instead of compile_config() The base branch refactored stream/event methods from Program:: to StreamManager::, which stores the arch in arch_. Our AMDGPU branches still referenced compile_config().arch which is a Program method. Co-authored-by: Cursor --- quadrants/program/program_stream.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index 88288cc313..b4adc0226a 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -28,7 +28,7 @@ uint64 StreamManager::create_stream() { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); void *stream = nullptr; AMDGPUDriver::get_instance().stream_create(&stream, 0x1 /*HIP_STREAM_NON_BLOCKING*/); @@ -46,7 +46,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu && stream_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_destroy(reinterpret_cast(stream_handle)); } @@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu && stream_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } @@ -76,7 +76,7 @@ void StreamManager::set_current_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); AMDGPUContext::get_instance().set_stream(reinterpret_cast(stream_handle)); } @@ -93,7 +93,7 @@ uint64 StreamManager::create_event() { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); void *event = nullptr; AMDGPUDriver::get_instance().event_create(&event, 0x02 /*hipEventDisableTiming*/); @@ -111,7 +111,7 @@ void StreamManager::destroy_event(uint64 event_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_destroy(reinterpret_cast(event_handle)); } @@ -127,7 +127,7 @@ void StreamManager::record_event(uint64 event_handle, uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_record(reinterpret_cast(event_handle), reinterpret_cast(stream_handle)); @@ -143,7 +143,7 @@ void StreamManager::synchronize_event(uint64 event_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().event_synchronize(reinterpret_cast(event_handle)); } @@ -159,7 +159,7 @@ void StreamManager::stream_wait_event(uint64 stream_handle, uint64 event_handle) } #endif #ifdef QD_WITH_AMDGPU - if (compile_config().arch == Arch::amdgpu && event_handle != 0) { + if (arch_ == Arch::amdgpu && event_handle != 0) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_wait_event(reinterpret_cast(stream_handle), reinterpret_cast(event_handle), 0 /*flags*/); From 7e102672eab2ff2713c26cd90445566b81d57a53 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 09:18:07 -0700 Subject: [PATCH 087/109] Reflow comment in program_stream.h to 120-char width Co-authored-by: Cursor --- quadrants/program/program_stream.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/quadrants/program/program_stream.h b/quadrants/program/program_stream.h index 54a8e88d0b..69265c26b3 100644 --- a/quadrants/program/program_stream.h +++ b/quadrants/program/program_stream.h @@ -1,6 +1,5 @@ -// StreamManager — manages CUDA stream and event lifecycle. -// Isolated from Program so that backend-specific GPU plumbing -// does not pollute the core Program interface. +// StreamManager — manages CUDA stream and event lifecycle, isolated from Program so that backend-specific GPU +// plumbing does not pollute the core Program interface. #pragma once From 614c742cd9cfb0195ae32dedee09d4d7fd374949 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 09:19:08 -0700 Subject: [PATCH 088/109] Use captured prog_ref for all Stream/Event operations All methods on Stream and Event now resolve the Program through the captured weakref first, falling back to the current runtime only for externally-wrapped handles. Fixes a bug where destroy/synchronize/ record/wait would call into the wrong Program after qd.reset(). Co-authored-by: Cursor --- python/quadrants/lang/stream.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 063a2aeafc..85e7c1e86b 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -22,10 +22,16 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None): def handle(self) -> int: return self._handle + def _prog(self): + if self._prog_ref is not None: + prog = self._prog_ref() + if prog is not None: + return prog + return impl.get_runtime().prog + def synchronize(self): """Block until all operations on this stream complete.""" - prog = impl.get_runtime().prog - prog.stream_synchronize(self._handle) + self._prog().stream_synchronize(self._handle) def destroy(self): """Explicitly destroy the stream. Safe to call multiple times. @@ -33,8 +39,7 @@ def destroy(self): No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - prog = impl.get_runtime().prog - prog.stream_destroy(self._handle) + self._prog().stream_destroy(self._handle) self._handle = 0 def __del__(self): @@ -69,22 +74,26 @@ def __init__(self, handle: int, prog_ref: weakref.ref | None = None): def handle(self) -> int: return self._handle + def _prog(self): + if self._prog_ref is not None: + prog = self._prog_ref() + if prog is not None: + return prog + return impl.get_runtime().prog + def record(self, qd_stream: Stream | None = None): """Record this event on a stream. None means the default stream.""" - prog = impl.get_runtime().prog stream_handle = qd_stream.handle if qd_stream is not None else 0 - prog.event_record(self._handle, stream_handle) + self._prog().event_record(self._handle, stream_handle) def wait(self, qd_stream: Stream | None = None): """Make a stream wait for this event. None means the default stream.""" - prog = impl.get_runtime().prog stream_handle = qd_stream.handle if qd_stream is not None else 0 - prog.stream_wait_event(stream_handle, self._handle) + self._prog().stream_wait_event(stream_handle, self._handle) def synchronize(self): """Block the host until this event has been reached.""" - prog = impl.get_runtime().prog - prog.event_synchronize(self._handle) + self._prog().event_synchronize(self._handle) def destroy(self): """Explicitly destroy the event. Safe to call multiple times. @@ -92,8 +101,7 @@ def destroy(self): No-op for events wrapping external handles (created via Event(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - prog = impl.get_runtime().prog - prog.event_destroy(self._handle) + self._prog().event_destroy(self._handle) self._handle = 0 def __del__(self): From 3dad35ad4ad58bd92d034e0eb01a65a92705c897 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 09:31:35 -0700 Subject: [PATCH 089/109] Fix stale handle safety in Stream/Event after qd.reset() When _prog_ref is set but the weakref has expired (Program destroyed), _prog() now returns None instead of falling back to the current runtime. Active operations (synchronize, record, wait) raise RuntimeError; destroy silently no-ops and zeroes the handle. Also allow synchronize_stream(0) to sync the default stream in CUDA, matching cuStreamSynchronize(nullptr) semantics. Co-authored-by: Cursor --- python/quadrants/lang/stream.py | 41 ++++++++++++++++++---------- quadrants/program/program_stream.cpp | 2 +- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 85e7c1e86b..5898cb434e 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -23,23 +23,28 @@ def handle(self) -> int: return self._handle def _prog(self): + """Resolve the owning Program, or None if the owner was collected.""" if self._prog_ref is not None: - prog = self._prog_ref() - if prog is not None: - return prog + return self._prog_ref() return impl.get_runtime().prog def synchronize(self): """Block until all operations on this stream complete.""" - self._prog().stream_synchronize(self._handle) + prog = self._prog() + if prog is None: + raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())") + prog.stream_synchronize(self._handle) def destroy(self): """Explicitly destroy the stream. Safe to call multiple times. - No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref). + No-op if the owning Program has already been collected, or for streams wrapping external handles + (created via Stream(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - self._prog().stream_destroy(self._handle) + prog = self._prog() + if prog is not None: + prog.stream_destroy(self._handle) self._handle = 0 def __del__(self): @@ -75,33 +80,41 @@ def handle(self) -> int: return self._handle def _prog(self): + """Resolve the owning Program, or None if the owner was collected.""" if self._prog_ref is not None: - prog = self._prog_ref() - if prog is not None: - return prog + return self._prog_ref() return impl.get_runtime().prog + def _require_prog(self): + prog = self._prog() + if prog is None: + raise RuntimeError("Event's owning Program has been destroyed (e.g. after qd.reset())") + return prog + def record(self, qd_stream: Stream | None = None): """Record this event on a stream. None means the default stream.""" stream_handle = qd_stream.handle if qd_stream is not None else 0 - self._prog().event_record(self._handle, stream_handle) + self._require_prog().event_record(self._handle, stream_handle) def wait(self, qd_stream: Stream | None = None): """Make a stream wait for this event. None means the default stream.""" stream_handle = qd_stream.handle if qd_stream is not None else 0 - self._prog().stream_wait_event(stream_handle, self._handle) + self._require_prog().stream_wait_event(stream_handle, self._handle) def synchronize(self): """Block the host until this event has been reached.""" - self._prog().event_synchronize(self._handle) + self._require_prog().event_synchronize(self._handle) def destroy(self): """Explicitly destroy the event. Safe to call multiple times. - No-op for events wrapping external handles (created via Event(ptr) without a prog_ref). + No-op if the owning Program has already been collected, or for events wrapping external handles + (created via Event(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - self._prog().event_destroy(self._handle) + prog = self._prog() + if prog is not None: + prog.event_destroy(self._handle) self._handle = 0 def __del__(self): diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index b1c2429dd6..8a7431532a 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -36,7 +36,7 @@ void StreamManager::destroy_stream(uint64 stream_handle) { void StreamManager::synchronize_stream(uint64 stream_handle) { #ifdef QD_WITH_CUDA - if (arch_ == Arch::cuda && stream_handle != 0) { + if (arch_ == Arch::cuda) { CUDAContext::get_instance().make_current(); CUDADriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } From bebc9040869cdbcdcf8094b80fb4c849f28f16ce Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:14:23 -0700 Subject: [PATCH 090/109] Extract stream/event pybind bindings into export_stream.cpp Move the 9 stream/event .def() bindings from export_lang.cpp into a new export_stream.cpp, following the existing export_math/export_misc pattern. Satisfies the feature-factorization check for the 1225-line export_lang.cpp. Co-authored-by: Cursor --- quadrants/python/export.h | 6 ++++++ quadrants/python/export_lang.cpp | 14 +++----------- quadrants/python/export_stream.cpp | 26 ++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 quadrants/python/export_stream.cpp diff --git a/quadrants/python/export.h b/quadrants/python/export.h index 331c35b4b6..92736daedf 100644 --- a/quadrants/python/export.h +++ b/quadrants/python/export.h @@ -21,6 +21,10 @@ #include "quadrants/common/core.h" +namespace quadrants::lang { +class Program; +} // namespace quadrants::lang + namespace quadrants { namespace py = pybind11; @@ -33,4 +37,6 @@ void export_math(py::module &m); void export_misc(py::module &m); +void export_stream(py::module &m, py::class_ &program_class); + } // namespace quadrants diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index c46d40ac10..b3dc79bef5 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -314,8 +314,8 @@ void export_lang(py::module &m) { auto compiled_kernel_data = py::class_(m, "CompiledKernelData") .def("_debug_dump_to_string", &CompiledKernelData::debug_dump_to_string); - py::class_(m, "Program") - .def(py::init<>()) + auto program_class = py::class_(m, "Program"); + program_class.def(py::init<>()) .def( "ndarray_to_dlpack", [](Program *program, pybind11::object owner, Ndarray *ndarray, const std::vector &layout, @@ -406,20 +406,12 @@ void export_lang(py::module &m) { .def("compile_kernel", &Program::compile_kernel, py::return_value_policy::reference) .def("launch_kernel", &Program::launch_kernel) .def("get_device_caps", &Program::get_device_caps) - .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) - .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) - .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) - .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) - .def("event_create", [](Program *p) { return p->stream_manager().create_event(); }) - .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); }) - .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); }) - .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); }) - .def("stream_wait_event", [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); }) .def("get_graph_cache_size", &Program::get_graph_cache_size) .def("get_graph_cache_used_on_last_call", &Program::get_graph_cache_used_on_last_call) .def("get_num_offloaded_tasks_on_last_call", &Program::get_num_offloaded_tasks_on_last_call) .def("get_graph_num_nodes_on_last_call", &Program::get_graph_num_nodes_on_last_call) .def("get_graph_total_builds", &Program::get_graph_total_builds); + export_stream(m, program_class); py::class_(m, "CompileResult") .def_property_readonly( diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp new file mode 100644 index 0000000000..f3f2fad525 --- /dev/null +++ b/quadrants/python/export_stream.cpp @@ -0,0 +1,26 @@ +/******************************************************************************* + Copyright (c) The Quadrants Authors (2016- ). All Rights Reserved. + The use of this software is governed by the LICENSE file. +*******************************************************************************/ + +#include "quadrants/python/export.h" +#include "quadrants/program/program.h" + +namespace quadrants { + +void export_stream(py::module &m, py::class_ &program_class) { + using lang::Program; + program_class + .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) + .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) + .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) + .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) + .def("event_create", [](Program *p) { return p->stream_manager().create_event(); }) + .def("event_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_event(h); }) + .def("event_record", [](Program *p, uint64 eh, uint64 sh) { p->stream_manager().record_event(eh, sh); }) + .def("event_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_event(h); }) + .def("stream_wait_event", + [](Program *p, uint64 sh, uint64 eh) { p->stream_manager().stream_wait_event(sh, eh); }); +} + +} // namespace quadrants From b4450f7c1837e3fb603ddf267fb0a01a8f781154 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:19:23 -0700 Subject: [PATCH 091/109] Fix clang-format in export_stream.cpp Co-authored-by: Cursor --- quadrants/python/export_stream.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp index f3f2fad525..66b3c8a3d7 100644 --- a/quadrants/python/export_stream.cpp +++ b/quadrants/python/export_stream.cpp @@ -10,8 +10,7 @@ namespace quadrants { void export_stream(py::module &m, py::class_ &program_class) { using lang::Program; - program_class - .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) + program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) From b6cd986a9f319a9d5d9c9c1dd5bce239feb1af97 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:21:49 -0700 Subject: [PATCH 092/109] Fix clang-format line break in CUDA kernel launcher Co-authored-by: Cursor --- quadrants/runtime/cuda/kernel_launcher.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index a97ba400d5..9558c57d66 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -141,8 +141,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int effective_grid_dim = prepare_task(j, t); CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); - cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, - {&ctx.get_context()}, {}); + cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()}, + {}); } for (auto &[sid, s] : stream_by_id) { From 3b09331daf736eb85d220bdd6760dd5e5e553bd2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:24:59 -0700 Subject: [PATCH 093/109] Fix clang-format in export_stream.cpp Co-authored-by: Cursor --- quadrants/python/export_stream.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/quadrants/python/export_stream.cpp b/quadrants/python/export_stream.cpp index f3f2fad525..66b3c8a3d7 100644 --- a/quadrants/python/export_stream.cpp +++ b/quadrants/python/export_stream.cpp @@ -10,8 +10,7 @@ namespace quadrants { void export_stream(py::module &m, py::class_ &program_class) { using lang::Program; - program_class - .def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) + program_class.def("stream_create", [](Program *p) { return p->stream_manager().create_stream(); }) .def("stream_destroy", [](Program *p, uint64 h) { p->stream_manager().destroy_stream(h); }) .def("stream_synchronize", [](Program *p, uint64 h) { p->stream_manager().synchronize_stream(h); }) .def("set_current_cuda_stream", [](Program *p, uint64 h) { p->stream_manager().set_current_stream(h); }) From af4a30615f6e625fd47ef4cb30cb2259993e5df4 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 10:29:18 -0700 Subject: [PATCH 094/109] Skip coverage probes in stream_parallel exclusivity check; restore deleted comments The Linux build CI runs with QD_KERNEL_COVERAGE=1, which injects _qd_cov[probe_id] = 1 Assign nodes before each statement in the kernel body. _validate_stream_parallel_exclusivity was rejecting these probes as non-stream_parallel statements. Add _is_coverage_probe() to skip them. Also restores the 4 safety comments in CUDA kernel_launcher.cpp's prepare_task lambda that were flagged by the deleted-comments check, fixes clang-format line break, and reflows the symbol_resolver.py docstring to 120 characters. Co-authored-by: Cursor --- .../function_def_transformer.py | 13 ++++++++++ python/quadrants/lang/ast/symbol_resolver.py | 4 +-- quadrants/runtime/cuda/kernel_launcher.cpp | 25 +++++++++++++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py index 123767be55..142694091f 100644 --- a/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py +++ b/python/quadrants/lang/ast/ast_transformers/function_def_transformer.py @@ -535,6 +535,17 @@ def _is_docstring(stmt: ast.stmt, index: int) -> bool: and isinstance(stmt.value, (ast.Constant, ast.Str)) ) + @staticmethod + def _is_coverage_probe(stmt: ast.stmt) -> bool: + if not isinstance(stmt, ast.Assign) or len(stmt.targets) != 1: + return False + target = stmt.targets[0] + return ( + isinstance(target, ast.Subscript) + and isinstance(target.value, ast.Name) + and target.value.id.startswith("_qd_cov") + ) + @staticmethod def _validate_stream_parallel_exclusivity( body: list[ast.stmt], global_vars: dict[str, Any] @@ -547,6 +558,8 @@ def _validate_stream_parallel_exclusivity( for i, stmt in enumerate(body): if FunctionDefTransformer._is_docstring(stmt, i): continue + if FunctionDefTransformer._is_coverage_probe(stmt): + continue if not FunctionDefTransformer._is_stream_parallel_with(stmt, global_vars): stmt_desc = f"{type(stmt).__name__}" if isinstance(stmt, ast.With) and stmt.items: diff --git a/python/quadrants/lang/ast/symbol_resolver.py b/python/quadrants/lang/ast/symbol_resolver.py index f95373a463..c2b4fcaffe 100644 --- a/python/quadrants/lang/ast/symbol_resolver.py +++ b/python/quadrants/lang/ast/symbol_resolver.py @@ -60,8 +60,8 @@ def resolve_to(node, wanted, scope): def resolve_value(node, scope): """Resolve an AST Name/Attribute node to a Python object. - Same traversal as resolve_to but returns the resolved object (or None) - instead of comparing against a wanted value. + Same traversal as resolve_to but returns the resolved object (or None) instead of comparing against a wanted + value. """ if isinstance(node, ast.Name): return scope.get(node.id) if isinstance(scope, dict) else None diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 658b2089d9..b08af6733e 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -81,8 +81,19 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int effective_grid_dim = task.grid_dim; if (!task.ad_stack.allocas.empty()) { std::size_t n = resolve_num_threads(task.ad_stack, executor); + // Pass the device-side `RuntimeContext` pointer through to the adstack sizer kernel. Without it the sizer + // launches with a host pointer and the next DtoH sync trips `CUDA_ERROR_ILLEGAL_ADDRESS ... + // memcpy_device_to_host` on GPUs whose driver + kernel cannot coherently access pageable host memory (the HMM + // capability gated below in `launch_llvm_kernel`). `nullptr` on HMM-capable setups keeps + // `publish_adstack_metadata`'s host-pointer fast path. executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr); if (task.ad_stack.bound_expr.has_value()) { + // Reducer length is the gating ndarray's full flat element count, not `n`: the lazy row-claim atomic-rmw + // fires once per LCA execution, and `gpu_parallel_struct_for` / `gpu_parallel_range_for` grid-stride (`i += + // grid_dim()`) so a single dispatched thread can hit the LCA many times across one launch when the logical + // loop span exceeds the (capped) concurrent thread count. Walking the reducer over the full ndarray length + // keeps `bound_row_capacities[task_index]` consistent with the total claim count, which the codegen-emitted + // bounds clamp reads. Mirrors the CPU launcher's `bound_count_length` derivation. std::size_t bound_count_length = n; if (task.ad_stack.bound_expr->field_source_kind == StaticAdStackBoundExpr::FieldSourceKind::NdArray && !task.ad_stack.bound_expr->ndarray_arg_id.empty() && task.ad_stack.bound_expr->ndarray_ndim > 0 && @@ -92,6 +103,11 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, std::vector indices = task.ad_stack.bound_expr->ndarray_arg_id; indices.push_back(TypeFactory::SHAPE_POS_IN_NDARRAY); indices.push_back(axis); + // get_struct_arg_host (NOT get_struct_arg): `launch_llvm_kernel` above has already swapped + // `ctx_->arg_buffer` to a device pointer, so a plain `get_struct_arg` here would dereference device + // memory from the host - SIGSEGV / CUDA_ERROR_ILLEGAL_ADDRESS on drivers without HMM, garbage + // `flat_len` on HMM-capable setups. The host backing buffer (`arg_buffer_`) stays host-resident across + // the swap and holds the same shape entries, so the host-safe variant is byte-equivalent here. flat_len *= int64_t(ctx.get_struct_arg_host(indices)); } bound_count_length = static_cast(std::max(0, flat_len)); @@ -100,6 +116,11 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, device_context_ptr); executor->ensure_per_task_float_heap_post_reducer(task_index, task.ad_stack, n); } + // Floor division (not ceiling): the heap-row count `n` resolved by `resolve_num_threads` floors at + // `kAdStackMaxConcurrentThreads`, so dispatching `cap_blocks * block_dim` threads must not exceed that count. + // Ceiling division would over-dispatch by `block_dim - 1` threads when `block_dim` does not divide + // `kAdStackMaxConcurrentThreads` evenly (e.g. `block_dim=192`: `ceil(65536/192)*192 = 65664`), and threads + // with `linear_thread_idx >= 65536` would index past the heap end. if (task.block_dim > 0) { const std::size_t cap_blocks = std::max(1u, kAdStackMaxConcurrentThreads / static_cast(task.block_dim)); @@ -143,8 +164,8 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, int effective_grid_dim = prepare_task(j, t); CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); - cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, - {&ctx.get_context()}, {}); + cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()}, + {}); } for (auto &[sid, s] : stream_by_id) { From e8d9cf0413588ddfd1c51967407d53d8c657136e Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 12:18:08 -0700 Subject: [PATCH 095/109] Allow synchronizing the default AMDGPU stream (handle 0) The stream_handle != 0 guard made synchronize_stream a no-op for the default stream on AMDGPU, unlike the CUDA path. HIP supports hipStreamSynchronize(nullptr), so remove the guard to match CUDA semantics. Co-authored-by: Cursor --- quadrants/program/program_stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quadrants/program/program_stream.cpp b/quadrants/program/program_stream.cpp index 31fb12e76d..9686a86332 100644 --- a/quadrants/program/program_stream.cpp +++ b/quadrants/program/program_stream.cpp @@ -61,7 +61,7 @@ void StreamManager::synchronize_stream(uint64 stream_handle) { } #endif #ifdef QD_WITH_AMDGPU - if (arch_ == Arch::amdgpu && stream_handle != 0) { + if (arch_ == Arch::amdgpu) { AMDGPUContext::get_instance().make_current(); AMDGPUDriver::get_instance().stream_synchronize(reinterpret_cast(stream_handle)); } From 48c3922acac9a7959cda3fbec90aaa4cdbabbb1a Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 12:20:34 -0700 Subject: [PATCH 096/109] Fall back to current runtime for Stream/Event destroy after reset When the owning Program has been collected (e.g. after qd.reset()), destroy() and __del__ now fall back to the current runtime's Program to free the underlying CUDA resource. This is safe because CUDAContext is a singleton, so stream/event handles remain valid across Programs. Prevents resource leaks in create/reset cycles. Co-authored-by: Cursor --- python/quadrants/lang/stream.py | 42 ++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 5898cb434e..6187b6f9c4 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -35,21 +35,34 @@ def synchronize(self): raise RuntimeError("Stream's owning Program has been destroyed (e.g. after qd.reset())") prog.stream_synchronize(self._handle) + def _destroy_prog(self): + """Resolve a Program for resource cleanup. + + Falls back to the current runtime when the owner has been collected, which is safe because + CUDAContext is a singleton so the CUDA stream handle remains valid. + """ + prog = self._prog() + if prog is None: + try: + return impl.get_runtime().prog + except Exception: + return None + return prog + def destroy(self): """Explicitly destroy the stream. Safe to call multiple times. - No-op if the owning Program has already been collected, or for streams wrapping external handles - (created via Stream(ptr) without a prog_ref). + No-op for streams wrapping external handles (created via Stream(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - prog = self._prog() + prog = self._destroy_prog() if prog is not None: prog.stream_destroy(self._handle) self._handle = 0 def __del__(self): if self._handle != 0 and self._prog_ref is not None: - prog = self._prog_ref() + prog = self._destroy_prog() if prog is not None: try: prog.stream_destroy(self._handle) @@ -105,21 +118,34 @@ def synchronize(self): """Block the host until this event has been reached.""" self._require_prog().event_synchronize(self._handle) + def _destroy_prog(self): + """Resolve a Program for resource cleanup. + + Falls back to the current runtime when the owner has been collected, which is safe because + CUDAContext is a singleton so the CUDA event handle remains valid. + """ + prog = self._prog() + if prog is None: + try: + return impl.get_runtime().prog + except Exception: + return None + return prog + def destroy(self): """Explicitly destroy the event. Safe to call multiple times. - No-op if the owning Program has already been collected, or for events wrapping external handles - (created via Event(ptr) without a prog_ref). + No-op for events wrapping external handles (created via Event(ptr) without a prog_ref). """ if self._handle != 0 and self._prog_ref is not None: - prog = self._prog() + prog = self._destroy_prog() if prog is not None: prog.event_destroy(self._handle) self._handle = 0 def __del__(self): if self._handle != 0 and self._prog_ref is not None: - prog = self._prog_ref() + prog = self._destroy_prog() if prog is not None: try: prog.event_destroy(self._handle) From 44ee707afa655e728ebf452d0e4102de3e75da7f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sat, 2 May 2026 15:04:03 -0700 Subject: [PATCH 097/109] Reflow _destroy_prog docstrings to 120-char width Co-authored-by: Cursor --- python/quadrants/lang/stream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/quadrants/lang/stream.py b/python/quadrants/lang/stream.py index 6187b6f9c4..e87816568c 100644 --- a/python/quadrants/lang/stream.py +++ b/python/quadrants/lang/stream.py @@ -38,8 +38,8 @@ def synchronize(self): def _destroy_prog(self): """Resolve a Program for resource cleanup. - Falls back to the current runtime when the owner has been collected, which is safe because - CUDAContext is a singleton so the CUDA stream handle remains valid. + Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a + singleton so the CUDA stream handle remains valid. """ prog = self._prog() if prog is None: @@ -121,8 +121,8 @@ def synchronize(self): def _destroy_prog(self): """Resolve a Program for resource cleanup. - Falls back to the current runtime when the owner has been collected, which is safe because - CUDAContext is a singleton so the CUDA event handle remains valid. + Falls back to the current runtime when the owner has been collected, which is safe because CUDAContext is a + singleton so the CUDA event handle remains valid. """ prog = self._prog() if prog is None: From ac4b825074b40d65ee8dc367510d7d3d6557c59b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 3 May 2026 02:53:16 -0700 Subject: [PATCH 098/109] Guard stream-parallel cleanup with exception safety Wrap the launch+synchronize section in try/catch so that acquired streams are returned to the pool and active_stream is restored even when a launch or stream_synchronize throws. Co-authored-by: Cursor --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 28 +++++++++++++------- quadrants/runtime/cuda/kernel_launcher.cpp | 28 +++++++++++++------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 96ac406902..8cdd9fa3c1 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -141,17 +141,25 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, } } - for (size_t j = group_start; j < i; j++) { - const auto &t = offloaded_tasks[j]; - int effective_grid_dim = prepare_task(j, t); - AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); - QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); - amdgpu_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, - {(void *)&context_pointer}, {arg_size}); - } + try { + for (size_t j = group_start; j < i; j++) { + const auto &t = offloaded_tasks[j]; + int effective_grid_dim = prepare_task(j, t); + AMDGPUContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); + QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); + amdgpu_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, + {(void *)&context_pointer}, {arg_size}); + } - for (auto &[sid, s] : stream_by_id) { - AMDGPUDriver::get_instance().stream_synchronize(s); + for (auto &[sid, s] : stream_by_id) { + AMDGPUDriver::get_instance().stream_synchronize(s); + } + } catch (...) { + for (auto &[sid, s] : stream_by_id) { + AMDGPUContext::get_instance().release_stream(s); + } + AMDGPUContext::get_instance().set_stream(active_stream); + throw; } for (auto &[sid, s] : stream_by_id) { AMDGPUContext::get_instance().release_stream(s); diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 0a0d6faafc..17a04067a4 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -160,17 +160,25 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, } } - for (size_t j = group_start; j < i; j++) { - const auto &t = offloaded_tasks[j]; - int effective_grid_dim = prepare_task(j, t); - CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); - QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); - cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, {&ctx.get_context()}, - {}); - } + try { + for (size_t j = group_start; j < i; j++) { + const auto &t = offloaded_tasks[j]; + int effective_grid_dim = prepare_task(j, t); + CUDAContext::get_instance().set_stream(stream_by_id[t.stream_parallel_group_id]); + QD_TRACE("Launching kernel {}<<<{}, {}>>>", t.name, effective_grid_dim, t.block_dim); + cuda_module->launch(t.name, effective_grid_dim, t.block_dim, t.dynamic_shared_array_bytes, + {&ctx.get_context()}, {}); + } - for (auto &[sid, s] : stream_by_id) { - CUDADriver::get_instance().stream_synchronize(s); + for (auto &[sid, s] : stream_by_id) { + CUDADriver::get_instance().stream_synchronize(s); + } + } catch (...) { + for (auto &[sid, s] : stream_by_id) { + CUDAContext::get_instance().release_stream(s); + } + CUDAContext::get_instance().set_stream(active_stream); + throw; } for (auto &[sid, s] : stream_by_id) { CUDAContext::get_instance().release_stream(s); From 65d5cb92d0d8fa8cd82bf7b26eb7744e76d40c98 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 3 May 2026 03:37:40 -0700 Subject: [PATCH 099/109] Restore explanatory comments removed during stream-parallel refactor The prepare_task lambda extraction dropped several non-obvious comments explaining adstack gate roles, lazy-claim buffer rationale, device-side reducer mechanics, shape-entry unit-stability, and grid-dim capping rationale. Restore them. Co-authored-by: Cursor --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 3 +++ quadrants/runtime/cuda/kernel_launcher.cpp | 24 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 8cdd9fa3c1..e74152927b 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -61,6 +61,9 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, const bool any_lazy_task = std::any_of(offloaded_tasks.begin(), offloaded_tasks.end(), [](const OffloadedTask &t) { return t.ad_stack.bound_expr.has_value(); }); if (any_lazy_task) { + // Allocate / reset the per-kernel lazy-claim arrays once before the first task. See the matching CPU launcher + // block for rationale; on AMDGPU the same memcpy_host_to_device path through the cached field pointers publishes + // the cleared counter and UINT32_MAX-defaulted capacity arrays. executor->publish_adstack_lazy_claim_buffers(offloaded_tasks.size()); } diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 17a04067a4..4653ddb55a 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -70,9 +70,18 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, // Two gates govern the per-launch adstack publish work, both opt-in by the kernel's IR shape. Forward-only kernels // skip both gates and pay zero adstack overhead; reverse-mode kernels without a captured `bound_expr` skip the // lazy-claim block, paying the per-task `publish_adstack_metadata` only. + // - `any_adstack`: at least one task has an `AdStackAllocaStmt`. Gates the per-task `publish_adstack_metadata` + // call (sets per-thread stride for the codegen heap-base addressing). + // - `any_lazy_task`: at least one task has a captured `bound_expr` (the codegen routes such tasks through the + // lazy LCA-block atomic-rmw row claim, which reads `runtime->adstack_row_counters[task_id]` and + // `runtime->adstack_bound_row_capacities[task_id]`). Gates `publish_adstack_lazy_claim_buffers` and the + // per-task reducer dispatch + DtoH heap sizing. const bool any_lazy_task = std::any_of(offloaded_tasks.begin(), offloaded_tasks.end(), [](const OffloadedTask &t) { return t.ad_stack.bound_expr.has_value(); }); if (any_lazy_task) { + // Allocate / reset the per-kernel lazy-claim arrays once before the first task. See the matching CPU launcher + // block for rationale; on CUDA the same memcpy_host_to_device path through the cached field pointers publishes + // the cleared counter and UINT32_MAX-defaulted capacity arrays. executor->publish_adstack_lazy_claim_buffers(offloaded_tasks.size()); } @@ -88,6 +97,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, // `publish_adstack_metadata`'s host-pointer fast path. executor->publish_adstack_metadata(task.ad_stack, n, &ctx, device_context_ptr); if (task.ad_stack.bound_expr.has_value()) { + // Device-side reducer for tasks with a captured ndarray-backed `bound_expr`: a single-thread CUDA kernel + // walks the gating ndarray, counts gate-passing threads, writes the count into + // `runtime->adstack_bound_row_capacities[task_index]`. The codegen-emitted clamp at the float LCA-block + // claim site reads it back. Tasks without a captured gate keep the UINT32_MAX default and the clamp stays + // inert. + // // Reducer length is the gating ndarray's full flat element count, not `n`: the lazy row-claim atomic-rmw // fires once per LCA execution, and `gpu_parallel_struct_for` / `gpu_parallel_range_for` grid-stride (`i += // grid_dim()`) so a single dispatched thread can hit the LCA many times across one launch when the logical @@ -98,6 +113,9 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, if (task.ad_stack.bound_expr->field_source_kind == StaticAdStackBoundExpr::FieldSourceKind::NdArray && !task.ad_stack.bound_expr->ndarray_arg_id.empty() && task.ad_stack.bound_expr->ndarray_ndim > 0 && ctx.args_type != nullptr) { + // Length = product of shape entries via `args_type`. See `runtime/cpu/kernel_launcher.cpp` for the + // unit-stability rationale; `array_runtime_sizes` carries different units depending on the dispatch entry + // point and would undercount by `sizeof(elem)`x for `qd.ndarray` arguments. int64_t flat_len = 1; for (int axis = 0; axis < task.ad_stack.bound_expr->ndarray_ndim; ++axis) { std::vector indices = task.ad_stack.bound_expr->ndarray_arg_id; @@ -119,6 +137,12 @@ void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, // dispatched-threads worst case on sparse-grid workloads. executor->ensure_per_task_float_heap_post_reducer(task_index, task.ad_stack, n, &ctx); } + // For adstack-bearing tasks, dispatch at most `kAdStackMaxConcurrentThreads` (matching the heap row count + // resolved above). The runtime's grid-strided loop (`gpu_parallel_struct_for` / `gpu_parallel_range_for`, + // `quadrants/runtime/llvm/runtime_module/runtime.cpp`) walks the full element list / range with + // `i += grid_dim()`, so a smaller grid completes the same workload sequentially per slot. Tasks without an + // adstack keep the codegen-emitted `task.grid_dim` (saturating_grid_dim) for max throughput. + // // Floor division (not ceiling): the heap-row count `n` resolved by `resolve_num_threads` floors at // `kAdStackMaxConcurrentThreads`, so dispatching `cap_blocks * block_dim` threads must not exceed that count. // Ceiling division would over-dispatch by `block_dim - 1` threads when `block_dim` does not divide From b5554ca6267ce618b70490c4b77436a906a7314b Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 5 May 2026 01:13:15 -0700 Subject: [PATCH 100/109] Fix clang-format line length in kernel launchers --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 3 ++- quadrants/runtime/cuda/kernel_launcher.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index 79ef2ef740..aede51b290 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -324,7 +324,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } int arg_size = sizeof(RuntimeContext *); if (launcher_ctx.runtime_context_dev_ptr == nullptr) { - AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), active_stream); + AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), + active_stream); } void *context_pointer = launcher_ctx.runtime_context_dev_ptr; AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext), diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index fb68dfcbb4..0f5aa38fba 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -401,7 +401,8 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx void *device_context_ptr = nullptr; if (needs_sizer_device_ctx) { if (launcher_ctx.runtime_context_dev_ptr == nullptr) { - CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), active_stream); + CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), + active_stream); } device_context_ptr = launcher_ctx.runtime_context_dev_ptr; CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(), From 8a7cdd795633e825e52521e2966f1873a2ccd7c7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 5 May 2026 02:26:16 -0700 Subject: [PATCH 101/109] Use default stream for persistent buffer alloc/free Persistent scratch buffers (result_buffer, arg_buffer, runtime_context) must use nullptr (default stream) for malloc_async/mem_free_async so the operations serialize with all non-blocking streams. Using active_stream caused use-after-free when the active stream changed between launches. --- quadrants/runtime/amdgpu/kernel_launcher.cpp | 11 +++++------ quadrants/runtime/cuda/kernel_launcher.cpp | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/quadrants/runtime/amdgpu/kernel_launcher.cpp b/quadrants/runtime/amdgpu/kernel_launcher.cpp index aede51b290..88e64ce7ac 100644 --- a/quadrants/runtime/amdgpu/kernel_launcher.cpp +++ b/quadrants/runtime/amdgpu/kernel_launcher.cpp @@ -230,10 +230,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx const std::size_t needed_result = std::max(ctx.result_buffer_size, sizeof(uint64)); if (needed_result > persistent_result_buffer_capacity_) { if (persistent_result_buffer_dev_ptr_ != nullptr) { - AMDGPUDriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, active_stream); + AMDGPUDriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, nullptr); } const std::size_t new_cap = std::max(needed_result, 2 * persistent_result_buffer_capacity_); - AMDGPUDriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, active_stream); + AMDGPUDriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, nullptr); persistent_result_buffer_capacity_ = new_cap; } device_result_buffer = static_cast(persistent_result_buffer_dev_ptr_); @@ -311,10 +311,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx if (ctx.arg_buffer_size > 0) { if (ctx.arg_buffer_size > launcher_ctx.arg_buffer_capacity) { if (launcher_ctx.arg_buffer_dev_ptr != nullptr) { - AMDGPUDriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, active_stream); + AMDGPUDriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, nullptr); } const std::size_t new_cap = std::max(ctx.arg_buffer_size, 2 * launcher_ctx.arg_buffer_capacity); - AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, active_stream); + AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, nullptr); launcher_ctx.arg_buffer_capacity = new_cap; } device_arg_buffer = static_cast(launcher_ctx.arg_buffer_dev_ptr); @@ -324,8 +324,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx } int arg_size = sizeof(RuntimeContext *); if (launcher_ctx.runtime_context_dev_ptr == nullptr) { - AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), - active_stream); + AMDGPUDriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), nullptr); } void *context_pointer = launcher_ctx.runtime_context_dev_ptr; AMDGPUDriver::get_instance().memcpy_host_to_device_async(context_pointer, &ctx.get_context(), sizeof(RuntimeContext), diff --git a/quadrants/runtime/cuda/kernel_launcher.cpp b/quadrants/runtime/cuda/kernel_launcher.cpp index 0f5aa38fba..1e44305024 100644 --- a/quadrants/runtime/cuda/kernel_launcher.cpp +++ b/quadrants/runtime/cuda/kernel_launcher.cpp @@ -278,10 +278,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx const std::size_t needed_result = std::max(ctx.result_buffer_size, sizeof(uint64)); if (needed_result > persistent_result_buffer_capacity_) { if (persistent_result_buffer_dev_ptr_ != nullptr) { - CUDADriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, active_stream); + CUDADriver::get_instance().mem_free_async(persistent_result_buffer_dev_ptr_, nullptr); } const std::size_t new_cap = std::max(needed_result, 2 * persistent_result_buffer_capacity_); - CUDADriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, active_stream); + CUDADriver::get_instance().malloc_async(&persistent_result_buffer_dev_ptr_, new_cap, nullptr); persistent_result_buffer_capacity_ = new_cap; } device_result_buffer = static_cast(persistent_result_buffer_dev_ptr_); @@ -367,10 +367,10 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx if (ctx.arg_buffer_size > 0) { if (ctx.arg_buffer_size > launcher_ctx.arg_buffer_capacity) { if (launcher_ctx.arg_buffer_dev_ptr != nullptr) { - CUDADriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, active_stream); + CUDADriver::get_instance().mem_free_async(launcher_ctx.arg_buffer_dev_ptr, nullptr); } const std::size_t new_cap = std::max(ctx.arg_buffer_size, 2 * launcher_ctx.arg_buffer_capacity); - CUDADriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, active_stream); + CUDADriver::get_instance().malloc_async(&launcher_ctx.arg_buffer_dev_ptr, new_cap, nullptr); launcher_ctx.arg_buffer_capacity = new_cap; } device_arg_buffer = static_cast(launcher_ctx.arg_buffer_dev_ptr); @@ -401,8 +401,7 @@ void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx void *device_context_ptr = nullptr; if (needs_sizer_device_ctx) { if (launcher_ctx.runtime_context_dev_ptr == nullptr) { - CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), - active_stream); + CUDADriver::get_instance().malloc_async(&launcher_ctx.runtime_context_dev_ptr, sizeof(RuntimeContext), nullptr); } device_context_ptr = launcher_ctx.runtime_context_dev_ptr; CUDADriver::get_instance().memcpy_host_to_device_async(device_context_ptr, &ctx.get_context(), From 594bb8a782f924ee5e5ed937a07ec6dd1fb75ff1 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:28:54 -0700 Subject: [PATCH 102/109] Update streams doc: rename fill_a/fill_b, remove redundant synchronize Rename fill_a/fill_b to some_func1/some_func2 in explicit stream examples. Remove redundant synchronize() from context manager example since destroy() already waits for in-flight work. --- docs/source/user_guide/streams.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 26ea154321..7158647a18 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -63,8 +63,8 @@ For cases that require manual control — such as launching separate kernels on s1 = qd.create_stream() s2 = qd.create_stream() -fill_a(qd_stream=s1) -fill_b(qd_stream=s2) +some_func1(qd_stream=s1) +some_func2(qd_stream=s2) s1.synchronize() s2.synchronize() @@ -115,9 +115,8 @@ Streams and events support `with` blocks for automatic cleanup: ```python with qd.create_stream() as s: - fill_a(qd_stream=s) - s.synchronize() -# s.destroy() called automatically + some_func1(qd_stream=s) +# s.destroy() called automatically — waits for in-flight work ``` ### PyTorch interop (CUDA) From bfa9ff977c62caff137f207808edfeb0a8c25895 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:29:44 -0700 Subject: [PATCH 103/109] Remove incorrect claim about data corruption without stream management --- docs/source/user_guide/streams.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 7158647a18..6ff231f7f1 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -121,7 +121,7 @@ with qd.create_stream() as s: ### PyTorch interop (CUDA) -When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption. +When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. #### Running Quadrants kernels on PyTorch's stream From c38f53e9b4dd7a8d14c06c713f165c91c07763ff Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:30:25 -0700 Subject: [PATCH 104/109] Remove PyTorch interop section from streams doc --- docs/source/user_guide/streams.md | 36 ------------------------------- 1 file changed, 36 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 6ff231f7f1..ab0625235d 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -119,42 +119,6 @@ with qd.create_stream() as s: # s.destroy() called automatically — waits for in-flight work ``` -### PyTorch interop (CUDA) - -When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. - -#### Running Quadrants kernels on PyTorch's stream - -```python -import torch -from quadrants.lang.stream import Stream - -torch_stream_ptr = torch.cuda.current_stream().cuda_stream -stream = Stream(torch_stream_ptr) - -physics_kernel(qd_stream=stream) -observations = compute_obs_tensor() # PyTorch op on the same stream -apply_actions_kernel(qd_stream=stream) -``` - -Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream. - -#### Running PyTorch operations on a Quadrants stream - -```python -qd_stream = qd.create_stream() -torch_stream = torch.cuda.ExternalStream(qd_stream.handle) - -with torch.cuda.stream(torch_stream): - physics_kernel(qd_stream=qd_stream) - observations = compute_obs_tensor() - apply_actions_kernel(qd_stream=qd_stream) - -qd_stream.destroy() -``` - -`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly. - ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. From c8d779261c0561f5a74b55a52846de3262138320 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:32:21 -0700 Subject: [PATCH 105/109] Move sync behavior notes out of Limitations into own section --- docs/source/user_guide/streams.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index ab0625235d..6ea695e476 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -119,9 +119,12 @@ with qd.create_stream() as s: # s.destroy() called automatically — waits for in-flight work ``` +## Synchronization notes + +- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for. +- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. `stream_parallel` handles this automatically. + ## Limitations - **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. - **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context. -- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for. -- **No automatic synchronization with explicit streams.** When using explicit streams, you are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input. `stream_parallel` handles this automatically. From 8ef3a0bd0578e4d2c36283f908129f7b8a843536 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:34:45 -0700 Subject: [PATCH 106/109] Revert qd.sync() to default-stream-only synchronization context_synchronize/device_synchronize waits on all streams, which contradicts the documented behavior that qd.sync() only waits on the default stream. stream_parallel already synchronizes its pooled streams before returning, so a global barrier is unnecessary. --- quadrants/runtime/llvm/llvm_runtime_executor.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/quadrants/runtime/llvm/llvm_runtime_executor.cpp b/quadrants/runtime/llvm/llvm_runtime_executor.cpp index 02b2b3d8a8..663e8ffbb0 100644 --- a/quadrants/runtime/llvm/llvm_runtime_executor.cpp +++ b/quadrants/runtime/llvm/llvm_runtime_executor.cpp @@ -201,15 +201,13 @@ Program *LlvmRuntimeExecutor::get_program() const { void LlvmRuntimeExecutor::synchronize() { if (config_.arch == Arch::cuda) { #if defined(QD_WITH_CUDA) - CUDAContext::get_instance().make_current(); - CUDADriver::get_instance().context_synchronize(); + CUDADriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No CUDA support"); #endif } else if (config_.arch == Arch::amdgpu) { #if defined(QD_WITH_AMDGPU) - AMDGPUContext::get_instance().make_current(); - AMDGPUDriver::get_instance().device_synchronize(); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); #else QD_ERROR("No AMDGPU support"); #endif From cf09b26df9ef729e6d6cfa85f886836b1e025090 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 07:50:23 -0700 Subject: [PATCH 107/109] Clarify that qd_stream is implicit in any @qd.kernel call Address review comment: make explicit that qd_stream is a special keyword argument handled by the @qd.kernel decorator, not something the user declares in the kernel signature. --- docs/source/user_guide/streams.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 6ea695e476..37d8967eeb 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -59,12 +59,19 @@ For cases that require manual control — such as launching separate kernels on ### Creating and using streams +Any `@qd.kernel` function accepts a special `qd_stream` keyword argument — you do not need to declare it in the kernel signature. The `@qd.kernel` decorator handles it automatically. + ```python +@qd.kernel +def my_kernel(): + for i in range(N): + a[i] = i + s1 = qd.create_stream() s2 = qd.create_stream() -some_func1(qd_stream=s1) -some_func2(qd_stream=s2) +my_kernel(qd_stream=s1) +my_kernel(qd_stream=s2) s1.synchronize() s2.synchronize() @@ -73,7 +80,7 @@ s1.destroy() s2.destroy() ``` -Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes. +Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes. ### Events From 5f36533bc5fead1ca433675f013bd54898a5e563 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 08:05:16 -0700 Subject: [PATCH 108/109] Note that graph/autodiff + qd_stream raises RuntimeError --- docs/source/user_guide/streams.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md index 37d8967eeb..a8db331bcc 100644 --- a/docs/source/user_guide/streams.md +++ b/docs/source/user_guide/streams.md @@ -133,5 +133,5 @@ with qd.create_stream() as s: ## Limitations -- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`. -- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context. +- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True` (if you do, a `RuntimeError` will be raised). +- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context (if you do, a `RuntimeError` will be raised). From b298d92eb5d1c4b6777c2d10d555288b665eb2a3 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 7 May 2026 09:14:20 -0700 Subject: [PATCH 109/109] Add tests for build_With error branches Cover all 5 error paths in ASTTransformer.build_With: multiple context managers, with-as syntax, non-call expression, non-stream_parallel context manager, and stream_parallel inside @qd.func. --- tests/python/test_streams.py | 108 +++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tests/python/test_streams.py b/tests/python/test_streams.py index fbf7abb155..b89a3b4a42 100644 --- a/tests/python/test_streams.py +++ b/tests/python/test_streams.py @@ -538,3 +538,111 @@ def parallel_fill( qd.sync() assert np.allclose(a.to_numpy(), v), f"iteration {iteration}" assert np.allclose(b.to_numpy(), v * 2.0), f"iteration {iteration}" + + +@test_utils.test() +def test_with_multiple_context_managers_rejected(): + import pytest + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + + with pytest.raises(QuadrantsSyntaxError, match="single context manager"): + + @qd.kernel + def bad(): + with qd.stream_parallel(), qd.stream_parallel(): + for i in range(N): + a[i] = 1.0 + + bad() + + +@test_utils.test() +def test_with_as_rejected(): + import pytest + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + + with pytest.raises(QuadrantsSyntaxError, match="with .* as"): + + @qd.kernel + def bad(): + with qd.stream_parallel() as s: + for i in range(N): + a[i] = 1.0 + + bad() + + +@test_utils.test() +def test_with_non_call_expression_rejected(): + import pytest + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + dummy = qd.stream_parallel + + with pytest.raises(QuadrantsSyntaxError, match="requires a call expression"): + + @qd.kernel + def bad(): + with dummy: + for i in range(N): + a[i] = 1.0 + + bad() + + +@test_utils.test() +def test_with_non_stream_parallel_rejected(): + import pytest + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + + def other_ctx(): + pass + + with pytest.raises(QuadrantsSyntaxError, match="only supports qd.stream_parallel"): + + @qd.kernel + def bad(): + with other_ctx(): + for i in range(N): + a[i] = 1.0 + + bad() + + +@test_utils.test() +def test_stream_parallel_in_func_rejected(): + import pytest + + from quadrants.lang.exception import QuadrantsSyntaxError + + N = 64 + a = qd.field(qd.f32, shape=(N,)) + + with pytest.raises(QuadrantsSyntaxError, match="only be used inside @qd.kernel"): + + @qd.func + def helper(): + with qd.stream_parallel(): + for i in range(N): + a[i] = 1.0 + + @qd.kernel + def bad(): + helper() + + bad()