feat(cuda.core): cu12 fallback for prefetch_batch (N3)

rparolin · claude · rparolin · commit d75a7bd49c3e · 2026-05-01T10:04:41.000-07:00
Per Leo's review on PR #1775 (_managed_memory_ops.pyx:228), raising NotImplementedError on cu12 forces users to write their own loop. The CUDA driver semantics for cuMemPrefetchBatchAsync are equivalent to per-range cuMemPrefetchAsync calls — just more efficient when batched at the driver level. On cu12 builds (where cuMemPrefetchBatchAsync is not exposed), fall back to a Python-level loop calling cuMemPrefetchAsync per buffer. The single-range path (_do_single_prefetch) already works on cu12 via the IF/ELSE split inside it. Note this fallback applies only to prefetch_batch — discard_batch and discard_prefetch_batch keep the cu12 NotImplementedError because the driver has no single-range cuMemDiscard{,AndPrefetch}Async to fall back to. Test skips for cuMemPrefetchBatchAsync unavailability dropped from TestPrefetchBatch.test_same_location and test_per_buffer_location; the fallback path now runs on cu12 builds too. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -280,10 +280,11 @@ def prefetch_batch(buffers, locations, *, stream):
     stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
         Stream for the asynchronous prefetch (keyword-only).
 
-    Raises
-    ------
-    NotImplementedError
-        On a CUDA 12 build of ``cuda.core``.
+    Notes
+    -----
+    On a CUDA 12 build, falls back to a Python-level loop calling
+    ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on
+    CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly.
     """
     cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch")
     cdef Py_ssize_t n = len(bufs)
@@ -364,9 +365,13 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
     IF CUDA_CORE_BUILD_MAJOR >= 13:
         _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync)
     ELSE:
-        raise NotImplementedError(
-            "batched prefetch requires a CUDA 13 build of cuda.core"
-        )
+        # cu12 has no cuMemPrefetchBatchAsync; loop per-range.
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        cdef Py_ssize_t n = len(bufs)
+        for i in range(n):
+            buf = <Buffer>bufs[i]
+            _do_single_prefetch(buf, locs[i], s)
 
 
 def discard_prefetch_batch(buffers, locations, *, stream):
diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py
@@ -349,8 +349,6 @@ def test_same_location(self, init_cuda):
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
-        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
-            pytest.skip("cuMemPrefetchBatchAsync unavailable")
         device.set_current()
         mr = create_managed_memory_resource_or_skip()
         bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)]
@@ -372,8 +370,6 @@ def test_per_buffer_location(self, init_cuda):
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
-        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
-            pytest.skip("cuMemPrefetchBatchAsync unavailable")
         device.set_current()
         mr = create_managed_memory_resource_or_skip()
         bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]