NVIDIA · tpn · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
@@ -39,7 +39,7 @@ body:
         - CUB
         - libcu++
         - CUDA Experimental (cudax)
-        - cuda.coop (Python)
+        - cuda.coop.numba_cuda (Python)
         - cuda.compute (Python)
         - General CCCL
         - Infrastructure

@@ -23,7 +23,7 @@ body:
         - CUB
         - libcu++
         - CUDA Experimental (cudax)
-        - cuda.coop (Python)
+        - cuda.coop.numba_cuda (Python)
         - cuda.compute (Python)
         - General CCCL
         - Infrastructure

@@ -7,7 +7,7 @@ changelog:
     - title: 📚 Libcudacxx
       labels:
         - "libcu++"
-    - title: 🤝 cuda.coop
+    - title: 🤝 cuda.coop.numba_cuda
       labels:
         - "cuda.coop"
     - title: ⚡ cuda.compute

diff --git a/AGENTS.md b/AGENTS.md
@@ -206,7 +206,7 @@ Supported versions: `3.10`, `3.11`, `3.12`, `3.13`
 ### Modules
 
 * **cuda.compute** — Device-level algorithms, iterators, custom GPU types
-* **cuda.coop** — Block/warp-level primitives
+* **cuda.coop.numba_cuda** — Block/warp-level primitives for Numba CUDA
 * **cuda.cccl.headers** — Programmatic access to headers
 
 ### Installation
@@ -245,7 +245,7 @@ Requirements:
 import cuda.compute
 result = cuda.compute.reduce_into(input_array, output_scalar, init_val, binary_op)
 
-from cuda import coop
+import cuda.coop.numba_cuda as coop
 @cuda.jit
 def kernel(data):
     coop.block.reduce(data, binary_op)

@@ -38,7 +38,7 @@ benchmarks:
       # Examples:
       # - 'compute/reduce/sum\.py'
       # - 'compute/transform/.*\.py'
-      # - 'coop/bench_warp_reduce\.py'
+      # - 'coop/numba_cuda/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:

@@ -38,7 +38,7 @@ benchmarks:
       # Examples:
       # - 'compute/reduce/sum\.py'
       # - 'compute/transform/.*\.py'
-      # - 'coop/bench_warp_reduce\.py'
+      # - 'coop/numba_cuda/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:

@@ -71,7 +71,7 @@ For Python benchmarks, `compare_paths.sh`:
 Python filters are regex patterns matched against relative paths under `python/cuda_cccl/benchmarks/`, for example:
 - `compute/reduce/sum\.py` — single benchmark
 - `compute/transform/.*\.py` — all transform benchmarks
-- `coop/.*\.py` — all coop benchmarks
+- `coop/numba_cuda/.*\.py` — all coop benchmarks
 
 ## Artifacts
 

@@ -507,7 +507,7 @@ jobs:
   # Python:
   build_py_wheel:   { name: "Build cuda.cccl",             gpu: false, invoke: { prefix: 'build_cuda_cccl'} }
   test_py_headers:  { name: "Test cuda.cccl.headers",      gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_headers'} }
-  test_py_coop:     { name: "Test cuda.coop",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
+  test_py_coop:     { name: "Test cuda.coop.numba_cuda",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
 

@@ -27,4 +27,4 @@ python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]"
 
 # Run tests for coop module
 cd "/home/coder/cccl/python/cuda_cccl/tests/"
-python -m pytest -n auto -v coop/
+python -m pytest -n auto -v coop/numba_cuda/
@@ -1,6 +1,6 @@
 <#
 .SYNOPSIS
-    Build Python cuda-cccl wheels for the cuda.compute and cuda.coop packages
+    Build Python cuda-cccl wheels for the cuda.compute and cuda.coop.numba_cuda packages
     on Windows.
 
 .DESCRIPTION

@@ -20,6 +20,6 @@ ${wheelPath} = Get-CudaCcclWheel
 
 Push-Location (Join-Path (Get-RepoRoot) "python/cuda_cccl/tests")
 try {
-    & $python -m pytest -n auto -v coop/
+    & $python -m pytest -n auto -v coop/numba_cuda/
 }
 finally { Pop-Location }
@@ -150,12 +150,12 @@ CUB_NAMESPACE_BEGIN
 //!
 //!    .. code-block:: python
 //!
-//!        from cuda import coop
+//!        import cuda.coop.numba_cuda as coop
 //!        from pynvjitlink import patch
 //!        patch.patch_numba_linker(lto=True)
 //!
 //!        # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
-//!        block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
+//!        block_radix_sort = coop.block.make_radix_sort_keys(numba.int32, 128, 4)
 //!        temp_storage_bytes = block_radix_sort.temp_storage_bytes
 //!
 //!        @cuda.jit(link=block_radix_sort.files)

@@ -1,27 +1,27 @@
 .. _cccl-python-coop:
 
-``cuda.coop``: Cooperative Algorithms
-=====================================
+``cuda.coop.numba_cuda``: Cooperative Algorithms
+================================================
 
-The ``cuda.coop`` library provides cooperative algorithms that operate
+The ``cuda.coop.numba_cuda`` library provides cooperative algorithms that operate
 at the level of blocks and warps. It is designed to be used within
 `Numba CUDA kernels <https://numba.readthedocs.io/en/stable/cuda/kernels.html>`_.
 
-Here's an example showing how to use the ``cuda.coop`` library to
+Here's an example showing how to use the ``cuda.coop.numba_cuda`` library to
 perform block-level reduction within a Numba CUDA kernel.
 
-.. literalinclude:: ../../python/cuda_cccl/tests/coop/examples/block/reduce.py
+.. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/examples/block/reduce.py
    :language: python
    :pyobject: custom_reduce_example
-   :caption: Block-level reduction example. `View complete source on GitHub <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/coop/examples/block/reduce.py>`__
+   :caption: Block-level reduction example. `View complete source on GitHub <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/coop/numba_cuda/examples/block/reduce.py>`__
 
 Example Collections
 -------------------
 
 For complete runnable examples and more advanced usage patterns, see our
-full collection of `examples <https://github.com/NVIDIA/CCCL/tree/main/python/cuda_cccl/tests/coop/examples>`_.
+full collection of `examples <https://github.com/NVIDIA/CCCL/tree/main/python/cuda_cccl/tests/coop/numba_cuda/examples>`_.
 
 API Reference
 -------------
 
-- :ref:`cuda_coop-module`
+- :ref:`cuda_coop_numba_cuda-module`
@@ -1,19 +1,19 @@
-.. _cuda_coop-module:
+.. _cuda_coop_numba_cuda-module:
 
-``cuda.coop`` API Reference
-===========================
+``cuda.coop.numba_cuda`` API Reference
+======================================
 
 .. warning::
-  ``cuda.coop`` is in public beta.
+  ``cuda.coop.numba_cuda`` is in public beta.
   The API is subject to change without notice.
 
-.. automodule:: cuda.coop.warp
+.. automodule:: cuda.coop.numba_cuda.warp
   :members:
   :undoc-members:
   :imported-members:
 
 
-.. automodule:: cuda.coop.block
+.. automodule:: cuda.coop.numba_cuda.block
   :members:
   :undoc-members:
   :imported-members:
@@ -11,7 +11,7 @@ abstractions for CUDA Python developers.
 * :doc:`cuda.compute <compute/index>` — Composable device-level primitives for building
   custom parallel algorithms, without writing CUDA kernels directly.
 
-* :doc:`cuda.coop <coop>` — Cooperative block- and warp-level algorithms for
+* :doc:`cuda.coop.numba_cuda <coop>` — Cooperative block- and warp-level algorithms for
   writing highly efficient CUDA kernels with `Numba CUDA <https://nvidia.github.io/numba-cuda/>`_.
 
 These libraries expose the generic, highly-optimized algorithms from the

@@ -7,7 +7,7 @@ Examples
 For recipes and patterns, see our examples:
 
 * ``cuda.compute`` `examples <https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/compute/examples>`_
-* ``cuda.coop`` `examples <https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/examples>`_
+* ``cuda.coop.numba_cuda`` `examples <https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/numba_cuda/examples>`_
 
 CUB and Thrust Documentation
 ----------------------------

@@ -6,7 +6,7 @@ provides a Pythonic interface to the
 It provides the following modules:
 
 - **`cuda.compute`** - Device-level parallel algorithms (reduce, scan, sort, etc.) and iterators
-- **`cuda.coop`** - Block and warp-level cooperative primitives for custom CUDA kernels
+- **`cuda.coop.numba_cuda`** - Block and warp-level cooperative primitives for custom CUDA kernels
 
 ## Installation
 
@@ -46,4 +46,4 @@ For complete documentation, examples, and API reference, visit:
 
 - **Full Documentation**: [nvidia.github.io/cccl/python](https://nvidia.github.io/cccl/python)
 - **Repository**: [github.com/NVIDIA/cccl](https://github.com/NVIDIA/cccl)
-- **Examples**: [github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/compute/examples](https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/compute/examples) and [github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/examples](https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/examples)
+- **Examples**: [github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/compute/examples](https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/compute/examples) and [github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/numba_cuda/examples](https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/coop/numba_cuda/examples)
@@ -9,7 +9,7 @@
 from numba.core.extending import intrinsic
 
 import cuda.bindings.driver as driver
-from cuda import coop
+import cuda.coop.numba_cuda as coop
 from cuda.core import Device
 
 

@@ -1,8 +1,7 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from . import block, warp
-from ._types import StatefulFunction
+"""Cooperative algorithm backends for CUDA Python DSLs."""
 
-__all__ = ["block", "warp", "StatefulFunction"]
+__all__: list[str] = []
@@ -0,0 +1,8 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from . import block, warp
+from ._types import StatefulFunction
+
+__all__ = ["block", "warp", "StatefulFunction"]
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 """
-cuda.coop._scan_op
+cuda.coop.numba_cuda._scan_op
 ==================
 
 This module implements the ``ScanOp`` class and related functions.

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 """
-cuda.coop.block_exchange
+cuda.coop.numba_cuda.block_exchange
 ====================================
 
 This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -126,9 +126,9 @@ def kernel(thread_data):
     :param block_exchange_type: Exchange mode to perform. Currently, only
         :py:attr:`StripedToBlocked` is supported.
     :param dtype: Data type of input and output values.
-    :type dtype: :py:class:`cuda.coop._typing.DtypeType`
+    :type dtype: :py:class:`cuda.coop.numba_cuda._typing.DtypeType`
     :param threads_per_block: Number of threads in the block.
-    :type threads_per_block: :py:class:`cuda.coop._typing.DimType`
+    :type threads_per_block: :py:class:`cuda.coop.numba_cuda._typing.DimType`
     :param items_per_thread: Number of items owned by each thread.
     :type items_per_thread: int
     :param warp_time_slicing: Whether to use warp time-slicing. If true,
@@ -143,7 +143,7 @@ def kernel(thread_data):
 
     :raises ValueError: If ``items_per_thread`` is less than 1.
 
-    :returns: An :py:class:`cuda.coop._types.Invocable` object
+    :returns: An :py:class:`cuda.coop.numba_cuda._types.Invocable` object
         representing the specialized kernel callable from a Numba JIT'd
         CUDA kernel.
 

@@ -80,7 +80,7 @@ def make_load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         128 integer items by 32 threads, with each thread handling
         4 integers.
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -89,7 +89,7 @@ def make_load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The following snippet shows how to invoke the returned
         ``block_load`` and ``block_store`` primitives:
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store
@@ -182,7 +182,7 @@ def make_store(dtype, threads_per_block, items_per_thread=1, algorithm="direct")
         128 integer items by 32 threads, with each thread handling
         4 integers.
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -191,7 +191,7 @@ def make_store(dtype, threads_per_block, items_per_thread=1, algorithm="direct")
         The following snippet shows how to invoke the returned
         ``block_load`` and ``block_store`` primitives:
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store

@@ -44,7 +44,7 @@ def make_merge_sort_keys(
         :ref:`blocked arrangement <flexible-data-arrangement>` across
         128 threads where each thread owns 4 consecutive keys.
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -53,7 +53,7 @@ def make_merge_sort_keys(
         The following snippet shows how to invoke the returned
         ``block_merge_sort`` primitive:
 
-        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/numba_cuda/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort