scverse · Intron7 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -69,16 +69,47 @@ jobs:
           path = pathlib.Path("pyproject.toml")
           text = path.read_text()
 
+          def remove_toml_array(text, key):
+              lines = text.splitlines(keepends=True)
+              out = []
+              i = 0
+              while i < len(lines):
+                  if lines[i].startswith(f"{key} = ["):
+                      depth = lines[i].count("[") - lines[i].count("]")
+                      i += 1
+                      while i < len(lines) and depth > 0:
+                          depth += lines[i].count("[") - lines[i].count("]")
+                          i += 1
+                      continue
+                  out.append(lines[i])
+                  i += 1
+              return "".join(out)
+
           # Rename package
           text = text.replace(
               'name = "rapids-singlecell"',
               f'name = "rapids-singlecell-cu{cuda}"',
           )
           # Rename matching extra to "rapids", remove the other
-          text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
-          # Remove the other CUDA extra line entirely
-          lines = text.splitlines(keepends=True)
-          text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
+          text = text.replace(f'rapids-cu{cuda} = [', 'rapids = [')
+          text = remove_toml_array(text, f"rapids-cu{other}")
+
+          # librmm is needed at build time because CMake links the CUDA
+          # extension against librmm. Add the matching wheel to the isolated
+          # PEP 517 build requirements after selecting the CUDA package variant.
+          for dep in (
+              f'    "librmm-cu{other}>=25.10",\n',
+              f'    "rmm-cu{other}>=25.10",\n',
+          ):
+              text = text.replace(dep, "")
+          rmm_build_req = f'    "librmm-cu{cuda}>=25.10",\n'
+          build_system_text = text.split("[project]", 1)[0]
+          if f'"librmm-cu{cuda}>=25.10"' not in build_system_text:
+              text = text.replace(
+                  ']\nbuild-backend = "scikit_build_core.build"',
+                  f'{rmm_build_req}]\nbuild-backend = "scikit_build_core.build"',
+                  1,
+              )
 
           # Set CUDA architectures (replace "native" with CI target archs)
           text = text.replace(
@@ -96,6 +127,7 @@ jobs:
 
       - name: Sanity check pyproject.toml
         run: |
+          python3 -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"
           grep -E "name|rapids|CUDA_ARCH" pyproject.toml
 
       - name: Build CUDA manylinux image
@@ -116,11 +148,23 @@ jobs:
             LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
             PATH=/usr/local/cuda/bin:$PATH
           CIBW_BEFORE_BUILD: >
+            rm -f build/.librmm_dir &&
+            mkdir -p build &&
             python -m pip install -U pip
             scikit-build-core cmake ninja nanobind
+            librmm-cu${{ matrix.cuda_major }} &&
+            RMM_ROOT=$(python -c "import librmm; print(librmm.__path__[0])") &&
+            LOG_ROOT=$(python -c "import rapids_logger; print(rapids_logger.__path__[0])") &&
+            echo "[rsc-build] librmm=$RMM_ROOT" &&
+            echo "[rsc-build] rapids_logger=$LOG_ROOT" &&
+            ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
+            ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
+            ldconfig &&
+            python -c "import librmm; print(librmm.__path__[0])" > build/.librmm_dir &&
+            echo "[rsc-build] marker=$(cat build/.librmm_dir)"
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
-          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} -w {dest_dir} {wheel}"
+          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
           CIBW_BUILD_VERBOSITY: "1"
 
       - uses: actions/upload-artifact@v4

diff --git a/.gitignore b/.gitignore
@@ -49,6 +49,8 @@ coverage.xml
 .claude/
 .codex
 CLAUDE.md
+.codex
 
 # tmp_scripts
 tmp_scripts/
+/benchmarks/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,104 @@ if (RSC_BUILD_EXTENSIONS)
   find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
   find_package(nanobind CONFIG REQUIRED)
   find_package(CUDAToolkit REQUIRED)
+  set(RSC_RMM_HINTS)
+  set(RSC_RAPIDS_CMAKE_PREFIXES)
+  set(RSC_CCCL_HINTS)
+  set(RSC_RAPIDS_LOGGER_HINTS)
+  set(RSC_NVTX3_HINTS)
+  macro(_rsc_collect_rapids_python_prefix _rsc_prefix)
+    if (NOT "${_rsc_prefix}" STREQUAL "")
+      file(GLOB _rsc_rmm_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/rmm")
+      file(GLOB _rsc_rapids_prefixes
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64"
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids"
+        "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64"
+        "${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib"
+      )
+      file(GLOB _rsc_cccl_dirs
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids/cmake/cccl"
+        "${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib/cmake/cccl"
+      )
+      file(GLOB _rsc_rapids_logger_dirs "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64/cmake/rapids_logger")
+      file(GLOB _rsc_nvtx3_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/nvtx3")
+      list(APPEND RSC_RMM_HINTS ${_rsc_rmm_dirs})
+      list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_rapids_prefixes})
+      list(APPEND RSC_CCCL_HINTS ${_rsc_cccl_dirs})
+      list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_rapids_logger_dirs})
+      list(APPEND RSC_NVTX3_HINTS ${_rsc_nvtx3_dirs})
+    endif()
+  endmacro()
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import importlib.util, pathlib; spec = importlib.util.find_spec('librmm'); print(pathlib.Path(spec.origin).parent / 'lib64' / 'cmake' / 'rmm' if spec else '')"
+    OUTPUT_VARIABLE RSC_PYTHON_RMM_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+  )
+  if (RSC_PYTHON_RMM_DIR AND EXISTS "${RSC_PYTHON_RMM_DIR}/rmm-config.cmake")
+    list(APPEND RSC_RMM_HINTS "${RSC_PYTHON_RMM_DIR}")
+  endif()
+  # Wheel builds install librmm/rapids_logger into the isolated build env and
+  # write build/.librmm_dir from CIBW_BEFORE_BUILD.  publish.yml also symlinks
+  # those shared libraries into /usr/local/lib so auditwheel can see and exclude
+  # them instead of bundling RAPIDS runtime libraries into the wheel.
+  if(DEFINED ENV{RSC_LIBRMM_DIR} AND EXISTS "$ENV{RSC_LIBRMM_DIR}/lib64/cmake/rmm/rmm-config.cmake")
+    set(_rsc_librmm_marker "$ENV{RSC_LIBRMM_DIR}")
+  elseif(EXISTS "${CMAKE_SOURCE_DIR}/build/.librmm_dir")
+    file(READ "${CMAKE_SOURCE_DIR}/build/.librmm_dir" _rsc_librmm_marker)
+    string(STRIP "${_rsc_librmm_marker}" _rsc_librmm_marker)
+  else()
+    set(_rsc_librmm_marker "")
+  endif()
+  if(NOT "${_rsc_librmm_marker}" STREQUAL "" AND EXISTS "${_rsc_librmm_marker}/lib64/cmake/rmm/rmm-config.cmake")
+    file(GLOB _rsc_marker_rmm_dirs "${_rsc_librmm_marker}/lib64/cmake/rmm")
+    file(GLOB _rsc_marker_rapids_prefixes
+      "${_rsc_librmm_marker}/lib64"
+      "${_rsc_librmm_marker}/lib64/rapids"
+      "${_rsc_librmm_marker}/../rapids_logger/lib64"
+    )
+    file(GLOB _rsc_marker_cccl_dirs
+      "${_rsc_librmm_marker}/lib64/rapids/cmake/cccl"
+    )
+    file(GLOB _rsc_marker_rapids_logger_dirs "${_rsc_librmm_marker}/../rapids_logger/lib64/cmake/rapids_logger")
+    file(GLOB _rsc_marker_nvtx3_dirs "${_rsc_librmm_marker}/lib64/cmake/nvtx3")
+    list(APPEND RSC_RMM_HINTS ${_rsc_marker_rmm_dirs})
+    list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_marker_rapids_prefixes})
+    list(APPEND RSC_CCCL_HINTS ${_rsc_marker_cccl_dirs})
+    list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_marker_rapids_logger_dirs})
+    list(APPEND RSC_NVTX3_HINTS ${_rsc_marker_nvtx3_dirs})
+  endif()
+  foreach(_rsc_python_prefix IN ITEMS "${Python_ROOT_DIR}" "${Python3_ROOT_DIR}")
+    _rsc_collect_rapids_python_prefix("${_rsc_python_prefix}")
+  endforeach()
+  foreach(_rsc_env_prefix IN ITEMS "$ENV{CONDA_PREFIX}" "$ENV{VIRTUAL_ENV}")
+    _rsc_collect_rapids_python_prefix("${_rsc_env_prefix}")
+  endforeach()
+  string(REPLACE ":" ";" _rsc_path_entries "$ENV{PATH}")
+  foreach(_rsc_path_entry IN LISTS _rsc_path_entries)
+    get_filename_component(_rsc_path_prefix "${_rsc_path_entry}/.." ABSOLUTE)
+    _rsc_collect_rapids_python_prefix("${_rsc_path_prefix}")
+  endforeach()
+  if (RSC_RAPIDS_CMAKE_PREFIXES)
+    list(APPEND CMAKE_PREFIX_PATH ${RSC_RAPIDS_CMAKE_PREFIXES})
+    if (RSC_CCCL_HINTS)
+      list(GET RSC_CCCL_HINTS 0 _rsc_cccl_dir)
+      set(CCCL_DIR "${_rsc_cccl_dir}" CACHE PATH "Path to CCCL package config" FORCE)
+    endif()
+    if (RSC_RAPIDS_LOGGER_HINTS)
+      list(GET RSC_RAPIDS_LOGGER_HINTS 0 _rsc_rapids_logger_dir)
+      set(rapids_logger_DIR "${_rsc_rapids_logger_dir}" CACHE PATH "Path to rapids_logger package config" FORCE)
+    endif()
+    if (RSC_NVTX3_HINTS)
+      list(GET RSC_NVTX3_HINTS 0 _rsc_nvtx3_dir)
+      set(nvtx3_DIR "${_rsc_nvtx3_dir}" CACHE PATH "Path to nvtx3 package config" FORCE)
+    endif()
+  endif()
+  if (RSC_RMM_HINTS)
+    find_package(rmm CONFIG REQUIRED HINTS ${RSC_RMM_HINTS})
+  else()
+    find_package(rmm CONFIG REQUIRED)
+  endif()
+  message(STATUS "Using RMM for CUDA extension scratch allocations")
   message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 else()
   message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
@@ -85,6 +183,11 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_hvg_cuda          src/rapids_singlecell/_cuda/hvg/hvg.cu)
   add_nb_cuda_module(_kde_cuda          src/rapids_singlecell/_cuda/kde/kde.cu)
   add_nb_cuda_module(_wilcoxon_cuda     src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
+  target_sources(_wilcoxon_cuda PRIVATE src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_rmm.cu)
+  target_link_libraries(_wilcoxon_cuda PRIVATE rmm::rmm)
+  add_nb_cuda_module(_wilcoxon_sparse_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_sparse.cu)
+  target_sources(_wilcoxon_sparse_cuda PRIVATE src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_rmm.cu)
+  target_link_libraries(_wilcoxon_sparse_cuda PRIVATE rmm::rmm)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,6 +3,11 @@ requires = [
     "scikit-build-core>=0.10",
     "nanobind>=2.0.0",
     "setuptools-scm>=8",
+    # librmm headers/CMake config are needed at build time for Wilcoxon.
+    # Generic isolated source builds default to CUDA 12. CUDA wheel builds
+    # rewrite this to the matching cu12/cu13 package; CUDA 13 source builds
+    # should build in an existing RAPIDS env with --no-build-isolation.
+    "librmm-cu12>=25.10",
 ]
 build-backend = "scikit_build_core.build"
 
@@ -32,8 +37,22 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-rapids-cu13 = [ "cupy-cuda13x", "cudf-cu13>=25.10", "cuml-cu13>=25.10", "cugraph-cu13>=25.10", "cuvs-cu13>=25.10" ]
-rapids-cu12 = [ "cupy-cuda12x", "cudf-cu12>=25.10", "cuml-cu12>=25.10", "cugraph-cu12>=25.10", "cuvs-cu12>=25.10" ]
+rapids-cu13 = [
+    "cupy-cuda13x",
+    "cudf-cu13>=25.10",
+    "cuml-cu13>=25.10",
+    "cugraph-cu13>=25.10",
+    "cuvs-cu13>=25.10",
+    "librmm-cu13>=25.10",
+]
+rapids-cu12 = [
+    "cupy-cuda12x",
+    "cudf-cu12>=25.10",
+    "cuml-cu12>=25.10",
+    "cugraph-cu12>=25.10",
+    "cuvs-cu12>=25.10",
+    "librmm-cu12>=25.10",
+]
 
 doc = [
     "sphinx>=4.5.0",
@@ -150,7 +169,7 @@ sdist.include = [ "src/rapids_singlecell/_version.py" ]
 # Use abi3audit to catch issues with Limited API wheels
 [tool.cibuildwheel.linux]
 repair-wheel-command = [
-    "auditwheel repair --exclude libcublas.so.12 --exclude libcublas.so.13 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libcudart.so.12 --exclude libcudart.so.13 -w {dest_dir} {wheel}",
+    "auditwheel repair --exclude libcublas.so.12 --exclude libcublas.so.13 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libcudart.so.12 --exclude libcudart.so.13 --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}",
     "pipx run abi3audit --strict --report {wheel}",
 ]
 [tool.cibuildwheel.macos]

diff --git a/src/rapids_singlecell/_cuda/nb_types.h b/src/rapids_singlecell/_cuda/nb_types.h
@@ -42,6 +42,13 @@ using gpu_array = nb::ndarray<T, Device>;
 template <typename T, typename Device, typename Contig>
 using gpu_array_contig = nb::ndarray<T, Device, Contig>;
 
+// Host (NumPy) array aliases
+template <typename T>
+using host_array = nb::ndarray<T, nb::numpy, nb::ndim<1>>;
+
+template <typename T>
+using host_array_2d = nb::ndarray<T, nb::numpy, nb::ndim<2>>;
+
 // Register bindings for both regular CUDA and managed-memory arrays.
 // Usage:
 //   template <typename Device>