Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 49 additions & 5 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,47 @@ jobs:
path = pathlib.Path("pyproject.toml")
text = path.read_text()

def remove_toml_array(text, key):
lines = text.splitlines(keepends=True)
out = []
i = 0
while i < len(lines):
if lines[i].startswith(f"{key} = ["):
depth = lines[i].count("[") - lines[i].count("]")
i += 1
while i < len(lines) and depth > 0:
depth += lines[i].count("[") - lines[i].count("]")
i += 1
continue
out.append(lines[i])
i += 1
return "".join(out)

# Rename package
text = text.replace(
'name = "rapids-singlecell"',
f'name = "rapids-singlecell-cu{cuda}"',
)
# Rename matching extra to "rapids", remove the other
text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
# Remove the other CUDA extra line entirely
lines = text.splitlines(keepends=True)
text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
text = text.replace(f'rapids-cu{cuda} = [', 'rapids = [')
text = remove_toml_array(text, f"rapids-cu{other}")

# librmm is needed at build time because CMake links the CUDA
# extension against librmm. Add the matching wheel to the isolated
# PEP 517 build requirements after selecting the CUDA package variant.
for dep in (
f' "librmm-cu{other}>=25.10",\n',
f' "rmm-cu{other}>=25.10",\n',
):
text = text.replace(dep, "")
rmm_build_req = f' "librmm-cu{cuda}>=25.10",\n'
build_system_text = text.split("[project]", 1)[0]
if f'"librmm-cu{cuda}>=25.10"' not in build_system_text:
text = text.replace(
']\nbuild-backend = "scikit_build_core.build"',
f'{rmm_build_req}]\nbuild-backend = "scikit_build_core.build"',
1,
)

# Set CUDA architectures (replace "native" with CI target archs)
text = text.replace(
Expand All @@ -96,6 +127,7 @@ jobs:

- name: Sanity check pyproject.toml
run: |
python3 -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"
grep -E "name|rapids|CUDA_ARCH" pyproject.toml

- name: Build CUDA manylinux image
Expand All @@ -116,11 +148,23 @@ jobs:
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
PATH=/usr/local/cuda/bin:$PATH
CIBW_BEFORE_BUILD: >
rm -f build/.librmm_dir &&
mkdir -p build &&
python -m pip install -U pip
scikit-build-core cmake ninja nanobind
librmm-cu${{ matrix.cuda_major }} &&
RMM_ROOT=$(python -c "import librmm; print(librmm.__path__[0])") &&
LOG_ROOT=$(python -c "import rapids_logger; print(rapids_logger.__path__[0])") &&
echo "[rsc-build] librmm=$RMM_ROOT" &&
echo "[rsc-build] rapids_logger=$LOG_ROOT" &&
ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
ldconfig &&
python -c "import librmm; print(librmm.__path__[0])" > build/.librmm_dir &&
echo "[rsc-build] marker=$(cat build/.librmm_dir)"
CIBW_TEST_SKIP: "*"
CIBW_TEST_COMMAND: ""
CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} -w {dest_dir} {wheel}"
CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
CIBW_BUILD_VERBOSITY: "1"

- uses: actions/upload-artifact@v4
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ coverage.xml
.claude/
.codex
CLAUDE.md
.codex

# tmp_scripts
tmp_scripts/
/benchmarks/
103 changes: 103 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,104 @@ if (RSC_BUILD_EXTENSIONS)
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
find_package(nanobind CONFIG REQUIRED)
find_package(CUDAToolkit REQUIRED)
set(RSC_RMM_HINTS)
set(RSC_RAPIDS_CMAKE_PREFIXES)
set(RSC_CCCL_HINTS)
set(RSC_RAPIDS_LOGGER_HINTS)
set(RSC_NVTX3_HINTS)
macro(_rsc_collect_rapids_python_prefix _rsc_prefix)
if (NOT "${_rsc_prefix}" STREQUAL "")
file(GLOB _rsc_rmm_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/rmm")
file(GLOB _rsc_rapids_prefixes
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64"
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids"
"${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64"
"${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib"
)
file(GLOB _rsc_cccl_dirs
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids/cmake/cccl"
"${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib/cmake/cccl"
)
file(GLOB _rsc_rapids_logger_dirs "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64/cmake/rapids_logger")
file(GLOB _rsc_nvtx3_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/nvtx3")
list(APPEND RSC_RMM_HINTS ${_rsc_rmm_dirs})
list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_rapids_prefixes})
list(APPEND RSC_CCCL_HINTS ${_rsc_cccl_dirs})
list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_rapids_logger_dirs})
list(APPEND RSC_NVTX3_HINTS ${_rsc_nvtx3_dirs})
endif()
endmacro()
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import importlib.util, pathlib; spec = importlib.util.find_spec('librmm'); print(pathlib.Path(spec.origin).parent / 'lib64' / 'cmake' / 'rmm' if spec else '')"
OUTPUT_VARIABLE RSC_PYTHON_RMM_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
if (RSC_PYTHON_RMM_DIR AND EXISTS "${RSC_PYTHON_RMM_DIR}/rmm-config.cmake")
list(APPEND RSC_RMM_HINTS "${RSC_PYTHON_RMM_DIR}")
endif()
# Wheel builds install librmm/rapids_logger into the isolated build env and
# write build/.librmm_dir from CIBW_BEFORE_BUILD. publish.yml also symlinks
# those shared libraries into /usr/local/lib so auditwheel can see and exclude
# them instead of bundling RAPIDS runtime libraries into the wheel.
if(DEFINED ENV{RSC_LIBRMM_DIR} AND EXISTS "$ENV{RSC_LIBRMM_DIR}/lib64/cmake/rmm/rmm-config.cmake")
set(_rsc_librmm_marker "$ENV{RSC_LIBRMM_DIR}")
elseif(EXISTS "${CMAKE_SOURCE_DIR}/build/.librmm_dir")
file(READ "${CMAKE_SOURCE_DIR}/build/.librmm_dir" _rsc_librmm_marker)
string(STRIP "${_rsc_librmm_marker}" _rsc_librmm_marker)
else()
set(_rsc_librmm_marker "")
endif()
if(NOT "${_rsc_librmm_marker}" STREQUAL "" AND EXISTS "${_rsc_librmm_marker}/lib64/cmake/rmm/rmm-config.cmake")
file(GLOB _rsc_marker_rmm_dirs "${_rsc_librmm_marker}/lib64/cmake/rmm")
file(GLOB _rsc_marker_rapids_prefixes
"${_rsc_librmm_marker}/lib64"
"${_rsc_librmm_marker}/lib64/rapids"
"${_rsc_librmm_marker}/../rapids_logger/lib64"
)
file(GLOB _rsc_marker_cccl_dirs
"${_rsc_librmm_marker}/lib64/rapids/cmake/cccl"
)
file(GLOB _rsc_marker_rapids_logger_dirs "${_rsc_librmm_marker}/../rapids_logger/lib64/cmake/rapids_logger")
file(GLOB _rsc_marker_nvtx3_dirs "${_rsc_librmm_marker}/lib64/cmake/nvtx3")
list(APPEND RSC_RMM_HINTS ${_rsc_marker_rmm_dirs})
list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_marker_rapids_prefixes})
list(APPEND RSC_CCCL_HINTS ${_rsc_marker_cccl_dirs})
list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_marker_rapids_logger_dirs})
list(APPEND RSC_NVTX3_HINTS ${_rsc_marker_nvtx3_dirs})
endif()
foreach(_rsc_python_prefix IN ITEMS "${Python_ROOT_DIR}" "${Python3_ROOT_DIR}")
_rsc_collect_rapids_python_prefix("${_rsc_python_prefix}")
endforeach()
foreach(_rsc_env_prefix IN ITEMS "$ENV{CONDA_PREFIX}" "$ENV{VIRTUAL_ENV}")
_rsc_collect_rapids_python_prefix("${_rsc_env_prefix}")
endforeach()
string(REPLACE ":" ";" _rsc_path_entries "$ENV{PATH}")
foreach(_rsc_path_entry IN LISTS _rsc_path_entries)
get_filename_component(_rsc_path_prefix "${_rsc_path_entry}/.." ABSOLUTE)
_rsc_collect_rapids_python_prefix("${_rsc_path_prefix}")
endforeach()
if (RSC_RAPIDS_CMAKE_PREFIXES)
list(APPEND CMAKE_PREFIX_PATH ${RSC_RAPIDS_CMAKE_PREFIXES})
if (RSC_CCCL_HINTS)
list(GET RSC_CCCL_HINTS 0 _rsc_cccl_dir)
set(CCCL_DIR "${_rsc_cccl_dir}" CACHE PATH "Path to CCCL package config" FORCE)
endif()
if (RSC_RAPIDS_LOGGER_HINTS)
list(GET RSC_RAPIDS_LOGGER_HINTS 0 _rsc_rapids_logger_dir)
set(rapids_logger_DIR "${_rsc_rapids_logger_dir}" CACHE PATH "Path to rapids_logger package config" FORCE)
endif()
if (RSC_NVTX3_HINTS)
list(GET RSC_NVTX3_HINTS 0 _rsc_nvtx3_dir)
set(nvtx3_DIR "${_rsc_nvtx3_dir}" CACHE PATH "Path to nvtx3 package config" FORCE)
endif()
endif()
if (RSC_RMM_HINTS)
find_package(rmm CONFIG REQUIRED HINTS ${RSC_RMM_HINTS})
else()
find_package(rmm CONFIG REQUIRED)
endif()
message(STATUS "Using RMM for CUDA extension scratch allocations")
message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
Expand Down Expand Up @@ -85,6 +183,11 @@ if (RSC_BUILD_EXTENSIONS)
add_nb_cuda_module(_hvg_cuda src/rapids_singlecell/_cuda/hvg/hvg.cu)
add_nb_cuda_module(_kde_cuda src/rapids_singlecell/_cuda/kde/kde.cu)
add_nb_cuda_module(_wilcoxon_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
target_sources(_wilcoxon_cuda PRIVATE src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_rmm.cu)
target_link_libraries(_wilcoxon_cuda PRIVATE rmm::rmm)
add_nb_cuda_module(_wilcoxon_sparse_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_sparse.cu)
target_sources(_wilcoxon_sparse_cuda PRIVATE src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_rmm.cu)
target_link_libraries(_wilcoxon_sparse_cuda PRIVATE rmm::rmm)
# Harmony CUDA modules
add_nb_cuda_module(_harmony_scatter_cuda src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
add_nb_cuda_module(_harmony_outer_cuda src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
Expand Down
25 changes: 22 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ requires = [
"scikit-build-core>=0.10",
"nanobind>=2.0.0",
"setuptools-scm>=8",
# librmm headers/CMake config are needed at build time for Wilcoxon.
# Generic isolated source builds default to CUDA 12. CUDA wheel builds
# rewrite this to the matching cu12/cu13 package; CUDA 13 source builds
# should build in an existing RAPIDS env with --no-build-isolation.
"librmm-cu12>=25.10",
]
build-backend = "scikit_build_core.build"

Expand Down Expand Up @@ -32,8 +37,22 @@ dependencies = [
]

[project.optional-dependencies]
rapids-cu13 = [ "cupy-cuda13x", "cudf-cu13>=25.10", "cuml-cu13>=25.10", "cugraph-cu13>=25.10", "cuvs-cu13>=25.10" ]
rapids-cu12 = [ "cupy-cuda12x", "cudf-cu12>=25.10", "cuml-cu12>=25.10", "cugraph-cu12>=25.10", "cuvs-cu12>=25.10" ]
rapids-cu13 = [
"cupy-cuda13x",
"cudf-cu13>=25.10",
"cuml-cu13>=25.10",
"cugraph-cu13>=25.10",
"cuvs-cu13>=25.10",
"librmm-cu13>=25.10",
]
rapids-cu12 = [
"cupy-cuda12x",
"cudf-cu12>=25.10",
"cuml-cu12>=25.10",
"cugraph-cu12>=25.10",
"cuvs-cu12>=25.10",
"librmm-cu12>=25.10",
]

doc = [
"sphinx>=4.5.0",
Expand Down Expand Up @@ -150,7 +169,7 @@ sdist.include = [ "src/rapids_singlecell/_version.py" ]
# Use abi3audit to catch issues with Limited API wheels
[tool.cibuildwheel.linux]
repair-wheel-command = [
"auditwheel repair --exclude libcublas.so.12 --exclude libcublas.so.13 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libcudart.so.12 --exclude libcudart.so.13 -w {dest_dir} {wheel}",
"auditwheel repair --exclude libcublas.so.12 --exclude libcublas.so.13 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libcudart.so.12 --exclude libcudart.so.13 --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}",
"pipx run abi3audit --strict --report {wheel}",
]
[tool.cibuildwheel.macos]
Expand Down
7 changes: 7 additions & 0 deletions src/rapids_singlecell/_cuda/nb_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ using gpu_array = nb::ndarray<T, Device>;
template <typename T, typename Device, typename Contig>
using gpu_array_contig = nb::ndarray<T, Device, Contig>;

// Host (NumPy) array aliases
template <typename T>
using host_array = nb::ndarray<T, nb::numpy, nb::ndim<1>>;

template <typename T>
using host_array_2d = nb::ndarray<T, nb::numpy, nb::ndim<2>>;

// Register bindings for both regular CUDA and managed-memory arrays.
// Usage:
// template <typename Device>
Expand Down
Loading
Loading