From ce82f8f00bf58e822640ffd99af9deb144ea18c1 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Thu, 7 May 2026 11:28:24 -0400 Subject: [PATCH 1/4] Add Python build/pytest CI workflow on colossus runner Mirrors the cpp-build-test workflow's container + runner setup, then builds the Python package via pip + scikit-build and runs the pytest suite (excluding the `long` substructure integration tests, matching pip-build.yml's smoke job). Single matrix entry pinned to py3.13 / rdkit 2025.9.2, the latest CUDA 12.x pair tracked in conda-build.yml. --- .github/workflows/python-build-test.yml | 95 +++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 .github/workflows/python-build-test.yml diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml new file mode 100644 index 00000000..78687f93 --- /dev/null +++ b/.github/workflows/python-build-test.yml @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Python Build & Test + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: python-build-test-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build-and-test: + name: py${{ matrix.python }} rdkit${{ matrix.rdkit }} + runs-on: colossus + container: + image: nvcr.io/nvidia/cuda:12.6.3-devel-ubuntu22.04 + options: --gpus all + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + include: + # Latest python + rdkit pair from the conda-build matrix that targets + # CUDA 12.x. Bump these together when conda-build.yml's matrix moves. + - python: "3.13" + rdkit: "2025.9.2" + + steps: + - name: Verify GPU access + shell: bash + run: | + set -xeuo pipefail + nvidia-smi + + - name: Check out source tree + uses: actions/checkout@v4 + + - name: Install conda + native dependencies + shell: bash + run: bash admin/ci/setup_dependencies.sh ${{ matrix.python }} ${{ matrix.rdkit }} + + - name: Install Python build & test deps + shell: bash + run: | + set -xeuo pipefail + . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base + python -m pip install --upgrade pip + python -m pip install \ + "scikit-build>=0.18" \ + "numpy>=1.23" \ + "torch>=2.1" \ + triton \ + pandas \ + psutil \ + optuna + + - name: Build & install nvmolkit + shell: bash + env: + CUDA_HOME: /usr/local/cuda + NVMOLKIT_CUDA_TARGET_MODE: native + run: | + set -xeo pipefail + . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base + export CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" + python -m pip install . --no-deps --no-build-isolation -v + + - name: Run pytest + shell: bash + working-directory: /tmp + run: | + set -xeo pipefail + . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base + python -m pytest "${GITHUB_WORKSPACE}/nvmolkit/tests" --tb=short -k 'not long' From 4e73042f7677e28e1aa3922c1564e748af5844ba Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Thu, 7 May 2026 12:50:45 -0400 Subject: [PATCH 2/4] Pin pip torch <2.5 for V100 runner PyPI torch wheels from 2.5+ dropped sm_70 from their default arch list, which produces cudaErrorNoKernelImageForDevice on the colossus V100. --- .github/workflows/python-build-test.yml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml index 78687f93..4e8e7c13 100644 --- a/.github/workflows/python-build-test.yml +++ b/.github/workflows/python-build-test.yml @@ -66,15 +66,34 @@ jobs: set -xeuo pipefail . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base python -m pip install --upgrade pip + # The colossus runner is a V100 (sm_70). PyPI torch wheels from 2.5+ + # dropped sm_70 from their default arch list, so anything newer hits + # cudaErrorNoKernelImageForDevice on this runner. Pin to the last + # 2.4.x release that still ships sm_70. python -m pip install \ "scikit-build>=0.18" \ "numpy>=1.23" \ - "torch>=2.1" \ + "torch>=2.1,<2.5" \ triton \ pandas \ psutil \ optuna + - name: Report torch CUDA arch coverage + shell: bash + run: | + set -xeuo pipefail + . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base + python - <<'PY' + import torch + print("torch", torch.__version__) + print("torch.version.cuda", torch.version.cuda) + print("torch.cuda.get_arch_list()", torch.cuda.get_arch_list()) + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + print(f"device[{i}]", props.name, f"sm_{props.major}{props.minor}") + PY + - name: Build & install nvmolkit shell: bash env: From 72f89cdb53ad4db4c0fc2f25f577efc812f4a217 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Thu, 7 May 2026 13:16:14 -0400 Subject: [PATCH 3/4] Drop CI matrix back to py3.12 for V100 runner py3.13 torch wheels start at 2.5, which dropped sm_70 from default arches. Use py3.12 + rdkit 2025.3.6 instead. --- .github/workflows/python-build-test.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml index 4e8e7c13..345a6217 100644 --- a/.github/workflows/python-build-test.yml +++ b/.github/workflows/python-build-test.yml @@ -42,9 +42,11 @@ jobs: matrix: include: # Latest python + rdkit pair from the conda-build matrix that targets - # CUDA 12.x. Bump these together when conda-build.yml's matrix moves. - - python: "3.13" - rdkit: "2025.9.2" + # CUDA 12.x AND has pip torch wheels covering sm_70 (the colossus + # runner is a V100). Python 3.13 needs torch >=2.5, which dropped + # sm_70 from its default arch list, so we stay on 3.12 for now. + - python: "3.12" + rdkit: "2025.3.6" steps: - name: Verify GPU access @@ -66,10 +68,10 @@ jobs: set -xeuo pipefail . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base python -m pip install --upgrade pip - # The colossus runner is a V100 (sm_70). PyPI torch wheels from 2.5+ - # dropped sm_70 from their default arch list, so anything newer hits - # cudaErrorNoKernelImageForDevice on this runner. Pin to the last - # 2.4.x release that still ships sm_70. + # torch is pinned <2.5 because 2.5+ dropped sm_70 from its default + # arch list and the colossus runner is a V100. Bumping the python + # matrix entry past 3.12 will require a torch source on this runner + # (e.g. conda-forge) since 3.13 wheels start at 2.5. python -m pip install \ "scikit-build>=0.18" \ "numpy>=1.23" \ From ae7f21865df21dfbf60bcef7face9994b2b892f2 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Thu, 7 May 2026 14:03:26 -0400 Subject: [PATCH 4/4] Use cu126 PyTorch channel for V100 + cudaGraphAddNode The default PyPI torch wheel ships an older bundled libcudart that predates cudaGraphAddNode (added in CUDA 12.4), and the cu128/cu129 channels dropped sm_70. --- .github/workflows/python-build-test.yml | 38 +++++++++---------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml index 345a6217..51358fb2 100644 --- a/.github/workflows/python-build-test.yml +++ b/.github/workflows/python-build-test.yml @@ -42,11 +42,9 @@ jobs: matrix: include: # Latest python + rdkit pair from the conda-build matrix that targets - # CUDA 12.x AND has pip torch wheels covering sm_70 (the colossus - # runner is a V100). Python 3.13 needs torch >=2.5, which dropped - # sm_70 from its default arch list, so we stay on 3.12 for now. - - python: "3.12" - rdkit: "2025.3.6" + # CUDA 12.x. Bump these together when conda-build.yml's matrix moves. + - python: "3.13" + rdkit: "2025.9.2" steps: - name: Verify GPU access @@ -68,34 +66,24 @@ jobs: set -xeuo pipefail . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base python -m pip install --upgrade pip - # torch is pinned <2.5 because 2.5+ dropped sm_70 from its default - # arch list and the colossus runner is a V100. Bumping the python - # matrix entry past 3.12 will require a torch source on this runner - # (e.g. conda-forge) since 3.13 wheels start at 2.5. + # Torch must come from the cu126 channel: the default PyPI wheel + # bundles a libcudart from an older CUDA minor that predates + # cudaGraphAddNode (added in CUDA 12.4) and gets picked up by the + # dynamic linker ahead of the container's libcudart, breaking + # nvmolkit's clustering module. cu126 also keeps sm_70 in the wheel, + # which the cu128 / cu129 channels have dropped — required for the + # V100 colossus runner. + python -m pip install \ + --index-url https://download.pytorch.org/whl/cu126 \ + torch python -m pip install \ "scikit-build>=0.18" \ "numpy>=1.23" \ - "torch>=2.1,<2.5" \ triton \ pandas \ psutil \ optuna - - name: Report torch CUDA arch coverage - shell: bash - run: | - set -xeuo pipefail - . /usr/local/anaconda/etc/profile.d/conda.sh && conda activate base - python - <<'PY' - import torch - print("torch", torch.__version__) - print("torch.version.cuda", torch.version.cuda) - print("torch.cuda.get_arch_list()", torch.cuda.get_arch_list()) - for i in range(torch.cuda.device_count()): - props = torch.cuda.get_device_properties(i) - print(f"device[{i}]", props.name, f"sm_{props.major}{props.minor}") - PY - - name: Build & install nvmolkit shell: bash env: