NVIDIA
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 29 additions & 10 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎cuda_bindings/tests/nvml/test_pynvml.py‎
Lines changed: 8 additions & 2 deletions b/‎cuda_bindings/tests/nvml/test_pynvml.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎cuda_core/cuda/core/_cpp/resource_handles.cpp‎
Lines changed: 41 additions & 34 deletions b/‎cuda_core/cuda/core/_cpp/resource_handles.cpp‎
Lines changed: 41 additions & 34 deletions
@@ -40,6 +40,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       skip: ${{ steps.get-should-skip.outputs.skip }}
+      doc-only: ${{ steps.get-should-skip.outputs.doc_only }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8  # v6.0.1
@@ -50,11 +51,16 @@ jobs:
         run: |
           set -euxo pipefail
           if ${{ startsWith(github.ref_name, 'pull-request/') }}; then
-            skip="$(gh pr view "$(grep -Po '(\d+)$' <<< '${{ github.ref_name }}')" --json title --jq '.title | contains("[no-ci]")')"
+            pr_number="$(grep -Po '(\d+)$' <<< '${{ github.ref_name }}')"
+            pr_title="$(gh pr view "${pr_number}" --json title --jq '.title')"
+            skip="$(echo "${pr_title}" | grep -q '\[no-ci\]' && echo true || echo false)"
+            doc_only="$(echo "${pr_title}" | grep -q '\[doc-only\]' && echo true || echo false)"
           else
             skip=false
+            doc_only=false
           fi
           echo "skip=${skip}" >> "$GITHUB_OUTPUT"
+          echo "doc_only=${doc_only}" >> "$GITHUB_OUTPUT"
 
   # WARNING: make sure all of the build jobs are in sync
   build-linux-64:
@@ -86,7 +92,7 @@ jobs:
         host-platform:
           - linux-aarch64
     name: Build ${{ matrix.host-platform }}, CUDA ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
-    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) && !fromJSON(needs.should-skip.outputs.doc-only) }}
     secrets: inherit
     uses: ./.github/workflows/build-wheel.yml
     with:
@@ -105,7 +111,7 @@ jobs:
         host-platform:
           - win-64
     name: Build ${{ matrix.host-platform }}, CUDA ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
-    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) && !fromJSON(needs.should-skip.outputs.doc-only) }}
     secrets: inherit
     uses: ./.github/workflows/build-wheel.yml
     with:
@@ -121,11 +127,12 @@ jobs:
         host-platform:
           - linux-64
     name: Test ${{ matrix.host-platform }}
-    if: ${{ github.repository_owner == 'nvidia' }}
+    if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.doc-only) }}
     permissions:
       contents: read  # This is required for actions/checkout
     needs:
       - ci-vars
+      - should-skip
       - build-linux-64
     secrets: inherit
     uses: ./.github/workflows/test-wheel-linux.yml
@@ -142,6 +149,8 @@ jobs:
         host-platform:
           - linux-aarch64
     name: Test ${{ matrix.host-platform }}
+    # Note: No doc-only check needed here - if build-linux-aarch64 is skipped,
+    # this job is automatically skipped due to the dependency.
     if: ${{ github.repository_owner == 'nvidia' }}
     permissions:
       contents: read  # This is required for actions/checkout
@@ -162,6 +171,8 @@ jobs:
         host-platform:
           - win-64
     name: Test ${{ matrix.host-platform }}
+    # Note: No doc-only check needed here - if build-windows is skipped,
+    # this job is automatically skipped due to the dependency.
     if: ${{ github.repository_owner == 'nvidia' }}
     permissions:
       contents: read  # This is required for actions/checkout
@@ -196,6 +207,7 @@ jobs:
     if: always()
     runs-on: ubuntu-latest
     needs:
+      - should-skip
       - test-linux-64
       - test-linux-aarch64
       - test-windows
@@ -219,11 +231,18 @@ jobs:
           # failing job(s) will timeout causing a cancellation here and the
           # build to succeed which we don't want (originally this was just
           # 'exit 0')
-          if ${{ needs.test-linux-64.result == 'cancelled' ||
-                 needs.test-linux-aarch64.result == 'cancelled' ||
-                 needs.test-windows.result == 'cancelled' ||
-                 needs.doc.result == 'cancelled' }}; then
+          #
+          # Note: When [doc-only] is in PR title, test jobs are intentionally
+          # skipped and should not cause failure.
+          doc_only=${{ needs.should-skip.outputs.doc-only }}
+          if ${{ needs.doc.result == 'cancelled' }}; then
             exit 1
-          else
-            exit 0
           fi
+          if [[ "${doc_only}" != "true" ]]; then
+            if ${{ needs.test-linux-64.result == 'cancelled' ||
+                   needs.test-linux-aarch64.result == 'cancelled' ||
+                   needs.test-windows.result == 'cancelled' }}; then
+              exit 1
+            fi
+          fi
+          exit 0
@@ -136,13 +136,19 @@ def test_device_get_p2p_status(handles, index):
 
 def test_device_get_power_usage(ngpus, handles):
     for i in range(ngpus):
-        power_mwatts = nvml.device_get_power_usage(handles[i])
+        try:
+            power_mwatts = nvml.device_get_power_usage(handles[i])
+        except nvml.NotSupportedError:
+            pytest.skip("device_get_power_usage not supported")
         assert power_mwatts >= 0.0
 
 
 def test_device_get_total_energy_consumption(ngpus, handles):
     for i in range(ngpus):
-        energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
+        try:
+            energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
+        except nvml.NotSupportedError:
+            pytest.skip("device_get_total_energy_consumption not supported")
         for j in range(10):  # idle for 150 ms
             time.sleep(0.015)  # and check for increase every 15 ms
             energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
 
@@ -171,15 +171,15 @@ struct ContextBox {
 };
 }  // namespace
 
-ContextHandle create_context_handle_ref(CUcontext ctx) noexcept {
+ContextHandle create_context_handle_ref(CUcontext ctx) {
     auto box = std::make_shared<const ContextBox>(ContextBox{ctx});
     return ContextHandle(box, &box->resource);
 }
 
 // Thread-local cache of primary contexts indexed by device ID
 static thread_local std::vector<ContextHandle> primary_context_cache;
 
-ContextHandle get_primary_context(int device_id) noexcept {
+ContextHandle get_primary_context(int device_id) {
     // Check thread-local cache
     if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
         if (auto cached = primary_context_cache[device_id]) {
@@ -212,7 +212,7 @@ ContextHandle get_primary_context(int device_id) noexcept {
     return h;
 }
 
-ContextHandle get_current_context() noexcept {
+ContextHandle get_current_context() {
     GILReleaseGuard gil;
     CUcontext ctx = nullptr;
     if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
@@ -234,7 +234,7 @@ struct StreamBox {
 };
 }  // namespace
 
-StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept {
+StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
     GILReleaseGuard gil;
     CUstream stream;
     if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
@@ -252,12 +252,12 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p
     return StreamHandle(box, &box->resource);
 }
 
-StreamHandle create_stream_handle_ref(CUstream stream) noexcept {
+StreamHandle create_stream_handle_ref(CUstream stream) {
     auto box = std::make_shared<const StreamBox>(StreamBox{stream});
     return StreamHandle(box, &box->resource);
 }
 
-StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) noexcept {
+StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
     if (!owner) {
         return create_stream_handle_ref(stream);
     }
@@ -281,12 +281,12 @@ StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) n
     return StreamHandle(box, &box->resource);
 }
 
-StreamHandle get_legacy_stream() noexcept {
+StreamHandle get_legacy_stream() {
     static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
     return handle;
 }
 
-StreamHandle get_per_thread_stream() noexcept {
+StreamHandle get_per_thread_stream() {
     static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
     return handle;
 }
@@ -301,7 +301,7 @@ struct EventBox {
 };
 }  // namespace
 
-EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept {
+EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
@@ -319,11 +319,11 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcep
     return EventHandle(box, &box->resource);
 }
 
-EventHandle create_event_handle_noctx(unsigned int flags) noexcept {
+EventHandle create_event_handle_noctx(unsigned int flags) {
     return create_event_handle(ContextHandle{}, flags);
 }
 
-EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) noexcept {
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
@@ -353,19 +353,24 @@ struct MemoryPoolBox {
 
 // Helper to clear peer access before destroying a memory pool.
 // Works around nvbug 5698116: recycled pool handles inherit peer access state.
-static void clear_mempool_peer_access(CUmemoryPool pool) {
-    int device_count = 0;
-    if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
-        return;
-    }
+// Must be noexcept since it's called from a shared_ptr deleter.
+static void clear_mempool_peer_access(CUmemoryPool pool) noexcept {
+    try {
+        int device_count = 0;
+        if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
+            return;
+        }
 
-    std::vector<CUmemAccessDesc> clear_access(device_count);
-    for (int i = 0; i < device_count; ++i) {
-        clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        clear_access[i].location.id = i;
-        clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+        std::vector<CUmemAccessDesc> clear_access(device_count);
+        for (int i = 0; i < device_count; ++i) {
+            clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            clear_access[i].location.id = i;
+            clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+        }
+        p_cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
+    } catch (...) {
+        // Swallow exceptions - this is best-effort cleanup in destructor context
     }
-    p_cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
 }
 
 static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
@@ -381,7 +386,7 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
     return MemoryPoolHandle(box, &box->resource);
 }
 
-MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
+MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
     GILReleaseGuard gil;
     CUmemoryPool pool;
     if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
@@ -390,12 +395,12 @@ MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
     return wrap_mempool_owned(pool);
 }
 
-MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) noexcept {
+MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
     auto box = std::make_shared<const MemoryPoolBox>(MemoryPoolBox{pool});
     return MemoryPoolHandle(box, &box->resource);
 }
 
-MemoryPoolHandle get_device_mempool(int device_id) noexcept {
+MemoryPoolHandle get_device_mempool(int device_id) {
     GILReleaseGuard gil;
     CUmemoryPool pool;
     if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
@@ -404,7 +409,7 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept {
     return create_mempool_handle_ref(pool);
 }
 
-MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) noexcept {
+MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
     GILReleaseGuard gil;
     CUmemoryPool pool;
     auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
@@ -448,7 +453,7 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) no
     get_box(h)->h_stream = std::move(h_stream);
 }
 
-DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) noexcept {
+DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) {
@@ -466,7 +471,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept {
+DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) {
@@ -484,7 +489,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexce
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
+DevicePtrHandle deviceptr_alloc(size_t size) {
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
@@ -502,7 +507,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
+DevicePtrHandle deviceptr_alloc_host(size_t size) {
     GILReleaseGuard gil;
     void* ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
@@ -520,12 +525,12 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) noexcept {
+DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
     auto box = std::make_shared<DevicePtrBox>(DevicePtrBox{ptr, StreamHandle{}});
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) noexcept {
+DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
     if (!owner) {
         return deviceptr_create_ref(ptr);
     }
@@ -607,7 +612,7 @@ struct ExportDataKeyHash {
 static std::mutex ipc_ptr_cache_mutex;
 static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
 
-DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) noexcept {
+DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
     auto data = const_cast<CUmemPoolPtrExportData*>(
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
 
@@ -639,14 +644,16 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
             new DevicePtrBox{ptr, h_stream},
             [h_pool, key](DevicePtrBox* b) {
                 GILReleaseGuard gil;
-                {
+                try {
                     std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
                     // Only erase if expired - avoids race where another thread
                     // replaced the entry with a new import before we acquired the lock.
                     auto it = ipc_ptr_cache.find(key);
                     if (it != ipc_ptr_cache.end() && it->second.expired()) {
                         ipc_ptr_cache.erase(it);
                     }
+                } catch (...) {
+                    // Cache cleanup is best-effort - swallow exceptions in destructor context
                 }
                 p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
                 delete b;