Skip to content

Commit ad31723

Browse files
authored
Merge branch 'main' into copilot/add-cuda-python-projects-link
2 parents ad817c3 + b8261e5 commit ad31723

7 files changed

Lines changed: 162 additions & 112 deletions

File tree

.github/workflows/ci.yml

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ jobs:
4040
runs-on: ubuntu-latest
4141
outputs:
4242
skip: ${{ steps.get-should-skip.outputs.skip }}
43+
doc-only: ${{ steps.get-should-skip.outputs.doc_only }}
4344
steps:
4445
- name: Checkout repository
4546
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
@@ -50,11 +51,16 @@ jobs:
5051
run: |
5152
set -euxo pipefail
5253
if ${{ startsWith(github.ref_name, 'pull-request/') }}; then
53-
skip="$(gh pr view "$(grep -Po '(\d+)$' <<< '${{ github.ref_name }}')" --json title --jq '.title | contains("[no-ci]")')"
54+
pr_number="$(grep -Po '(\d+)$' <<< '${{ github.ref_name }}')"
55+
pr_title="$(gh pr view "${pr_number}" --json title --jq '.title')"
56+
skip="$(echo "${pr_title}" | grep -q '\[no-ci\]' && echo true || echo false)"
57+
doc_only="$(echo "${pr_title}" | grep -q '\[doc-only\]' && echo true || echo false)"
5458
else
5559
skip=false
60+
doc_only=false
5661
fi
5762
echo "skip=${skip}" >> "$GITHUB_OUTPUT"
63+
echo "doc_only=${doc_only}" >> "$GITHUB_OUTPUT"
5864
5965
# WARNING: make sure all of the build jobs are in sync
6066
build-linux-64:
@@ -86,7 +92,7 @@ jobs:
8692
host-platform:
8793
- linux-aarch64
8894
name: Build ${{ matrix.host-platform }}, CUDA ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
89-
if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
95+
if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) && !fromJSON(needs.should-skip.outputs.doc-only) }}
9096
secrets: inherit
9197
uses: ./.github/workflows/build-wheel.yml
9298
with:
@@ -105,7 +111,7 @@ jobs:
105111
host-platform:
106112
- win-64
107113
name: Build ${{ matrix.host-platform }}, CUDA ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
108-
if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) }}
114+
if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.skip) && !fromJSON(needs.should-skip.outputs.doc-only) }}
109115
secrets: inherit
110116
uses: ./.github/workflows/build-wheel.yml
111117
with:
@@ -121,11 +127,12 @@ jobs:
121127
host-platform:
122128
- linux-64
123129
name: Test ${{ matrix.host-platform }}
124-
if: ${{ github.repository_owner == 'nvidia' }}
130+
if: ${{ github.repository_owner == 'nvidia' && !fromJSON(needs.should-skip.outputs.doc-only) }}
125131
permissions:
126132
contents: read # This is required for actions/checkout
127133
needs:
128134
- ci-vars
135+
- should-skip
129136
- build-linux-64
130137
secrets: inherit
131138
uses: ./.github/workflows/test-wheel-linux.yml
@@ -142,6 +149,8 @@ jobs:
142149
host-platform:
143150
- linux-aarch64
144151
name: Test ${{ matrix.host-platform }}
152+
# Note: No doc-only check needed here - if build-linux-aarch64 is skipped,
153+
# this job is automatically skipped due to the dependency.
145154
if: ${{ github.repository_owner == 'nvidia' }}
146155
permissions:
147156
contents: read # This is required for actions/checkout
@@ -162,6 +171,8 @@ jobs:
162171
host-platform:
163172
- win-64
164173
name: Test ${{ matrix.host-platform }}
174+
# Note: No doc-only check needed here - if build-windows is skipped,
175+
# this job is automatically skipped due to the dependency.
165176
if: ${{ github.repository_owner == 'nvidia' }}
166177
permissions:
167178
contents: read # This is required for actions/checkout
@@ -196,6 +207,7 @@ jobs:
196207
if: always()
197208
runs-on: ubuntu-latest
198209
needs:
210+
- should-skip
199211
- test-linux-64
200212
- test-linux-aarch64
201213
- test-windows
@@ -219,11 +231,18 @@ jobs:
219231
# failing job(s) will timeout causing a cancellation here and the
220232
# build to succeed which we don't want (originally this was just
221233
# 'exit 0')
222-
if ${{ needs.test-linux-64.result == 'cancelled' ||
223-
needs.test-linux-aarch64.result == 'cancelled' ||
224-
needs.test-windows.result == 'cancelled' ||
225-
needs.doc.result == 'cancelled' }}; then
234+
#
235+
# Note: When [doc-only] is in PR title, test jobs are intentionally
236+
# skipped and should not cause failure.
237+
doc_only=${{ needs.should-skip.outputs.doc-only }}
238+
if ${{ needs.doc.result == 'cancelled' }}; then
226239
exit 1
227-
else
228-
exit 0
229240
fi
241+
if [[ "${doc_only}" != "true" ]]; then
242+
if ${{ needs.test-linux-64.result == 'cancelled' ||
243+
needs.test-linux-aarch64.result == 'cancelled' ||
244+
needs.test-windows.result == 'cancelled' }}; then
245+
exit 1
246+
fi
247+
fi
248+
exit 0

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,19 @@ def test_device_get_p2p_status(handles, index):
136136

137137
def test_device_get_power_usage(ngpus, handles):
138138
for i in range(ngpus):
139-
power_mwatts = nvml.device_get_power_usage(handles[i])
139+
try:
140+
power_mwatts = nvml.device_get_power_usage(handles[i])
141+
except nvml.NotSupportedError:
142+
pytest.skip("device_get_power_usage not supported")
140143
assert power_mwatts >= 0.0
141144

142145

143146
def test_device_get_total_energy_consumption(ngpus, handles):
144147
for i in range(ngpus):
145-
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
148+
try:
149+
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
150+
except nvml.NotSupportedError:
151+
pytest.skip("device_get_total_energy_consumption not supported")
146152
for j in range(10): # idle for 150 ms
147153
time.sleep(0.015) # and check for increase every 15 ms
148154
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])

cuda_core/cuda/core/_cpp/resource_handles.cpp

Lines changed: 41 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,15 @@ struct ContextBox {
171171
};
172172
} // namespace
173173

174-
ContextHandle create_context_handle_ref(CUcontext ctx) noexcept {
174+
ContextHandle create_context_handle_ref(CUcontext ctx) {
175175
auto box = std::make_shared<const ContextBox>(ContextBox{ctx});
176176
return ContextHandle(box, &box->resource);
177177
}
178178

179179
// Thread-local cache of primary contexts indexed by device ID
180180
static thread_local std::vector<ContextHandle> primary_context_cache;
181181

182-
ContextHandle get_primary_context(int device_id) noexcept {
182+
ContextHandle get_primary_context(int device_id) {
183183
// Check thread-local cache
184184
if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
185185
if (auto cached = primary_context_cache[device_id]) {
@@ -212,7 +212,7 @@ ContextHandle get_primary_context(int device_id) noexcept {
212212
return h;
213213
}
214214

215-
ContextHandle get_current_context() noexcept {
215+
ContextHandle get_current_context() {
216216
GILReleaseGuard gil;
217217
CUcontext ctx = nullptr;
218218
if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
@@ -234,7 +234,7 @@ struct StreamBox {
234234
};
235235
} // namespace
236236

237-
StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept {
237+
StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
238238
GILReleaseGuard gil;
239239
CUstream stream;
240240
if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
@@ -252,12 +252,12 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p
252252
return StreamHandle(box, &box->resource);
253253
}
254254

255-
StreamHandle create_stream_handle_ref(CUstream stream) noexcept {
255+
StreamHandle create_stream_handle_ref(CUstream stream) {
256256
auto box = std::make_shared<const StreamBox>(StreamBox{stream});
257257
return StreamHandle(box, &box->resource);
258258
}
259259

260-
StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) noexcept {
260+
StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
261261
if (!owner) {
262262
return create_stream_handle_ref(stream);
263263
}
@@ -281,12 +281,12 @@ StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) n
281281
return StreamHandle(box, &box->resource);
282282
}
283283

284-
StreamHandle get_legacy_stream() noexcept {
284+
StreamHandle get_legacy_stream() {
285285
static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
286286
return handle;
287287
}
288288

289-
StreamHandle get_per_thread_stream() noexcept {
289+
StreamHandle get_per_thread_stream() {
290290
static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
291291
return handle;
292292
}
@@ -301,7 +301,7 @@ struct EventBox {
301301
};
302302
} // namespace
303303

304-
EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept {
304+
EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
305305
GILReleaseGuard gil;
306306
CUevent event;
307307
if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
@@ -319,11 +319,11 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcep
319319
return EventHandle(box, &box->resource);
320320
}
321321

322-
EventHandle create_event_handle_noctx(unsigned int flags) noexcept {
322+
EventHandle create_event_handle_noctx(unsigned int flags) {
323323
return create_event_handle(ContextHandle{}, flags);
324324
}
325325

326-
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) noexcept {
326+
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
327327
GILReleaseGuard gil;
328328
CUevent event;
329329
if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
@@ -353,19 +353,24 @@ struct MemoryPoolBox {
353353

354354
// Helper to clear peer access before destroying a memory pool.
355355
// Works around nvbug 5698116: recycled pool handles inherit peer access state.
356-
static void clear_mempool_peer_access(CUmemoryPool pool) {
357-
int device_count = 0;
358-
if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
359-
return;
360-
}
356+
// Must be noexcept since it's called from a shared_ptr deleter.
357+
static void clear_mempool_peer_access(CUmemoryPool pool) noexcept {
358+
try {
359+
int device_count = 0;
360+
if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
361+
return;
362+
}
361363

362-
std::vector<CUmemAccessDesc> clear_access(device_count);
363-
for (int i = 0; i < device_count; ++i) {
364-
clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
365-
clear_access[i].location.id = i;
366-
clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
364+
std::vector<CUmemAccessDesc> clear_access(device_count);
365+
for (int i = 0; i < device_count; ++i) {
366+
clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
367+
clear_access[i].location.id = i;
368+
clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
369+
}
370+
p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort
371+
} catch (...) {
372+
// Swallow exceptions - this is best-effort cleanup in destructor context
367373
}
368-
p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort
369374
}
370375

371376
static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
@@ -381,7 +386,7 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
381386
return MemoryPoolHandle(box, &box->resource);
382387
}
383388

384-
MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
389+
MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
385390
GILReleaseGuard gil;
386391
CUmemoryPool pool;
387392
if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
@@ -390,12 +395,12 @@ MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
390395
return wrap_mempool_owned(pool);
391396
}
392397

393-
MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) noexcept {
398+
MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
394399
auto box = std::make_shared<const MemoryPoolBox>(MemoryPoolBox{pool});
395400
return MemoryPoolHandle(box, &box->resource);
396401
}
397402

398-
MemoryPoolHandle get_device_mempool(int device_id) noexcept {
403+
MemoryPoolHandle get_device_mempool(int device_id) {
399404
GILReleaseGuard gil;
400405
CUmemoryPool pool;
401406
if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
@@ -404,7 +409,7 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept {
404409
return create_mempool_handle_ref(pool);
405410
}
406411

407-
MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) noexcept {
412+
MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
408413
GILReleaseGuard gil;
409414
CUmemoryPool pool;
410415
auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
@@ -448,7 +453,7 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) no
448453
get_box(h)->h_stream = std::move(h_stream);
449454
}
450455

451-
DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) noexcept {
456+
DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
452457
GILReleaseGuard gil;
453458
CUdeviceptr ptr;
454459
if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) {
@@ -466,7 +471,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
466471
return DevicePtrHandle(box, &box->resource);
467472
}
468473

469-
DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept {
474+
DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
470475
GILReleaseGuard gil;
471476
CUdeviceptr ptr;
472477
if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) {
@@ -484,7 +489,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexce
484489
return DevicePtrHandle(box, &box->resource);
485490
}
486491

487-
DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
492+
DevicePtrHandle deviceptr_alloc(size_t size) {
488493
GILReleaseGuard gil;
489494
CUdeviceptr ptr;
490495
if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
@@ -502,7 +507,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
502507
return DevicePtrHandle(box, &box->resource);
503508
}
504509

505-
DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
510+
DevicePtrHandle deviceptr_alloc_host(size_t size) {
506511
GILReleaseGuard gil;
507512
void* ptr;
508513
if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
@@ -520,12 +525,12 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
520525
return DevicePtrHandle(box, &box->resource);
521526
}
522527

523-
DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) noexcept {
528+
DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
524529
auto box = std::make_shared<DevicePtrBox>(DevicePtrBox{ptr, StreamHandle{}});
525530
return DevicePtrHandle(box, &box->resource);
526531
}
527532

528-
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) noexcept {
533+
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
529534
if (!owner) {
530535
return deviceptr_create_ref(ptr);
531536
}
@@ -607,7 +612,7 @@ struct ExportDataKeyHash {
607612
static std::mutex ipc_ptr_cache_mutex;
608613
static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
609614

610-
DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) noexcept {
615+
DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
611616
auto data = const_cast<CUmemPoolPtrExportData*>(
612617
reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
613618

@@ -639,14 +644,16 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
639644
new DevicePtrBox{ptr, h_stream},
640645
[h_pool, key](DevicePtrBox* b) {
641646
GILReleaseGuard gil;
642-
{
647+
try {
643648
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
644649
// Only erase if expired - avoids race where another thread
645650
// replaced the entry with a new import before we acquired the lock.
646651
auto it = ipc_ptr_cache.find(key);
647652
if (it != ipc_ptr_cache.end() && it->second.expired()) {
648653
ipc_ptr_cache.erase(it);
649654
}
655+
} catch (...) {
656+
// Cache cleanup is best-effort - swallow exceptions in destructor context
650657
}
651658
p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
652659
delete b;

0 commit comments

Comments
 (0)