Skip to content

Commit a6c4e20

Browse files
committed
Backport nvml tests
1 parent d310335 commit a6c4e20

9 files changed

Lines changed: 105 additions & 40 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
5+
# This package contains test helper utilities that may also be useful for other libraries outside of `cuda.bindings`,
6+
# such as `cuda.core`. These utilities are not part of the public API of `cuda.bindings` and may change without notice.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
5+
from contextlib import contextmanager
6+
7+
import pytest
8+
9+
from cuda.bindings import _nvml as nvml
10+
11+
12+
@contextmanager
13+
def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
14+
device_arch = nvml.device_get_architecture(device)
15+
16+
if isinstance(expected_device_arch, nvml.DeviceArch):
17+
expected_device_arch_int = int(expected_device_arch)
18+
elif expected_device_arch == "FERMI":
19+
expected_device_arch_int = 1
20+
else:
21+
expected_device_arch_int = 0
22+
23+
if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
24+
# In this case, we don't /know/ if it will fail, but we are ok if it
25+
# does or does not.
26+
27+
# TODO: There are APIs that are documented as supported only if the
28+
# device has an InfoROM, but I couldn't find a way to detect that. For
29+
# now, they are just handled as "possibly failing".
30+
31+
try:
32+
yield
33+
except nvml.NotSupportedError:
34+
# The API call raised NotSupportedError, so we skip the test, but
35+
# don't fail it
36+
pytest.skip(
37+
f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
38+
f"on device '{nvml.device_get_name(device)}'"
39+
)
40+
# If the API call worked, just continue
41+
elif int(device_arch) < expected_device_arch_int:
42+
# In this case, we /know/ if will fail, and we want to assert that it does.
43+
with pytest.raises(nvml.NotSupportedError):
44+
yield
45+
# The above call was unsupported, so the rest of the test is skipped
46+
pytest.skip(f"Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
47+
else:
48+
# In this case, we /know/ it should work, and if it fails, the test should fail.
49+
yield

cuda_bindings/tests/nvml/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77

88
from cuda.bindings import _nvml as nvml
9+
from cuda.bindings._test_helpers.arch_check import unsupported_before # noqa: F401
910

1011

1112
class NVMLInitializer:

cuda_bindings/tests/nvml/test_compute_mode.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
from cuda.bindings import _nvml as nvml
1010

11+
from .conftest import unsupported_before
12+
1113
COMPUTE_MODES = [
1214
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
1315
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -17,18 +19,11 @@
1719

1820
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
1921
def test_compute_mode_supported_nonroot(all_devices):
20-
skip_reasons = set()
2122
for device in all_devices:
22-
try:
23+
with unsupported_before(device, None):
2324
original_compute_mode = nvml.device_get_compute_mode(device)
24-
except nvml.NotSupportedError:
25-
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
26-
continue
2725

2826
for cm in COMPUTE_MODES:
2927
with pytest.raises(nvml.NoPermissionError):
3028
nvml.device_set_compute_mode(device, cm)
3129
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
32-
33-
if skip_reasons:
34-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_gpu.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from cuda.bindings import _nvml as nvml
77

88
from . import util
9+
from .conftest import unsupported_before
910

1011

1112
def test_gpu_get_module_id(nvml_init):
@@ -24,23 +25,14 @@ def test_gpu_get_module_id(nvml_init):
2425

2526

2627
def test_gpu_get_platform_info(all_devices):
27-
skip_reasons = set()
2828
for device in all_devices:
2929
if util.is_vgpu(device):
30-
skip_reasons.add(f"Not supported on vGPU device {device}")
31-
continue
30+
pytest.skip(f"Not supported on vGPU device {device}")
3231

33-
# TODO
34-
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
35-
# test_utils.skip_test("Not supported on chip before Blackwell")
32+
# Documentation says Blackwell or newer only, but this does seem to pass
33+
# on some newer GPUs.
3634

37-
try:
35+
with unsupported_before(device, None):
3836
platform_info = nvml.device_get_platform_info(device)
39-
except nvml.NotSupportedError:
40-
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
41-
continue
42-
43-
assert isinstance(platform_info, (nvml.PlatformInfo_v2, nvml.PlatformInfo_v1))
4437

45-
if skip_reasons:
46-
pytest.skip(" ; ".join(skip_reasons))
38+
assert isinstance(platform_info, (nvml.PlatformInfo_v1, nvml.PlatformInfo_v2))

cuda_bindings/tests/nvml/test_init.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

44
import sys
5+
import warnings
56

67
import pytest
78

@@ -17,6 +18,23 @@ def assert_nvml_is_uninitialized():
1718
nvml.device_get_count_v2()
1819

1920

21+
def test_devices_are_the_same_architecture(all_devices):
22+
# The tests in this directory that use `unsupported_before` will generally
23+
# skip the entire test after the first device that isn't supported is found.
24+
# This means that if subsequent devices are of a different architecture,
25+
# they won't be tested properly. This tests for the (hopefully rare) case
26+
# where a system has devices of different architectures and produces a warning.
27+
28+
all_arches = set(nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices)
29+
30+
if len(all_arches) > 1:
31+
warnings.warn( # noqa: B028
32+
f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). "
33+
f" Some tests may be skipped unexpectedly",
34+
UserWarning,
35+
)
36+
37+
2038
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
2139
def test_init_ref_count():
2240
"""

cuda_bindings/tests/nvml/test_nvlink.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ def test_nvlink_get_link_count(all_devices):
1111
"""
1212
for device in all_devices:
1313
fields = nvml.FieldValue(1)
14-
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
14+
fields[0].field_id = nvml.FieldId.DEV_NVLINK_LINK_COUNT
1515
value = nvml.device_get_field_values(device, fields)[0]
1616
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
1717
f"Unexpected return {value.nvml_return} for link count field query"
1818
)
1919

2020
# Use the alternative argument to device_get_field_values
21-
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
21+
value = nvml.device_get_field_values(device, [nvml.FieldId.DEV_NVLINK_LINK_COUNT])[0]
2222
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
2323
f"Unexpected return {value.nvml_return} for link count field query"
2424
)

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from cuda.bindings import _nvml as nvml
1212

1313
from . import util
14+
from .conftest import unsupported_before
1415

1516
XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
1617

@@ -67,7 +68,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
6768
def test_device_get_memory_affinity(handles, scope):
6869
size = 1024
6970
for handle in handles:
70-
node_set = nvml.device_get_memory_affinity(handle, size, scope)
71+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
72+
node_set = nvml.device_get_memory_affinity(handle, size, scope)
7173
assert node_set is not None
7274
assert len(node_set) == size
7375

@@ -77,7 +79,8 @@ def test_device_get_memory_affinity(handles, scope):
7779
def test_device_get_cpu_affinity_within_scope(handles, scope):
7880
size = 1024
7981
for handle in handles:
80-
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
82+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
83+
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
8184
assert cpu_set is not None
8285
assert len(cpu_set) == size
8386

@@ -137,22 +140,22 @@ def test_device_get_p2p_status(handles, index):
137140

138141
def test_device_get_power_usage(ngpus, handles):
139142
for i in range(ngpus):
140-
try:
143+
# Note: documentation says this is supported on Fermi or newer,
144+
# but in practice it fails on some later architectures.
145+
with unsupported_before(handles[i], None):
141146
power_mwatts = nvml.device_get_power_usage(handles[i])
142-
except nvml.NotSupportedError:
143-
pytest.skip("device_get_power_usage not supported")
144147
assert power_mwatts >= 0.0
145148

146149

147150
def test_device_get_total_energy_consumption(ngpus, handles):
148151
for i in range(ngpus):
149-
try:
152+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
150153
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
151-
except nvml.NotSupportedError:
152-
pytest.skip("device_get_total_energy_consumption not supported")
154+
153155
for j in range(10): # idle for 150 ms
154156
time.sleep(0.015) # and check for increase every 15 ms
155-
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
157+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
158+
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
156159
assert energy_mjoules2 >= energy_mjoules1
157160
if energy_mjoules2 > energy_mjoules1:
158161
break
@@ -183,7 +186,8 @@ def test_device_get_memory_info(ngpus, handles):
183186

184187
def test_device_get_utilization_rates(ngpus, handles):
185188
for i in range(ngpus):
186-
urate = nvml.device_get_utilization_rates(handles[i])
189+
with unsupported_before(handles[i], "FERMI"):
190+
urate = nvml.device_get_utilization_rates(handles[i])
187191
assert urate.gpu >= 0
188192
assert urate.memory >= 0
189193

@@ -240,7 +244,8 @@ def test_device_get_utilization_rates(ngpus, handles):
240244

241245
def test_device_get_pcie_throughput(ngpus, handles):
242246
for i in range(ngpus):
243-
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
247+
with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
248+
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
244249
assert tx_bytes_tp >= 0
245250
rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
246251
assert rx_bytes_tp >= 0
@@ -272,10 +277,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
272277
def test_device_get_nvlink_capability(ngpus, handles, cap_type):
273278
for i in range(ngpus):
274279
for j in range(nvml.NVLINK_MAX_LINKS):
275-
try:
280+
# By the documentation, this should be supported on PASCAL or newer,
281+
# but this also seems to fail on newer.
282+
with unsupported_before(handles[i], None):
276283
cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
277-
except nvml.NotSupportedError:
278-
pytest.skip("NVLink capability not supported")
279284
assert cap >= 0
280285

281286

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
from cuda.pathfinder._version import __version__ # noqa: F401
5-
64
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
75
from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
86
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
97
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
108
SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES, # noqa: F401
119
)
10+
from cuda.pathfinder._version import __version__ # noqa: F401

0 commit comments

Comments
 (0)