Backport nvml tests

mdboom · mdboom · commit a6c4e2052eb8 · 2026-01-28T08:49:52.000-05:00
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/__init__.py b/cuda_bindings/cuda/bindings/_test_helpers/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+# This package contains test helper utilities that may also be useful for other libraries outside of `cuda.bindings`,
+# such as `cuda.core`. These utilities are not part of the public API of `cuda.bindings` and may change without notice.
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py b/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+from contextlib import contextmanager
+
+import pytest
+
+from cuda.bindings import _nvml as nvml
+
+
+@contextmanager
+def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
+    device_arch = nvml.device_get_architecture(device)
+
+    if isinstance(expected_device_arch, nvml.DeviceArch):
+        expected_device_arch_int = int(expected_device_arch)
+    elif expected_device_arch == "FERMI":
+        expected_device_arch_int = 1
+    else:
+        expected_device_arch_int = 0
+
+    if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
+        # In this case, we don't /know/ if it will fail, but we are ok if it
+        # does or does not.
+
+        # TODO: There are APIs that are documented as supported only if the
+        # device has an InfoROM, but I couldn't find a way to detect that.  For
+        # now, they are just handled as "possibly failing".
+
+        try:
+            yield
+        except nvml.NotSupportedError:
+            # The API call raised NotSupportedError, so we skip the test, but
+            # don't fail it
+            pytest.skip(
+                f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
+                f"on device '{nvml.device_get_name(device)}'"
+            )
+        # If the API call worked, just continue
+    elif int(device_arch) < expected_device_arch_int:
+        # In this case, we /know/ if will fail, and we want to assert that it does.
+        with pytest.raises(nvml.NotSupportedError):
+            yield
+        # The above call was unsupported, so the rest of the test is skipped
+        pytest.skip(f"Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
+    else:
+        # In this case, we /know/ it should work, and if it fails, the test should fail.
+        yield
diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py
@@ -6,6 +6,7 @@
 import pytest
 
 from cuda.bindings import _nvml as nvml
+from cuda.bindings._test_helpers.arch_check import unsupported_before  # noqa: F401
 
 
 class NVMLInitializer:
diff --git a/cuda_bindings/tests/nvml/test_compute_mode.py b/cuda_bindings/tests/nvml/test_compute_mode.py
@@ -8,6 +8,8 @@
 
 from cuda.bindings import _nvml as nvml
 
+from .conftest import unsupported_before
+
 COMPUTE_MODES = [
     nvml.ComputeMode.COMPUTEMODE_DEFAULT,
     nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -17,18 +19,11 @@
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
 def test_compute_mode_supported_nonroot(all_devices):
-    skip_reasons = set()
     for device in all_devices:
-        try:
+        with unsupported_before(device, None):
             original_compute_mode = nvml.device_get_compute_mode(device)
-        except nvml.NotSupportedError:
-            skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
-            continue
 
         for cm in COMPUTE_MODES:
             with pytest.raises(nvml.NoPermissionError):
                 nvml.device_set_compute_mode(device, cm)
             assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
-
-    if skip_reasons:
-        pytest.skip(" ; ".join(skip_reasons))
diff --git a/cuda_bindings/tests/nvml/test_gpu.py b/cuda_bindings/tests/nvml/test_gpu.py
@@ -6,6 +6,7 @@
 from cuda.bindings import _nvml as nvml
 
 from . import util
+from .conftest import unsupported_before
 
 
 def test_gpu_get_module_id(nvml_init):
@@ -24,23 +25,14 @@ def test_gpu_get_module_id(nvml_init):
 
 
 def test_gpu_get_platform_info(all_devices):
-    skip_reasons = set()
     for device in all_devices:
         if util.is_vgpu(device):
-            skip_reasons.add(f"Not supported on vGPU device {device}")
-            continue
+            pytest.skip(f"Not supported on vGPU device {device}")
 
-        # TODO
-        # if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
-        #     test_utils.skip_test("Not supported on chip before Blackwell")
+        # Documentation says Blackwell or newer only, but this does seem to pass
+        # on some newer GPUs.
 
-        try:
+        with unsupported_before(device, None):
             platform_info = nvml.device_get_platform_info(device)
-        except nvml.NotSupportedError:
-            skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
-            continue
-
-        assert isinstance(platform_info, (nvml.PlatformInfo_v2, nvml.PlatformInfo_v1))
 
-    if skip_reasons:
-        pytest.skip(" ; ".join(skip_reasons))
+        assert isinstance(platform_info, (nvml.PlatformInfo_v1, nvml.PlatformInfo_v2))
diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import sys
+import warnings
 
 import pytest
 
@@ -17,6 +18,23 @@ def assert_nvml_is_uninitialized():
         nvml.device_get_count_v2()
 
 
+def test_devices_are_the_same_architecture(all_devices):
+    # The tests in this directory that use `unsupported_before` will generally
+    # skip the entire test after the first device that isn't supported is found.
+    # This means that if subsequent devices are of a different architecture,
+    # they won't be tested properly.  This tests for the (hopefully rare) case
+    # where a system has devices of different architectures and produces a warning.
+
+    all_arches = set(nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices)
+
+    if len(all_arches) > 1:
+        warnings.warn(  # noqa: B028
+            f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). "
+            f" Some tests may be skipped unexpectedly",
+            UserWarning,
+        )
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
 def test_init_ref_count():
     """
diff --git a/cuda_bindings/tests/nvml/test_nvlink.py b/cuda_bindings/tests/nvml/test_nvlink.py
@@ -11,14 +11,14 @@ def test_nvlink_get_link_count(all_devices):
     """
     for device in all_devices:
         fields = nvml.FieldValue(1)
-        fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
+        fields[0].field_id = nvml.FieldId.DEV_NVLINK_LINK_COUNT
         value = nvml.device_get_field_values(device, fields)[0]
         assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
             f"Unexpected return {value.nvml_return} for link count field query"
         )
 
         # Use the alternative argument to device_get_field_values
-        value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
+        value = nvml.device_get_field_values(device, [nvml.FieldId.DEV_NVLINK_LINK_COUNT])[0]
         assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
             f"Unexpected return {value.nvml_return} for link count field query"
         )
diff --git a/cuda_bindings/tests/nvml/test_pynvml.py b/cuda_bindings/tests/nvml/test_pynvml.py
@@ -11,6 +11,7 @@
 from cuda.bindings import _nvml as nvml
 
 from . import util
+from .conftest import unsupported_before
 
 XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
 
@@ -67,7 +68,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
 def test_device_get_memory_affinity(handles, scope):
     size = 1024
     for handle in handles:
-        node_set = nvml.device_get_memory_affinity(handle, size, scope)
+        with unsupported_before(handle, nvml.DeviceArch.KEPLER):
+            node_set = nvml.device_get_memory_affinity(handle, size, scope)
         assert node_set is not None
         assert len(node_set) == size
 
@@ -77,7 +79,8 @@ def test_device_get_memory_affinity(handles, scope):
 def test_device_get_cpu_affinity_within_scope(handles, scope):
     size = 1024
     for handle in handles:
-        cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
+        with unsupported_before(handle, nvml.DeviceArch.KEPLER):
+            cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
         assert cpu_set is not None
         assert len(cpu_set) == size
 
@@ -137,22 +140,22 @@ def test_device_get_p2p_status(handles, index):
 
 def test_device_get_power_usage(ngpus, handles):
     for i in range(ngpus):
-        try:
+        # Note: documentation says this is supported on Fermi or newer,
+        # but in practice it fails on some later architectures.
+        with unsupported_before(handles[i], None):
             power_mwatts = nvml.device_get_power_usage(handles[i])
-        except nvml.NotSupportedError:
-            pytest.skip("device_get_power_usage not supported")
         assert power_mwatts >= 0.0
 
 
 def test_device_get_total_energy_consumption(ngpus, handles):
     for i in range(ngpus):
-        try:
+        with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
             energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
-        except nvml.NotSupportedError:
-            pytest.skip("device_get_total_energy_consumption not supported")
+
         for j in range(10):  # idle for 150 ms
             time.sleep(0.015)  # and check for increase every 15 ms
-            energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
+            with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
+                energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
             assert energy_mjoules2 >= energy_mjoules1
             if energy_mjoules2 > energy_mjoules1:
                 break
@@ -183,7 +186,8 @@ def test_device_get_memory_info(ngpus, handles):
 
 def test_device_get_utilization_rates(ngpus, handles):
     for i in range(ngpus):
-        urate = nvml.device_get_utilization_rates(handles[i])
+        with unsupported_before(handles[i], "FERMI"):
+            urate = nvml.device_get_utilization_rates(handles[i])
         assert urate.gpu >= 0
         assert urate.memory >= 0
 
@@ -240,7 +244,8 @@ def test_device_get_utilization_rates(ngpus, handles):
 
 def test_device_get_pcie_throughput(ngpus, handles):
     for i in range(ngpus):
-        tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
+        with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
+            tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
         assert tx_bytes_tp >= 0
         rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
         assert rx_bytes_tp >= 0
@@ -272,10 +277,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
 def test_device_get_nvlink_capability(ngpus, handles, cap_type):
     for i in range(ngpus):
         for j in range(nvml.NVLINK_MAX_LINKS):
-            try:
+            # By the documentation, this should be supported on PASCAL or newer,
+            # but this also seems to fail on newer.
+            with unsupported_before(handles[i], None):
                 cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
-            except nvml.NotSupportedError:
-                pytest.skip("NVLink capability not supported")
             assert cap >= 0
 
 
diff --git a/cuda_pathfinder/cuda/pathfinder/__init__.py b/cuda_pathfinder/cuda/pathfinder/__init__.py
@@ -1,11 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.pathfinder._version import __version__  # noqa: F401
-
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError as DynamicLibNotFoundError
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL as LoadedDL
 from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import load_nvidia_dynamic_lib as load_nvidia_dynamic_lib
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LIBNAMES as SUPPORTED_NVIDIA_LIBNAMES,  # noqa: F401
 )
+from cuda.pathfinder._version import __version__  # noqa: F401