NVIDIA
diff --git a/‎.gitattributes‎
Lines changed: 3 additions & 1 deletion b/‎.gitattributes‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cuda_bindings/cuda/bindings/_test_helpers/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎cuda_bindings/cuda/bindings/_test_helpers/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cuda_bindings/cuda/bindings/_test_helpers/arch_check.py‎
Lines changed: 48 additions & 0 deletions b/‎cuda_bindings/cuda/bindings/_test_helpers/arch_check.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎cuda_bindings/tests/nvml/conftest.py‎
Lines changed: 1 addition & 0 deletions b/‎cuda_bindings/tests/nvml/conftest.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cuda_bindings/tests/nvml/test_compute_mode.py‎
Lines changed: 3 additions & 8 deletions b/‎cuda_bindings/tests/nvml/test_compute_mode.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎cuda_bindings/tests/nvml/test_gpu.py‎
Lines changed: 5 additions & 13 deletions b/‎cuda_bindings/tests/nvml/test_gpu.py‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎cuda_bindings/tests/nvml/test_init.py‎
Lines changed: 18 additions & 0 deletions b/‎cuda_bindings/tests/nvml/test_init.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cuda_bindings/tests/nvml/test_pynvml.py‎
Lines changed: 19 additions & 14 deletions b/‎cuda_bindings/tests/nvml/test_pynvml.py‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 23 additions & 0 deletions b/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 23 additions & 0 deletions
@@ -6,7 +6,9 @@ cuda/_version.py export-subst
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
-# Exception: headers we own (cuda_core C++ implementation)
+# Exception: headers we own
+cuda_bindings/cuda/bindings/_bindings/*.h -binary text diff
+cuda_bindings/cuda/bindings/_lib/*.h -binary text diff
 cuda_core/cuda/core/_cpp/*.h -binary text diff
 cuda_core/cuda/core/_cpp/*.hpp -binary text diff
 # git should not convert line endings in PNG files
 
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,6 +30,7 @@ repos:
         additional_dependencies:
           - https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl
         exclude: '.*pixi\.lock'
+        args: ["--fix"]
 
       - id: no-markdown-in-docs-source
         name: Prevent markdown files in docs/source directories
@@ -89,5 +90,6 @@ repos:
         args: [--no-pycodestyle]
         exclude: ^cuda_bindings/
 
+
 default_language_version:
       python: python3
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+# This package contains test helper utilities that may also be useful for other libraries outside of `cuda.bindings`,
+# such as `cuda.core`. These utilities are not part of the public API of `cuda.bindings` and may change without notice.
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+from contextlib import contextmanager
+
+import pytest
+from cuda.bindings import _nvml as nvml
+
+
+@contextmanager
+def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
+    device_arch = nvml.device_get_architecture(device)
+
+    if isinstance(expected_device_arch, nvml.DeviceArch):
+        expected_device_arch_int = int(expected_device_arch)
+    elif expected_device_arch == "FERMI":
+        expected_device_arch_int = 1
+    else:
+        expected_device_arch_int = 0
+
+    if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
+        # In this case, we don't /know/ if it will fail, but we are ok if it
+        # does or does not.
+
+        # TODO: There are APIs that are documented as supported only if the
+        # device has an InfoROM, but I couldn't find a way to detect that.  For
+        # now, they are just handled as "possibly failing".
+
+        try:
+            yield
+        except nvml.NotSupportedError:
+            # The API call raised NotSupportedError, so we skip the test, but
+            # don't fail it
+            pytest.skip(
+                f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
+                f"on device '{nvml.device_get_name(device)}'"
+            )
+        # If the API call worked, just continue
+    elif int(device_arch) < expected_device_arch_int:
+        # In this case, we /know/ if will fail, and we want to assert that it does.
+        with pytest.raises(nvml.NotSupportedError):
+            yield
+        # The above call was unsupported, so the rest of the test is skipped
+        pytest.skip(f"Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
+    else:
+        # In this case, we /know/ it should work, and if it fails, the test should fail.
+        yield
@@ -5,6 +5,7 @@
 
 import pytest
 from cuda.bindings import _nvml as nvml
+from cuda.bindings._test_helpers.arch_check import unsupported_before  # noqa: F401
 
 
 class NVMLInitializer:
 
@@ -7,6 +7,8 @@
 import pytest
 from cuda.bindings import _nvml as nvml
 
+from .conftest import unsupported_before
+
 COMPUTE_MODES = [
     nvml.ComputeMode.COMPUTEMODE_DEFAULT,
     nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -16,18 +18,11 @@
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
 def test_compute_mode_supported_nonroot(all_devices):
-    skip_reasons = set()
     for device in all_devices:
-        try:
+        with unsupported_before(device, None):
             original_compute_mode = nvml.device_get_compute_mode(device)
-        except nvml.NotSupportedError:
-            skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
-            continue
 
         for cm in COMPUTE_MODES:
             with pytest.raises(nvml.NoPermissionError):
                 nvml.device_set_compute_mode(device, cm)
             assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
-
-    if skip_reasons:
-        pytest.skip(" ; ".join(skip_reasons))
@@ -5,6 +5,7 @@
 from cuda.bindings import _nvml as nvml
 
 from . import util
+from .conftest import unsupported_before
 
 
 def test_gpu_get_module_id(nvml_init):
@@ -23,23 +24,14 @@ def test_gpu_get_module_id(nvml_init):
 
 
 def test_gpu_get_platform_info(all_devices):
-    skip_reasons = set()
     for device in all_devices:
         if util.is_vgpu(device):
-            skip_reasons.add(f"Not supported on vGPU device {device}")
-            continue
+            pytest.skip(f"Not supported on vGPU device {device}")
 
-        # TODO
-        # if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
-        #     test_utils.skip_test("Not supported on chip before Blackwell")
+        # Documentation says Blackwell or newer only, but this does seem to pass
+        # on some newer GPUs.
 
-        try:
+        with unsupported_before(device, None):
             platform_info = nvml.device_get_platform_info(device)
-        except nvml.NotSupportedError:
-            skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
-            continue
 
         assert isinstance(platform_info, nvml.PlatformInfo_v2)
-
-    if skip_reasons:
-        pytest.skip(" ; ".join(skip_reasons))
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import sys
+import warnings
 
 import pytest
 from cuda.bindings import _nvml as nvml
@@ -16,6 +17,23 @@ def assert_nvml_is_uninitialized():
         nvml.device_get_count_v2()
 
 
+def test_devices_are_the_same_architecture(all_devices):
+    # The tests in this directory that use `unsupported_before` will generally
+    # skip the entire test after the first device that isn't supported is found.
+    # This means that if subsequent devices are of a different architecture,
+    # they won't be tested properly.  This tests for the (hopefully rare) case
+    # where a system has devices of different architectures and produces a warning.
+
+    all_arches = set(nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices)
+
+    if len(all_arches) > 1:
+        warnings.warn(  # noqa: B028
+            f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). "
+            f" Some tests may be skipped unexpectedly",
+            UserWarning,
+        )
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
 def test_init_ref_count():
     """
 
@@ -10,6 +10,7 @@
 from cuda.bindings import _nvml as nvml
 
 from . import util
+from .conftest import unsupported_before
 
 XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
 
@@ -66,7 +67,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
 def test_device_get_memory_affinity(handles, scope):
     size = 1024
     for handle in handles:
-        node_set = nvml.device_get_memory_affinity(handle, size, scope)
+        with unsupported_before(handle, nvml.DeviceArch.KEPLER):
+            node_set = nvml.device_get_memory_affinity(handle, size, scope)
         assert node_set is not None
         assert len(node_set) == size
 
@@ -76,7 +78,8 @@ def test_device_get_memory_affinity(handles, scope):
 def test_device_get_cpu_affinity_within_scope(handles, scope):
     size = 1024
     for handle in handles:
-        cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
+        with unsupported_before(handle, nvml.DeviceArch.KEPLER):
+            cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
         assert cpu_set is not None
         assert len(cpu_set) == size
 
@@ -136,22 +139,22 @@ def test_device_get_p2p_status(handles, index):
 
 def test_device_get_power_usage(ngpus, handles):
     for i in range(ngpus):
-        try:
+        # Note: documentation says this is supported on Fermi or newer,
+        # but in practice it fails on some later architectures.
+        with unsupported_before(handles[i], None):
             power_mwatts = nvml.device_get_power_usage(handles[i])
-        except nvml.NotSupportedError:
-            pytest.skip("device_get_power_usage not supported")
         assert power_mwatts >= 0.0
 
 
 def test_device_get_total_energy_consumption(ngpus, handles):
     for i in range(ngpus):
-        try:
+        with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
             energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
-        except nvml.NotSupportedError:
-            pytest.skip("device_get_total_energy_consumption not supported")
+
         for j in range(10):  # idle for 150 ms
             time.sleep(0.015)  # and check for increase every 15 ms
-            energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
+            with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
+                energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
             assert energy_mjoules2 >= energy_mjoules1
             if energy_mjoules2 > energy_mjoules1:
                 break
@@ -182,7 +185,8 @@ def test_device_get_memory_info(ngpus, handles):
 
 def test_device_get_utilization_rates(ngpus, handles):
     for i in range(ngpus):
-        urate = nvml.device_get_utilization_rates(handles[i])
+        with unsupported_before(handles[i], "FERMI"):
+            urate = nvml.device_get_utilization_rates(handles[i])
         assert urate.gpu >= 0
         assert urate.memory >= 0
 
@@ -239,7 +243,8 @@ def test_device_get_utilization_rates(ngpus, handles):
 
 def test_device_get_pcie_throughput(ngpus, handles):
     for i in range(ngpus):
-        tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
+        with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
+            tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
         assert tx_bytes_tp >= 0
         rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
         assert rx_bytes_tp >= 0
@@ -271,10 +276,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
 def test_device_get_nvlink_capability(ngpus, handles, cap_type):
     for i in range(ngpus):
         for j in range(nvml.NVLINK_MAX_LINKS):
-            try:
+            # By the documentation, this should be supported on PASCAL or newer,
+            # but this also seems to fail on newer.
+            with unsupported_before(handles[i], None):
                 cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
-            except nvml.NotSupportedError:
-                pytest.skip("NVLink capability not supported")
             assert cap >= 0
 
 
 
@@ -1034,6 +1034,29 @@ class Device:
         total = system.get_num_devices()
         return tuple(cls(device_id) for device_id in range(total))
 
+    def to_system_device(self) -> 'cuda.core.system.Device':
+        """
+        Get the corresponding :class:`cuda.core.system.Device` (which is used
+        for NVIDIA Machine Library (NVML) access) for this
+        :class:`cuda.core.Device` (which is used for CUDA access).
+
+        The devices are mapped to one another by their UUID.
+
+        Returns
+        -------
+        cuda.core.system.Device
+            The corresponding system-level device instance used for NVML access.
+        """
+        from cuda.core.system._system import CUDA_BINDINGS_NVML_IS_COMPATIBLE
+
+        if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+            raise RuntimeError(
+                "cuda.core.system.Device requires cuda_bindings 13.1.2+ or 12.9.6+"
+            )
+
+        from cuda.core.system import Device as SystemDevice
+        return SystemDevice(uuid=self.uuid)
+
     @property
     def device_id(self) -> int:
         """Return device ordinal."""