Merge branch 'main' into issue2027

mdboom · web-flow · commit baa3e6f3af58 · 2026-05-06T10:21:13.000-04:00
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -11,7 +11,6 @@
 import glob
 import os
 import re
-import subprocess
 import sys
 import tempfile
 import zipfile
@@ -185,28 +184,6 @@ def get_sources(mod_name):
         # related to free-threading builds.
         extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"]
 
-    # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC
-    # linker can resolve the AOTI symbols (they live in torch_cpu.dll at
-    # runtime).  We generate the .lib from a .def file at build time.
-    # Note: aoti_torch_get_current_cuda_stream lives in torch_cuda.dll and
-    # is resolved lazily at runtime (not via the stub lib) — see
-    # _tensor_bridge.pyx.
-    _aoti_extra_link_args = []
-    if sys.platform == "win32":
-        _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def")
-        _lib_file = os.path.join("build", "aoti_shim.lib")
-        os.makedirs("build", exist_ok=True)
-        subprocess.check_call(  # noqa: S603
-            ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"],  # noqa: S607
-            stdout=subprocess.DEVNULL,
-        )
-        _aoti_extra_link_args = [_lib_file]
-
-    def get_extra_link_args(mod_name):
-        if mod_name == "_tensor_bridge" and _aoti_extra_link_args:
-            return extra_link_args + _aoti_extra_link_args
-        return extra_link_args
-
     ext_modules = tuple(
         Extension(
             f"cuda.core.{mod.replace(os.path.sep, '.')}",
@@ -218,7 +195,7 @@ def get_extra_link_args(mod_name):
             + all_include_dirs,
             language="c++",
             extra_compile_args=extra_compile_args,
-            extra_link_args=get_extra_link_args(mod),
+            extra_link_args=extra_link_args,
         )
         for mod in module_names()
     )
diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx
@@ -106,11 +106,18 @@ cdef class SMResourceOptions:
         Preferred co-scheduled SM count; the driver tries to satisfy
         this but may fall back to ``coscheduled_sm_count``.
         (Default to ``None``)
+    backfill : bool or Sequence[bool], optional
+        If ``True``, allow the driver to relax the co-scheduling
+        constraint when assigning SMs. This enables requesting
+        arbitrary aligned SM counts that the driver would otherwise
+        reject due to hardware topology constraints.
+        (Default to ``False``)
     """
 
     count: int | SequenceABC | None = None
     coscheduled_sm_count: int | SequenceABC | None = None
     preferred_coscheduled_sm_count: int | SequenceABC | None = None
+    backfill: bool | SequenceABC = False
 
 
 @dataclass
@@ -172,6 +179,12 @@ cdef inline int _resolve_group_count(SMResourceOptions options) except?-1:
         n_groups,
         count_is_scalar,
     )
+    _validate_split_field_length(
+        options.backfill,
+        "backfill",
+        n_groups,
+        count_is_scalar,
+    )
     return n_groups
 
 
@@ -243,6 +256,7 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
         cdef list counts = _broadcast_field(options.count, n_groups)
         cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups)
         cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups)
+        cdef list backfills = _broadcast_field(options.backfill, n_groups)
         cdef int i
 
         for i in range(n_groups):
@@ -252,7 +266,10 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
                 params[i].coscheduledSmCount = <unsigned int>(coscheduled[i])
             if preferred[i] is not None:
                 params[i].preferredCoscheduledSmCount = <unsigned int>(preferred[i])
-            params[i].flags = 0
+            params[i].flags = (
+                cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL
+                if backfills[i] else 0
+            )
         return 0
 
 
diff --git a/cuda_core/cuda/core/_include/aoti_shim.def b/cuda_core/cuda/core/_include/aoti_shim.def
@@ -4,7 +4,7 @@
 ; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').
 ;
 ; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations
-; in aoti_shim.h. build_hooks.py turns this file into the stub import library
+; in aoti_shim.h. setup.py turns this file into the stub import library
 ; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI
 ; symbol must be updated in both files.
 LIBRARY torch_cpu.dll
diff --git a/cuda_core/cuda/core/_include/aoti_shim.h b/cuda_core/cuda/core/_include/aoti_shim.h
@@ -52,10 +52,10 @@ typedef struct AtenTensorOpaque* AtenTensorHandle;
 
 /*
  * IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with
- * aoti_shim.def.  On Windows, build_hooks.py turns that .def file into the
- * stub import library that MSVC needs to link _tensor_bridge without making
- * PyTorch a build-time dependency.  If you add, remove, or rename an
- * imported AOTI symbol here, update aoti_shim.def in the same change.
+ * aoti_shim.def.  On Windows, setup.py generates that stub import library
+ * during build_ext so MSVC can link _tensor_bridge without making PyTorch a
+ * build-time dependency.  If you add, remove, or rename an imported AOTI
+ * symbol here, update aoti_shim.def in the same change.
  *
  * Exception: aoti_torch_get_current_cuda_stream lives in torch_cuda (not
  * torch_cpu) and is resolved lazily at runtime — see _tensor_bridge.pyx.
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -245,6 +245,9 @@ execution.
 CUDA system information and NVIDIA Management Library (NVML)
 ------------------------------------------------------------
 
+.. note::
+   ``cuda.core.system`` support requires ``cuda_bindings`` 12.9.6 or later, or 13.2.0 or later.
+
 Basic functions
 ```````````````
 
diff --git a/cuda_core/setup.py b/cuda_core/setup.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from pathlib import Path
 
 import build_hooks  # our build backend
 from setuptools import setup
@@ -11,11 +12,63 @@
 
 nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
 coverage_mode = bool(int(os.environ.get("CUDA_PYTHON_COVERAGE", "0")))
+_ROOT_DIR = Path(__file__).resolve().parent
+_AOTI_SHIM_DEF_FILE = _ROOT_DIR / "cuda" / "core" / "_include" / "aoti_shim.def"
+_AOTI_SHIM_LIB_FILE = _ROOT_DIR / "build" / "aoti_shim.lib"
+_TENSOR_BRIDGE_EXT_NAME = "cuda.core._tensor_bridge"
+
+
+def _ensure_compiler_initialized(compiler, plat_name):
+    initialize = getattr(compiler, "initialize", None)
+    if callable(initialize) and not getattr(compiler, "initialized", False):
+        if plat_name is None:
+            initialize()
+        else:
+            initialize(plat_name)
+
+
+def _build_aoti_shim_lib(compiler):
+    # Reuse setuptools' initialized MSVC compiler instead of rediscovering
+    # lib.exe separately in the build backend.
+    lib_exe = getattr(compiler, "lib", None)
+    if not lib_exe:
+        raise RuntimeError("MSVC compiler did not expose lib.exe after initialization.")
+
+    _AOTI_SHIM_LIB_FILE.parent.mkdir(exist_ok=True)
+    compiler.spawn(
+        [
+            lib_exe,
+            f"/DEF:{_AOTI_SHIM_DEF_FILE}",
+            f"/OUT:{_AOTI_SHIM_LIB_FILE}",
+            "/MACHINE:X64",
+        ]
+    )
+    return str(_AOTI_SHIM_LIB_FILE)
 
 
 class build_ext(_build_ext):  # noqa: N801
+    def _configure_windows_tensor_bridge(self):
+        if os.name != "nt" or getattr(self.compiler, "compiler_type", None) != "msvc":
+            return
+
+        # _tensor_bridge imports AOTI symbols from torch_cpu.dll, which on
+        # Windows requires a stub import library for the MSVC linker.
+        for ext in self.extensions:
+            if ext.name != _TENSOR_BRIDGE_EXT_NAME:
+                continue
+
+            _ensure_compiler_initialized(self.compiler, self.plat_name)
+            shim_lib = _build_aoti_shim_lib(self.compiler)
+            link_args = list(ext.extra_link_args or [])
+            if shim_lib not in link_args:
+                ext.extra_link_args = [*link_args, shim_lib]
+            return
+
+        raise RuntimeError(f"Failed to find extension {_TENSOR_BRIDGE_EXT_NAME!r} for Windows build.")
+
     def build_extensions(self):
         self.parallel = nthreads
+        self._configure_windows_tensor_bridge()
         super().build_extensions()
 
 
diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py
@@ -82,11 +82,16 @@ def fill_kernel(init_cuda):
     return mod.get_kernel("fill")
 
 
-def _aligned_half(sm):
-    """Compute half the SM count, rounded down to min_partition_size alignment."""
+def _safe_two_group_count(sm):
+    """Return a safe per-group SM count for a 2-group split.
+
+    Uses min_partition_size which is always a valid split size regardless
+    of hardware topology. Returns None if the device doesn't have enough SMs.
+    """
     min_size = sm.min_partition_size
-    half = (sm.sm_count // 2 // min_size) * min_size
-    return half
+    if sm.sm_count < 2 * min_size:
+        return None
+    return min_size
 
 
 @contextlib.contextmanager
@@ -238,30 +243,33 @@ def test_discovery_respects_alignment(self, sm_resource):
             assert groups[0].sm_count % sm_resource.coscheduled_alignment == 0
 
     def test_two_groups(self, sm_resource):
-        """Two-group split with explicit aligned counts."""
-        half = _aligned_half(sm_resource)
-        if half < sm_resource.min_partition_size:
+        """Two-group split with min_partition_size (always topology-safe)."""
+        count = _safe_two_group_count(sm_resource)
+        if count is None:
             pytest.skip("Not enough SMs for a 2-group split")
 
-        groups, rem = sm_resource.split(SMResourceOptions(count=(half, half)))
+        groups, rem = sm_resource.split(SMResourceOptions(count=(count, count)))
 
         assert len(groups) == 2
-        assert groups[0].sm_count > 0
-        assert groups[1].sm_count > 0
+        assert groups[0].sm_count >= count
+        assert groups[1].sm_count >= count
         total = groups[0].sm_count + groups[1].sm_count + rem.sm_count
         assert total <= sm_resource.sm_count
 
-    def test_two_groups_each_meets_request(self, sm_resource):
-        min_size = sm_resource.min_partition_size
-        half = _aligned_half(sm_resource)
-        if half < min_size:
-            pytest.skip("Not enough SMs for a 2-group split")
+    def test_two_groups_backfill(self, sm_resource):
+        """Two-group split with backfill allows larger partitions."""
+        align = sm_resource.coscheduled_alignment
+        if align == 0:
+            align = sm_resource.min_partition_size
+        half = (sm_resource.sm_count // 2 // align) * align
+        if half < sm_resource.min_partition_size:
+            pytest.skip("Not enough SMs for a 2-group backfill split")
 
-        groups, _ = sm_resource.split(SMResourceOptions(count=(min_size, min_size)))
+        groups, rem = sm_resource.split(SMResourceOptions(count=(half, half), backfill=True))
 
         assert len(groups) == 2
-        assert groups[0].sm_count >= min_size
-        assert groups[1].sm_count >= min_size
+        assert groups[0].sm_count >= half
+        assert groups[1].sm_count >= half
 
     def test_dry_run_matches_real(self, sm_resource):
         """Dry-run reports the same SM counts as a real split."""
@@ -352,11 +360,11 @@ def test_green_ctx_sm_resources(self, green_ctx, sm_resource):
 
     def test_green_ctx_resources_reflect_partition(self, init_cuda, sm_resource):
         """Two green contexts should have disjoint SM partitions."""
-        half = _aligned_half(sm_resource)
-        if half < sm_resource.min_partition_size:
+        count = _safe_two_group_count(sm_resource)
+        if count is None:
             pytest.skip("Not enough SMs for a 2-group split")
 
-        groups, _ = sm_resource.split(SMResourceOptions(count=(half, half)))
+        groups, _ = sm_resource.split(SMResourceOptions(count=(count, count)))
 
         ctx_a = ctx_b = None
         try:
@@ -425,11 +433,11 @@ def test_launch_and_verify(self, init_cuda, green_ctx, fill_kernel):
     def test_two_green_contexts_independent(self, init_cuda, sm_resource, fill_kernel):
         """Two SM groups -> two green contexts -> two independent kernels."""
         dev = init_cuda
-        half = _aligned_half(sm_resource)
-        if half < sm_resource.min_partition_size:
+        count = _safe_two_group_count(sm_resource)
+        if count is None:
             pytest.skip("Not enough SMs for a 2-group split")
 
-        groups, _ = sm_resource.split(SMResourceOptions(count=(half, half)))
+        groups, _ = sm_resource.split(SMResourceOptions(count=(count, count)))
         assert len(groups) == 2
 
         ctx_a = ctx_b = None

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').`
`5`	`5`	`;`
`6`	`6`	`; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations`
`7`		`-; in aoti_shim.h. build_hooks.py turns this file into the stub import library`
	`7`	`+; in aoti_shim.h. setup.py turns this file into the stub import library`
`8`	`8`	`; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI`
`9`	`9`	`; symbol must be updated in both files.`
`10`	`10`	`LIBRARY torch_cpu.dll`