From cf5da9c31d188c78bc16ba4d32e01f52e2fa3f27 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 14 Apr 2026 10:20:44 -0700 Subject: [PATCH 1/2] Fix Windows already-loaded detection for newer CUPTI builds Fall back to enumerating loaded modules when basename-based GetModuleHandleW lookups miss an already loaded DLL so pathfinder can recognize newer Windows CUPTI loads consistently and keep the regression covered. Made-with: Cursor --- .../_dynamic_libs/load_dl_windows.py | 58 +++++++++++++++++++ cuda_pathfinder/tests/test_load_dl_windows.py | 56 ++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 cuda_pathfinder/tests/test_load_dl_windows.py diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index a296813aa29..b16ef8e6d7a 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -7,6 +7,7 @@ import ctypes.wintypes import os import struct +from collections.abc import Iterator from typing import TYPE_CHECKING from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL @@ -22,11 +23,16 @@ # Set up kernel32 functions with proper types kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined] +psapi = ctypes.windll.psapi # type: ignore[attr-defined] # GetModuleHandleW kernel32.GetModuleHandleW.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.GetModuleHandleW.restype = ctypes.wintypes.HMODULE +# GetCurrentProcess +kernel32.GetCurrentProcess.argtypes = [] +kernel32.GetCurrentProcess.restype = ctypes.wintypes.HANDLE + # LoadLibraryExW kernel32.LoadLibraryExW.argtypes = [ ctypes.wintypes.LPCWSTR, # lpLibFileName @@ -47,6 +53,15 @@ kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE +# EnumProcessModules +psapi.EnumProcessModules.argtypes = [ + ctypes.wintypes.HANDLE, + ctypes.POINTER(ctypes.wintypes.HMODULE), + ctypes.wintypes.DWORD, + ctypes.POINTER(ctypes.wintypes.DWORD), +] +psapi.EnumProcessModules.restype = ctypes.wintypes.BOOL + def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -101,6 +116,41 @@ def abs_path_for_dynamic_library(libname: str, handle: ctypes.wintypes.HMODULE) return buffer.value +def _iter_loaded_module_handles() -> Iterator[ctypes.wintypes.HMODULE]: + process_handle = kernel32.GetCurrentProcess() + capacity = 64 + module_size = ctypes.sizeof(ctypes.wintypes.HMODULE) + while True: + module_handles = (ctypes.wintypes.HMODULE * capacity)() + needed = ctypes.wintypes.DWORD() + ok = psapi.EnumProcessModules( + process_handle, + module_handles, + ctypes.sizeof(module_handles), + ctypes.byref(needed), + ) + if not ok: + error_code = ctypes.GetLastError() # type: ignore[attr-defined] + raise RuntimeError(f"EnumProcessModules failed (error code: {error_code})") + count = needed.value // module_size + if count <= capacity: + for raw_handle in module_handles[:count]: + if raw_handle is None: + continue + yield ctypes.wintypes.HMODULE(int(raw_handle)) + return + capacity = count + + +def _find_loaded_module(dll_names: tuple[str, ...]) -> tuple[ctypes.wintypes.HMODULE, str] | None: + wanted = {dll_name.casefold() for dll_name in dll_names} + for handle in _iter_loaded_module_handles(): + abs_path = abs_path_for_dynamic_library("loaded module", handle) + if os.path.basename(abs_path).casefold() in wanted: + return handle, abs_path + return None + + def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: bool) -> LoadedDL | None: for dll_name in desc.windows_dlls: handle = kernel32.GetModuleHandleW(dll_name) @@ -112,6 +162,14 @@ def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: b # activate it even if the library was already loaded from elsewhere. add_dll_directory(abs_path) return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere") + # Observed on newer Windows CUPTI builds: GetModuleHandleW(basename) + # can miss an already loaded DLL, so fall back to enumerating loaded modules. + loaded = _find_loaded_module(desc.windows_dlls) + if loaded is not None: + handle, abs_path = loaded + if have_abs_path and desc.requires_add_dll_directory: + add_dll_directory(abs_path) + return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere") return None diff --git a/cuda_pathfinder/tests/test_load_dl_windows.py b/cuda_pathfinder/tests/test_load_dl_windows.py new file mode 100644 index 00000000000..e51fe1f1cdc --- /dev/null +++ b/cuda_pathfinder/tests/test_load_dl_windows.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import sys + +import pytest + +if sys.platform != "win32": + pytest.skip("Windows-only tests", allow_module_level=True) + +from cuda.pathfinder._dynamic_libs import load_dl_windows +from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS + + +def test_check_if_already_loaded_falls_back_to_enumerated_modules(tmp_path, mocker): + desc = LIB_DESCRIPTORS["cupti"] + expected_path = tmp_path / desc.windows_dlls[0] + handles = (0x111, 0x222) + + mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0) + mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter(handles)) + mocker.patch.object( + load_dl_windows, + "abs_path_for_dynamic_library", + side_effect=( + r"C:\Windows\System32\kernel32.dll", + str(expected_path), + ), + ) + add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory") + + result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=False) + + assert result is not None + assert result.abs_path == str(expected_path) + assert result.was_already_loaded_from_elsewhere is True + assert result.found_via == "was-already-loaded-from-elsewhere" + assert result._handle_uint == handles[1] + add_dll_directory.assert_not_called() + + +def test_check_if_already_loaded_fallback_preserves_add_dll_directory_side_effect(tmp_path, mocker): + desc = LIB_DESCRIPTORS["nvrtc"] + expected_path = tmp_path / desc.windows_dlls[0] + + mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0) + mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter((0x333,))) + mocker.patch.object(load_dl_windows, "abs_path_for_dynamic_library", return_value=str(expected_path)) + add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory") + + result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=True) + + assert result is not None + assert result.abs_path == str(expected_path) + assert result.was_already_loaded_from_elsewhere is True + add_dll_directory.assert_called_once_with(str(expected_path)) From 281e39b8d4d3e2ad03107bc4b9a6a458127acce5 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 14 Apr 2026 11:07:55 -0700 Subject: [PATCH 2/2] Add diagnostics for Windows CUPTI reload detection Capture the first load result, basename probe results, and relevant enumerated modules so we can determine why cupti reload detection still fails on real Windows 13.2.1 systems. Made-with: Cursor --- .../_dynamic_libs/dynamic_lib_subprocess.py | 27 ++++++++++ .../_dynamic_libs/load_dl_windows.py | 54 +++++++++++++++++-- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py index ba5d05242f2..8e7721d8f88 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py @@ -27,6 +27,8 @@ # Any production-code impact is negligible since the extra logic only runs # in the subprocess entrypoint and only in test mode. +_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS" + def _probe_canary_abs_path(libname: str) -> str | None: desc = LIB_DESCRIPTORS.get(libname) @@ -48,6 +50,26 @@ def _validate_abs_path(abs_path: str) -> None: assert os.path.isfile(abs_path), f"not a file: {abs_path=!r}" +def _cupti_diagnostics_enabled(libname: str) -> bool: + raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR) + if libname != "cupti" or raw is None: + return False + return raw.strip().lower() not in ("", "0", "false", "no") + + +def _emit_cupti_diagnostic(message: str) -> None: + print(f"[cuda.pathfinder][cupti-diag] {message}", file=sys.stderr) + + +def _emit_loaded_dl_diagnostic(label: str, loaded_dl: LoadedDL) -> None: + _emit_cupti_diagnostic( + f"{label}: abs_path={loaded_dl.abs_path!r}" + f" found_via={loaded_dl.found_via!r}" + f" was_already_loaded_from_elsewhere={loaded_dl.was_already_loaded_from_elsewhere}" + f" handle=0x{loaded_dl._handle_uint:x}" + ) + + def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: """Test-only loader used by the subprocess entrypoint.""" # Keep imports inside the subprocess body so startup stays focused on the @@ -60,7 +82,10 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: ) from cuda.pathfinder._utils.platform_aware import IS_WINDOWS + diagnostics_enabled = _cupti_diagnostics_enabled(libname) loaded_dl_fresh = load_nvidia_dynamic_lib(libname) + if diagnostics_enabled: + _emit_loaded_dl_diagnostic("fresh load", loaded_dl_fresh) if loaded_dl_fresh.was_already_loaded_from_elsewhere: raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere") @@ -75,6 +100,8 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh") loaded_dl_no_cache = _load_lib_no_cache(libname) + if diagnostics_enabled: + _emit_loaded_dl_diagnostic("second uncached load", loaded_dl_no_cache) supported_libs = SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES if not loaded_dl_no_cache.was_already_loaded_from_elsewhere and libname in supported_libs: raise RuntimeError("not loaded_dl_no_cache.was_already_loaded_from_elsewhere") diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index b16ef8e6d7a..44e0da32f13 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -7,6 +7,7 @@ import ctypes.wintypes import os import struct +import sys from collections.abc import Iterator from typing import TYPE_CHECKING @@ -62,6 +63,19 @@ ] psapi.EnumProcessModules.restype = ctypes.wintypes.BOOL +_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS" + + +def _cupti_diagnostics_enabled(desc_name: str) -> bool: + raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR) + if desc_name != "cupti" or raw is None: + return False + return raw.strip().lower() not in ("", "0", "false", "no") + + +def _emit_cupti_diagnostic(message: str) -> None: + sys.stderr.write(f"[cuda.pathfinder][cupti-diag] {message}\n") + def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -142,20 +156,52 @@ def _iter_loaded_module_handles() -> Iterator[ctypes.wintypes.HMODULE]: capacity = count -def _find_loaded_module(dll_names: tuple[str, ...]) -> tuple[ctypes.wintypes.HMODULE, str] | None: +def _find_loaded_module( + dll_names: tuple[str, ...], + *, + diagnostics_enabled: bool = False, +) -> tuple[ctypes.wintypes.HMODULE, str] | None: wanted = {dll_name.casefold() for dll_name in dll_names} + relevant_modules: list[str] = [] for handle in _iter_loaded_module_handles(): abs_path = abs_path_for_dynamic_library("loaded module", handle) - if os.path.basename(abs_path).casefold() in wanted: + basename = os.path.basename(abs_path) + basename_casefold = basename.casefold() + if diagnostics_enabled and ("cupti" in basename_casefold or "nvperf" in basename_casefold): + relevant_modules.append(f"0x{ctypes_handle_to_unsigned_int(handle):x}:{abs_path}") + if basename_casefold in wanted: + if diagnostics_enabled: + _emit_cupti_diagnostic( + "enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "") + ) + _emit_cupti_diagnostic( + f"enumeration match: basename={basename!r} abs_path={abs_path!r}" + f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}" + ) return handle, abs_path + if diagnostics_enabled: + _emit_cupti_diagnostic( + "enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "") + ) return None def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: bool) -> LoadedDL | None: + diagnostics_enabled = _cupti_diagnostics_enabled(desc.name) + basename_probe_results: list[str] = [] for dll_name in desc.windows_dlls: handle = kernel32.GetModuleHandleW(dll_name) + if diagnostics_enabled: + handle_text = "0x0" if not handle else f"0x{ctypes_handle_to_unsigned_int(handle):x}" + basename_probe_results.append(f"{dll_name}={handle_text}") if handle: abs_path = abs_path_for_dynamic_library(desc.name, handle) + if diagnostics_enabled: + _emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results)) + _emit_cupti_diagnostic( + f"basename match: dll_name={dll_name!r} abs_path={abs_path!r}" + f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}" + ) if have_abs_path and desc.requires_add_dll_directory: # This is a side-effect if the pathfinder loads the library via # load_with_abs_path(). To make the side-effect more deterministic, @@ -164,7 +210,9 @@ def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: b return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere") # Observed on newer Windows CUPTI builds: GetModuleHandleW(basename) # can miss an already loaded DLL, so fall back to enumerating loaded modules. - loaded = _find_loaded_module(desc.windows_dlls) + if diagnostics_enabled: + _emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results)) + loaded = _find_loaded_module(desc.windows_dlls, diagnostics_enabled=diagnostics_enabled) if loaded is not None: handle, abs_path = loaded if have_abs_path and desc.requires_add_dll_directory: