diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py index ba5d05242f2..8e7721d8f88 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/dynamic_lib_subprocess.py @@ -27,6 +27,8 @@ # Any production-code impact is negligible since the extra logic only runs # in the subprocess entrypoint and only in test mode. +_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS" + def _probe_canary_abs_path(libname: str) -> str | None: desc = LIB_DESCRIPTORS.get(libname) @@ -48,6 +50,26 @@ def _validate_abs_path(abs_path: str) -> None: assert os.path.isfile(abs_path), f"not a file: {abs_path=!r}" +def _cupti_diagnostics_enabled(libname: str) -> bool: + raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR) + if libname != "cupti" or raw is None: + return False + return raw.strip().lower() not in ("", "0", "false", "no") + + +def _emit_cupti_diagnostic(message: str) -> None: + print(f"[cuda.pathfinder][cupti-diag] {message}", file=sys.stderr) + + +def _emit_loaded_dl_diagnostic(label: str, loaded_dl: LoadedDL) -> None: + _emit_cupti_diagnostic( + f"{label}: abs_path={loaded_dl.abs_path!r}" + f" found_via={loaded_dl.found_via!r}" + f" was_already_loaded_from_elsewhere={loaded_dl.was_already_loaded_from_elsewhere}" + f" handle=0x{loaded_dl._handle_uint:x}" + ) + + def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: """Test-only loader used by the subprocess entrypoint.""" # Keep imports inside the subprocess body so startup stays focused on the @@ -60,7 +82,10 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: ) from cuda.pathfinder._utils.platform_aware import IS_WINDOWS + diagnostics_enabled = _cupti_diagnostics_enabled(libname) loaded_dl_fresh = load_nvidia_dynamic_lib(libname) + if diagnostics_enabled: + _emit_loaded_dl_diagnostic("fresh load", loaded_dl_fresh) if loaded_dl_fresh.was_already_loaded_from_elsewhere: raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere") @@ -75,6 +100,8 @@ def _load_nvidia_dynamic_lib_for_test(libname: str) -> str: raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh") loaded_dl_no_cache = _load_lib_no_cache(libname) + if diagnostics_enabled: + _emit_loaded_dl_diagnostic("second uncached load", loaded_dl_no_cache) supported_libs = SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES if not loaded_dl_no_cache.was_already_loaded_from_elsewhere and libname in supported_libs: raise RuntimeError("not loaded_dl_no_cache.was_already_loaded_from_elsewhere") diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index a296813aa29..44e0da32f13 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -7,6 +7,8 @@ import ctypes.wintypes import os import struct +import sys +from collections.abc import Iterator from typing import TYPE_CHECKING from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL @@ -22,11 +24,16 @@ # Set up kernel32 functions with proper types kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined] +psapi = ctypes.windll.psapi # type: ignore[attr-defined] # GetModuleHandleW kernel32.GetModuleHandleW.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.GetModuleHandleW.restype = ctypes.wintypes.HMODULE +# GetCurrentProcess +kernel32.GetCurrentProcess.argtypes = [] +kernel32.GetCurrentProcess.restype = ctypes.wintypes.HANDLE + # LoadLibraryExW kernel32.LoadLibraryExW.argtypes = [ ctypes.wintypes.LPCWSTR, # lpLibFileName @@ -47,6 +54,28 @@ kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE +# EnumProcessModules +psapi.EnumProcessModules.argtypes = [ + ctypes.wintypes.HANDLE, + ctypes.POINTER(ctypes.wintypes.HMODULE), + ctypes.wintypes.DWORD, + ctypes.POINTER(ctypes.wintypes.DWORD), +] +psapi.EnumProcessModules.restype = ctypes.wintypes.BOOL + +_CUPTI_DIAGNOSTICS_ENVVAR = "CUDA_PATHFINDER_WINDOWS_CUPTI_ALREADY_LOADED_DIAGNOSTICS" + + +def _cupti_diagnostics_enabled(desc_name: str) -> bool: + raw = os.environ.get(_CUPTI_DIAGNOSTICS_ENVVAR) + if desc_name != "cupti" or raw is None: + return False + return raw.strip().lower() not in ("", "0", "false", "no") + + +def _emit_cupti_diagnostic(message: str) -> None: + sys.stderr.write(f"[cuda.pathfinder][cupti-diag] {message}\n") + def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -101,17 +130,94 @@ def abs_path_for_dynamic_library(libname: str, handle: ctypes.wintypes.HMODULE) return buffer.value +def _iter_loaded_module_handles() -> Iterator[ctypes.wintypes.HMODULE]: + process_handle = kernel32.GetCurrentProcess() + capacity = 64 + module_size = ctypes.sizeof(ctypes.wintypes.HMODULE) + while True: + module_handles = (ctypes.wintypes.HMODULE * capacity)() + needed = ctypes.wintypes.DWORD() + ok = psapi.EnumProcessModules( + process_handle, + module_handles, + ctypes.sizeof(module_handles), + ctypes.byref(needed), + ) + if not ok: + error_code = ctypes.GetLastError() # type: ignore[attr-defined] + raise RuntimeError(f"EnumProcessModules failed (error code: {error_code})") + count = needed.value // module_size + if count <= capacity: + for raw_handle in module_handles[:count]: + if raw_handle is None: + continue + yield ctypes.wintypes.HMODULE(int(raw_handle)) + return + capacity = count + + +def _find_loaded_module( + dll_names: tuple[str, ...], + *, + diagnostics_enabled: bool = False, +) -> tuple[ctypes.wintypes.HMODULE, str] | None: + wanted = {dll_name.casefold() for dll_name in dll_names} + relevant_modules: list[str] = [] + for handle in _iter_loaded_module_handles(): + abs_path = abs_path_for_dynamic_library("loaded module", handle) + basename = os.path.basename(abs_path) + basename_casefold = basename.casefold() + if diagnostics_enabled and ("cupti" in basename_casefold or "nvperf" in basename_casefold): + relevant_modules.append(f"0x{ctypes_handle_to_unsigned_int(handle):x}:{abs_path}") + if basename_casefold in wanted: + if diagnostics_enabled: + _emit_cupti_diagnostic( + "enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "") + ) + _emit_cupti_diagnostic( + f"enumeration match: basename={basename!r} abs_path={abs_path!r}" + f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}" + ) + return handle, abs_path + if diagnostics_enabled: + _emit_cupti_diagnostic( + "enumerated relevant modules: " + (" | ".join(relevant_modules) if relevant_modules else "") + ) + return None + + def check_if_already_loaded_from_elsewhere(desc: LibDescriptor, have_abs_path: bool) -> LoadedDL | None: + diagnostics_enabled = _cupti_diagnostics_enabled(desc.name) + basename_probe_results: list[str] = [] for dll_name in desc.windows_dlls: handle = kernel32.GetModuleHandleW(dll_name) + if diagnostics_enabled: + handle_text = "0x0" if not handle else f"0x{ctypes_handle_to_unsigned_int(handle):x}" + basename_probe_results.append(f"{dll_name}={handle_text}") if handle: abs_path = abs_path_for_dynamic_library(desc.name, handle) + if diagnostics_enabled: + _emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results)) + _emit_cupti_diagnostic( + f"basename match: dll_name={dll_name!r} abs_path={abs_path!r}" + f" handle=0x{ctypes_handle_to_unsigned_int(handle):x}" + ) if have_abs_path and desc.requires_add_dll_directory: # This is a side-effect if the pathfinder loads the library via # load_with_abs_path(). To make the side-effect more deterministic, # activate it even if the library was already loaded from elsewhere. add_dll_directory(abs_path) return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere") + # Observed on newer Windows CUPTI builds: GetModuleHandleW(basename) + # can miss an already loaded DLL, so fall back to enumerating loaded modules. + if diagnostics_enabled: + _emit_cupti_diagnostic("basename GetModuleHandleW results: " + ", ".join(basename_probe_results)) + loaded = _find_loaded_module(desc.windows_dlls, diagnostics_enabled=diagnostics_enabled) + if loaded is not None: + handle, abs_path = loaded + if have_abs_path and desc.requires_add_dll_directory: + add_dll_directory(abs_path) + return LoadedDL(abs_path, True, ctypes_handle_to_unsigned_int(handle), "was-already-loaded-from-elsewhere") return None diff --git a/cuda_pathfinder/tests/test_load_dl_windows.py b/cuda_pathfinder/tests/test_load_dl_windows.py new file mode 100644 index 00000000000..e51fe1f1cdc --- /dev/null +++ b/cuda_pathfinder/tests/test_load_dl_windows.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import sys + +import pytest + +if sys.platform != "win32": + pytest.skip("Windows-only tests", allow_module_level=True) + +from cuda.pathfinder._dynamic_libs import load_dl_windows +from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS + + +def test_check_if_already_loaded_falls_back_to_enumerated_modules(tmp_path, mocker): + desc = LIB_DESCRIPTORS["cupti"] + expected_path = tmp_path / desc.windows_dlls[0] + handles = (0x111, 0x222) + + mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0) + mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter(handles)) + mocker.patch.object( + load_dl_windows, + "abs_path_for_dynamic_library", + side_effect=( + r"C:\Windows\System32\kernel32.dll", + str(expected_path), + ), + ) + add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory") + + result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=False) + + assert result is not None + assert result.abs_path == str(expected_path) + assert result.was_already_loaded_from_elsewhere is True + assert result.found_via == "was-already-loaded-from-elsewhere" + assert result._handle_uint == handles[1] + add_dll_directory.assert_not_called() + + +def test_check_if_already_loaded_fallback_preserves_add_dll_directory_side_effect(tmp_path, mocker): + desc = LIB_DESCRIPTORS["nvrtc"] + expected_path = tmp_path / desc.windows_dlls[0] + + mocker.patch.object(load_dl_windows.kernel32, "GetModuleHandleW", return_value=0) + mocker.patch.object(load_dl_windows, "_iter_loaded_module_handles", return_value=iter((0x333,))) + mocker.patch.object(load_dl_windows, "abs_path_for_dynamic_library", return_value=str(expected_path)) + add_dll_directory = mocker.patch.object(load_dl_windows, "add_dll_directory") + + result = load_dl_windows.check_if_already_loaded_from_elsewhere(desc, have_abs_path=True) + + assert result is not None + assert result.abs_path == str(expected_path) + assert result.was_already_loaded_from_elsewhere is True + add_dll_directory.assert_called_once_with(str(expected_path))