Skip to content

Commit f550183

Browse files
committed
Merge remote-tracking branch 'origin/main' into graph-step-3
2 parents 3c6c387 + b280b9d commit f550183

12 files changed

Lines changed: 826 additions & 12 deletions

File tree

cuda_core/cuda/core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def _import_versioned_module():
2828
del _import_versioned_module
2929

3030

31-
from cuda.core import system, utils
31+
from cuda.core import checkpoint, system, utils
3232
from cuda.core._device import Device
3333
from cuda.core._event import Event, EventOptions
3434
from cuda.core._graphics import GraphicsResource

cuda_core/cuda/core/checkpoint.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import ctypes as _ctypes
6+
from collections.abc import Mapping as _Mapping
7+
from typing import Any as _Any
8+
9+
from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return
10+
from cuda.core._utils.version import binding_version as _binding_version
11+
from cuda.core._utils.version import driver_version as _driver_version
12+
from cuda.core.typing import ProcessStateT as _ProcessStateT
13+
14+
try:
15+
from cuda.bindings import driver as _driver
16+
except ImportError:
17+
from cuda import cuda as _driver
18+
19+
20+
_PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateT], ...] = (
21+
("CU_PROCESS_STATE_RUNNING", "running"),
22+
("CU_PROCESS_STATE_LOCKED", "locked"),
23+
("CU_PROCESS_STATE_CHECKPOINTED", "checkpointed"),
24+
("CU_PROCESS_STATE_FAILED", "failed"),
25+
)
26+
27+
_REQUIRED_BINDING_ATTRS = (
28+
"cuCheckpointProcessCheckpoint",
29+
"cuCheckpointProcessGetRestoreThreadId",
30+
"cuCheckpointProcessGetState",
31+
"cuCheckpointProcessLock",
32+
"cuCheckpointProcessRestore",
33+
"cuCheckpointProcessUnlock",
34+
"CUcheckpointGpuPair",
35+
"CUcheckpointLockArgs",
36+
"CUprocessState",
37+
"CUcheckpointRestoreArgs",
38+
)
39+
_REQUIRED_DRIVER_VERSION = (12, 8, 0)
40+
_driver_capability_checked = False
41+
42+
43+
class Process:
44+
"""
45+
CUDA process that can be locked, checkpointed, restored, and unlocked.
46+
47+
Parameters
48+
----------
49+
pid : int
50+
Process ID of the CUDA process.
51+
"""
52+
53+
__slots__ = ("_pid",)
54+
55+
def __init__(self, pid: int):
56+
self._pid = _check_pid(pid)
57+
58+
@property
59+
def pid(self) -> int:
60+
"""
61+
Process ID of the CUDA process.
62+
"""
63+
return self._pid
64+
65+
@property
66+
def state(self) -> _ProcessStateT:
67+
"""
68+
CUDA checkpoint state for this process.
69+
"""
70+
driver = _get_driver()
71+
state = _call_driver(driver, driver.cuCheckpointProcessGetState, self._pid)
72+
state_names = _get_process_state_names(driver)
73+
try:
74+
return state_names[state]
75+
except KeyError as e:
76+
state_value = int(state)
77+
raise RuntimeError(f"Unknown CUDA checkpoint process state: {state_value}") from e
78+
79+
@property
80+
def restore_thread_id(self) -> int:
81+
"""
82+
CUDA restore thread ID for this process.
83+
"""
84+
driver = _get_driver()
85+
return _call_driver(driver, driver.cuCheckpointProcessGetRestoreThreadId, self._pid)
86+
87+
def lock(self, timeout_ms: int = 0) -> None:
88+
"""
89+
Lock this process, blocking further CUDA API calls.
90+
91+
Parameters
92+
----------
93+
timeout_ms : int, optional
94+
Timeout in milliseconds. A value of 0 indicates no timeout.
95+
"""
96+
driver = _get_driver()
97+
args = driver.CUcheckpointLockArgs()
98+
args.timeoutMs = _check_timeout_ms(timeout_ms)
99+
_call_driver(driver, driver.cuCheckpointProcessLock, self._pid, args)
100+
101+
def checkpoint(self) -> None:
102+
"""
103+
Checkpoint the GPU memory contents of this locked process.
104+
"""
105+
driver = _get_driver()
106+
_call_driver(driver, driver.cuCheckpointProcessCheckpoint, self._pid, None)
107+
108+
def restore(self, gpu_mapping: _Mapping[_Any, _Any] | None = None) -> None:
109+
"""
110+
Restore this checkpointed process.
111+
112+
Parameters
113+
----------
114+
gpu_mapping : mapping, optional
115+
GPU UUID remapping from each checkpointed GPU UUID to the GPU UUID
116+
to restore onto. For migration workflows, provide mappings for
117+
every GPU visible to the kernel-mode driver. User-space masking
118+
such as ``CUDA_VISIBLE_DEVICES`` does not reduce this mapping
119+
requirement.
120+
"""
121+
driver = _get_driver()
122+
args = _make_restore_args(driver, gpu_mapping)
123+
_call_driver(driver, driver.cuCheckpointProcessRestore, self._pid, args)
124+
125+
def unlock(self) -> None:
126+
"""
127+
Unlock this locked process so it can resume CUDA API calls.
128+
"""
129+
driver = _get_driver()
130+
_call_driver(driver, driver.cuCheckpointProcessUnlock, self._pid, None)
131+
132+
133+
def _get_driver():
134+
global _driver_capability_checked
135+
if _driver_capability_checked:
136+
return _driver
137+
138+
binding_ver = _binding_version()
139+
if not _binding_version_supports_checkpoint(binding_ver):
140+
raise RuntimeError(
141+
"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. "
142+
f"Found cuda.bindings {'.'.join(str(part) for part in binding_ver[:3])}."
143+
)
144+
145+
missing = [name for name in _REQUIRED_BINDING_ATTRS if not hasattr(_driver, name)]
146+
if missing:
147+
raise RuntimeError(
148+
f"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Missing: {', '.join(missing)}"
149+
)
150+
151+
driver_ver = _driver_version()
152+
if driver_ver < _REQUIRED_DRIVER_VERSION:
153+
raise RuntimeError(
154+
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
155+
"Upgrade to a driver version with CUDA checkpoint API support."
156+
)
157+
158+
_driver_capability_checked = True
159+
return _driver
160+
161+
162+
def _binding_version_supports_checkpoint(version) -> bool:
163+
major, minor, patch = version[:3]
164+
return (major == 12 and (minor, patch) >= (8, 0)) or (major == 13 and (minor, patch) >= (0, 2)) or major > 13
165+
166+
167+
def _get_process_state_names(driver) -> dict[_Any, _ProcessStateT]:
168+
return {getattr(driver.CUprocessState, attr): state_name for attr, state_name in _PROCESS_STATE_NAME_ATTRS}
169+
170+
171+
def _call_driver(driver, func, *args):
172+
try:
173+
result = func(*args)
174+
except RuntimeError as e:
175+
if "cuCheckpointProcess" in str(e) and "not found" in str(e):
176+
raise RuntimeError(
177+
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
178+
"Upgrade to a driver version with CUDA checkpoint API support."
179+
) from e
180+
raise
181+
182+
err = result[0]
183+
not_supported_errors = (
184+
getattr(driver.CUresult, "CUDA_ERROR_NOT_FOUND", None),
185+
getattr(driver.CUresult, "CUDA_ERROR_NOT_SUPPORTED", None),
186+
)
187+
if err in not_supported_errors:
188+
raise RuntimeError(
189+
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
190+
"Upgrade to a driver version with CUDA checkpoint API support."
191+
)
192+
193+
return _handle_cuda_return(result)
194+
195+
196+
def _check_pid(pid: int) -> int:
197+
if isinstance(pid, bool) or not isinstance(pid, int):
198+
raise TypeError("pid must be an int")
199+
if pid <= 0:
200+
raise ValueError("pid must be a positive int")
201+
return pid
202+
203+
204+
def _check_timeout_ms(timeout_ms: int) -> int:
205+
if isinstance(timeout_ms, bool) or not isinstance(timeout_ms, int):
206+
raise TypeError("timeout_ms must be an int")
207+
if timeout_ms < 0:
208+
raise ValueError("timeout_ms must be >= 0")
209+
return timeout_ms
210+
211+
212+
def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None):
213+
if gpu_mapping is None:
214+
return None
215+
if not isinstance(gpu_mapping, _Mapping):
216+
raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID")
217+
218+
pairs = []
219+
for old_uuid, new_uuid in gpu_mapping.items():
220+
pair = driver.CUcheckpointGpuPair()
221+
buffers = []
222+
pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers)
223+
pair.newUuid = _as_cuuuid(driver, new_uuid, buffers)
224+
pairs.append(pair)
225+
226+
if not pairs:
227+
return None
228+
229+
args = driver.CUcheckpointRestoreArgs()
230+
args.gpuPairs = pairs
231+
args.gpuPairsCount = len(pairs)
232+
return args
233+
234+
235+
def _as_cuuuid(driver, value, buffers):
236+
"""Convert *value* to a ``CUuuid``.
237+
238+
Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in
239+
the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by
240+
:attr:`Device.uuid`.
241+
"""
242+
if isinstance(value, str):
243+
raw = bytes.fromhex(value.replace("-", ""))
244+
if len(raw) != 16:
245+
raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}")
246+
buf = _ctypes.create_string_buffer(raw, 16)
247+
buffers.append(buf)
248+
return driver.CUuuid(_ctypes.addressof(buf))
249+
return value
250+
251+
252+
__all__ = [
253+
"Process",
254+
]

cuda_core/cuda/core/graph/_subclasses.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ cdef class HostCallbackNode(GraphNode):
573573
574574
Properties
575575
----------
576-
callback_fn : callable or None
576+
callback : callable or None
577577
The Python callable (None for ctypes function pointer callbacks).
578578
"""
579579

@@ -613,7 +613,7 @@ cdef class HostCallbackNode(GraphNode):
613613
f" cfunc=0x{<uintptr_t>self._fn:x}>")
614614

615615
@property
616-
def callback_fn(self):
616+
def callback(self):
617617
"""The Python callable, or None for ctypes function pointer callbacks."""
618618
return self._callable
619619

cuda_core/cuda/core/typing.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@
44

55
"""Public type aliases and protocols used in cuda.core API signatures."""
66

7+
from typing import Literal as _Literal
8+
79
from cuda.core._memory._buffer import DevicePointerT
810
from cuda.core._stream import IsStreamT
911

12+
ProcessStateT = _Literal["running", "locked", "checkpointed", "failed"]
13+
1014
__all__ = [
1115
"DevicePointerT",
1216
"IsStreamT",
17+
"ProcessStateT",
1318
]

cuda_core/docs/source/api.rst

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,68 @@ CUDA compilation toolchain
174174
LinkerOptions
175175

176176

177+
CUDA process checkpointing
178+
--------------------------
179+
180+
The :mod:`cuda.core.checkpoint` module wraps the CUDA driver process
181+
checkpoint APIs. These APIs are intended for Linux process checkpoint and
182+
restore workflows, and require a CUDA driver with checkpoint API support and
183+
a ``cuda-bindings`` version that exposes those driver entry points.
184+
185+
Checkpointing is typically driven by a coordinator process acting on a target
186+
CUDA process, similar to attaching a debugger or sending a signal. The target
187+
process is identified by process ID. Linux and the CUDA driver enforce process
188+
permissions; checkpointing another user's process may require elevated
189+
permissions such as ``CAP_SYS_PTRACE`` or administrator privileges.
190+
191+
The CUDA checkpoint APIs prepare CUDA-managed GPU state for process-level
192+
checkpoint and restore. They do not capture the CPU process image by
193+
themselves; full process checkpoint workflows still need a CPU-side process
194+
checkpointing tool such as CRIU. A minimal coordinator-side sequence looks like
195+
this:
196+
197+
.. code-block:: python
198+
199+
import os
200+
201+
from cuda.core import checkpoint
202+
203+
target_pid = os.getpid() # or the PID of another CUDA process
204+
process = checkpoint.Process(target_pid)
205+
process.lock(timeout_ms=5000)
206+
process.checkpoint()
207+
208+
# Capture or restore the CPU process image outside cuda.core.
209+
210+
process.restore()
211+
process.unlock()
212+
213+
``Process.state`` returns one of ``"running"``, ``"locked"``,
214+
``"checkpointed"``, or ``"failed"``. Restore may optionally remap GPUs by
215+
passing ``gpu_mapping`` from each checkpointed GPU UUID to the GPU UUID that
216+
should be used during restore. For migration workflows, provide mappings for
217+
every GPU visible to the NVIDIA kernel-mode driver at checkpoint time.
218+
User-space masking such as ``CUDA_VISIBLE_DEVICES`` does not reduce this
219+
mapping requirement, so applications that rely on user-space GPU masking may
220+
not be valid migration targets. The mapping may use ``CUuuid`` objects or the
221+
UUID strings returned by :attr:`Device.uuid`. A successful restore returns the
222+
process to the locked state; call ``Process.unlock`` after restore to allow
223+
CUDA API calls to resume.
224+
225+
The CUDA driver requires restore to run from the process restore thread.
226+
Use ``Process.restore_thread_id`` to discover that thread before calling
227+
``Process.restore`` from a checkpoint coordinator. Restore also requires
228+
persistence mode to be enabled or ``cuInit`` to have been called before
229+
execution.
230+
231+
.. autosummary::
232+
:toctree: generated/
233+
234+
:template: class.rst
235+
236+
checkpoint.Process
237+
238+
177239
CUDA system information and NVIDIA Management Library (NVML)
178240
------------------------------------------------------------
179241

cuda_core/docs/source/api_private.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ CUDA runtime
1717
:toctree: generated/
1818

1919
typing.DevicePointerT
20+
typing.ProcessStateT
2021
_memory._virtual_memory_resource.VirtualMemoryAllocationTypeT
2122
_memory._virtual_memory_resource.VirtualMemoryLocationTypeT
2223
_memory._virtual_memory_resource.VirtualMemoryGranularityT

cuda_core/docs/source/release/1.0.0-notes.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ Highlights
1616
New features
1717
------------
1818

19-
- TBD
19+
- Added the :mod:`cuda.core.checkpoint` module for CUDA process checkpointing,
20+
including string process state queries, lock/checkpoint/restore/unlock
21+
operations, and GPU UUID remapping support for restore.
22+
(`#1343 <https://github.com/NVIDIA/cuda-python/issues/1343>`__)
2023

2124

2225
Breaking changes
@@ -87,6 +90,10 @@ Breaking changes
8790
- New: ``kernel.attributes.num_regs`` and
8891
``kernel.attributes[some_dev].num_regs``
8992

93+
- Renamed :attr:`graph.HostCallbackNode.callback_fn` to
94+
:attr:`graph.HostCallbackNode.callback` to drop the redundant ``_fn`` suffix
95+
(`#1945 <https://github.com/NVIDIA/cuda-python/issues/1945>`__).
96+
9097
- Unified the conditional graph API on :class:`~graph.GraphCondition`
9198
and consistent verbs
9299
(`#1945 <https://github.com/NVIDIA/cuda-python/issues/1945>`__):

0 commit comments

Comments
 (0)