Skip to content

Commit 1337557

Browse files
committed
Merge branch 'main' into nvbugs5815123_xfail_if_mempool_oom
2 parents 79543e4 + 4459b30 commit 1337557

19 files changed

Lines changed: 679 additions & 983 deletions

benchmarks/cuda_bindings/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
66
The goal is to benchmark how much overhead does the Python layer adds to calling
77
CUDA APIs and what operations are not in our target of less than 1us of overhead.
88

9-
Each Python benchmark has a C++ counterpart, which is used to compare the
10-
operations. We try to make each implementation perform small operations
11-
and nearly the same work as possible and are run under similar conditions.
9+
Most Python benchmarks have a C++ counterpart that is used as a comparative
10+
baseline. We try to make each implementation perform small operations and
11+
nearly the same work as possible and are run under similar conditions.
12+
13+
A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
14+
because they measure costs with no direct C++ equivalent — such as enum
15+
construction and member access on `cuda.bindings` enum classes.
1216

1317
These are **not** throughput benchmarks to measure the overall performance
1418
of kernels and applications.

benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
_, DEVICE = cuda.cuDeviceGet(0)
1414
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
1515

16+
# Outer retain so the benchmarked retain/release pair just bumps the refcount.
17+
_err, _PRIMARY_CTX = cuda.cuDevicePrimaryCtxRetain(DEVICE)
18+
if _err != cuda.CUresult.CUDA_SUCCESS:
19+
raise RuntimeError(f"cuDevicePrimaryCtxRetain failed during setup: {_err}")
20+
1621

1722
def bench_ctx_get_current(loops: int) -> float:
1823
_fn = cuda.cuCtxGetCurrent
@@ -60,3 +65,15 @@ def bench_device_get_attribute(loops: int) -> float:
6065
for _ in range(loops):
6166
_fn(_attr, _dev)
6267
return time.perf_counter() - t0
68+
69+
70+
def bench_device_primary_ctx_retain(loops: int) -> float:
71+
_retain = cuda.cuDevicePrimaryCtxRetain
72+
_release = cuda.cuDevicePrimaryCtxRelease
73+
_dev = DEVICE
74+
75+
t0 = time.perf_counter()
76+
for _ in range(loops):
77+
_retain(_dev)
78+
_release(_dev)
79+
return time.perf_counter() - t0
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from cuda.bindings import driver as cuda
8+
9+
10+
def bench_curesult_construction(loops: int) -> float:
11+
_cls = cuda.CUresult
12+
13+
t0 = time.perf_counter()
14+
for _ in range(loops):
15+
_cls(0)
16+
return time.perf_counter() - t0
17+
18+
19+
def bench_curesult_member_access(loops: int) -> float:
20+
_cls = cuda.CUresult
21+
22+
t0 = time.perf_counter()
23+
for _ in range(loops):
24+
_cls.CUDA_SUCCESS # noqa: B018
25+
return time.perf_counter() - t0
26+
27+
28+
def bench_device_attribute_construction(loops: int) -> float:
29+
_cls = cuda.CUdevice_attribute
30+
31+
t0 = time.perf_counter()
32+
for _ in range(loops):
33+
_cls(1)
34+
return time.perf_counter() - t0

benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@
1212
PTR = alloc_persistent(1 << 18)
1313
ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
1414

15+
ATTRIBUTES = (
16+
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
17+
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
18+
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
19+
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
20+
)
21+
NUM_ATTRIBUTES = len(ATTRIBUTES)
22+
1523

1624
def bench_pointer_get_attribute(loops: int) -> float:
1725
# Local references to avoid global lookups in the hot loop
@@ -23,3 +31,15 @@ def bench_pointer_get_attribute(loops: int) -> float:
2331
for _ in range(loops):
2432
_fn(_attr, _ptr)
2533
return time.perf_counter() - t0
34+
35+
36+
def bench_pointer_get_attributes(loops: int) -> float:
37+
_fn = cuda.cuPointerGetAttributes
38+
_num = NUM_ATTRIBUTES
39+
_attrs = ATTRIBUTES
40+
_ptr = PTR
41+
42+
t0 = time.perf_counter()
43+
for _ in range(loops):
44+
_fn(_num, _attrs, _ptr)
45+
return time.perf_counter() - t0
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import alloc_persistent, ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
PTR = alloc_persistent(1 << 20)
14+
15+
cuuint32_t = cuda.cuuint32_t
16+
cuuint64_t = cuda.cuuint64_t
17+
18+
# Tiled: rank-2 float32, 128x128, 64x64 tile.
19+
TILED_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
20+
TILED_RANK = 2
21+
TILED_GLOBAL_DIM = (cuuint64_t(128), cuuint64_t(128))
22+
TILED_GLOBAL_STRIDES = (cuuint64_t(128 * 4),)
23+
TILED_BOX_DIM = (cuuint32_t(64), cuuint32_t(64))
24+
TILED_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1))
25+
TILED_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
26+
TILED_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
27+
TILED_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
28+
TILED_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
29+
30+
# Im2col: rank-3 float16, 32x64x64.
31+
IM2COL_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
32+
IM2COL_RANK = 3
33+
IM2COL_GLOBAL_DIM = (cuuint64_t(32), cuuint64_t(64), cuuint64_t(64))
34+
IM2COL_GLOBAL_STRIDES = (cuuint64_t(32 * 2), cuuint64_t(32 * 64 * 2))
35+
IM2COL_PIXEL_BOX_LOWER = (0,)
36+
IM2COL_PIXEL_BOX_UPPER = (0,)
37+
IM2COL_CHANNELS = 32
38+
IM2COL_PIXELS = 32
39+
IM2COL_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1), cuuint32_t(1))
40+
IM2COL_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
41+
IM2COL_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
42+
IM2COL_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
43+
IM2COL_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
44+
45+
_SUCCESS = cuda.CUresult.CUDA_SUCCESS
46+
47+
# Resolve bindings once at module load. A missing attribute (old binding that
48+
# predates a TMA API) is the only legitimate reason for a probe to skip —
49+
# everything else (signature mismatches, unexpected TypeError, etc.) should
50+
# surface loudly instead of being reclassified as "unsupported".
51+
_ENCODE_TILED = getattr(cuda, "cuTensorMapEncodeTiled", None)
52+
_ENCODE_IM2COL = getattr(cuda, "cuTensorMapEncodeIm2col", None)
53+
_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
54+
_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
55+
56+
57+
def _probe_tiled() -> bool:
58+
if _ENCODE_TILED is None:
59+
return False
60+
err, _ = _ENCODE_TILED(
61+
TILED_DTYPE,
62+
TILED_RANK,
63+
PTR,
64+
TILED_GLOBAL_DIM,
65+
TILED_GLOBAL_STRIDES,
66+
TILED_BOX_DIM,
67+
TILED_ELEMENT_STRIDES,
68+
TILED_INTERLEAVE,
69+
TILED_SWIZZLE,
70+
TILED_L2,
71+
TILED_OOB,
72+
)
73+
return err == _SUCCESS
74+
75+
76+
def _probe_im2col() -> bool:
77+
if _ENCODE_IM2COL is None:
78+
return False
79+
err, _ = _ENCODE_IM2COL(
80+
IM2COL_DTYPE,
81+
IM2COL_RANK,
82+
PTR,
83+
IM2COL_GLOBAL_DIM,
84+
IM2COL_GLOBAL_STRIDES,
85+
IM2COL_PIXEL_BOX_LOWER,
86+
IM2COL_PIXEL_BOX_UPPER,
87+
IM2COL_CHANNELS,
88+
IM2COL_PIXELS,
89+
IM2COL_ELEMENT_STRIDES,
90+
IM2COL_INTERLEAVE,
91+
IM2COL_SWIZZLE,
92+
IM2COL_L2,
93+
IM2COL_OOB,
94+
)
95+
return err == _SUCCESS
96+
97+
98+
def _probe_im2col_wide() -> bool:
99+
if _ENCODE_IM2COL_WIDE is None or _IM2COL_WIDE_MODE_CLS is None:
100+
return False
101+
mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
102+
err, _ = _ENCODE_IM2COL_WIDE(
103+
IM2COL_DTYPE,
104+
IM2COL_RANK,
105+
PTR,
106+
IM2COL_GLOBAL_DIM,
107+
IM2COL_GLOBAL_STRIDES,
108+
0,
109+
0,
110+
IM2COL_CHANNELS,
111+
IM2COL_PIXELS,
112+
IM2COL_ELEMENT_STRIDES,
113+
IM2COL_INTERLEAVE,
114+
mode,
115+
cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
116+
IM2COL_L2,
117+
IM2COL_OOB,
118+
)
119+
return err == _SUCCESS
120+
121+
122+
_TILED_OK = _probe_tiled()
123+
_IM2COL_OK = _probe_im2col()
124+
_IM2COL_WIDE_OK = _probe_im2col_wide()
125+
126+
if _IM2COL_WIDE_OK:
127+
_IM2COL_WIDE_MODE_W = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
128+
_IM2COL_WIDE_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B
129+
130+
SKIPPED_BENCHMARKS: set[str] = set()
131+
if not _TILED_OK:
132+
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_tiled")
133+
if not _IM2COL_OK:
134+
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col")
135+
if not _IM2COL_WIDE_OK:
136+
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col_wide")
137+
138+
139+
def bench_tensor_map_encode_tiled(loops: int) -> float:
140+
_fn = cuda.cuTensorMapEncodeTiled
141+
_dt = TILED_DTYPE
142+
_rank = TILED_RANK
143+
_addr = PTR
144+
_gdim = TILED_GLOBAL_DIM
145+
_gstr = TILED_GLOBAL_STRIDES
146+
_bdim = TILED_BOX_DIM
147+
_estr = TILED_ELEMENT_STRIDES
148+
_inter = TILED_INTERLEAVE
149+
_swz = TILED_SWIZZLE
150+
_l2 = TILED_L2
151+
_oob = TILED_OOB
152+
153+
t0 = time.perf_counter()
154+
for _ in range(loops):
155+
_fn(_dt, _rank, _addr, _gdim, _gstr, _bdim, _estr, _inter, _swz, _l2, _oob)
156+
return time.perf_counter() - t0
157+
158+
159+
def bench_tensor_map_encode_im2col(loops: int) -> float:
160+
_fn = cuda.cuTensorMapEncodeIm2col
161+
_dt = IM2COL_DTYPE
162+
_rank = IM2COL_RANK
163+
_addr = PTR
164+
_gdim = IM2COL_GLOBAL_DIM
165+
_gstr = IM2COL_GLOBAL_STRIDES
166+
_lower = IM2COL_PIXEL_BOX_LOWER
167+
_upper = IM2COL_PIXEL_BOX_UPPER
168+
_ch = IM2COL_CHANNELS
169+
_px = IM2COL_PIXELS
170+
_estr = IM2COL_ELEMENT_STRIDES
171+
_inter = IM2COL_INTERLEAVE
172+
_swz = IM2COL_SWIZZLE
173+
_l2 = IM2COL_L2
174+
_oob = IM2COL_OOB
175+
176+
t0 = time.perf_counter()
177+
for _ in range(loops):
178+
_fn(_dt, _rank, _addr, _gdim, _gstr, _lower, _upper, _ch, _px, _estr, _inter, _swz, _l2, _oob)
179+
return time.perf_counter() - t0
180+
181+
182+
def bench_tensor_map_encode_im2col_wide(loops: int) -> float:
183+
_fn = _ENCODE_IM2COL_WIDE
184+
_dt = IM2COL_DTYPE
185+
_rank = IM2COL_RANK
186+
_addr = PTR
187+
_gdim = IM2COL_GLOBAL_DIM
188+
_gstr = IM2COL_GLOBAL_STRIDES
189+
_lower_w = 0
190+
_upper_w = 0
191+
_ch = IM2COL_CHANNELS
192+
_px = IM2COL_PIXELS
193+
_estr = IM2COL_ELEMENT_STRIDES
194+
_inter = IM2COL_INTERLEAVE
195+
_mode = _IM2COL_WIDE_MODE_W
196+
_swz = _IM2COL_WIDE_SWIZZLE
197+
_l2 = IM2COL_L2
198+
_oob = IM2COL_OOB
199+
200+
t0 = time.perf_counter()
201+
for _ in range(loops):
202+
_fn(_dt, _rank, _addr, _gdim, _gstr, _lower_w, _upper_w, _ch, _px, _estr, _inter, _mode, _swz, _l2, _oob)
203+
return time.perf_counter() - t0

benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ add_driver_benchmark(bench_ctx_device)
8383
add_driver_benchmark(bench_stream)
8484
add_driver_benchmark(bench_event)
8585
add_driver_benchmark(bench_memory)
86+
add_driver_benchmark(bench_tensormap)
8687

8788
# NVRTC benchmarks (require nvrtc for kernel compilation)
8889
if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)

benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,25 @@ int main(int argc, char** argv) {
7777
});
7878
}
7979

80+
// --- device_primary_ctx_retain ---
81+
// Outer retain so the benchmarked retain/release pair just bumps the refcount.
82+
CUcontext primary_outer = nullptr;
83+
check_cu(
84+
cuDevicePrimaryCtxRetain(&primary_outer, device),
85+
"cuDevicePrimaryCtxRetain (setup) failed"
86+
);
87+
{
88+
CUcontext primary = nullptr;
89+
suite.run("ctx_device.device_primary_ctx_retain", [&]() {
90+
check_cu(cuDevicePrimaryCtxRetain(&primary, device), "cuDevicePrimaryCtxRetain failed");
91+
check_cu(cuDevicePrimaryCtxRelease(device), "cuDevicePrimaryCtxRelease failed");
92+
});
93+
}
94+
check_cu(
95+
cuDevicePrimaryCtxRelease(device),
96+
"cuDevicePrimaryCtxRelease (teardown) failed"
97+
);
98+
8099
// Cleanup
81100
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
82101

benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,29 @@ int main(int argc, char** argv) {
4949
});
5050
}
5151

52+
// --- pointer_get_attributes ---
53+
{
54+
unsigned int memory_type = 0;
55+
CUdeviceptr dev_ptr_out = 0;
56+
void* host_ptr_out = nullptr;
57+
unsigned long long buffer_id = 0;
58+
59+
CUpointer_attribute attrs[4] = {
60+
CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
61+
CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
62+
CU_POINTER_ATTRIBUTE_HOST_POINTER,
63+
CU_POINTER_ATTRIBUTE_BUFFER_ID,
64+
};
65+
void* data[4] = {&memory_type, &dev_ptr_out, &host_ptr_out, &buffer_id};
66+
67+
suite.run("pointer_attributes.pointer_get_attributes", [&]() {
68+
check_cu(
69+
cuPointerGetAttributes(4, attrs, data, ptr),
70+
"cuPointerGetAttributes failed"
71+
);
72+
});
73+
}
74+
5275
// Cleanup
5376
check_cu(cuMemFree(ptr), "cuMemFree failed");
5477
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");

0 commit comments

Comments
 (0)