Skip to content

Commit 8800c46

Browse files
committed
refactor(examples): use KernelHelper context management
Replace manual KernelHelper close calls with context-managed usage so modules are unloaded consistently even on early exits and cleanup stays centralized. Made-with: Cursor
1 parent e504f3b commit 8800c46

11 files changed

Lines changed: 415 additions & 431 deletions

cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -71,38 +71,37 @@ def main():
7171
hinput[i] = i
7272

7373
devID = findCudaDevice()
74-
kernelHelper = common.KernelHelper(clock_nvrtc, devID)
75-
kernel_addr = kernelHelper.getFunction(b"timedReduction")
76-
77-
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
78-
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
79-
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
80-
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81-
82-
args = ((dinput, doutput, dtimer), (None, None, None))
83-
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
84-
85-
grid_dims = (NUM_BLOCKS, 1, 1)
86-
block_dims = (NUM_THREADS, 1, 1)
87-
88-
checkCudaErrors(
89-
cuda.cuLaunchKernel(
90-
kernel_addr,
91-
*grid_dims, # grid dim
92-
*block_dims, # block dim
93-
shared_memory_nbytes,
94-
0, # shared mem, stream
95-
args,
96-
0,
97-
)
98-
) # arguments
99-
100-
checkCudaErrors(cuda.cuCtxSynchronize())
101-
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102-
checkCudaErrors(cuda.cuMemFree(dinput))
103-
checkCudaErrors(cuda.cuMemFree(doutput))
104-
checkCudaErrors(cuda.cuMemFree(dtimer))
105-
kernelHelper.close()
74+
with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
75+
kernel_addr = kernelHelper.getFunction(b"timedReduction")
76+
77+
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
78+
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
79+
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
80+
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81+
82+
args = ((dinput, doutput, dtimer), (None, None, None))
83+
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
84+
85+
grid_dims = (NUM_BLOCKS, 1, 1)
86+
block_dims = (NUM_THREADS, 1, 1)
87+
88+
checkCudaErrors(
89+
cuda.cuLaunchKernel(
90+
kernel_addr,
91+
*grid_dims, # grid dim
92+
*block_dims, # block dim
93+
shared_memory_nbytes,
94+
0, # shared mem, stream
95+
args,
96+
0,
97+
)
98+
) # arguments
99+
100+
checkCudaErrors(cuda.cuCtxSynchronize())
101+
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102+
checkCudaErrors(cuda.cuMemFree(dinput))
103+
checkCudaErrors(cuda.cuMemFree(doutput))
104+
checkCudaErrors(cuda.cuMemFree(dtimer))
106105

107106
avgElapsedClocks = 0.0
108107

cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -154,58 +154,57 @@ def main():
154154
f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
155155
)
156156

157-
kernelHelper = common.KernelHelper(simpleCubemapTexture, devID)
158-
_transformKernel = kernelHelper.getFunction(b"transformKernel")
159-
kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
160-
checkCudaErrors(
161-
cuda.cuLaunchKernel(
162-
_transformKernel,
163-
dimGrid.x,
164-
dimGrid.y,
165-
dimGrid.z, # grid dim
166-
dimBlock.x,
167-
dimBlock.y,
168-
dimBlock.z, # block dim
169-
0,
170-
0, # shared mem and stream
171-
kernelArgs,
172-
0,
173-
)
174-
) # arguments
175-
176-
checkCudaErrors(cudart.cudaDeviceSynchronize())
177-
178-
start = time.time()
179-
180-
# Execute the kernel
181-
checkCudaErrors(
182-
cuda.cuLaunchKernel(
183-
_transformKernel,
184-
dimGrid.x,
185-
dimGrid.y,
186-
dimGrid.z, # grid dim
187-
dimBlock.x,
188-
dimBlock.y,
189-
dimBlock.z, # block dim
190-
0,
191-
0, # shared mem and stream
192-
kernelArgs,
193-
0,
194-
)
195-
) # arguments
196-
197-
checkCudaErrors(cudart.cudaDeviceSynchronize())
198-
stop = time.time()
199-
print(f"Processing time: {stop - start:.3f} msec")
200-
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
201-
202-
# Allocate mem for the result on host side
203-
h_odata = np.empty_like(h_data)
204-
# Copy result from device to host
205-
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
157+
with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
158+
_transformKernel = kernelHelper.getFunction(b"transformKernel")
159+
kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
160+
checkCudaErrors(
161+
cuda.cuLaunchKernel(
162+
_transformKernel,
163+
dimGrid.x,
164+
dimGrid.y,
165+
dimGrid.z, # grid dim
166+
dimBlock.x,
167+
dimBlock.y,
168+
dimBlock.z, # block dim
169+
0,
170+
0, # shared mem and stream
171+
kernelArgs,
172+
0,
173+
)
174+
) # arguments
175+
176+
checkCudaErrors(cudart.cudaDeviceSynchronize())
177+
178+
start = time.time()
179+
180+
# Execute the kernel
181+
checkCudaErrors(
182+
cuda.cuLaunchKernel(
183+
_transformKernel,
184+
dimGrid.x,
185+
dimGrid.y,
186+
dimGrid.z, # grid dim
187+
dimBlock.x,
188+
dimBlock.y,
189+
dimBlock.z, # block dim
190+
0,
191+
0, # shared mem and stream
192+
kernelArgs,
193+
0,
194+
)
195+
) # arguments
196+
197+
checkCudaErrors(cudart.cudaDeviceSynchronize())
198+
stop = time.time()
199+
print(f"Processing time: {stop - start:.3f} msec")
200+
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
201+
202+
# Allocate mem for the result on host side
203+
h_odata = np.empty_like(h_data)
204+
# Copy result from device to host
205+
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
206206

207207
checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
208-
kernelHelper.close()
209208
checkCudaErrors(cudart.cudaFree(d_data))
210209
checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
211210

cuda_bindings/examples/0_Introduction/simpleP2P_test.py

Lines changed: 34 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -153,53 +153,49 @@ def main():
153153
print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
154154
checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
155155

156-
kernelHelper = [None] * 2
157-
_simpleKernel = [None] * 2
158-
kernelArgs = [None] * 2
159-
160-
kernelHelper[1] = common.KernelHelper(simplep2p, gpuid[1])
161-
_simpleKernel[1] = kernelHelper[1].getFunction(b"SimpleKernel")
162-
kernelArgs[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
163-
checkCudaErrors(
164-
cuda.cuLaunchKernel(
165-
_simpleKernel[1],
166-
blocks.x,
167-
blocks.y,
168-
blocks.z,
169-
threads.x,
170-
threads.y,
171-
threads.z,
172-
0,
173-
0,
174-
kernelArgs[1],
175-
0,
156+
with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper:
157+
simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel")
158+
kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
159+
checkCudaErrors(
160+
cuda.cuLaunchKernel(
161+
simple_kernel_1,
162+
blocks.x,
163+
blocks.y,
164+
blocks.z,
165+
threads.x,
166+
threads.y,
167+
threads.z,
168+
0,
169+
0,
170+
kernel_args_1,
171+
0,
172+
)
176173
)
177-
)
178174

179175
checkCudaErrors(cudart.cudaDeviceSynchronize())
180176

181177
# Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
182178
# output to the GPU 0 buffer
183179
print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
184180
checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
185-
kernelHelper[0] = common.KernelHelper(simplep2p, gpuid[0])
186-
_simpleKernel[0] = kernelHelper[0].getFunction(b"SimpleKernel")
187-
kernelArgs[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
188-
checkCudaErrors(
189-
cuda.cuLaunchKernel(
190-
_simpleKernel[0],
191-
blocks.x,
192-
blocks.y,
193-
blocks.z,
194-
threads.x,
195-
threads.y,
196-
threads.z,
197-
0,
198-
0,
199-
kernelArgs[0],
200-
0,
181+
with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper:
182+
simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel")
183+
kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
184+
checkCudaErrors(
185+
cuda.cuLaunchKernel(
186+
simple_kernel_0,
187+
blocks.x,
188+
blocks.y,
189+
blocks.z,
190+
threads.x,
191+
threads.y,
192+
threads.z,
193+
0,
194+
0,
195+
kernel_args_0,
196+
0,
197+
)
201198
)
202-
)
203199

204200
checkCudaErrors(cudart.cudaDeviceSynchronize())
205201

@@ -227,9 +223,6 @@ def main():
227223

228224
# Cleanup and shutdown
229225
print("Shutting down...")
230-
for helper in kernelHelper:
231-
if helper is not None:
232-
helper.close()
233226
checkCudaErrors(cudart.cudaEventDestroy(start_event))
234227
checkCudaErrors(cudart.cudaEventDestroy(stop_event))
235228
checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))

cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -126,27 +126,27 @@ def main():
126126
grid.x = math.ceil(nelem / float(block.x))
127127
grid.y = 1
128128
grid.z = 1
129-
kernelHelper = common.KernelHelper(simpleZeroCopy, idev)
130-
_vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
131-
kernelArgs = (
132-
(d_a, d_b, d_c, nelem),
133-
(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
134-
)
135-
checkCudaErrors(
136-
cuda.cuLaunchKernel(
137-
_vectorAddGPU,
138-
grid.x,
139-
grid.y,
140-
grid.z,
141-
block.x,
142-
block.y,
143-
block.z,
144-
0,
145-
cuda.CU_STREAM_LEGACY,
146-
kernelArgs,
147-
0,
129+
with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper:
130+
_vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
131+
kernelArgs = (
132+
(d_a, d_b, d_c, nelem),
133+
(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
134+
)
135+
checkCudaErrors(
136+
cuda.cuLaunchKernel(
137+
_vectorAddGPU,
138+
grid.x,
139+
grid.y,
140+
grid.z,
141+
block.x,
142+
block.y,
143+
block.z,
144+
0,
145+
cuda.CU_STREAM_LEGACY,
146+
kernelArgs,
147+
0,
148+
)
148149
)
149-
)
150150
checkCudaErrors(cudart.cudaDeviceSynchronize())
151151

152152
print("> Checking the results from vectorAddGPU() ...")
@@ -163,8 +163,6 @@ def main():
163163
errorNorm = math.sqrt(errorNorm)
164164
refNorm = math.sqrt(refNorm)
165165

166-
kernelHelper.close()
167-
168166
# Memory clean up
169167

170168
print("Releasing CPU memory...")

cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -209,32 +209,31 @@ def main():
209209
# To make the AND and XOR tests generate something other than 0...
210210
atom_arr_h[7] = atom_arr_h[9] = 0xFF
211211

212-
kernelHelper = common.KernelHelper(systemWideAtomics, dev_id)
213-
_atomicKernel = kernelHelper.getFunction(b"atomicKernel")
214-
kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
215-
checkCudaErrors(
216-
cuda.cuLaunchKernel(
217-
_atomicKernel,
218-
numBlocks,
219-
1,
220-
1, # grid dim
221-
numThreads,
222-
1,
223-
1, # block dim
224-
0,
225-
cuda.CU_STREAM_LEGACY, # shared mem and stream
226-
kernelArgs,
227-
0,
228-
)
229-
) # arguments
212+
with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper:
213+
_atomicKernel = kernelHelper.getFunction(b"atomicKernel")
214+
kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
215+
checkCudaErrors(
216+
cuda.cuLaunchKernel(
217+
_atomicKernel,
218+
numBlocks,
219+
1,
220+
1, # grid dim
221+
numThreads,
222+
1,
223+
1, # block dim
224+
0,
225+
cuda.CU_STREAM_LEGACY, # shared mem and stream
226+
kernelArgs,
227+
0,
228+
)
229+
) # arguments
230230
# NOTE: Python doesn't have an equivalent system atomic operations
231231
# atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
232232

233233
checkCudaErrors(cudart.cudaDeviceSynchronize())
234234

235235
# Compute & verify reference solution
236236
testResult = verify(atom_arr_h, numThreads * numBlocks)
237-
kernelHelper.close()
238237

239238
if device_prop.pageableMemoryAccess:
240239
pass

0 commit comments

Comments
 (0)