NVIDIA
diff --git a/‎cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py‎
Lines changed: 31 additions & 32 deletions b/‎cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py‎
Lines changed: 31 additions & 32 deletions
diff --git a/‎cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py‎
Lines changed: 49 additions & 50 deletions b/‎cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py‎
Lines changed: 49 additions & 50 deletions
diff --git a/‎cuda_bindings/examples/0_Introduction/simpleP2P_test.py‎
Lines changed: 34 additions & 41 deletions b/‎cuda_bindings/examples/0_Introduction/simpleP2P_test.py‎
Lines changed: 34 additions & 41 deletions
diff --git a/‎cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py‎
Lines changed: 20 additions & 22 deletions b/‎cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py‎
Lines changed: 20 additions & 22 deletions
diff --git a/‎cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py‎
Lines changed: 18 additions & 19 deletions b/‎cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py‎
Lines changed: 18 additions & 19 deletions
@@ -71,38 +71,37 @@ def main():
         hinput[i] = i
 
     devID = findCudaDevice()
-    kernelHelper = common.KernelHelper(clock_nvrtc, devID)
-    kernel_addr = kernelHelper.getFunction(b"timedReduction")
-
-    dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
-    doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
-    dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
-    checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
-
-    args = ((dinput, doutput, dtimer), (None, None, None))
-    shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
-
-    grid_dims = (NUM_BLOCKS, 1, 1)
-    block_dims = (NUM_THREADS, 1, 1)
-
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            kernel_addr,
-            *grid_dims,  # grid dim
-            *block_dims,  # block dim
-            shared_memory_nbytes,
-            0,  # shared mem, stream
-            args,
-            0,
-        )
-    )  # arguments
-
-    checkCudaErrors(cuda.cuCtxSynchronize())
-    checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
-    checkCudaErrors(cuda.cuMemFree(dinput))
-    checkCudaErrors(cuda.cuMemFree(doutput))
-    checkCudaErrors(cuda.cuMemFree(dtimer))
-    kernelHelper.close()
+    with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
+        kernel_addr = kernelHelper.getFunction(b"timedReduction")
+
+        dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
+        doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
+        dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
+        checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
+
+        args = ((dinput, doutput, dtimer), (None, None, None))
+        shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
+
+        grid_dims = (NUM_BLOCKS, 1, 1)
+        block_dims = (NUM_THREADS, 1, 1)
+
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel_addr,
+                *grid_dims,  # grid dim
+                *block_dims,  # block dim
+                shared_memory_nbytes,
+                0,  # shared mem, stream
+                args,
+                0,
+            )
+        )  # arguments
+
+        checkCudaErrors(cuda.cuCtxSynchronize())
+        checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
+        checkCudaErrors(cuda.cuMemFree(dinput))
+        checkCudaErrors(cuda.cuMemFree(doutput))
+        checkCudaErrors(cuda.cuMemFree(dtimer))
 
     avgElapsedClocks = 0.0
 
 
@@ -154,58 +154,57 @@ def main():
         f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
     )
 
-    kernelHelper = common.KernelHelper(simpleCubemapTexture, devID)
-    _transformKernel = kernelHelper.getFunction(b"transformKernel")
-    kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _transformKernel,
-            dimGrid.x,
-            dimGrid.y,
-            dimGrid.z,  # grid dim
-            dimBlock.x,
-            dimBlock.y,
-            dimBlock.z,  # block dim
-            0,
-            0,  # shared mem and stream
-            kernelArgs,
-            0,
-        )
-    )  # arguments
-
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
-
-    start = time.time()
-
-    # Execute the kernel
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _transformKernel,
-            dimGrid.x,
-            dimGrid.y,
-            dimGrid.z,  # grid dim
-            dimBlock.x,
-            dimBlock.y,
-            dimBlock.z,  # block dim
-            0,
-            0,  # shared mem and stream
-            kernelArgs,
-            0,
-        )
-    )  # arguments
-
-    checkCudaErrors(cudart.cudaDeviceSynchronize())
-    stop = time.time()
-    print(f"Processing time: {stop - start:.3f} msec")
-    print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
-
-    # Allocate mem for the result on host side
-    h_odata = np.empty_like(h_data)
-    # Copy result from device to host
-    checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
+    with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
+        _transformKernel = kernelHelper.getFunction(b"transformKernel")
+        kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _transformKernel,
+                dimGrid.x,
+                dimGrid.y,
+                dimGrid.z,  # grid dim
+                dimBlock.x,
+                dimBlock.y,
+                dimBlock.z,  # block dim
+                0,
+                0,  # shared mem and stream
+                kernelArgs,
+                0,
+            )
+        )  # arguments
+
+        checkCudaErrors(cudart.cudaDeviceSynchronize())
+
+        start = time.time()
+
+        # Execute the kernel
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _transformKernel,
+                dimGrid.x,
+                dimGrid.y,
+                dimGrid.z,  # grid dim
+                dimBlock.x,
+                dimBlock.y,
+                dimBlock.z,  # block dim
+                0,
+                0,  # shared mem and stream
+                kernelArgs,
+                0,
+            )
+        )  # arguments
+
+        checkCudaErrors(cudart.cudaDeviceSynchronize())
+        stop = time.time()
+        print(f"Processing time: {stop - start:.3f} msec")
+        print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
+
+        # Allocate mem for the result on host side
+        h_odata = np.empty_like(h_data)
+        # Copy result from device to host
+        checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
 
     checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
-    kernelHelper.close()
     checkCudaErrors(cudart.cudaFree(d_data))
     checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
 
 
@@ -153,53 +153,49 @@ def main():
     print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
 
-    kernelHelper = [None] * 2
-    _simpleKernel = [None] * 2
-    kernelArgs = [None] * 2
-
-    kernelHelper[1] = common.KernelHelper(simplep2p, gpuid[1])
-    _simpleKernel[1] = kernelHelper[1].getFunction(b"SimpleKernel")
-    kernelArgs[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _simpleKernel[1],
-            blocks.x,
-            blocks.y,
-            blocks.z,
-            threads.x,
-            threads.y,
-            threads.z,
-            0,
-            0,
-            kernelArgs[1],
-            0,
+    with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper:
+        simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel")
+        kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                simple_kernel_1,
+                blocks.x,
+                blocks.y,
+                blocks.z,
+                threads.x,
+                threads.y,
+                threads.z,
+                0,
+                0,
+                kernel_args_1,
+                0,
+            )
         )
-    )
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
     # output to the GPU 0 buffer
     print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
-    kernelHelper[0] = common.KernelHelper(simplep2p, gpuid[0])
-    _simpleKernel[0] = kernelHelper[0].getFunction(b"SimpleKernel")
-    kernelArgs[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _simpleKernel[0],
-            blocks.x,
-            blocks.y,
-            blocks.z,
-            threads.x,
-            threads.y,
-            threads.z,
-            0,
-            0,
-            kernelArgs[0],
-            0,
+    with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper:
+        simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel")
+        kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                simple_kernel_0,
+                blocks.x,
+                blocks.y,
+                blocks.z,
+                threads.x,
+                threads.y,
+                threads.z,
+                0,
+                0,
+                kernel_args_0,
+                0,
+            )
         )
-    )
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
@@ -227,9 +223,6 @@ def main():
 
     # Cleanup and shutdown
     print("Shutting down...")
-    for helper in kernelHelper:
-        if helper is not None:
-            helper.close()
     checkCudaErrors(cudart.cudaEventDestroy(start_event))
     checkCudaErrors(cudart.cudaEventDestroy(stop_event))
     checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
 
@@ -126,27 +126,27 @@ def main():
     grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
-    kernelHelper = common.KernelHelper(simpleZeroCopy, idev)
-    _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
-    kernelArgs = (
-        (d_a, d_b, d_c, nelem),
-        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
-    )
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _vectorAddGPU,
-            grid.x,
-            grid.y,
-            grid.z,
-            block.x,
-            block.y,
-            block.z,
-            0,
-            cuda.CU_STREAM_LEGACY,
-            kernelArgs,
-            0,
+    with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper:
+        _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
+        kernelArgs = (
+            (d_a, d_b, d_c, nelem),
+            (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+        )
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _vectorAddGPU,
+                grid.x,
+                grid.y,
+                grid.z,
+                block.x,
+                block.y,
+                block.z,
+                0,
+                cuda.CU_STREAM_LEGACY,
+                kernelArgs,
+                0,
+            )
         )
-    )
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     print("> Checking the results from vectorAddGPU() ...")
@@ -163,8 +163,6 @@ def main():
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    kernelHelper.close()
-
     # Memory clean up
 
     print("Releasing CPU memory...")
 
@@ -209,32 +209,31 @@ def main():
     # To make the AND and XOR tests generate something other than 0...
     atom_arr_h[7] = atom_arr_h[9] = 0xFF
 
-    kernelHelper = common.KernelHelper(systemWideAtomics, dev_id)
-    _atomicKernel = kernelHelper.getFunction(b"atomicKernel")
-    kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
-    checkCudaErrors(
-        cuda.cuLaunchKernel(
-            _atomicKernel,
-            numBlocks,
-            1,
-            1,  # grid dim
-            numThreads,
-            1,
-            1,  # block dim
-            0,
-            cuda.CU_STREAM_LEGACY,  # shared mem and stream
-            kernelArgs,
-            0,
-        )
-    )  # arguments
+    with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper:
+        _atomicKernel = kernelHelper.getFunction(b"atomicKernel")
+        kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _atomicKernel,
+                numBlocks,
+                1,
+                1,  # grid dim
+                numThreads,
+                1,
+                1,  # block dim
+                0,
+                cuda.CU_STREAM_LEGACY,  # shared mem and stream
+                kernelArgs,
+                0,
+            )
+        )  # arguments
     # NOTE: Python doesn't have an equivalent system atomic operations
     # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     # Compute & verify reference solution
     testResult = verify(atom_arr_h, numThreads * numBlocks)
-    kernelHelper.close()
 
     if device_prop.pageableMemoryAccess:
         pass