@@ -125,6 +125,64 @@ def test_launch_config_native_conversion(init_cuda):
125125 pytest .skip ("Driver or GPU not new enough for thread block clusters" )
126126
127127
128+ def test_to_native_launch_config_no_cluster ():
129+ """Covers the no-cluster path of _to_native_launch_config; no Hopper+ required."""
130+ from cuda .core ._launch_config import _to_native_launch_config
131+
132+ config = LaunchConfig (grid = (4 , 5 , 6 ), block = (7 , 8 , 9 ), shmem_size = 128 )
133+ native = _to_native_launch_config (config )
134+ assert native .gridDimX == 4 , f"Expected gridDimX=4, got { native .gridDimX } "
135+ assert native .gridDimY == 5 , f"Expected gridDimY=5, got { native .gridDimY } "
136+ assert native .gridDimZ == 6 , f"Expected gridDimZ=6, got { native .gridDimZ } "
137+ assert native .blockDimX == 7 , f"Expected blockDimX=7, got { native .blockDimX } "
138+ assert native .blockDimY == 8 , f"Expected blockDimY=8, got { native .blockDimY } "
139+ assert native .blockDimZ == 9 , f"Expected blockDimZ=9, got { native .blockDimZ } "
140+ assert native .sharedMemBytes == 128 , f"Expected sharedMemBytes=128, got { native .sharedMemBytes } "
141+ assert native .numAttrs == 0 , f"Expected numAttrs=0, got { native .numAttrs } "
142+ assert list (native .attrs ) == [], f"Expected empty attrs, got { list (native .attrs )} "
143+
144+
145+ def test_launch_config_cooperative_unsupported (monkeypatch ):
146+ """LaunchConfig(is_cooperative=True) raises when device does not support it."""
147+ from cuda .core import _launch_config as _lc_mod
148+
149+ class _FakeProps :
150+ cooperative_launch = False
151+
152+ class _FakeDev :
153+ properties = _FakeProps ()
154+
155+ monkeypatch .setattr (_lc_mod , "Device" , lambda : _FakeDev ())
156+ with pytest .raises (CUDAError , match = "cooperative kernels are not supported" ):
157+ LaunchConfig (grid = 1 , block = 1 , is_cooperative = True )
158+
159+
160+ def test_to_native_launch_config_cooperative (monkeypatch ):
161+ """Covers the is_cooperative branch of _to_native_launch_config; Device is mocked so it runs on any GPU."""
162+ from cuda .bindings import driver
163+ from cuda .core import _launch_config as _lc_mod
164+ from cuda .core ._launch_config import _to_native_launch_config
165+
166+ class _FakeProps :
167+ cooperative_launch = True
168+
169+ class _FakeDev :
170+ properties = _FakeProps ()
171+
172+ monkeypatch .setattr (_lc_mod , "Device" , lambda : _FakeDev ())
173+
174+ config = LaunchConfig (grid = 2 , block = 4 , is_cooperative = True )
175+ native = _to_native_launch_config (config )
176+ assert native .gridDimX == 2
177+ assert native .blockDimX == 4
178+ assert native .numAttrs == 1
179+ attr = native .attrs [0 ]
180+ assert attr .id == driver .CUlaunchAttributeID .CU_LAUNCH_ATTRIBUTE_COOPERATIVE , (
181+ f"Expected CU_LAUNCH_ATTRIBUTE_COOPERATIVE, got { attr .id } "
182+ )
183+ assert attr .value .cooperative == 1 , f"Expected cooperative=1, got { attr .value .cooperative } "
184+
185+
128186def test_launch_invalid_values (init_cuda ):
129187 code = 'extern "C" __global__ void my_kernel() {}'
130188 program = Program (code , SourceCodeType .CXX )
@@ -403,28 +461,42 @@ class MyFloat(ctypes.c_float):
403461 class MyBool (ctypes .c_bool ):
404462 pass
405463
406- # These should NOT raise — they should be handled via isinstance fallback
464+ # These should NOT raise; they should be handled via isinstance fallback
407465 holder = ParamHolder ([MyInt32 (42 ), MyFloat (3.14 ), MyBool (True )])
408466 assert holder .ptr != 0
409467
410468
411469@requires_module (np , "2.1" )
412- def test_launch_scalar_argument_ctypes_subclass_fallback ():
413- """Subclassed ctypes scalars survive the launch path and reach the kernel correctly."""
470+ @pytest .mark .parametrize (
471+ ("scalar_kind" , "np_dtype" , "cpp_type" , "raw_value" ),
472+ [
473+ ("ctypes" , np .int32 , "signed int" , - 123456 ),
474+ ("numpy" , np .float32 , "float" , 3.14 ),
475+ ],
476+ ids = ["ctypes_subclass" , "numpy_subclass" ],
477+ )
478+ def test_launch_scalar_argument_subclass_fallback (scalar_kind , np_dtype , cpp_type , raw_value ):
479+ """Subclassed scalar arguments survive fallback handling and reach the kernel."""
480+ if scalar_kind == "ctypes" :
414481
415- class MyInt32 (ctypes .c_int32 ):
416- pass
482+ class Subclassed (ctypes .c_int32 ):
483+ pass
484+ else :
485+
486+ class Subclassed (np .float32 ):
487+ pass
488+
489+ scalar = Subclassed (raw_value )
490+ expected = np_dtype (raw_value )
417491
418492 dev = Device ()
419493 dev .set_current ()
420494
421495 mr = LegacyPinnedMemoryResource ()
422- b = mr .allocate (np .dtype (np . int32 ).itemsize )
423- arr = np .from_dlpack (b ).view (np . int32 )
496+ b = mr .allocate (np .dtype (np_dtype ).itemsize )
497+ arr = np .from_dlpack (b ).view (np_dtype )
424498 arr [:] = 0
425499
426- scalar = MyInt32 (- 123456 )
427-
428500 code = r"""
429501 template <typename T>
430502 __global__ void write_scalar(T* arr, T val) {
@@ -435,17 +507,16 @@ class MyInt32(ctypes.c_int32):
435507 arch = "" .join (f"{ i } " for i in dev .compute_capability )
436508 pro_opts = ProgramOptions (std = "c++17" , arch = f"sm_{ arch } " )
437509 prog = Program (code , code_type = "c++" , options = pro_opts )
438- ker_name = "write_scalar<signed int >"
510+ ker_name = f "write_scalar<{ cpp_type } >"
439511 mod = prog .compile ("cubin" , name_expressions = (ker_name ,))
440512 ker = mod .get_kernel (ker_name )
441513
442- # This exercises the prepare_ctypes_arg isinstance fallback through a real launch.
443514 stream = dev .default_stream
444515 config = LaunchConfig (grid = 1 , block = 1 )
445516 launch (stream , config , ker , arr .ctypes .data , scalar )
446517 stream .sync ()
447518
448- assert arr [0 ] == scalar . value
519+ assert arr [0 ] == expected
449520
450521
451522def test_kernel_arg_numpy_subclass_isinstance_fallback ():
@@ -462,46 +533,6 @@ class MyFloat32(np.float32):
462533 assert holder .ptr != 0
463534
464535
465- @requires_module (np , "2.1" )
466- def test_launch_scalar_argument_numpy_subclass_fallback ():
467- """Subclassed numpy scalars survive the launch path and reach the kernel correctly."""
468-
469- class MyFloat32 (np .float32 ):
470- pass
471-
472- dev = Device ()
473- dev .set_current ()
474-
475- mr = LegacyPinnedMemoryResource ()
476- b = mr .allocate (np .dtype (np .float32 ).itemsize )
477- arr = np .from_dlpack (b ).view (np .float32 )
478- arr [:] = 0.0
479-
480- scalar = MyFloat32 (3.14 )
481-
482- code = r"""
483- template <typename T>
484- __global__ void write_scalar(T* arr, T val) {
485- arr[0] = val;
486- }
487- """
488-
489- arch = "" .join (f"{ i } " for i in dev .compute_capability )
490- pro_opts = ProgramOptions (std = "c++17" , arch = f"sm_{ arch } " )
491- prog = Program (code , code_type = "c++" , options = pro_opts )
492- ker_name = "write_scalar<float>"
493- mod = prog .compile ("cubin" , name_expressions = (ker_name ,))
494- ker = mod .get_kernel (ker_name )
495-
496- # This exercises the prepare_numpy_arg isinstance fallback through a real launch.
497- stream = dev .default_stream
498- config = LaunchConfig (grid = 1 , block = 1 )
499- launch (stream , config , ker , arr .ctypes .data , scalar )
500- stream .sync ()
501-
502- assert arr [0 ] == scalar
503-
504-
505536def test_kernel_arg_python_isinstance_fallbacks ():
506537 """Subclassed Python builtins hit the isinstance fallback in ParamHolder."""
507538 from cuda .core ._kernel_arg_handler import ParamHolder
0 commit comments