From 260ff3532232917be7c5af747229e837c6a42d3a Mon Sep 17 00:00:00 2001 From: Yaoyao Ding Date: Mon, 4 May 2026 13:29:46 -0400 Subject: [PATCH 1/2] [Test] Compile-only mode for tests on non-supporting GPUs Adds InstantiatedScript.compile(*args, **kwargs) -> JitInstance, a public API that transpiles + builds every schedule for the given arguments without executing the kernel, benchmarking, or persisting a dispatch choice. Adds tilus.target.scope(target) as a context manager for temporarily overriding the build target. Changes tilus.testing.requires.X behavior: when the current GPU does not support X, the test now runs in compile-only mode instead of being hard-skipped -- the build target is scoped to X, InstantiatedScript.__call__ is patched to delegate to compile() and raise an internal sentinel, and the wrapper catches the sentinel so a successful compile counts as a passing test. Lets CI on older arches (e.g. sm89) cover compilation paths for newer arches (e.g. sm100a) without requiring matching hardware. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yaoyao Ding --- python/tilus/lang/instantiated_script.py | 32 +++++++++-- python/tilus/lang/script.py | 8 +-- python/tilus/target.py | 30 ++++++++++- python/tilus/testing/_requires.py | 68 +++++++++++++++++------- 4 files changed, 110 insertions(+), 28 deletions(-) diff --git a/python/tilus/lang/instantiated_script.py b/python/tilus/lang/instantiated_script.py index 222cd8b7..becefe23 100644 --- a/python/tilus/lang/instantiated_script.py +++ b/python/tilus/lang/instantiated_script.py @@ -782,6 +782,33 @@ def __call__(self, *args, **kwargs): return ret + def compile(self, *args: Any, **kwargs: Any) -> JitInstance: + """Compile the script for the given arguments without executing it. + + This transpiles every schedule in the autotune space into a Program and builds each Program + to a shared library, but does not run the kernel and does not benchmark/persist a dispatch + choice. Useful in CI to validate that a kernel compiles for a target architecture (e.g., + sm100a) on a machine that does not support running it. Combine with + :func:`tilus.target.scope` to override the build target. + + Parameters + ---------- + args: + The positional arguments to ``__call__``. + + kwargs: + The keyword arguments to ``__call__``. + + Returns + ------- + jit_instance: JitInstance + The JIT instance for the script with the given arguments. The compiled programs are + available as ``jit_instance.valid_programs`` and ``jit_instance.compiled_programs``. + """ + jit_instance = self._jit_instance_for(*args, **kwargs) + jit_instance.programs() + return jit_instance + def _jit_instance_for(self, *args: Any, **kwargs: Any) -> JitInstance: if kwargs or self.with_default: # we allow the user to pass the keyword arguments to the script instance, or use the default values @@ -794,10 +821,7 @@ def _jit_instance_for(self, *args: Any, **kwargs: Any) -> JitInstance: "The number of arguments should be {}, but got {}.".format(len(self.params.param_names), len(args)) ) - # extract the JIT key and the tuning key - keys = extract_keys(args, self.const_params, self.tuning_params) - - jit_key, tuning_key = keys + jit_key, _ = extract_keys(args, self.const_params, self.tuning_params) jit_instance: Optional[JitInstance] = self.jit_instances.get(jit_key, None) if jit_instance is None: jit_instance = JitInstance(self.script_cls, self.params, self.build_options, self.schedules, jit_key) diff --git a/python/tilus/lang/script.py b/python/tilus/lang/script.py index d12ffc0e..849262fc 100644 --- a/python/tilus/lang/script.py +++ b/python/tilus/lang/script.py @@ -21,7 +21,7 @@ from tilus.lang.modules.cuda import cuda if TYPE_CHECKING: - from tilus.lang.instantiated_script import InstantiatedScript, JitInstance + from tilus.lang.instantiated_script import InstantiatedScript, JitInstance # noqa: F401 Int: TypeAlias = int | Expr @@ -70,9 +70,9 @@ def __init__(self) -> None: def __call__(self, *args, **kwargs): raise RuntimeError("This method should never be called.") - def jit_instance_for(self, *args: object, **kwargs: object) -> JitInstance: + def compile(self, *args: object, **kwargs: object) -> JitInstance: """ - Instantiate the script program with the specified arguments and keyword arguments. + Transpile and build the script for the given arguments without executing it. Parameters ---------- @@ -86,7 +86,7 @@ def jit_instance_for(self, *args: object, **kwargs: object) -> JitInstance: ret: JitInstance The JIT instance for the script with given arguments. """ - raise RuntimeError("This method should never be called. See InstantiatedScript.jit_instance instead.") + raise RuntimeError("This method should never be called. See InstantiatedScript.compile instead.") # the following properties should only be access in the __call__ function @property diff --git a/python/tilus/target.py b/python/tilus/target.py index 4161faae..51af9457 100644 --- a/python/tilus/target.py +++ b/python/tilus/target.py @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import functools +from contextlib import contextmanager from dataclasses import dataclass -from typing import Optional, Sequence, Tuple +from typing import Iterator, Optional, Sequence, Tuple @dataclass(frozen=True) @@ -293,6 +294,33 @@ def set_current_target(target: Target) -> None: _target = target +@contextmanager +def scope(target: Target) -> Iterator[Target]: + """Temporarily set the current compilation target. + + Useful to compile a kernel for a specific architecture (e.g., sm100a) on a machine that does not + support running it. Restores the previous target on exit. + + Parameters + ---------- + target: Target + The target to use within the scope. + + Yields + ------ + target: Target + The target that is now active. + """ + global _target + assert isinstance(target, Target) + prev = _target + _target = target + try: + yield target + finally: + _target = prev + + @functools.cache def get_default_target() -> Target: import torch diff --git a/python/tilus/testing/_requires.py b/python/tilus/testing/_requires.py index 83785504..856aa01a 100644 --- a/python/tilus/testing/_requires.py +++ b/python/tilus/testing/_requires.py @@ -12,40 +12,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable +import functools +from typing import Any, Callable -from tilus.target import Target, get_current_target, nvgpu_sm80, nvgpu_sm90, nvgpu_sm100, nvgpu_sm100a +from tilus.target import Target, get_current_target, nvgpu_sm80, nvgpu_sm90, nvgpu_sm100, nvgpu_sm100a, scope + + +class _CompileOnlyDone(Exception): + """Raised inside a compile-only test to short-circuit execution after a successful compile.""" def _requires(target: Target) -> Callable[[Callable], Callable]: """ - Pytest fixture decorator that skips tests if the current GPU doesn't support the required architecture. + Pytest decorator that adapts test behavior to the current GPU. + + If the current GPU supports the required target, the test runs unchanged. + + Otherwise, the test runs in *compile-only* mode: + - The current compilation target is overridden to ``target`` for the duration of the test. + - The first ``InstantiatedScript.__call__`` invocation is redirected to + :py:meth:`InstantiatedScript.compile `, which transpiles + + builds every schedule in the autotune space without running the kernel. + - After the compile succeeds, a sentinel exception is raised to short-circuit the rest of the + test body; the decorator catches the sentinel and treats the test as passed. Parameters ---------- target : Target - The required target architecture. Examples include 'sm_90a', 'sm_80', + The required target architecture, e.g. ``nvgpu_sm100a``. """ - def decorator(test_func): - import pytest - + def decorator(test_func: Callable) -> Callable: try: - required_target = target current_target = get_current_target() - current_capability = current_target.properties.compute_capability + supports_target = current_target.supports(target) + except Exception: + # Could not determine the current target (e.g. no GPU available). + # Fall through to compile-only mode -- compilation does not need a runtime GPU. + supports_target = False - if not current_target.supports(required_target): - return pytest.mark.skip( - f"Test requires architecture {required_target}, but current GPU capability is {current_capability}" - )(test_func) + if supports_target: return test_func - except ValueError as e: - # If we can't parse the architecture string, skip the test - return pytest.mark.skip(f"Invalid architecture requirement: {e}")(test_func) - except Exception as e: - # If we can't determine current capability, skip the test - return pytest.mark.skip(f"Cannot determine current GPU capability: {e}")(test_func) + + @functools.wraps(test_func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + # Importing here avoids a top-level circular import: tilus.testing is imported eagerly + # by tests but tilus.lang.instantiated_script depends on the rest of the package. + from tilus.lang.instantiated_script import InstantiatedScript + + original_call = InstantiatedScript.__call__ + + def compile_only_call(self: InstantiatedScript, *call_args: Any, **call_kwargs: Any) -> Any: + self.compile(*call_args, **call_kwargs) + raise _CompileOnlyDone() + + InstantiatedScript.__call__ = compile_only_call # type: ignore[method-assign] + try: + with scope(target): + test_func(*args, **kwargs) + except _CompileOnlyDone: + pass + finally: + InstantiatedScript.__call__ = original_call # type: ignore[method-assign] + + return wrapper return decorator From 666ff235de09202a79854ff1f5fa569d98f403ed Mon Sep 17 00:00:00 2001 From: Yaoyao Ding Date: Mon, 4 May 2026 18:03:22 -0400 Subject: [PATCH 2/2] [CI] Bump CUDA toolkit to 13.0 and fix sm100/sm100a test annotations The CI runner has an L4 (sm89) but tests for newer-arch instructions need to compile against compute_100 / compute_100a. The docker image was nvidia/cuda:12.6.2-devel-ubuntu22.04, whose nvcc is 12.6 and does not know compute_100. Bump to nvidia/cuda:13.0.0-devel-ubuntu22.04 so the compile-only paths can build sm_100/sm_100a kernels (matches the torch 13.0 binaries already pulled at runtime). Also tighten two test annotations whose kernels emit instructions unsupported below sm_100a: - test_copy_async_tensor_cta uses cp.async.bulk.tensor with the .cta_group::1 modifier (sm_100+); was annotated sm_90. - test_cluster_launch_control uses clusterlaunchcontrol.try_cancel with the multicast variant (sm_100a only); was annotated sm_100. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yaoyao Ding --- .github/workflows/deploy-wheel-to-pypi.yaml | 2 +- .github/workflows/tests.yaml | 6 +++--- tests/instructions/test_cluster_launch_control.py | 2 +- tests/instructions/test_copy_async_tensor.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/deploy-wheel-to-pypi.yaml b/.github/workflows/deploy-wheel-to-pypi.yaml index be54e067..fcc713dc 100644 --- a/.github/workflows/deploy-wheel-to-pypi.yaml +++ b/.github/workflows/deploy-wheel-to-pypi.yaml @@ -18,7 +18,7 @@ jobs: if: github.repository == 'NVIDIA/tilus' runs-on: linux-amd64-gpu-l4-latest-1 container: - image: nvidia/cuda:12.6.2-devel-ubuntu22.04 + image: nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --gpus all outputs: wheel-path: ${{ steps.setup-and-install.outputs.wheel-path }} diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index b7bc4499..d9fd4c63 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -48,7 +48,7 @@ jobs: if: github.repository == 'NVIDIA/tilus' && needs.check-changes.outputs.should_run_tests == 'true' runs-on: linux-amd64-gpu-l4-latest-1 container: - image: nvidia/cuda:12.6.2-devel-ubuntu22.04 + image: nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --gpus all steps: - name: Checkout Repository @@ -106,7 +106,7 @@ jobs: - linux-amd64-gpu-l4-latest-1 runs-on: ${{ matrix.runner }} container: - image: nvidia/cuda:12.6.2-devel-ubuntu22.04 + image: nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --gpus all steps: - name: Checkout Repository @@ -128,7 +128,7 @@ jobs: if: github.repository == 'NVIDIA/tilus' && needs.check-changes.outputs.should_run_examples == 'true' runs-on: linux-amd64-gpu-l4-latest-1 container: - image: nvidia/cuda:12.6.2-devel-ubuntu22.04 + image: nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --gpus all steps: - name: Checkout Repository diff --git a/tests/instructions/test_cluster_launch_control.py b/tests/instructions/test_cluster_launch_control.py index ed0e7f0d..5a249998 100644 --- a/tests/instructions/test_cluster_launch_control.py +++ b/tests/instructions/test_cluster_launch_control.py @@ -99,7 +99,7 @@ def __call__(self, n: int32, p_out: ~int32) -> None: break -@tilus.testing.requires.nvgpu_sm100 +@tilus.testing.requires.nvgpu_sm100a @pytest.mark.parametrize("cluster_blocks", [2, 4]) @pytest.mark.parametrize("num_stages", [2, 3, 4]) @pytest.mark.parametrize("warps", [4, 8]) diff --git a/tests/instructions/test_copy_async_tensor.py b/tests/instructions/test_copy_async_tensor.py index de6cbb50..4115108d 100644 --- a/tests/instructions/test_copy_async_tensor.py +++ b/tests/instructions/test_copy_async_tensor.py @@ -71,7 +71,7 @@ def __call__(self, m_size: int32, n_size: int, x_ptr: ~float16, y_ptr: ~float16) self.tma.wait_group(0) -@requires.nvgpu_sm90 +@requires.nvgpu_sm100a def test_copy_async_tensor_cta(): m = 123 n = 64 * 8