From 4fc0c3848ed174ec70dcf920937ef999cdc411c9 Mon Sep 17 00:00:00 2001 From: Min Wang Date: Wed, 15 Apr 2026 18:42:42 +0000 Subject: [PATCH 1/2] fix(callconv): align alloca slots to CUDA vector type ABI requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDA vector types (float2, float4, etc.) carry __align__(N) attributes that require N-byte alignment (float2 → 8 B, float4 → 16 B). LLVM represents them as anonymous structs whose ABI alignment defaults to the element alignment (4 B for float), not the vector alignment. When FunctionCallConv._lower_impl allocates stack slots via builder.alloca / cgutils.alloca_once without an explicit alignment, LLVM emits 4-byte-aligned allocas. The NVRTC-compiled shim then tries to perform a vector load/store (e.g. ld.global.v2.f32 for float2) on that 4-byte-aligned pointer, which violates the 8-byte alignment requirement and raises cudaErrorMisalignedAddress at runtime. Fix: after every alloca in _lower_impl, set alloca.align = max(dl.abi_alignment(ty), min(dl.abi_size(ty), 16)) The cap at 16 bytes covers float4/int4 (the widest standard CUDA vector types) without over-aligning large user-defined structs. The four sites fixed are: 1. retval_ptr — the function return-value slot 2. visible-arg ptrs (no-intent-plan path) 3. out_return ptrs (intent-plan path) 4. visible-arg ptrs (intent-plan path) Fixes optixGetTriangleBarycentrics() (float2 return) and any other Numbast binding that returns or accepts a CUDA vector type. --- numbast/src/numbast/callconv.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/numbast/src/numbast/callconv.py b/numbast/src/numbast/callconv.py index 5e10eadd..695704c7 100644 --- a/numbast/src/numbast/callconv.py +++ b/numbast/src/numbast/callconv.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# NUMBAST_RETVAL_ALIGN_FIX_APPLIED from numbast.args import prepare_ir_types from numbast.intent import IntentPlan @@ -107,6 +108,34 @@ def _lower_impl(self, builder, context, sig, args): else: retval_ty = context.get_value_type(cxx_return_type) retval_ptr = builder.alloca(retval_ty, name="retval") + # Enforce CUDA vector-type alignment on every alloca. + # + # Background: CUDA vector types (float2, float4, int4, …) are + # declared with __align__(N) in the CUDA headers, meaning the + # hardware and PTX ISA require the pointer to be N-byte aligned + # (float2 → 8 B, float4 → 16 B). When Numbast lowers these types + # into LLVM IR they become anonymous structs, e.g. float2 becomes + # {float, float}. LLVM computes the ABI alignment of a struct as + # the maximum alignment of its members — 4 B for a struct of + # floats — which is *less* than the CUDA-required 8 B. builder.alloca + # without an explicit alignment adopts that 4-byte default. + # + # The NVRTC-compiled shim accesses the slot with a vector + # instruction (e.g. ld/st.v2.f32 for float2) which requires + # 8-byte alignment. Passing a 4-byte-aligned pointer causes + # cudaErrorMisalignedAddress at runtime. + # + # Fix: after every alloca, raise the alignment to + # max(abi_alignment(type), min(sizeof(type), 16)) + # Taking the max with abi_alignment never reduces alignment below + # what LLVM already computed. min(sizeof, 16) captures the natural + # alignment for all standard CUDA vectors (float2=8, float4=16) + # while the cap at 16 prevents over-aligning large user structs + # that happen to be bigger than 16 bytes. + _dl = context.target_data + retval_ptr.align = max( + _dl.abi_alignment(retval_ty), min(_dl.abi_size(retval_ty), 16) + ) # 2. Prepare arguments if self._intent_plan is None: @@ -154,6 +183,10 @@ def _lower_impl(self, builder, context, sig, args): ptrs.append(arg) else: ptr = cgutils.alloca_once(builder, vty) + _dl = context.target_data + ptr.align = max( # see retval_ptr comment above + _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) + ) builder.store( arg, ptr, align=getattr(argty, "alignof_", None) ) @@ -175,6 +208,10 @@ def _lower_impl(self, builder, context, sig, args): out_nbty = self._out_return_types[out_pos] vty = context.get_value_type(out_nbty) ptr = cgutils.alloca_once(builder, vty) + _dl = context.target_data + ptr.align = max( # see retval_ptr comment above + _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) + ) ptrs.append(ptr) arg_pointer_types.append(ir.PointerType(vty)) out_return_ptrs.append((out_nbty, ptr)) @@ -194,6 +231,10 @@ def _lower_impl(self, builder, context, sig, args): arg_pointer_types.append(vty) else: ptr = cgutils.alloca_once(builder, vty) + _dl = context.target_data + ptr.align = max( # see retval_ptr comment above + _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) + ) builder.store( arg, ptr, align=getattr(argty, "alignof_", None) ) From f32141d3d936bf644927f6ab9119b2e0ac922c79 Mon Sep 17 00:00:00 2001 From: Min Wang Date: Wed, 15 Apr 2026 19:10:36 +0000 Subject: [PATCH 2/2] refactor(callconv): use alignof_ from Numba type instead of sizeof heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix used max(abi_alignment, min(sizeof, 16)) to guess the required alloca alignment for CUDA vector types. This heuristic works for power-of-2 sized types (float2, float4) but is incorrect for non-power-of-2 types like float3/uint3 (sizeof=12 → would produce alignment=12, which is not a valid power-of-2 LLVM alignment). Numbast already propagates alignof_ from ast_canopy onto user-defined bound structs, and already uses getattr(argty, "alignof_", None) for load/store instructions. Apply the same convention to alloca: check alignof_ on the Numba type and set it when present; when absent, leave LLVM's default ABI alignment (correct for scalars and structs without an explicit __align__ attribute). Callers registering built-in CUDA vector types in CTYPE_MAPS must set alignof_ on the Numba type to match the __align__(N) in the CUDA headers (e.g. float32x2.alignof_ = 8 for float2's __align__(8)). This mirrors how ast_canopy-derived struct types already work. --- numbast/src/numbast/callconv.py | 69 ++++++++++++++------------------- 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/numbast/src/numbast/callconv.py b/numbast/src/numbast/callconv.py index 695704c7..94df7b40 100644 --- a/numbast/src/numbast/callconv.py +++ b/numbast/src/numbast/callconv.py @@ -108,34 +108,28 @@ def _lower_impl(self, builder, context, sig, args): else: retval_ty = context.get_value_type(cxx_return_type) retval_ptr = builder.alloca(retval_ty, name="retval") - # Enforce CUDA vector-type alignment on every alloca. + # Use the Numba type's alignof_ to set the alloca alignment. # - # Background: CUDA vector types (float2, float4, int4, …) are - # declared with __align__(N) in the CUDA headers, meaning the - # hardware and PTX ISA require the pointer to be N-byte aligned - # (float2 → 8 B, float4 → 16 B). When Numbast lowers these types - # into LLVM IR they become anonymous structs, e.g. float2 becomes - # {float, float}. LLVM computes the ABI alignment of a struct as - # the maximum alignment of its members — 4 B for a struct of - # floats — which is *less* than the CUDA-required 8 B. builder.alloca - # without an explicit alignment adopts that 4-byte default. - # - # The NVRTC-compiled shim accesses the slot with a vector - # instruction (e.g. ld/st.v2.f32 for float2) which requires - # 8-byte alignment. Passing a 4-byte-aligned pointer causes + # LLVM computes struct ABI alignment as the max alignment of its + # members. For CUDA vector types (float2, float4, uchar4, …) + # declared with __align__(N) in the CUDA headers, N can exceed + # the member alignment: float2 is {float,float} with member + # alignment 4 B but __align__(8). LLVM therefore assigns a 4 B + # alloca, while the NVRTC shim uses a vector instruction + # (ld/st.v2.f32) that requires 8 B alignment, causing # cudaErrorMisalignedAddress at runtime. # - # Fix: after every alloca, raise the alignment to - # max(abi_alignment(type), min(sizeof(type), 16)) - # Taking the max with abi_alignment never reduces alignment below - # what LLVM already computed. min(sizeof, 16) captures the natural - # alignment for all standard CUDA vectors (float2=8, float4=16) - # while the cap at 16 prevents over-aligning large user structs - # that happen to be bigger than 16 bytes. - _dl = context.target_data - retval_ptr.align = max( - _dl.abi_alignment(retval_ty), min(_dl.abi_size(retval_ty), 16) - ) + # Numbast's struct binder already sets alignof_ on user-defined + # bound structs (propagated from ast_canopy). For built-in CUDA + # vector types, callers must set alignof_ on the Numba type when + # registering it in CTYPE_MAPS. When alignof_ is present it is + # used here, matching the convention already applied to loads and + # stores (getattr(argty, "alignof_", None)). When absent, LLVM's + # default ABI alignment is used, which is correct for scalars and + # structs without an explicit __align__ attribute. + _nb_align = getattr(cxx_return_type, "alignof_", None) + if _nb_align is not None: + retval_ptr.align = _nb_align # 2. Prepare arguments if self._intent_plan is None: @@ -183,12 +177,11 @@ def _lower_impl(self, builder, context, sig, args): ptrs.append(arg) else: ptr = cgutils.alloca_once(builder, vty) - _dl = context.target_data - ptr.align = max( # see retval_ptr comment above - _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) - ) + _nb_align = getattr(argty, "alignof_", None) + if _nb_align is not None: + ptr.align = _nb_align # see retval_ptr comment above builder.store( - arg, ptr, align=getattr(argty, "alignof_", None) + arg, ptr, align=_nb_align ) ptrs.append(ptr) else: @@ -208,10 +201,9 @@ def _lower_impl(self, builder, context, sig, args): out_nbty = self._out_return_types[out_pos] vty = context.get_value_type(out_nbty) ptr = cgutils.alloca_once(builder, vty) - _dl = context.target_data - ptr.align = max( # see retval_ptr comment above - _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) - ) + _nb_align = getattr(out_nbty, "alignof_", None) + if _nb_align is not None: + ptr.align = _nb_align # see retval_ptr comment above ptrs.append(ptr) arg_pointer_types.append(ir.PointerType(vty)) out_return_ptrs.append((out_nbty, ptr)) @@ -231,12 +223,11 @@ def _lower_impl(self, builder, context, sig, args): arg_pointer_types.append(vty) else: ptr = cgutils.alloca_once(builder, vty) - _dl = context.target_data - ptr.align = max( # see retval_ptr comment above - _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16) - ) + _nb_align = getattr(argty, "alignof_", None) + if _nb_align is not None: + ptr.align = _nb_align # see retval_ptr comment above builder.store( - arg, ptr, align=getattr(argty, "alignof_", None) + arg, ptr, align=_nb_align ) ptrs.append(ptr) arg_pointer_types.append(ir.PointerType(vty))