From 4fc0c3848ed174ec70dcf920937ef999cdc411c9 Mon Sep 17 00:00:00 2001
From: Min Wang <wangm@nvidia.com>
Date: Wed, 15 Apr 2026 18:42:42 +0000
Subject: [PATCH 1/2] fix(callconv): align alloca slots to CUDA vector type ABI
 requirements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CUDA vector types (float2, float4, etc.) carry __align__(N) attributes
that require N-byte alignment (float2 → 8 B, float4 → 16 B).  LLVM
represents them as anonymous structs whose ABI alignment defaults to
the element alignment (4 B for float), not the vector alignment.

When FunctionCallConv._lower_impl allocates stack slots via
builder.alloca / cgutils.alloca_once without an explicit alignment,
LLVM emits 4-byte-aligned allocas.  The NVRTC-compiled shim then tries
to perform a vector load/store (e.g. ld.global.v2.f32 for float2) on
that 4-byte-aligned pointer, which violates the 8-byte alignment
requirement and raises cudaErrorMisalignedAddress at runtime.

Fix: after every alloca in _lower_impl, set
  alloca.align = max(dl.abi_alignment(ty), min(dl.abi_size(ty), 16))

The cap at 16 bytes covers float4/int4 (the widest standard CUDA
vector types) without over-aligning large user-defined structs.

The four sites fixed are:
  1. retval_ptr — the function return-value slot
  2. visible-arg ptrs (no-intent-plan path)
  3. out_return ptrs (intent-plan path)
  4. visible-arg ptrs (intent-plan path)

Fixes optixGetTriangleBarycentrics() (float2 return) and any other
Numbast binding that returns or accepts a CUDA vector type.
---
 numbast/src/numbast/callconv.py | 41 +++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/numbast/src/numbast/callconv.py b/numbast/src/numbast/callconv.py
index 5e10eadd..695704c7 100644
--- a/numbast/src/numbast/callconv.py
+++ b/numbast/src/numbast/callconv.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+# NUMBAST_RETVAL_ALIGN_FIX_APPLIED
 from numbast.args import prepare_ir_types
 from numbast.intent import IntentPlan
 
@@ -107,6 +108,34 @@ def _lower_impl(self, builder, context, sig, args):
         else:
             retval_ty = context.get_value_type(cxx_return_type)
             retval_ptr = builder.alloca(retval_ty, name="retval")
+            # Enforce CUDA vector-type alignment on every alloca.
+            #
+            # Background: CUDA vector types (float2, float4, int4, …) are
+            # declared with __align__(N) in the CUDA headers, meaning the
+            # hardware and PTX ISA require the pointer to be N-byte aligned
+            # (float2 → 8 B, float4 → 16 B). When Numbast lowers these types
+            # into LLVM IR they become anonymous structs, e.g. float2 becomes
+            # {float, float}. LLVM computes the ABI alignment of a struct as
+            # the maximum alignment of its members — 4 B for a struct of
+            # floats — which is *less* than the CUDA-required 8 B. builder.alloca
+            # without an explicit alignment adopts that 4-byte default.
+            #
+            # The NVRTC-compiled shim accesses the slot with a vector
+            # instruction (e.g. ld/st.v2.f32 for float2) which requires
+            # 8-byte alignment. Passing a 4-byte-aligned pointer causes
+            # cudaErrorMisalignedAddress at runtime.
+            #
+            # Fix: after every alloca, raise the alignment to
+            #   max(abi_alignment(type), min(sizeof(type), 16))
+            # Taking the max with abi_alignment never reduces alignment below
+            # what LLVM already computed. min(sizeof, 16) captures the natural
+            # alignment for all standard CUDA vectors (float2=8, float4=16)
+            # while the cap at 16 prevents over-aligning large user structs
+            # that happen to be bigger than 16 bytes.
+            _dl = context.target_data
+            retval_ptr.align = max(
+                _dl.abi_alignment(retval_ty), min(_dl.abi_size(retval_ty), 16)
+            )
 
         # 2. Prepare arguments
         if self._intent_plan is None:
@@ -154,6 +183,10 @@ def _lower_impl(self, builder, context, sig, args):
                     ptrs.append(arg)
                 else:
                     ptr = cgutils.alloca_once(builder, vty)
+                    _dl = context.target_data
+                    ptr.align = max(  # see retval_ptr comment above
+                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
+                    )
                     builder.store(
                         arg, ptr, align=getattr(argty, "alignof_", None)
                     )
@@ -175,6 +208,10 @@ def _lower_impl(self, builder, context, sig, args):
                     out_nbty = self._out_return_types[out_pos]
                     vty = context.get_value_type(out_nbty)
                     ptr = cgutils.alloca_once(builder, vty)
+                    _dl = context.target_data
+                    ptr.align = max(  # see retval_ptr comment above
+                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
+                    )
                     ptrs.append(ptr)
                     arg_pointer_types.append(ir.PointerType(vty))
                     out_return_ptrs.append((out_nbty, ptr))
@@ -194,6 +231,10 @@ def _lower_impl(self, builder, context, sig, args):
                     arg_pointer_types.append(vty)
                 else:
                     ptr = cgutils.alloca_once(builder, vty)
+                    _dl = context.target_data
+                    ptr.align = max(  # see retval_ptr comment above
+                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
+                    )
                     builder.store(
                         arg, ptr, align=getattr(argty, "alignof_", None)
                     )

From f32141d3d936bf644927f6ab9119b2e0ac922c79 Mon Sep 17 00:00:00 2001
From: Min Wang <wangm@nvidia.com>
Date: Wed, 15 Apr 2026 19:10:36 +0000
Subject: [PATCH 2/2] refactor(callconv): use alignof_ from Numba type instead
 of sizeof heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous fix used max(abi_alignment, min(sizeof, 16)) to guess the
required alloca alignment for CUDA vector types. This heuristic works
for power-of-2 sized types (float2, float4) but is incorrect for
non-power-of-2 types like float3/uint3 (sizeof=12 → would produce
alignment=12, which is not a valid power-of-2 LLVM alignment).

Numbast already propagates alignof_ from ast_canopy onto user-defined
bound structs, and already uses getattr(argty, "alignof_", None) for
load/store instructions. Apply the same convention to alloca: check
alignof_ on the Numba type and set it when present; when absent, leave
LLVM's default ABI alignment (correct for scalars and structs without
an explicit __align__ attribute).

Callers registering built-in CUDA vector types in CTYPE_MAPS must set
alignof_ on the Numba type to match the __align__(N) in the CUDA
headers (e.g. float32x2.alignof_ = 8 for float2's __align__(8)).
This mirrors how ast_canopy-derived struct types already work.
---
 numbast/src/numbast/callconv.py | 69 ++++++++++++++-------------------
 1 file changed, 30 insertions(+), 39 deletions(-)

diff --git a/numbast/src/numbast/callconv.py b/numbast/src/numbast/callconv.py
index 695704c7..94df7b40 100644
--- a/numbast/src/numbast/callconv.py
+++ b/numbast/src/numbast/callconv.py
@@ -108,34 +108,28 @@ def _lower_impl(self, builder, context, sig, args):
         else:
             retval_ty = context.get_value_type(cxx_return_type)
             retval_ptr = builder.alloca(retval_ty, name="retval")
-            # Enforce CUDA vector-type alignment on every alloca.
+            # Use the Numba type's alignof_ to set the alloca alignment.
             #
-            # Background: CUDA vector types (float2, float4, int4, …) are
-            # declared with __align__(N) in the CUDA headers, meaning the
-            # hardware and PTX ISA require the pointer to be N-byte aligned
-            # (float2 → 8 B, float4 → 16 B). When Numbast lowers these types
-            # into LLVM IR they become anonymous structs, e.g. float2 becomes
-            # {float, float}. LLVM computes the ABI alignment of a struct as
-            # the maximum alignment of its members — 4 B for a struct of
-            # floats — which is *less* than the CUDA-required 8 B. builder.alloca
-            # without an explicit alignment adopts that 4-byte default.
-            #
-            # The NVRTC-compiled shim accesses the slot with a vector
-            # instruction (e.g. ld/st.v2.f32 for float2) which requires
-            # 8-byte alignment. Passing a 4-byte-aligned pointer causes
+            # LLVM computes struct ABI alignment as the max alignment of its
+            # members.  For CUDA vector types (float2, float4, uchar4, …)
+            # declared with __align__(N) in the CUDA headers, N can exceed
+            # the member alignment: float2 is {float,float} with member
+            # alignment 4 B but __align__(8).  LLVM therefore assigns a 4 B
+            # alloca, while the NVRTC shim uses a vector instruction
+            # (ld/st.v2.f32) that requires 8 B alignment, causing
             # cudaErrorMisalignedAddress at runtime.
             #
-            # Fix: after every alloca, raise the alignment to
-            #   max(abi_alignment(type), min(sizeof(type), 16))
-            # Taking the max with abi_alignment never reduces alignment below
-            # what LLVM already computed. min(sizeof, 16) captures the natural
-            # alignment for all standard CUDA vectors (float2=8, float4=16)
-            # while the cap at 16 prevents over-aligning large user structs
-            # that happen to be bigger than 16 bytes.
-            _dl = context.target_data
-            retval_ptr.align = max(
-                _dl.abi_alignment(retval_ty), min(_dl.abi_size(retval_ty), 16)
-            )
+            # Numbast's struct binder already sets alignof_ on user-defined
+            # bound structs (propagated from ast_canopy).  For built-in CUDA
+            # vector types, callers must set alignof_ on the Numba type when
+            # registering it in CTYPE_MAPS.  When alignof_ is present it is
+            # used here, matching the convention already applied to loads and
+            # stores (getattr(argty, "alignof_", None)).  When absent, LLVM's
+            # default ABI alignment is used, which is correct for scalars and
+            # structs without an explicit __align__ attribute.
+            _nb_align = getattr(cxx_return_type, "alignof_", None)
+            if _nb_align is not None:
+                retval_ptr.align = _nb_align
 
         # 2. Prepare arguments
         if self._intent_plan is None:
@@ -183,12 +177,11 @@ def _lower_impl(self, builder, context, sig, args):
                     ptrs.append(arg)
                 else:
                     ptr = cgutils.alloca_once(builder, vty)
-                    _dl = context.target_data
-                    ptr.align = max(  # see retval_ptr comment above
-                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
-                    )
+                    _nb_align = getattr(argty, "alignof_", None)
+                    if _nb_align is not None:
+                        ptr.align = _nb_align  # see retval_ptr comment above
                     builder.store(
-                        arg, ptr, align=getattr(argty, "alignof_", None)
+                        arg, ptr, align=_nb_align
                     )
                     ptrs.append(ptr)
         else:
@@ -208,10 +201,9 @@ def _lower_impl(self, builder, context, sig, args):
                     out_nbty = self._out_return_types[out_pos]
                     vty = context.get_value_type(out_nbty)
                     ptr = cgutils.alloca_once(builder, vty)
-                    _dl = context.target_data
-                    ptr.align = max(  # see retval_ptr comment above
-                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
-                    )
+                    _nb_align = getattr(out_nbty, "alignof_", None)
+                    if _nb_align is not None:
+                        ptr.align = _nb_align  # see retval_ptr comment above
                     ptrs.append(ptr)
                     arg_pointer_types.append(ir.PointerType(vty))
                     out_return_ptrs.append((out_nbty, ptr))
@@ -231,12 +223,11 @@ def _lower_impl(self, builder, context, sig, args):
                     arg_pointer_types.append(vty)
                 else:
                     ptr = cgutils.alloca_once(builder, vty)
-                    _dl = context.target_data
-                    ptr.align = max(  # see retval_ptr comment above
-                        _dl.abi_alignment(vty), min(_dl.abi_size(vty), 16)
-                    )
+                    _nb_align = getattr(argty, "alignof_", None)
+                    if _nb_align is not None:
+                        ptr.align = _nb_align  # see retval_ptr comment above
                     builder.store(
-                        arg, ptr, align=getattr(argty, "alignof_", None)
+                        arg, ptr, align=_nb_align
                     )
                     ptrs.append(ptr)
                     arg_pointer_types.append(ir.PointerType(vty))