From ca7c920d162fe85d0785cb74e86d5d45f51dd4f6 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 12:52:22 +0200
Subject: [PATCH 01/87] handle `nnkSym` in `getInnerPointerType`

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 45af3782..5157fec7 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -92,6 +92,8 @@ proc getInnerPointerType(n: NimNode): GpuType =
     # VarTy
     #   Sym "BigInt"
     result = nimToGpuType(n[0])
+  elif n.kind == nnkSym: # symbol of e.g. `ntyVar`
+    result = nimToGpuType(n.getTypeInst())
   else:
     raiseAssert "Found what: " & $n.treerepr
 

From 561328eb469f68d8fcbc6ecc3880b4d24814a016 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 12:52:33 +0200
Subject: [PATCH 02/87] fix `constructPtrSignature` after change from nil ->
 gtVoid for iTyp

The type of the identifier is now always `gtVoid`, so the previous
check `not idTyp.isNil` does not work anylonger.
---
 constantine/math_compiler/experimental/backends/wgsl.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 0b6552bd..2ff1e336 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -56,7 +56,7 @@ proc fromAddressSpace(addrSpace: AddressSpace): GpuSymbolKind =
 proc constructPtrSignature(addrSpace: AddressSpace, idTyp: GpuType, ptrStr, typStr: string): string =
   ## Constructs the `ptr<addressSpace, typStr, [read / read_write]>` string, which only includes
   ## the RW string if the address space is `storage`
-  let rw = if not idTyp.isNil: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`)
+  let rw = if idTyp.kind != gtVoid: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`)
   let rwStr = if rw: "read_write" else: "read"
   case addrSpace
   of asStorage: result = &"{ptrStr}<{addrSpace}, {typStr}, {rwStr}>"

From d7de955815f05832b9146a1b6777505a765e7314 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 12:53:34 +0200
Subject: [PATCH 03/87] handle `gpuCast` in determineSymKind/Mutability/Ident

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 2ff1e336..26edc9c1 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -205,6 +205,7 @@ proc determineSymKind(arg: GpuAst): GpuSymbolKind =
   of gpuBlock: arg.statements[^1].determineSymKind() # look at last element
   of gpuPrefix: gsLocal # equivalent to constructing a local var
   of gpuConv: gsLocal # a converted value will be a local var
+  of gpuCast: arg.cExpr.determineSymKind() # symbol kind of the thing we cast
   else:
     raiseAssert "Not implemented to determine symbol kind from node: " & $arg
 
@@ -225,6 +226,7 @@ proc determineMutability(arg: GpuAst): bool =
   of gpuBlock: arg.statements[^1].determineMutability() # look at last element
   of gpuPrefix: false # equivalent to constructing a local var
   of gpuConv: false # a converted value will be immutable
+  of gpuCast: arg.cExpr.determineMutability() # mutability of the thing we cast
   else:
     raiseAssert "Not implemented to determine mutability from node: " & $arg
 
@@ -250,8 +252,9 @@ proc determineIdent(arg: GpuAst): GpuAst =
   of gpuBlock: arg.statements[^1].determineIdent()
   of gpuPrefix: dfl()
   of gpuConv: dfl()
+  of gpuCast: arg.cExpr.determineIdent() # ident of the thing we cast
   else:
-    raiseAssert "Not implemented to determine mutability from node: " & $arg
+    raiseAssert "Not implemented to determine ident from node: " & $arg
 
 proc getGenericArguments(args: seq[GpuAst], params: seq[GpuParam], callerParams: Table[string, GpuParam]): seq[GenericArg] =
   ## If an argument is not a ptr argument in the original function (`params`) then

From 4277fcf9bc999e0902aa516354b3aeae8f94e365 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 14:16:03 +0200
Subject: [PATCH 04/87] allow determination of GPU type to fail in
 `nimToGpuType`

This is only for the optional helper used on the WebGPU backend, where
we try to determine the type in an infix expression. However, for some
arguments to the infix this is not uniquely possible. I.e. we might
encounter `SomeInteger`, which is not a unique type. In this case we
just fall back to not assigning a known type.
---
 .../math_compiler/experimental/nim_to_gpu.nim | 51 +++++++++++--------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 5157fec7..2e8a42d3 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has
 import ./gpu_types
 import ./backends/backends
 
-proc nimToGpuType(n: NimNode): GpuType
+proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType
 
 proc initGpuType(kind: GpuTypeKind): GpuType =
   ## If `kind` is `gtPtr` `to` must be the type we point to
@@ -75,25 +75,25 @@ proc unpackGenericInst(t: NimNode): NimNode =
 proc toGpuTypeKind(t: NimNode): GpuTypeKind =
   result = t.unpackGenericInst().typeKind.toGpuTypeKind()
 
-proc getInnerPointerType(n: NimNode): GpuType =
+proc getInnerPointerType(n: NimNode, allowToFail: bool = false): GpuType =
   doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind
   if n.typeKind in {ntyPointer, ntyUncheckedArray}:
     let typ = n.getTypeInst()
     doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr
     doAssert typ[0].kind in {nnkIdent, nnkSym}
     doAssert typ[0].strVal in ["ptr", "UncheckedArray"]
-    result = nimToGpuType(typ[1])
+    result = nimToGpuType(typ[1], allowToFail)
   elif n.kind == nnkPtrTy:
-    result = nimToGpuType(n[0])
+    result = nimToGpuType(n[0], allowToFail)
   elif n.kind == nnkAddr:
     let typ = n.getTypeInst()
-    result = getInnerPointerType(typ)
+    result = getInnerPointerType(typ, allowToFail)
   elif n.kind == nnkVarTy:
     # VarTy
     #   Sym "BigInt"
-    result = nimToGpuType(n[0])
+    result = nimToGpuType(n[0], allowToFail)
   elif n.kind == nnkSym: # symbol of e.g. `ntyVar`
-    result = nimToGpuType(n.getTypeInst())
+    result = nimToGpuType(n.getTypeInst(), allowToFail)
   else:
     raiseAssert "Found what: " & $n.treerepr
 
@@ -130,34 +130,38 @@ proc getTypeName(n: NimNode): string =
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
 proc parseTypeFields(node: NimNode): seq[GpuTypeField]
-proc nimToGpuType(n: NimNode): GpuType =
+proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
   ## Maps a Nim type to a type on the GPU
+  ##
+  ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases
+  ## where we would otherwise raise. This is so that in some cases where
+  ## we only _attempt_ to determine a type, we can do so safely.
   case n.kind
   of nnkIdentDefs: # extract type for let / var based on explicit or implicit type
     if n[n.len - 2].kind != nnkEmpty: # explicit type
-      result = nimToGpuType(n[n.len - 2])
+      result = nimToGpuType(n[n.len - 2], allowToFail)
     else: # take from last element
-      result = nimToGpuType(n[n.len - 1].getTypeInst())
+      result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail)
   of nnkConstDef:
     if n[1].kind != nnkEmpty: # has an explicit type
-      result = nimToGpuType(n[1])
+      result = nimToGpuType(n[1], allowToFail)
     else:
-      result = nimToGpuType(n[2]) # derive from the RHS literal
+      result = nimToGpuType(n[2], allowToFail) # derive from the RHS literal
   else:
     if n.kind == nnkEmpty: return initGpuType(gtVoid)
     case n.typeKind
     of ntyBool, ntyInt .. ntyUint64: # includes all float types
       result = initGpuType(toGpuTypeKind n.typeKind)
     of ntyPtr:
-      result = initGpuPtrType(getInnerPointerType(n), implicitPtr = false)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = false)
     of ntyVar:
-      result = initGpuPtrType(getInnerPointerType(n), implicitPtr = true)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = true)
     of ntyPointer:
       result = initGpuVoidPtr()
     of ntyUncheckedArray:
       ## Note: this is just the internal type of the array. It is only a pointer due to
       ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
-      result = initGpuUAType(getInnerPointerType(n))
+      result = initGpuUAType(getInnerPointerType(n, allowToFail))
     of ntyObject:
       let impl = n.getTypeImpl
       let flds = impl.parseTypeFields()
@@ -166,7 +170,7 @@ proc nimToGpuType(n: NimNode): GpuType =
     of ntyArray:
       # For a generic, static array type, e.g.:
       if n.kind == nnkSym:
-        return nimToGpuType(getTypeImpl(n))
+        return nimToGpuType(getTypeImpl(n), allowToFail)
       if n.len == 3:
         # BracketExpr
         #   Sym "array"
@@ -189,8 +193,12 @@ proc nimToGpuType(n: NimNode): GpuType =
       result = initGpuType(gtVoid)
       error("Generics are not supported in the CUDA DSL so far.")
     of ntyGenericInst:
-      result = n.unpackGenericInst().nimToGpuType()
-    else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr
+      result = n.unpackGenericInst().nimToGpuType(allowToFail)
+    else:
+      if allowToFail:
+        result = GpuType(kind: gtVoid)
+      else:
+        raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr
 
 proc assignOp(op: string, isBoolean: bool): string =
   ## Returns the correct CUDA operation given the Nim operator.
@@ -533,13 +541,14 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result.bLeft = ctx.toGpuAst(node[1])
     result.bRight = ctx.toGpuAst(node[2])
     # We patch the types of int / float literals. WGSL does not automatically convert literals
-    # to the target type.
+    # to the target type. Determining the type here _can_ fail. In that case the
+    # `lType` field will just be `gtVoid`, like the default.
     if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit:
       # determine literal type based on `bRight`
-      result.bLeft.lType = nimToGpuType(node[2])
+      result.bLeft.lType = nimToGpuType(node[2], allowToFail = true)
     elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit:
       # determine literal type based on `bLeft`
-      result.bRight.lType = nimToGpuType(node[1])
+      result.bRight.lType = nimToGpuType(node[1], allowToFail = true)
 
   of nnkDotExpr:
     ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT

From ba89dcbc983ed906ab0da958f2f3b714dc21b000 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 14:17:19 +0200
Subject: [PATCH 05/87] handle `UncheckedArray` in `gpuTypeToString` on CUDA
 backend

---
 constantine/math_compiler/experimental/backends/cuda.nim | 1 +
 1 file changed, 1 insertion(+)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 227bde79..1077b92c 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -43,6 +43,7 @@ proc gpuTypeToString*(t: GpuTypeKind): string =
   of gtVoidPtr: "void*"
   of gtObject: "struct"
   of gtString: "const char*"
+  of gtUA: "" # `UncheckedArray` by itself is nothing in CUDA
   else:
     raiseAssert "Invalid type : " & $t
 

From a2bbf982e5df16dc47578f97444b645ad673cae9 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 15:17:27 +0200
Subject: [PATCH 06/87] rename constant to set workgroup size

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 26edc9c1..dc41cb12 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -825,5 +825,5 @@ proc codegen*(ctx: var GpuContext): string =
   for fnIdent, fn in ctx.fnTab:
     if fn.isGlobal():
       ## XXX: make adjustable!
-      result.add "@compute @workgroup_size(NUM_WORKGROUPS)\n"
+      result.add "@compute @workgroup_size(WORKGROUP_SIZE)\n"
     result.add ctx.genWebGpu(fn) & "\n\n"

From 1c5ff0c0925f660c2108e2c3e6a42986d77e9a88 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 15:19:51 +0200
Subject: [PATCH 07/87] minor cleanup

---
 .../math_compiler/experimental/backends/wgsl.nim      | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index dc41cb12..a6ecf6d4 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -552,26 +552,17 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   case ast.kind
   of gpuVoid: return # nothing to emit
   of gpuProc:
-
-    ## XXX: if a {.global.} / attGlobal proc, lift arguments
-    ## Store all arguments in the `GpuContext`
-    ## *AFTER* processing all of the code, generate header and place at beginning
-    ## Most difficult:
-    ## - track identifiers from {.global.} functions into arbitrary layers and remove
-    ## BUT, we can also have a full preprocessing pass.
-
     let attrs = collect:
       for att in ast.pAttributes:
         $att
 
-    # Parameters
     var params: seq[string]
     for p in ast.pParams:
       params.add gpuTypeToString(p.typ, p.ident, allowEmptyIdent = false)
     var fnArgs = params.join(", ")
     if $attGlobal in attrs:
       doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!"
-      ## XXX: clean this up. Add the global id builtin
+      ## XXX: make this more flexible. In theory can be any name
       fnArgs = "@builtin(global_invocation_id) global_id: vec3<u32>"
     let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs)
 

From b98f06985ccfc7d8ce3b15b4b3a4ef0b81586579 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 11 Aug 2025 18:49:15 +0200
Subject: [PATCH 08/87] correctly handle `gtUA` (UncheckedArray) on CUDA
 backend

The `gtUA` type enum element is new and was not correctly handled yet
on the CUDA backend.
---
 constantine/math_compiler/experimental/backends/cuda.nim | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 1077b92c..9ee253ca 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -58,6 +58,10 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
   var skipIdent = false
   case t.kind
   of gtPtr:
+    var t = t # if `ptr UncheckedArray`, remove the `gtUA` layer. No meaning on CUDA
+    if t.to.kind == gtUA:
+      t.to = t.to.uaTo
+
     if t.to.kind == gtArray: # ptr to array type
       # need to pass `*` for the pointer into the identifier, i.e.
       # `state: var array[4, BigInt]`

From c0e1eb88fa111405e5e2b8ad26037acfa2201ab3 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 12 Aug 2025 18:02:45 +0200
Subject: [PATCH 09/87] fix access of type for left ident in assignment

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index a6ecf6d4..865996ba 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -637,8 +637,8 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         of gpuConv: dfl()
         else:
           raiseAssert "Not implemented to determine mutability from node: " & $arg
-      let leftTyp = ast.aLeft.determineIdent().iTyp
-      if leftTyp.kind == gtPtr and leftTyp.to.kind == gtInt32:
+      let leftId = ast.aLeft.determineIdent()
+      if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32:
         # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was
         # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and
         # we patched the `bool -> i32` and thus need to convert it.

From 24afcfe883ae3f2dff11c9aebc0c370bcbd5c425 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 12 Aug 2025 18:03:17 +0200
Subject: [PATCH 10/87] rewrite compound assignment operators in all functions

i.e.

x += 5

becomes

x = x + 5

etc for any prefix `foo=` in

x foo= y

we generate

x = x foo y
---
 .../experimental/backends/wgsl.nim            | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 865996ba..1fc86f0d 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -542,6 +542,31 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
       ctx.injectAddressOf(fn)
 
 
+  proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
+    doAssert n.kind == gpuBinOp
+
+    template genAssign(left, rnode, op: typed): untyped =
+      let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode)
+      GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false)
+
+    let op = n.bOp
+    if op.len >= 2 and op[^1] == '=':
+      result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last
+    else:
+      # leave untouched
+      result = n
+
+  proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
+    case n.kind
+    of gpuBinOp: n = rewriteCompoundAssignment(n)
+    else:
+      for ch in mitems(n):
+        ctx.makeCodeValid(ch)
+  # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5`
+  for (fnIdent, fn) in mpairs(ctx.fnTab):
+    ctx.makeCodeValid(fn)
+
+
 proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a))

From 18d582f271675da3363b6fc18457ee52d28884cc Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 16:57:53 +0200
Subject: [PATCH 11/87] extend `GpuFieldInit` by a type field

We need that type to determine information about what type we actually
assign in a `gpuObjConstr`.
---
 constantine/math_compiler/experimental/gpu_types.nim  |  9 ++++++++-
 constantine/math_compiler/experimental/nim_to_gpu.nim | 11 ++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 448a783e..569682ff 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -212,6 +212,7 @@ type
   GpuFieldInit* = object
     name*: string
     value*: GpuAst
+    typ*: GpuType
 
   ## XXX: UNUSED
   TemplateInfo* = object
@@ -385,7 +386,13 @@ proc clone*(ast: GpuAst): GpuAst =
     result = GpuAst(kind: gpuObjConstr)
     result.ocName = ast.ocName
     for f in ast.ocFields:
-      result.ocFields.add(GpuFieldInit(name: f.name, value: f.value.clone()))
+      result.ocFields.add(
+        GpuFieldInit(
+          name: f.name,
+          value: f.value.clone(),
+          typ: f.typ.clone()
+        )
+      )
   of gpuInlineAsm:
     result = GpuAst(kind: gpuInlineAsm)
     result.stmt = ast.stmt
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 2e8a42d3..ce460e1a 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -658,16 +658,21 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     for i in 1 ..< node.len: # all fields to be init'd
       doAssert node[i].kind == nnkExprColonExpr
       ocFields.add GpuFieldInit(name: node[i][0].strVal,
-                                value: ctx.toGpuAst(node[i][1]))
+                                value: ctx.toGpuAst(node[i][1]),
+                                typ: GpuType(kind: gtVoid))
+
     # now add fields in order of the type declaration
     for i in 0 ..< flds.len:
       let idx = findIdx(ocFields, flds[i].name)
       if idx >= 0:
-        result.ocFields.add ocFields[idx]
+        var f = ocFields[idx]
+        f.typ = flds[i].typ
+        result.ocFields.add f
       else:
         let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid))
         result.ocFields.add GpuFieldInit(name: flds[i].name,
-                                         value: dfl)
+                                         value: dfl,
+                                         typ: flds[i].typ)
 
   of nnkAsmStmt:
     doAssert node.len == 2

From fbfeab6cc97afe52897aa2e4ac7f19e3d7156f11 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 16:59:45 +0200
Subject: [PATCH 12/87] make sure to update symbols in global functions too

Otherwise we don't have up to date type / symbol kind information in globals.
---
 .../experimental/backends/wgsl.nim              | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 1fc86f0d..78436f9b 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -498,6 +498,20 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) =
     for ch in mitems(n):
       ctx.injectAddressOf(ch)
 
+proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) =
+  ## Update symbols in global functions to have same mutability and symbolkind as
+  ## parameters
+  case n.kind
+  of gpuIdent:
+    if n.iSym in ctx.globals:
+      n.symbolKind = gsGlobalKernelParam
+      if n.iTyp.kind == gtPtr:
+        let g = ctx.globals[n.iSym]
+        n.iTyp.mutable = g.typ.kind == gtPtr # arguments as pointers == mutable
+  else:
+    for ch in n:
+      ctx.updateSymsInGlobals(ch)
+
 proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ## If `kernel` is a global function, we *only* generate code for that kernel.
   ## This is useful if your GPU code contains multiple kernels with differing
@@ -518,6 +532,9 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
       for p in fn.pParams:
         ctx.globals[p.ident.iSym] = p # copy all parameters over to globals
       fn.pParams.setLen(0) # delete function's parameters
+      # now update all appearances of the parameters, now globals, such that they reflect
+      # the correct symbol kind and mutability
+      ctx.updateSymsInGlobals(fn)
     else:
       discard
 

From 1f04a82aee993af78ed66860d6044826d8783c61 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:00:49 +0200
Subject: [PATCH 13/87] remove local `determineIdent` from `genWebGpu`

---
 .../experimental/backends/wgsl.nim            | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 78436f9b..66bda610 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -655,30 +655,6 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight),
                                        ctx.size(ast.aLeft))
     else:
-      proc determineIdent(arg: GpuAst): GpuAst =
-        ## Tries to determine the underlying ident that is contained in this node.
-        ## The issue is the argument to a `gpuCall` can be a complicated expression.
-        ## Depending on the node it may be possible to extract a simple identifier,
-        ## e.g. for `addr(foo)` (`gpuAddr` of `gpuIdent` node) we can get the ident.
-        ## If this fails, we return a `gpuVoid` node.
-        ##
-        ## TODO: Think about if it ever makes sense to extract the ident underlying
-        ## e.g. `deref` and use _that_ to determine mutability & address space.
-        template dfl(): untyped = GpuAst(kind: gpuVoid)
-        case arg.kind
-        of gpuIdent: arg
-        of gpuAddr: arg.aOf.determineIdent()
-        of gpuDeref: arg.dOf.determineIdent()
-        of gpuCall: dfl()
-        of gpuIndex: arg.iArr.determineIdent()
-        of gpuDot: arg.dParent.determineIdent()
-        of gpuLit: dfl()
-        of gpuBinOp: dfl()
-        of gpuBlock: arg.statements[^1].determineIdent()
-        of gpuPrefix: dfl()
-        of gpuConv: dfl()
-        else:
-          raiseAssert "Not implemented to determine mutability from node: " & $arg
       let leftId = ast.aLeft.determineIdent()
       if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32:
         # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was

From ebcff83187377238fce18fb57145a3dea0e56a0b Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:01:17 +0200
Subject: [PATCH 14/87] fix tree representation of GpuAst

---
 .../math_compiler/experimental/gpu_types.nim       | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 569682ff..52b4b668 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -559,10 +559,10 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add pretty(n.vName, indent + 2)
     result.add pretty(n.vInit, indent + 2)
     if n.vAttributes.len > 0:
-      result.add id("Attributes")
+      result.add idd("Attributes")
       for attr in n.vAttributes:
         let indent = indent + 2
-        result.add id(attr)
+        result.add idd(attr)
   of gpuAssign:
     result.add pretty(n.aLeft, indent + 2)
     result.add pretty(n.aRight, indent + 2)
@@ -599,11 +599,13 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
       let indent = indent + 2
       result.add id(t.name)
   of gpuObjConstr:
-    result.add id("Ident", n.ocName)
-    result.add id("Fields")
+    result.add idd("Ident", n.ocName)
+    result.add idd("Fields")
     for f in n.ocFields:
-      let indent = indent + 2
-      result.add id("Name", f.name)
+      var indent = indent + 2
+      result.add idd("Field")
+      indent = indent + 2
+      result.add idd("Name", f.name)
       result.add pretty(f.value, indent + 2)
   of gpuInlineAsm:
     result.add id(n.stmt)

From 3a32d2b9a8e93b1e6de6d1bf0dda999d3bc250c0 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:01:25 +0200
Subject: [PATCH 15/87] add `mpairs`, `pairs` iterator for `GpuAst`

---
 .../math_compiler/experimental/gpu_types.nim        | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 52b4b668..0b6b560d 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -703,6 +703,19 @@ iterator mitems*(ast: var GpuAst): var GpuAst =
 iterator items*(ast: GpuAst): GpuAst =
   iterImpl(ast, mutable = false)
 
+iterator mpairs*(ast: var GpuAst): (int, var GpuAst) =
+  ## Iterate over all child nodes of the given AST and the index
+  var i = 0
+  for el in mitems(ast):
+    yield (i, el)
+    inc i
+
+iterator pairs*(ast: GpuAst): (int, GpuAst) =
+  var i = 0
+  for el in items(ast):
+    yield (i, el)
+    inc i
+
 
 ## General utility helpers
 

From a6186ae5174f051f340c4c152f5cfc479ccb44a5 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:04:38 +0200
Subject: [PATCH 16/87] support default initialization in obj constr fields

As structs only support 'constructible types' in their fields anyway,
we can just default initialize all fields the user leaves out in an
object constructor.
---
 constantine/math_compiler/experimental/backends/wgsl.nim | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 66bda610..27036005 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -764,7 +764,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   of gpuObjConstr:
     result = ast.ocName & "("
     for i, el in ast.ocFields:
-      result.add ctx.genWebGpu(el.value)
+      if el.value.kind == gpuLit and el.value.lValue == "DEFAULT":
+        # use type to construct a default value
+        let typStr = gpuTypeToString(el.typ, allowEmptyIdent = true)
+        result.add typStr & "()"
+      else:
+        result.add ctx.genWebGpu(el.value)
       if i < ast.ocFields.len - 1:
         result.add ", "
     result.add ")"

From f485deb1c32cd0097ecf1efb5eb6882cd4e78fea Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:06:11 +0200
Subject: [PATCH 17/87] =?UTF-8?q?correctly=20handle=20`var=20foo=20{.const?=
 =?UTF-8?q?ant.}`=20variables=20by=20=E2=87=92=20globals?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those are intended for runtime constants, i.e. further storage buffers
in the context of WGSL. In CUDA we'd use `copyToSymbol` to copy the
data to them before execution.
---
 .../experimental/backends/wgsl.nim            | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 27036005..96c5fcba 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -512,6 +512,31 @@ proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) =
     for ch in n:
       ctx.updateSymsInGlobals(ch)
 
+proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) =
+  ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to
+  ## the `globals` of the context. Such variables are *not* regular global constants, but rather
+  ## `storage` buffers, which are filled before the kernel is executed.
+  ##
+  ## XXX: Document current not ideal behavior that one needs to be careful to pass data into
+  ## `wgsl.fakeExecute` precisely in the order in which the `var foo {.constant.}` are defined
+  ## *AND* after all kernel parameters!
+  doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind
+  var i = 0
+  while i < blk.len:
+    doAssert blk.kind == gpuBlock
+    let g = blk.statements[i]
+    if g.kind == gpuVar and atvConstant in g.vAttributes:
+      # remove this from `globalBlocks` and add to `globals`
+      doAssert g.vInit.kind == gpuVoid, "A variable annotated with `{.constant.}` must not have an initialization!"
+      # we construct a fake parameter from it
+      ## XXX: `storage` address space is probably what we want, but think more about it
+      let param = GpuParam(ident: g.vName, typ: g.vType, addressSpace: asStorage)
+      ctx.globals[param.ident.iSym] = param
+      blk.statements.delete(i)
+      # no need to increase `i`
+    else:
+      inc i
+
 proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ## If `kernel` is a global function, we *only* generate code for that kernel.
   ## This is useful if your GPU code contains multiple kernels with differing
@@ -538,6 +563,12 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
     else:
       discard
 
+  # 2.b filter out all `var foo {.constant.}: dtype` from the `globalBlocks` and add them to
+  #    the `globals`
+  # `globalBlocks` has two entries:
+  # 0: variables
+  # 1: types
+  ctx.pullConstantPragmaVars(ctx.globalBlocks[0])
   # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
   #    the functions called and record them in `fnTab`. If they have pointer arguments we
   #    generate a generic instantiation for the exact pointer types used.

From 08e43c5fd9b51304375b3c5220b28adca275b86c Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:13:52 +0200
Subject: [PATCH 18/87] move rewriting of compound assignment out of
 `storagePass`

---
 .../experimental/backends/wgsl.nim            | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 96c5fcba..7e41aa2a 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -498,6 +498,32 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) =
     for ch in mitems(n):
       ctx.injectAddressOf(ch)
 
+proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
+  doAssert n.kind == gpuBinOp
+  if n.bOp in ["<=", "==", ">=", "!="]: return n
+
+  template genAssign(left, rnode, op: typed): untyped =
+    let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode)
+    GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false)
+
+  let op = n.bOp
+  if op.len >= 2 and op[^1] == '=':
+    result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last
+  else:
+    # leave untouched
+    result = n
+
+proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
+proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
+  case n.kind
+  of gpuBinOp:
+    n = rewriteCompoundAssignment(n)
+    for ch in mitems(n): # now go over children
+      ctx.makeCodeValid(ch, inGlobal)
+  else:
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+
 proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) =
   ## Update symbols in global functions to have same mutability and symbolkind as
   ## parameters
@@ -589,33 +615,11 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
     if fn.isGlobal(): # non global functions don't need to be mutated
       ctx.injectAddressOf(fn)
 
-
-  proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
-    doAssert n.kind == gpuBinOp
-
-    template genAssign(left, rnode, op: typed): untyped =
-      let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode)
-      GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false)
-
-    let op = n.bOp
-    if op.len >= 2 and op[^1] == '=':
-      result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last
-    else:
-      # leave untouched
-      result = n
-
-  proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
-    case n.kind
-    of gpuBinOp: n = rewriteCompoundAssignment(n)
-    else:
-      for ch in mitems(n):
-        ctx.makeCodeValid(ch)
   # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5`
   for (fnIdent, fn) in mpairs(ctx.fnTab):
-    ctx.makeCodeValid(fn)
+    ctx.makeCodeValid(fn, inGlobal = fn.isGlobal())
 
 
-proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a))
 

From 00c4b16d0c2dd5cb022fd6c6e272643bdfee008d Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:18:38 +0200
Subject: [PATCH 19/87] implement lifting of struct pointer fields

This gives us the convenience of passing around a struct with a
pointer field, while preserving the restrictions of WebGPU. Because of
the fact that storage buffers are global anyway, we can "pretend" the
pointers are part of an object and simply replace them by the global
when the fields are used.

We throw a CT error if one tries to assign a non storage buffer
pointer to a field.
---
 .../experimental/backends/wgsl.nim            | 161 ++++++++++++++++++
 .../math_compiler/experimental/gpu_types.nim  |   4 +
 2 files changed, 165 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 7e41aa2a..542e060f 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -388,6 +388,11 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp
   ## of that function (hence the name `scanGenerics`). The generic instance will be added
   ## to `fnTab` instead. The name of the generic will be derived based on the types
   ## of arguments with respect to mutability and address space.
+  ##
+  ## In addition this function records any time a `struct` is constructed in a `gpuObjConstr`
+  ## node and a pointer field assigned to it. As pointer fields are not valid in WGSL, we
+  ## record them here to replace them by their arguments passed to the constructor later.
+  ## The pointers _must_ be pointers passed into a global kernel (i.e. `storage` address space).
   case n.kind
   of gpuCall:
     let fn = n.cName
@@ -438,6 +443,18 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp
     # Harvest generics from arguments to this call!
     for arg in n.cArgs:
       ctx.scanGenerics(arg, callerParams)
+  of gpuObjConstr:
+    # If pointer argument of `storage`, strip out, if pointer type field
+    # otherwise, raise CT error
+    for f in n.ocFields:
+      if f.typ.kind == gtPtr:
+        doAssert f.value.kind in [gpuAddr, gpuIdent], "Constructing a pointer field " &
+          "from a more complex expression than an ident or an address-of operation " &
+          "is currently not supported."
+        let id = f.value.determineIdent()
+        doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " &
+          "variable (i.e. an argument to a global kernel) is not supported: " & $f
+        ctx.structsWithPtrs[(n.ocName, f.name)] = id
   else:
     for ch in n:
       ctx.scanGenerics(ch, callerParams)
@@ -513,13 +530,129 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
     # leave untouched
     result = n
 
+proc getStructName(n: GpuAst): string =
+  ## Given an identifier `gpuIdent` (or `Deref` of one), return the name of the struct type
+  ## the ident is of or an empty string if it is not (pointing to) a struct.
+  doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind
+  var p = n
+  if p.kind == gpuDeref:
+    p = n.dOf
+  result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject:
+             p.iTyp.to.name
+           elif p.iTyp.kind == gtObject:
+             p.iTyp.name
+           else: ""
+
 proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
+  ## Addresses other AST patterns that need to be rewritten on WGSL. Aspects
+  ## that are rewritten include:
+  ##
+  ## - (`gpuBinOp`) rewriting compound assignment operators as regular assignments, `x += y` ↦ `x = x + y`
+  ##
+  ## - (`gpuDot`) replace field access of struct pointer fields by the pointers passed into the object
+  ##   constructor (ref `scanGenerics`). `inGlobal` is used to decide what exactly we replace
+  ##   it by. Inside of a global function the variables won't be pointers, hence we insert `&foo`.
+  ##   In device functions, the globals will have been passed into the function as a parameter,
+  ##   `ptr<storage, ...>`. Thus, we replace by `foo`.
+  ##   NOTE: We could consider to move this into `scanGenerics`, but for the moment I prefer to
+  ##   do code transformations here and `scanGenerics` being only about data collection.
+  ##
+  ## - (`gpuAssign`) compile time errors, if a user tries to assign a pointer to a struct pointer field
+  ##   outside the constructor.
+  ##
+  ## - (`gpuCall`) potentially update signatures of our custom generic functions. In `scanGenerics` if we
+  ##   have a call like `foo(bar.ptrField)` we will determine the signature of `foo` to have
+  ##   a `function` pointer, because `bar` will be a local struct instance. However, due to
+  ##   our replacement rules and fact that *only* storage pointers may be assigned to constructors
+  ##   the correct signature would be `storage` for the first argument after replacing `bar.ptrField`
+  ##   by its value in the constructor.
+  ##
+  ## - (`gpuObjConstr`) delete arguments to object constructors, which assign pointer fields.
+  ##
+  ## - (`gpuVar`) update types of new variables based on the RHS. May have changed since Nim -> GpuAst,
+  ##   due to `gpuDot` replacement further up.
+  ##
+  ## NOTE: A few cases already raise compile time errors _here_ and not in `checkCodeValid`,
+  ## as some transformations otherwise break the detection.
   case n.kind
   of gpuBinOp:
     n = rewriteCompoundAssignment(n)
     for ch in mitems(n): # now go over children
       ctx.makeCodeValid(ch, inGlobal)
+  of gpuObjConstr: # strip out arguments that are pointer types
+    let t = n.ocName
+    var i = 0
+    while i < n.ocFields.len:
+      let f = n.ocFields[i]
+      if (t, f.name) in ctx.structsWithPtrs:
+        if f.typ.kind == gtPtr:
+          n.ocFields.delete(i)
+        else:
+          inc i
+      else:
+        inc i
+  of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res`
+    var p = n.dParent
+    let id = getStructName(p)
+    doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind
+    let field = n.dField.ident()
+    if id.len > 0 and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer
+      let v = ctx.structsWithPtrs[(id, field)]
+      ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions,
+      ## we will have passed the parameter
+      if inGlobal:
+        n = GpuAst(kind: gpuAddr, aOf: v) # overwrite with the address of value passed in to the object constructor
+      else:
+        n = v
+  of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field
+    if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]:
+      let dot = n.aLeft
+      let id = getStructName(dot.dParent)
+      if id.len > 0:
+        doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind
+        let field = dot.dField.ident()
+        if (id, field) in ctx.structsWithPtrs:
+          raiseAssert "Assignment of a struct (`" & id & "`) field of a pointer type is not supported. " &
+            "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n)
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+  of gpuCall:
+    # we might need to update the type of generics, if we did the replacement in `gpuDot`, because
+    # a struct ptr field will have had the wrong storage type
+    for ch in mitems(n): # first process children
+      ctx.makeCodeValid(ch, inGlobal)
+    # now check if any argument's type mismatches against the generic we recorded
+    let fnName = n.cName
+    if fnName in ctx.fnTab: # otherwise will not be generated by us, so irrelevan
+      # NOTE: theoretically, if we had struct pointer field replacements with symbols that had
+      # *different* address spaces, we'd need to split one generic into multiple again here.
+      # But that shouldn't be possible, because our entire replacement is currently only
+      # sane if we store a *storage pointer* in a struct. We would have raised in `scanGenerics`
+      # because of invalid pointer assignment in an object constructor.
+      let fn = ctx.fnTab[fnName]
+      let params = fn.pParams
+      for i, arg in n: # walk the parameters again and compare
+        let argId = arg.determineIdent()
+        if argId.kind != gpuVoid and argId.ident().len > 0:
+          var p = params[i]
+          ## XXX: update anything else? We mostly care about the address space here, because
+          ## the rest _should_ be the same anyway.
+          if p.addressSpace != argId.symbolKind.toAddressSpace():
+            p.addressSpace = argId.symbolKind.toAddressSpace()
+            p.ident.symbolKind = argId.symbolKind
+            fn.pParams[i] = p # write back, not a ref type!
+  of gpuVar:
+    # first recurse on the `gpuVar` to get possible replacements
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+    # update LHS with info from RHS by copying over its symbol kind. Different types are
+    # possible after replacements of `gpuDot` nodes above.
+    if n.vType.kind == gtPtr:
+      let rightId = n.vInit.determineIdent()
+      n.vName.symbolKind = rightId.symbolKind
+      n.vType.mutable = rightId.iTyp.mutable
+      n.vName.iTyp.mutable = rightId.iTyp.mutable
   else:
     for ch in mitems(n):
       ctx.makeCodeValid(ch, inGlobal)
@@ -563,6 +696,31 @@ proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) =
     else:
       inc i
 
+proc removeStructPointerFields(blk: var GpuAst) =
+  ## Filters out `ptr` fields from all structs.
+  ##
+  ## If a type is used with `storage` pointer arguments, we will later perform replacement of field
+  ## access to the pointer field by the value we assign.
+  ##
+  ## If the user assigns a local (`function`) pointer, we raise a CT error. We _could_ in theory support
+  ## replacement for local pointer types too, but it requires a more careful analysis of which
+  ## local to replace by and the name in other scopes. I.e. passing a local pointer to a constructor
+  ## which has a different name than in the calling scope would require us to traverse the AST up to
+  ## the calling scope.
+  ##
+  ## Given the extreme limitations on `let` variables with pointers anyway, I don't think there is muc
+  ## purpose on supporting such features.
+  doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind
+  for typ in mitems(blk):
+    doAssert typ.kind == gpuTypeDef
+    var i = 0
+    while i < typ.tFields.len:
+      let f = typ.tFields[i]
+      if f.typ.kind == gtPtr: # delete
+        typ.tFields.delete(i)
+      else:
+        inc i
+
 proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ## If `kernel` is a global function, we *only* generate code for that kernel.
   ## This is useful if your GPU code contains multiple kernels with differing
@@ -595,6 +753,9 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   # 0: variables
   # 1: types
   ctx.pullConstantPragmaVars(ctx.globalBlocks[0])
+  # 2.c remove all fields of structs, which have pointer type
+  removeStructPointerFields(ctx.globalBlocks[1])
+
   # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
   #    the functions called and record them in `fnTab`. If they have pointer arguments we
   #    generate a generic instantiation for the exact pointer types used.
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 0b6b560d..ff402991 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -246,6 +246,10 @@ type
     #                        ## when we finish, we pop. Before we pop, we assign the variable definitions to the `gpuBlock`
     #                        ## `locals`
     genSymCount*: int ## increases for every generated identifier (currently only underscore `_`), hence the basic solution
+    ## Maps a struct type and field name, which is of pointer type to the value the user assigns
+    ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()`
+    ## constructor (WebGPU only).
+    structsWithPtrs*: Table[(string, string), GpuAst]
 
   GenericArg* = object
     addrSpace*: AddressSpace ## We store the address space, because that's what matters

From 4046396c4f5d1b16067b33832a699de979cf10dd Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:20:18 +0200
Subject: [PATCH 20/87] add helper to check code for validity

Currently only checks if we assign a pointer to a `var`
variable (which is not allowed in WGSL).
---
 .../experimental/backends/wgsl.nim            | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 542e060f..336d1797 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -671,6 +671,23 @@ proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) =
     for ch in n:
       ctx.updateSymsInGlobals(ch)
 
+proc checkCodeValid(ctx: var GpuContext, n: GpuAst) =
+  ## Checks if the code is valid according to WGSL spec.
+  ## So far handles:
+  ## - variables (`var`) to pointer types are not allowed
+  ##
+  ## Some code is already rejected in earlier passes, if a compiler pass would transform
+  ## the code in such a way as making a detection of illegal code invalid.
+  case n.kind
+  of gpuVar:
+    if n.vType.kind == gtPtr and n.vMutable: # `vMutable == var` -> not allowed to store pointers
+      let code = ctx.genWebGpu(n)
+      raiseAssert "The node: `" & $code & "` constructs a variable (`var`) to a pointer type. This " &
+        "is invalid in WGSL. Use `let`."
+  else:
+    for ch in n:
+      ctx.checkCodeValid(ch)
+
 proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) =
   ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to
   ## the `globals` of the context. Such variables are *not* regular global constants, but rather
@@ -780,6 +797,10 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   for (fnIdent, fn) in mpairs(ctx.fnTab):
     ctx.makeCodeValid(fn, inGlobal = fn.isGlobal())
 
+  # 6. finally raise error if we find anything that is not allowed in WGSL after our transformations
+  for (fnIdent, fn) in pairs(ctx.fnTab):
+    ctx.checkCodeValid(fn)
+
 
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a))

From f27b0a1aeecf6195907ae41997440db3de13e027 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 14 Aug 2025 17:21:12 +0200
Subject: [PATCH 21/87] add support for full Nim generics in the context of GPU
 code

This means we can finally write something like:

```
proc foo[N: static int](ar: array[N, BigInt], ...)
```

We generate generic instantiations for every instantiation the Nim
compiler produces (plus potentially additional ones for every pointer
type argument the user passes into such a function).
---
 .../experimental/backends/wgsl.nim            |   4 +
 .../experimental/gpu_compiler.nim             |  15 ++-
 .../math_compiler/experimental/gpu_types.nim  |  14 +++
 .../math_compiler/experimental/nim_to_gpu.nim | 115 +++++++++++-------
 4 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 336d1797..728ab57c 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -751,6 +751,10 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ctx.globalBlocks.add varBlock
   ctx.globalBlocks.add typBlock
 
+  # Now add the generics to the `allFnTab`
+  for k, v in pairs(ctx.genericInsts):
+    ctx.allFnTab[k] = v
+
   # 2. Remove all arguments from global functions, as none are allowed in WGSL
   for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table
     if (fn.isGlobal() and kernel.len > 0 and fn.pName.ident() == kernel) or
diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 0c68e88e..d71c4298 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -94,16 +94,17 @@ proc malloc*(size: csize_t): pointer  = discard
 proc free*(p: pointer) = discard
 proc syncthreads*() {.cudaName: "__syncthreads".} = discard
 
-macro toGpuAst*(body: typed): GpuAst =
+macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
   ## WARNING: The following are *not* supported:
   ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
   ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
   ##         is actually a field access!
   ## - most regular Nim features :)
-  echo body.treerepr
-  echo body.repr
   var ctx = GpuContext()
-  newLit(ctx.toGpuAst(body))
+  let ast = ctx.toGpuAst(body)
+  let gen = toSeq(ctx.genericInsts.values)
+  let g = GpuGenericsInfo(data: gen)
+  newLit((g, ast))
 
 macro cuda*(body: typed): string =
   ## WARNING: The following are *not* supported:
@@ -119,13 +120,15 @@ macro cuda*(body: typed): string =
   let body = ctx.codegen(gpuAst)
   result = newLit(body)
 
-proc codegen*(ast: GpuAst, kernel: string = ""): string =
+proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string =
   ## Generates the code based on the given AST (optionally at runtime) and restricts
   ## it to a single global kernel (WebGPU) if any given.
-  let ast = ast.clone() ## XXX: remove clone
   var ctx = GpuContext()
+  for fn in gen.data: # assign generics info to correct table
+    ctx.genericInsts[fn.pName] = fn
   result = ctx.codegen(ast, kernel)
 
+
 when isMainModule:
   # Mini example
   let kernel = cuda:
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index ff402991..87b1a22a 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -250,6 +250,20 @@ type
     ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()`
     ## constructor (WebGPU only).
     structsWithPtrs*: Table[(string, string), GpuAst]
+    ## Set of all generic proc names we have encountered in Nim -> GpuAst. When
+    ## we see an `nnkCall` we check if we call a generic function. If so, look up
+    ## the instantiated generic, parse it and store in `genericInsts` below.
+    generics*: HashSet[string]
+
+    ## Stores the unique identifiers (keys) and the implementations of the
+    ## precise generic instantiations that are called.
+    genericInsts*: OrderedTable[GpuAst, GpuAst]
+
+  ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we
+  ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext`
+  ## causes trouble.
+  GpuGenericsInfo* = object
+    data*: seq[GpuAst]
 
   GenericArg* = object
     addrSpace*: AddressSpace ## We store the address space, because that's what matters
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index ce460e1a..fe814ffa 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -105,7 +105,9 @@ proc determineArrayLength(n: NimNode): int =
   of nnkIdent:
     let msg = """Found array with length given by identifier: $#!
 You might want to create a typed template taking a typed parameter for this
-constant to force the Nim compiler to bind the symbol.
+constant to force the Nim compiler to bind the symbol. In theory though this
+error should not appear anymore though, as we don't try to parse generic
+functions.
 """ % n[1].strVal
     raiseAssert msg
   else:
@@ -248,7 +250,8 @@ proc requiresMemcpy(n: NimNode): bool =
 proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
   doAssert n.kind == nnkPragma
   for pragma in n:
-    doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr
+    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall], "Unexpected node kind: " & $pragma.treerepr
+    let pragma = if pragma.kind == nnkCall: pragma[0] else: pragma
     case pragma.strVal
     of "device": result.incl attDevice
     of "global": result.incl attGlobal
@@ -374,42 +377,55 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result = ctx.toGpuAst(node[0])
 
   of nnkProcDef, nnkFuncDef:
-    result = GpuAst(kind: gpuProc)
-    result.pName = ctx.toGpuAst(node.name)
-    result.pName.symbolKind = gsProc ## This is a procedure identifier
-    doAssert node[3].kind == nnkFormalParams
-    result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type
-    # Process pragmas
-    if node.pragma.kind != nnkEmpty:
-      doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
-      result.pAttributes = collectProcAttributes(node.pragma)
-      if result.pAttributes.len == 0: # means `nimonly` was applied
-        return GpuAst(kind: gpuVoid)
-    # Process parameters
-    for i in 1 ..< node[3].len:
-      let param = node[3][i]
-      let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
-      let typIdx = param.len - 2 # second to last is the type
-      # IdentDefs
-      #   Ident "x"
-      #   Ident "y"
-      #   Ident "res"
-      #   PtrTy
-      #     Ident "float32"   # `param.len - 2`
-      #   Empty               # `param.len - 1`
-      let paramType = nimToGpuType(param[typIdx])
-      #echo "Argument: ", param.treerepr, " has tpye: ", paramType
-      for i in 0 ..< numParams:
-        var p = ctx.toGpuAst(param[i])
-        let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam
-                      else: gsDeviceKernelParam
-        p.iTyp = paramType     ## Update the type of the symbol
-        p.symbolKind = symKind ## and the symbol kind
-        let param = GpuParam(ident: p, typ: paramType)
-        result.pParams.add(param)
-
-    result.pBody = ctx.toGpuAst(node.body)
-      .ensureBlock() # single line procs should be a block to generate `;`
+    # if it is a _generic_ function, we don't actually process it here. instead we add it to
+    # the `generics` set. When we encounter a `gpuCall` we will then check if the function
+    # being called is part of the generic set and look up its _instantiated_ implementation
+    # to parse it. The parsed generics are stored in the `genericInsts` table.
+    let name = ctx.toGpuAst(node.name)
+    if node[2].kind == nnkGenericParams: # is a generic
+      ctx.generics.incl name.iName # need to use raw name, *not* symbol
+      result = GpuAst(kind: gpuVoid)
+    else:
+      result = GpuAst(kind: gpuProc)
+      result.pName = name
+      result.pName.symbolKind = gsProc ## This is a procedure identifier
+      doAssert node[3].kind == nnkFormalParams
+      result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type
+      # Process pragmas
+      if node.pragma.kind != nnkEmpty:
+        doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
+        result.pAttributes = collectProcAttributes(node.pragma)
+        if result.pAttributes.len == 0: # means `nimonly` was applied
+          return GpuAst(kind: gpuVoid)
+      # Process parameters
+      echo "Node: ", node.treerepr
+      if node[2].kind == nnkGenericParams:
+        echo node[2][0].getImpl().treerepr
+        echo node[2][0].treerepr
+      for i in 1 ..< node[3].len:
+        let param = node[3][i]
+        let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
+        let typIdx = param.len - 2 # second to last is the type
+        # IdentDefs
+        #   Ident "x"
+        #   Ident "y"
+        #   Ident "res"
+        #   PtrTy
+        #     Ident "float32"   # `param.len - 2`
+        #   Empty               # `param.len - 1`
+        let paramType = nimToGpuType(param[typIdx])
+        #echo "Argument: ", param.treerepr, " has tpye: ", paramType
+        for i in 0 ..< numParams:
+          var p = ctx.toGpuAst(param[i])
+          let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam
+                        else: gsDeviceKernelParam
+          p.iTyp = paramType     ## Update the type of the symbol
+          p.symbolKind = symKind ## and the symbol kind
+          let param = GpuParam(ident: p, typ: paramType)
+          result.pParams.add(param)
+
+      result.pBody = ctx.toGpuAst(node.body)
+        .ensureBlock() # single line procs should be a block to generate `;`
 
   of nnkLetSection, nnkVarSection:
     # For a section with multiple declarations, create a block
@@ -518,8 +534,22 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result = GpuAst(kind: gpuVoid)
 
   of nnkCall, nnkCommand:
-    # Check if this is a template call
+    # `name` below is name + signature hash. Check if this is a generic based on node repr
     let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice
+    if node[0].repr in ctx.generics: # process the generic instantiaton and store
+      # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature
+      # Only the latter contains e.g. correct instantiation of static array sizes
+      let inst = node[0].getImpl()
+      let sig = node[0].getTypeInst()
+      inst.params = sig.params # copy over the parameters
+      let fn = ctx.toGpuAst(inst)
+      doAssert fn.pName.iSym == name.iSym, "Not matching"
+      # now overwrite the identifier's `iName` field by its `iSym` so that different
+      # generic insts have different
+      fn.pName.iName = fn.pName.iSym
+      name.iName = fn.pName.iSym
+      ctx.genericInsts[fn.pName] = fn
+
     let args = node[1..^1].mapIt(ctx.toGpuAst(it))
     # Producing a template call something like this (but problematic due to overloads etc)
     # we could then perform manual replacement of the template in the CUDA generation pass.
@@ -571,7 +601,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   of nnkSym:
     let s = node.repr & "_" & node.signatureHash()
     # NOTE: The reason we have a tab of known symbols is not to keep the same _reference_ to each
-    # symbol, but rather to allow having the same symbol kind (set in the caller of this call).
+    # symbol, but rather to allow having the same symbol kind and appropriate type for each
+    # symbol `GpuAst` (of kind `gpuIdent`), which is set in the caller of this call.
     # For example in `nnkCall` nodes returning the value from the table automatically means the
     # `symbolKind` is local / function argument etc.
     if s notin ctx.sigTab:
@@ -581,7 +612,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       if result.iName == "_":
         result.iName = "tmp_" & $ctx.genSymCount
         inc ctx.genSymCount
-        #ctx.sigTab[s] = result
+      ctx.sigTab[s] = result
     else:
       result = ctx.sigTab[s]
 
@@ -746,6 +777,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       doAssert el.kind == nnkConstDef
       result.statements.add ctx.toGpuAst(el)
 
+  of nnkWhenStmt:
+    raiseAssert "We shouldn't be seeing a `when` statement after sem check of the Nim code."
   else:
     echo "Unhandled node kind in toGpuAst: ", node.kind
     raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr

From d3e0c456d03dd50f9424fd427185618c6bb39207 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Sun, 17 Aug 2025 18:06:34 +0200
Subject: [PATCH 22/87] implement support for type aliases

So far only the code generation for WGSL is done, but the CUDA code
gen is simple.
---
 .../math_compiler/experimental/backends/wgsl.nim |  6 +++++-
 .../math_compiler/experimental/gpu_types.nim     | 12 ++++++++++++
 .../math_compiler/experimental/nim_to_gpu.nim    | 16 +++++++++++++---
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 728ab57c..c7ce0107 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -162,7 +162,7 @@ proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, ty
       ctx.farmTopLevel(ch, kernel, varBlock, typBlock)
   of gpuVar, gpuConstexpr:
     varBlock.statements.add ast
-  of gpuTypeDef:
+  of gpuTypeDef, gpuAlias:
     typBlock.statements.add ast
   else:
     discard
@@ -729,6 +729,7 @@ proc removeStructPointerFields(blk: var GpuAst) =
   ## purpose on supporting such features.
   doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind
   for typ in mitems(blk):
+    if typ.kind == gpuAlias: continue # don't need to mutate aliases!
     doAssert typ.kind == gpuTypeDef
     var i = 0
     while i < typ.tFields.len:
@@ -982,6 +983,9 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result.add "  " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n"
     result.add "}"
 
+  of gpuAlias:
+    result = "alias " & ast.aName & " = " & ctx.genWebGpu(ast.aTo)
+
   of gpuObjConstr:
     result = ast.ocName & "("
     for i, el in ast.ocFields:
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 87b1a22a..4adf8941 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -33,6 +33,7 @@ type
     gpuDot          # Member access (a.b)
     gpuIndex        # Array indexing (a[b])
     gpuTypeDef      # Type definition
+    gpuAlias        # A type alias
     gpuObjConstr    # Object (struct) constructor
     gpuInlineAsm    # Inline assembly (PTX)
     gpuAddr         # Address of an expression
@@ -161,6 +162,10 @@ type
     of gpuTypeDef:
       tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols
       tFields*: seq[GpuTypeField]
+    of gpuAlias:
+      aName*: string ## Name of the type alias
+      aTo*: GpuAst ## Type the alias maps to
+      aDistinct*: bool ## If the alias is a distinct type in Nim.
     of gpuObjConstr:
       ocName*: string  # type we construct
       ## XXX: it would be better if we already fill the fields with default values here
@@ -400,6 +405,10 @@ proc clone*(ast: GpuAst): GpuAst =
     result.tName = ast.tName
     for f in ast.tFields:
       result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone()))
+  of gpuAlias:
+    result = GpuAst(kind: gpuAlias)
+    result.aName = ast.aName
+    result.aTo = ast.aTo.clone()
   of gpuObjConstr:
     result = GpuAst(kind: gpuObjConstr)
     result.ocName = ast.ocName
@@ -616,6 +625,9 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     for t in n.tFields:
       let indent = indent + 2
       result.add id(t.name)
+  of gpuAlias:
+    result.add id("Alias", n.aName)
+    result.add pretty(n.aTo, indent + 2)
   of gpuObjConstr:
     result.add idd("Ident", n.ocName)
     result.add idd("Fields")
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index fe814ffa..9c1df7af 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -164,7 +164,9 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
       ## Note: this is just the internal type of the array. It is only a pointer due to
       ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
       result = initGpuUAType(getInnerPointerType(n, allowToFail))
-    of ntyObject:
+    of ntyObject, ntyAlias:
+      # for aliases, treat them identical to regular object types, but
+      # `getTypeName` returns the alias!
       let impl = n.getTypeImpl
       let flds = impl.parseTypeFields()
       let typName = getTypeName(n) # might be an object construction
@@ -677,8 +679,16 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       doAssert el.kind == nnkTypeDef
       result.statements.add ctx.toGpuAst(el)
   of nnkTypeDef:
-    result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal)
-    result.tFields = parseTypeFields(node[2])
+    doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len
+    case node[2].kind
+    of nnkObjectTy: # regular `type foo = object`
+      result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal)
+      result.tFields = parseTypeFields(node[2])
+    of nnkSym:      # a type alias `type foo = bar`
+      result = GpuAst(kind: gpuAlias, aName: node[0].strVal,
+                      aTo: ctx.toGpuAst(node[2]))
+    else:
+      raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind
   of nnkObjConstr:
     let typName = getTypeName(node)
     result = GpuAst(kind: gpuObjConstr, ocName: typName)

From 0d111035b7404ef001d2768a5637e0d3af969296 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Sun, 17 Aug 2025 18:07:02 +0200
Subject: [PATCH 23/87] allow "pulling in" procs from outside `cuda` scope

The generic logic is essentially what we need to pull in a proc
defined outside the scope of the `cuda` macro into the
code. Essentially when we encounter a function that is not known to
us yet, we simply look up its implementation from the symbol.

This is the first step towards untying the current `cuda` macro code
from relying on having _everything_ be defined under that macro.

In the future the idea is that essentially only the `{.global.}` procs
strictly need to be defined in the macro. These then pull in
everything they need (i.e. everything that is used and has been
checked by the compiler to be used).
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 9c1df7af..8316b411 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -429,6 +429,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result.pBody = ctx.toGpuAst(node.body)
         .ensureBlock() # single line procs should be a block to generate `;`
 
+      # Add to table of known functions
+      if result.pName notin ctx.allFnTab:
+        ctx.allFnTab[result.pName] = result
+
   of nnkLetSection, nnkVarSection:
     # For a section with multiple declarations, create a block
     result = GpuAst(kind: gpuBlock)
@@ -538,7 +542,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   of nnkCall, nnkCommand:
     # `name` below is name + signature hash. Check if this is a generic based on node repr
     let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice
-    if node[0].repr in ctx.generics: # process the generic instantiaton and store
+    if node[0].repr in ctx.generics or name notin ctx.allFnTab:
+      # process the generic instantiaton and store *or* pull in a proc defined outside
+      # the `cuda` macro by its implementation.
+      ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`!
       # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature
       # Only the latter contains e.g. correct instantiation of static array sizes
       let inst = node[0].getImpl()

From 2c3f8f62b23d5ac658c6f572cd13afe37b673794 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 18 Aug 2025 12:22:02 +0200
Subject: [PATCH 24/87] add pretty printer for GpuType

This is lossy so it is not the default representation.
---
 .../math_compiler/experimental/gpu_types.nim  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 4adf8941..b2f5293c 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -530,6 +530,31 @@ proc removePrefix(s, p: string): string =
   result = s
   result.removePrefix(p)
 
+proc pretty*(t: GpuType): string =
+  ## returns a flat (but lossy) string representation of the type
+  if t == nil:
+    result = "GpuType(nil)"
+  else:
+    case t.kind
+    of gtPtr:
+      result = if t.implicit: "var " else: "ptr "
+      result.add pretty(t.to)
+    of gtUA:
+      result = "UncheckedArray[" & t.uaTo.pretty() & "]"
+    of gtObject:
+      result = t.name # just the name
+    of gtArray:
+      result = "array[" & $t.aLen & ", " & t.aTyp.pretty() & "]"
+    of gtGenericInst:
+      result = t.gName & "["
+      for i, g in t.gArgs:
+        result.add pretty(g)
+        if i < t.gArgs.high:
+          result.add ", "
+      result.add "]"
+    else:
+      result = ($t.kind).removePrefix("gt")
+
 proc pretty*(n: GpuAst, indent: int = 0): string =
   template id(): untyped = repeat(" ", indent)
   template idn(x): untyped = repeat(" ", indent) & $x

From 0bf280739f555fe4d8a7a2a62db9f0f1ca8d3185 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 18 Aug 2025 12:32:37 +0200
Subject: [PATCH 25/87] store all types in `ctx.types` table, support 'pulling'
 types...

from outside the `cuda` scope. This is the second step towards making
it so that the `cuda` macro only needs to contain the calling
code (`{.global.}` proc in general).
---
 .../experimental/backends/cuda.nim            |  2 +-
 .../experimental/backends/wgsl.nim            | 37 +++++----
 .../experimental/gpu_compiler.nim             |  9 +-
 .../math_compiler/experimental/gpu_types.nim  | 28 ++++---
 .../math_compiler/experimental/nim_to_gpu.nim | 82 ++++++++++++++-----
 5 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 9ee253ca..ae8307f6 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -256,7 +256,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = ast.pOp & ctx.genCuda(ast.pVal)
 
   of gpuTypeDef:
-    result = "struct " & ast.tName & "{\n"
+    result = "struct " & gpuTypeToString(ast.tTyp) & "{\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, el.name) & ";\n"
     result.add "}"
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index c7ce0107..1c95ea5f 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -454,7 +454,7 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp
         let id = f.value.determineIdent()
         doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " &
           "variable (i.e. an argument to a global kernel) is not supported: " & $f
-        ctx.structsWithPtrs[(n.ocName, f.name)] = id
+        ctx.structsWithPtrs[(n.ocType, f.name)] = id
   else:
     for ch in n:
       ctx.scanGenerics(ch, callerParams)
@@ -530,18 +530,18 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
     # leave untouched
     result = n
 
-proc getStructName(n: GpuAst): string =
-  ## Given an identifier `gpuIdent` (or `Deref` of one), return the name of the struct type
-  ## the ident is of or an empty string if it is not (pointing to) a struct.
+proc getStructType(n: GpuAst): GpuType =
+  ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type
+  ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct.
   doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind
   var p = n
   if p.kind == gpuDeref:
     p = n.dOf
   result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject:
-             p.iTyp.to.name
+             p.iTyp.to
            elif p.iTyp.kind == gtObject:
-             p.iTyp.name
-           else: ""
+             p.iTyp
+           else: GpuType(kind: gtVoid)
 
 proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
@@ -581,7 +581,7 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
     for ch in mitems(n): # now go over children
       ctx.makeCodeValid(ch, inGlobal)
   of gpuObjConstr: # strip out arguments that are pointer types
-    let t = n.ocName
+    let t = n.ocType
     var i = 0
     while i < n.ocFields.len:
       let f = n.ocFields[i]
@@ -594,10 +594,10 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
         inc i
   of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res`
     var p = n.dParent
-    let id = getStructName(p)
+    let id = getStructType(p)
     doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind
     let field = n.dField.ident()
-    if id.len > 0 and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer
+    if id.kind != gtVoid and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer
       let v = ctx.structsWithPtrs[(id, field)]
       ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions,
       ## we will have passed the parameter
@@ -608,12 +608,12 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
   of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field
     if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]:
       let dot = n.aLeft
-      let id = getStructName(dot.dParent)
-      if id.len > 0:
+      let id = getStructType(dot.dParent)
+      if id.kind != gtVoid:
         doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind
         let field = dot.dField.ident()
         if (id, field) in ctx.structsWithPtrs:
-          raiseAssert "Assignment of a struct (`" & id & "`) field of a pointer type is not supported. " &
+          raiseAssert "Assignment of a struct (`" & pretty(id) & "`) field of a pointer type is not supported. " &
             "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n)
     for ch in mitems(n):
       ctx.makeCodeValid(ch, inGlobal)
@@ -751,10 +751,15 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ctx.farmTopLevel(ast, kernel, varBlock, typBlock)
   ctx.globalBlocks.add varBlock
   ctx.globalBlocks.add typBlock
+  ## XXX: `typBlock` should now always be empty, as we pass all
+  ## found types into `ctx.types`
 
   # Now add the generics to the `allFnTab`
   for k, v in pairs(ctx.genericInsts):
     ctx.allFnTab[k] = v
+  # And all the known types
+  for k, typ in pairs(ctx.types):
+    ctx.globalBlocks.add typ
 
   # 2. Remove all arguments from global functions, as none are allowed in WGSL
   for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table
@@ -978,16 +983,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = ast.pOp & ctx.genWebGpu(ast.pVal)
 
   of gpuTypeDef:
-    result = "struct " & ast.tName & "{\n"
+    result = "struct " & gpuTypeToString(ast.tTyp) & " {\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n"
     result.add "}"
 
   of gpuAlias:
-    result = "alias " & ast.aName & " = " & ctx.genWebGpu(ast.aTo)
+    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo)
 
   of gpuObjConstr:
-    result = ast.ocName & "("
+    result = gpuTypeToString(ast.ocType) & "("
     for i, el in ast.ocFields:
       if el.value.kind == gpuLit and el.value.lValue == "DEFAULT":
         # use type to construct a default value
diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index d71c4298..0fd28760 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -102,8 +102,9 @@ macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
   ## - most regular Nim features :)
   var ctx = GpuContext()
   let ast = ctx.toGpuAst(body)
-  let gen = toSeq(ctx.genericInsts.values)
-  let g = GpuGenericsInfo(data: gen)
+  let genProcs = toSeq(ctx.genericInsts.values)
+  let genTypes = toSeq(ctx.types.values)
+  let g = GpuGenericsInfo(procs: genProcs, types: genTypes)
   newLit((g, ast))
 
 macro cuda*(body: typed): string =
@@ -124,8 +125,10 @@ proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string =
   ## Generates the code based on the given AST (optionally at runtime) and restricts
   ## it to a single global kernel (WebGPU) if any given.
   var ctx = GpuContext()
-  for fn in gen.data: # assign generics info to correct table
+  for fn in gen.procs: # assign generics info to correct table
     ctx.genericInsts[fn.pName] = fn
+  for typ in gen.types: # assign generics info to correct table
+    ctx.types[typ.tTyp] = typ
   result = ctx.codegen(ast, kernel)
 
 
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index b2f5293c..9b3f4a70 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -160,14 +160,14 @@ type
       pOp*: string
       pVal*: GpuAst
     of gpuTypeDef:
-      tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols
+      tTyp*: GpuType ## the actual type. Used to generate the name
       tFields*: seq[GpuTypeField]
     of gpuAlias:
-      aName*: string ## Name of the type alias
+      aTyp*: GpuType ## Name of the type alias
       aTo*: GpuAst ## Type the alias maps to
       aDistinct*: bool ## If the alias is a distinct type in Nim.
     of gpuObjConstr:
-      ocName*: string  # type we construct
+      ocType*: GpuType  # type we construct
       ## XXX: it would be better if we already fill the fields with default values here
       ocFields*: seq[GpuFieldInit] # the fields we initialize
     of gpuInlineAsm:
@@ -254,7 +254,7 @@ type
     ## Maps a struct type and field name, which is of pointer type to the value the user assigns
     ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()`
     ## constructor (WebGPU only).
-    structsWithPtrs*: Table[(string, string), GpuAst]
+    structsWithPtrs*: Table[(GpuType, string), GpuAst]
     ## Set of all generic proc names we have encountered in Nim -> GpuAst. When
     ## we see an `nnkCall` we check if we call a generic function. If so, look up
     ## the instantiated generic, parse it and store in `genericInsts` below.
@@ -264,11 +264,17 @@ type
     ## precise generic instantiations that are called.
     genericInsts*: OrderedTable[GpuAst, GpuAst]
 
+    ## Table of all known types. Filled during Nim -> GpuAst. Includes generic
+    ## instantiations, but also all other types.
+    ## Key: the raw type. Value: a full `gpuTypeDef`
+    types*: OrderedTable[GpuType, GpuAst]
+
   ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we
   ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext`
   ## causes trouble.
   GpuGenericsInfo* = object
-    data*: seq[GpuAst]
+    procs*: seq[GpuAst]
+    types*: seq[GpuAst]
 
   GenericArg* = object
     addrSpace*: AddressSpace ## We store the address space, because that's what matters
@@ -402,16 +408,16 @@ proc clone*(ast: GpuAst): GpuAst =
     result.iIndex = ast.iIndex.clone()
   of gpuTypeDef:
     result = GpuAst(kind: gpuTypeDef)
-    result.tName = ast.tName
+    result.tTyp = ast.tTyp.clone()
     for f in ast.tFields:
       result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone()))
   of gpuAlias:
     result = GpuAst(kind: gpuAlias)
-    result.aName = ast.aName
+    result.aTyp = ast.aTyp.clone()
     result.aTo = ast.aTo.clone()
   of gpuObjConstr:
     result = GpuAst(kind: gpuObjConstr)
-    result.ocName = ast.ocName
+    result.ocType = ast.ocType.clone()
     for f in ast.ocFields:
       result.ocFields.add(
         GpuFieldInit(
@@ -645,16 +651,16 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add id("Op", n.pOp)
     result.add pretty(n.pVal, indent + 2)
   of gpuTypeDef:
-    result.add id("Type", n.tName)
+    result.add id("Type", pretty(n.tTyp))
     result.add id("Fields")
     for t in n.tFields:
       let indent = indent + 2
       result.add id(t.name)
   of gpuAlias:
-    result.add id("Alias", n.aName)
+    result.add id("Alias", pretty(n.aTyp))
     result.add pretty(n.aTo, indent + 2)
   of gpuObjConstr:
-    result.add idd("Ident", n.ocName)
+    result.add idd("Ident", pretty(n.ocType))
     result.add idd("Fields")
     for f in n.ocFields:
       var indent = indent + 2
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 8316b411..62c43a6b 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -343,6 +343,30 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
     # ctx.sigTab[sig] = result
   result.symbolKind = gsProc # make sure it's a proc
 
+proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
+  ## Looks up the implementation of the given function and stores it in our table
+  ## of generic instantiations.
+  ##
+  ## For any looked up procedure, we attach the `{.device.}` pragma.
+  ##
+  ## Mutates the `name` of the given function to match its generic name.
+  # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature
+  # Only the latter contains e.g. correct instantiation of static array sizes
+  let inst = node[0].getImpl()
+  let sig = node[0].getTypeInst()
+  inst.params = sig.params # copy over the parameters
+  let fn = ctx.toGpuAst(inst)
+  if fn.kind == gpuVoid: # should be an inbuilt proc, i.e. annotated with `{.builtin.}`
+    doAssert inst.isBuiltIn()
+  else:
+    fn.pAttributes.incl attDevice # make sure this is interpreted as a device function
+    doAssert fn.pName.iSym == name.iSym, "Not matching"
+    # now overwrite the identifier's `iName` field by its `iSym` so that different
+    # generic insts have different
+    fn.pName.iName = fn.pName.iSym
+    name.iName = fn.pName.iSym ## update the name of the called function
+    ctx.genericInsts[fn.pName] = fn
+
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   ## XXX: things still left to do:
   ## - support `result` variable? Currently not supported. Maybe we will won't
@@ -521,6 +545,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     ## NOTE: Currently we process templates, but we expect them to be already
     ## expanded by the Nim compiler. Thus we could in theory expand them manually
     ## but fortunately we don't need to.
+    return GpuAst(kind: gpuVoid)
     let tName = node[0].strVal
 
     # Extract parameters
@@ -546,18 +571,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       # process the generic instantiaton and store *or* pull in a proc defined outside
       # the `cuda` macro by its implementation.
       ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`!
-      # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature
-      # Only the latter contains e.g. correct instantiation of static array sizes
-      let inst = node[0].getImpl()
-      let sig = node[0].getTypeInst()
-      inst.params = sig.params # copy over the parameters
-      let fn = ctx.toGpuAst(inst)
-      doAssert fn.pName.iSym == name.iSym, "Not matching"
-      # now overwrite the identifier's `iName` field by its `iSym` so that different
-      # generic insts have different
-      fn.pName.iName = fn.pName.iSym
-      name.iName = fn.pName.iSym
-      ctx.genericInsts[fn.pName] = fn
+      ctx.addProcToGenericInsts(node, name)
 
     let args = node[1..^1].mapIt(ctx.toGpuAst(it))
     # Producing a template call something like this (but problematic due to overloads etc)
@@ -687,18 +701,42 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result.statements.add ctx.toGpuAst(el)
   of nnkTypeDef:
     doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len
-    case node[2].kind
-    of nnkObjectTy: # regular `type foo = object`
-      result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal)
-      result.tFields = parseTypeFields(node[2])
-    of nnkSym:      # a type alias `type foo = bar`
-      result = GpuAst(kind: gpuAlias, aName: node[0].strVal,
-                      aTo: ctx.toGpuAst(node[2]))
+    let name = ctx.toGpuAst(node[0])
+    if node[1].kind == nnkGenericParams: # if this is a generic, only store existence of it
+                                         # will store the instantiatons in `nnkObjConstr`
+      result = GpuAst(kind: gpuVoid)
     else:
-      raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind
+      let typ = nimToGpuType(node[0])
+      case node[2].kind
+      of nnkObjectTy: # regular `type foo = object`
+        result = GpuAst(kind: gpuTypeDef, tTyp: typ)
+        result.tFields = parseTypeFields(node[2])
+      of nnkSym:      # a type alias `type foo = bar`
+        result = GpuAst(kind: gpuAlias, aTyp: typ,
+                        aTo: ctx.toGpuAst(node[2]))
+      else:
+        raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind
+
+      # include this the set of known types to not generate duplicates
+      ctx.types[typ] = result
+      # Reset the type we return to void. We now generate _all_ types from the
+      # `types`.
+      result = GpuAst(kind: gpuVoid)
   of nnkObjConstr:
-    let typName = getTypeName(node)
-    result = GpuAst(kind: gpuObjConstr, ocName: typName)
+    ## this should never see `genericParam` I think
+    let typ = nimToGpuType(node)
+    if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
+      # store the type instantiation
+      let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ)
+      case typ.kind
+      of gtObject:      typDef.tFields = typ.oFields
+      of gtGenericInst: typDef.tFields = typ.gFields
+      else:
+        raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
+
+      ctx.types[typ] = typDef
+
+    result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
     let flds = node[0].getTypeImpl.parseTypeFields() # sym
     # find all fields that have been defined by the user

From d1a73906d16e3aaffc721d8c9b7b4438ef6428f1 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 18 Aug 2025 12:33:45 +0200
Subject: [PATCH 26/87] support generic instantiations, producing unique WGSL
 types

---
 .../experimental/backends/wgsl.nim            |  8 +++++
 .../math_compiler/experimental/gpu_types.nim  | 36 ++++++++++++++++---
 .../math_compiler/experimental/nim_to_gpu.nim | 29 +++++++++++++--
 3 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 1c95ea5f..9180b055 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -112,6 +112,14 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr =
     else:
       result = &"{identPrefix}array<{typ}, {t.aLen}>"
     skipIdent = true
+  of gtGenericInst:
+    # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
+    # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
+    result = t.gName & "_"
+    for i, g in t.gArgs:
+      result.add gpuTypeToString(g)
+      if i < t.gArgs.high:
+        result.add "_"
   of gtObject: result = t.name
   of gtUA:     result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">"
   else:        result = gpuTypeToString(t.kind)
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 9b3f4a70..fe893028 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -46,12 +46,13 @@ type
   GpuTypeKind* = enum
     gtVoid,
     gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics
-    gtArray,     # Static array `array[N, dtype]` -> `dtype[N]`
+    gtArray,       # Static array `array[N, dtype]` -> `dtype[N]`
     gtString,
-    gtObject,    # Struct types
-    gtPtr,       # Pointer type, carries inner type
-    gtUA,        # UncheckedArray (UA) mapped to runtime sized arrays
-    gtVoidPtr    # Opaque void pointer
+    gtObject,      # Struct types
+    gtPtr,         # Pointer type, carries inner type
+    gtUA,          # UncheckedArray (UA) mapped to runtime sized arrays
+    gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!)
+    gtVoidPtr      # Opaque void pointer
 
   GpuTypeField* = object
     name*: string
@@ -75,6 +76,10 @@ type
       aLen*: int     # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time
                     # On both CUDA and WebGPU a length of `0` is also used to generate `int foo[]` (CUDA)
                     # `array<foo>` (WebGPU) (runtime sized arrays), which are generated from `ptr UncheckedArray[float32]` for example.
+    of gtGenericInst:
+      gName*: string # name of the generic type
+      gArgs*: seq[GpuType] # list of the instantiated generic arguments e.g. `vec3<f32>` on WGSL backend
+      gFields*: seq[GpuTypeField] # same as `oFields` for `gtObject`
     else: discard
 
   GpuAttribute* = enum
@@ -304,6 +309,12 @@ proc clone*(typ: GpuType): GpuType =
   of gtArray:
     result.aTyp = typ.aTyp.clone()
     result.aLen = typ.aLen
+  of gtGenericInst:
+    result.gName = typ.gName
+    for g in typ.gArgs:
+      result.gArgs.add g.clone()
+    for f in typ.gFields:
+      result.gFields.add GpuTypeField(name: f.name, typ: f.typ.clone())
   else: discard
 
 proc clone*(ast: GpuAst): GpuAst =
@@ -463,6 +474,12 @@ proc hash*(t: GpuType): Hash =
   of gtArray:
     h = h !& hash(t.aTyp)
     h = h !& hash(t.aLen)
+  of gtGenericInst:
+    h = h !& hash(t.gName)
+    for g in t.gArgs:
+      h = h !& hash(g)
+    for f in t.gFields:
+      h = h !& hash(f)
   else: discard
   result = !$ h
 
@@ -491,6 +508,15 @@ proc `==`*(a, b: GpuType): bool =
       else:
         for i in 0 ..< a.oFields.len:
           result = result and (a.oFields[i] == b.oFields[i])
+    of gtGenericInst:
+      result = a.gName == b.gName
+      if a.gArgs.len != b.gArgs.len: result = false
+      elif a.gFields.len != b.gFields.len: result = false
+      else:
+        for i in 0 ..< a.gArgs.len:
+          result = result and (a.gArgs[i] == b.gArgs[i])
+        for i in 0 ..< a.gFields.len:
+          result = result and (a.gFields[i] == b.gFields[i])
     of gtArray: result = a.aTyp == b.aTyp and a.aLen == b.aLen
     else: discard
 
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 62c43a6b..4b7dd9a8 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -60,6 +60,25 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
   else:
     raiseAssert "Not supported yet: " & $t
 
+proc parseTypeFields(node: NimNode): seq[GpuTypeField]
+proc initGpuGenericInst(t: NimNode): GpuType =
+  doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind
+  case t.kind
+  of nnkBracketExpr: # regular generic instantiation
+    result = GpuType(kind: gtGenericInst, gName: t[0].repr)
+    for i in 1 ..< t.len: # grab all generic arguments
+      let typ = nimToGpuType(t[i])
+      result.gArgs.add typ
+    # now parse the object fields
+    let impl = t.getTypeImpl() # impl for the `gFields`
+    result.gFields = parseTypeFields(impl)
+  of nnkObjConstr:
+    doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr
+    result = initGpuGenericInst(t[0])
+  else:
+    raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
+  echo "Got generic inst: ", result
+
 proc unpackGenericInst(t: NimNode): NimNode =
   let tKind = t.typeKind
   if tKind == ntyGenericInst:
@@ -131,7 +150,6 @@ proc getTypeName(n: NimNode): string =
       result = n[0].strVal # type is the first node
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
-proc parseTypeFields(node: NimNode): seq[GpuTypeField]
 proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
   ## Maps a Nim type to a type on the GPU
   ##
@@ -197,7 +215,14 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
       result = initGpuType(gtVoid)
       error("Generics are not supported in the CUDA DSL so far.")
     of ntyGenericInst:
-      result = n.unpackGenericInst().nimToGpuType(allowToFail)
+      result = initGpuGenericInst(n)
+      #result = n.unpackGenericInst().nimToGpuType(allowToFail)
+    of ntyTypeDesc:
+      # `getType` returns a `BracketExpr` of eg:
+      # BracketExpr
+      #   Sym "typeDesc"
+      #   Sym "float32"
+      result = n.getType[1].nimToGpuType(allowToFail) # for a type desc we need to recurse using the type of it
     else:
       if allowToFail:
         result = GpuType(kind: gtVoid)

From 3482a45a4328556f763eb1bd4da0d24c056a7cba Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 18 Aug 2025 12:34:17 +0200
Subject: [PATCH 27/87] add `{.builtin.}` pragma intended to specify builtin
 procs in CUDA/WGSL

This is fundamentally doing the same as `{.nimonly.}`, but for this
purpose the different name makes the intent clearer.
---
 .../experimental/gpu_compiler.nim             | 28 +++++++++++--------
 .../math_compiler/experimental/gpu_types.nim  |  1 +
 .../math_compiler/experimental/nim_to_gpu.nim | 19 +++++++++++--
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 0fd28760..08378417 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -23,6 +23,11 @@ template global*() {.pragma.}
 template device*() {.pragma.}
 template forceinline*() {.pragma.}
 
+## If attached to a function, type or variable it will refer to a built in
+## in the target backend. This is used for all the functions, types and variables
+## defined below to indicate that we do not intend to generate code for them.
+template builtin*() {.pragma.}
+
 # If attached to a `var` it will be treated as a
 # `__constant__`! Only useful if you want to define a
 # constant without initializing it (and then use
@@ -54,25 +59,24 @@ type
     y*: DimWgsl
     z*: DimWgsl
 
-
 ## These are dummy elements to make CUDA block / thread index / dim
 ## access possible in the *typed* `cuda` macro. It cannot be `const`,
 ## because then the typed code would evaluate the values before we
 ## can work with it from the typed macro.
-let blockIdx* = NvBlockIdx()
-let blockDim* = NvBlockDim()
-let gridDim* = NvGridDim()
-let threadIdx* = NvThreadIdx()
+let blockIdx* {.builtin.} = NvBlockIdx()
+let blockDim* {.builtin.} = NvBlockDim()
+let gridDim* {.builtin.} = NvGridDim()
+let threadIdx* {.builtin.} = NvThreadIdx()
 
 ## WebGPU specific
-let global_id* = WgslGridDim()
+let global_id* {.builtin.} = WgslGridDim()
 
 ## Similar for procs. They don't need any implementation, as they won't ever be actually called.
-proc printf*(fmt: string) {.varargs.} = discard
-proc memcpy*(dst, src: pointer, size: int) = discard
+proc printf*(fmt: string) {.varargs, builtin.} = discard
+proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard
 
 ## WebGPU select
-proc select*[T](f, t: T, cond: bool): T =
+proc select*[T](f, t: T, cond: bool): T {.builtin.} =
   # Implementation to run WebGPU code on CPU
   if cond: t
   else: f
@@ -90,9 +94,9 @@ template private*(): untyped {.pragma.}
 
 ## While you can use `malloc` on device with small sizes, it is usually not
 ## recommended to do so.
-proc malloc*(size: csize_t): pointer  = discard
-proc free*(p: pointer) = discard
-proc syncthreads*() {.cudaName: "__syncthreads".} = discard
+proc malloc*(size: csize_t): pointer {.builtin.}  = discard
+proc free*(p: pointer) {.builtin.} = discard
+proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard
 
 macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
   ## WARNING: The following are *not* supported:
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index fe893028..3cd47a4c 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -59,6 +59,7 @@ type
     typ*: GpuType
 
   GpuType* = ref object
+    builtin*: bool ## Whether the type refers to a builtin type or not
     case kind*: GpuTypeKind
     of gtPtr:
       to*: GpuType # `ptr T` points to `to`
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 4b7dd9a8..0d8ab8ca 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -274,18 +274,31 @@ proc requiresMemcpy(n: NimNode): bool =
   ## At the moment we only emit a `memcpy` statement for array types
   result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy
 
+proc isBuiltIn(n: NimNode): bool =
+  ## Checks if the given proc is a `{.builtin.}` (or if it is a Nim "built in"
+  ## proc that uses `importc`, as we cannot emit those; they _need_ to have a
+  ## WGSL / CUDA equivalent built in)
+  doAssert n.kind in [nnkProcDef, nnkFuncDef], "Argument is not a proc: " & $n.treerepr
+  for pragma in n.pragma:
+    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr
+    let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma
+    if pragma.strVal in ["builtin", "importc"]:
+      return true
+
 proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
   doAssert n.kind == nnkPragma
   for pragma in n:
-    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall], "Unexpected node kind: " & $pragma.treerepr
-    let pragma = if pragma.kind == nnkCall: pragma[0] else: pragma
+    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr
+    let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma
     case pragma.strVal
     of "device": result.incl attDevice
     of "global": result.incl attGlobal
     of "forceinline": result.incl attForceInline
-    of "nimonly":
+    of "nimonly", "builtin":
       # used to fully ignore functions!
       return
+    of "importc": # encountered if we analyze a proc from outside `cuda` scope
+      return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil`
     else:
       raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
 

From cd20189ea82572cb7c6782f27840953280844163 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 18 Aug 2025 19:20:39 +0200
Subject: [PATCH 28/87] fix context version for `cuModuleGetGlobal`

---
 constantine/platforms/abis/nvidia_abi.nim | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim
index 8e22f6ef..cfc8d2c9 100644
--- a/constantine/platforms/abis/nvidia_abi.nim
+++ b/constantine/platforms/abis/nvidia_abi.nim
@@ -855,11 +855,11 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi
 {.pop.}
 
 proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult {.v2.}
+proc cuCtxDestroy*(ctx: CUcontext): CUresult {.v2.}
 proc cuCtxSynchronize*(ctx: CUcontext): CUresult {.v2.}
 
 {.push noconv, importc, dynlib: libCuda.}
 
-proc cuCtxDestroy*(ctx: CUcontext): CUresult
 proc cuCtxSynchronize*(): CUresult
 proc cuCtxGetCurrent*(ctx: var CUcontext): CUresult
 proc cuCtxSetCurrent*(ctx: CUcontext): CUresult
@@ -875,7 +875,6 @@ proc cuModuleUnload*(module: CUmodule): CUresult
 proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
 proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult
 proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult
-proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult
 
 proc cuLaunchKernel*(
        kernel: CUfunction,
@@ -889,6 +888,8 @@ proc cuLaunchKernel*(
 
 {.pop.} # {.push noconv, importc, dynlib: "libcuda.so"..}
 
+proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult {.v2.}
+
 proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult {.v2.}
 proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult {.v1.}
 proc cuMemFree*(devptr: CUdeviceptr): CUresult {.v2.}

From dff79875eeb4e8456ac43aa065961286bc248cd0 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 20 Aug 2025 11:19:19 +0200
Subject: [PATCH 29/87] allow to compile with `-d:debugCuda` to compile in
 debug mode

---
 .../experimental/runtime_compile.nim          | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim
index dcfde9c1..95928e2b 100644
--- a/constantine/math_compiler/experimental/runtime_compile.nim
+++ b/constantine/math_compiler/experimental/runtime_compile.nim
@@ -86,15 +86,17 @@ proc log*(nvrtc: var NVRTC) =
 proc compile*(nvrtc: var NVRTC) =
   # Compile the program with fmad disabled.
   # Note: Can specify GPU target architecture explicitly with '-arch' flag.
-  const
-    Options = [
-      cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is
-      # "--fmad=false", # and whatever other options for example
-    ]
-
-    NumberOfOptions = cint Options.len
-  let compileResult =  nvrtcCompileProgram(nvrtc.prog, NumberOfOptions,
-                                           cast[cstringArray](addr Options[0]))
+  var options = @[
+    cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is
+    # "--fmad=false", # and whatever other options for example
+  ]
+  when defined(debugCuda):
+    options.add cstring "--device-debug"       # Equivalent to -g
+    options.add cstring "--generate-line-info" # Equivalent to -lineinfo
+
+  let numberOfOptions = cint options.len
+  let compileResult =  nvrtcCompileProgram(nvrtc.prog, numberOfOptions,
+                                           cast[cstringArray](addr options[0]))
 
   nvrtc.log()
   ## XXX: only in `DebugCuda`?

From b834dc9a3cf11b0665c270f1fca0baa0034a2684 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 20 Aug 2025 11:44:54 +0200
Subject: [PATCH 30/87] ignore `varargs` pragma in procs

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 0d8ab8ca..812f7591 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -299,6 +299,8 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
       return
     of "importc": # encountered if we analyze a proc from outside `cuda` scope
       return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil`
+    of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend
+      continue
     else:
       raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
 

From 9fb4900005af4ea530ed73cad3c699f6304c8497 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 20 Aug 2025 11:45:25 +0200
Subject: [PATCH 31/87] make sure statically sized arrays are copied

As they are passed by pointer in C/C++/CUDA, e.g. a `BigInt state[2]`
decays to `BigInt *state` in a function argument behind the scenes.
---
 constantine/math_compiler/experimental/cuda_execute_dsl.nim | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
index 8f1aa879..f75c899f 100644
--- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim
+++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
@@ -41,13 +41,15 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool =
   case n.typeKind
   of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
     result = false
-  of ntyObject, ntyArray:
+  of ntyObject:
     if passStructByPointer:
       result = false # regular objects can just be copied!
     else:
       result = true # struct passing by pointer forbidden
     ## NOTE: strictly speaking this is not the case of course! If the object
     ## contains refs, it won't hold!
+  of ntyArray: # statically sized arrays are passed by pointer in CUDA / C++ / C!
+    result = true
   of ntyGenericInst:
     if passStructByPointer:
       let impl = n.getTypeImpl()

From cbbc41554a9ac668956d5d58fd6afd0661c336f9 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 20 Aug 2025 11:46:57 +0200
Subject: [PATCH 32/87] make sure to catch `CUdeviceptr` arguments

After our recent Nvidia API wrapper changes, it is now an alias and
not a distinct type anymore.
---
 constantine/math_compiler/experimental/cuda_execute_dsl.nim | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
index f75c899f..ae5732ad 100644
--- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim
+++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
@@ -62,6 +62,12 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool =
       result = false
     else:
       result = true
+  of ntyAlias:
+    let impl = n.getTypeInst()
+    if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr":
+      result = false
+    else:
+      result = true
   else:
     result = true
 

From d44da46940939a118aedfc102f1e876adc932e7a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 21 Aug 2025 09:27:23 +0200
Subject: [PATCH 33/87] change CUDA codegen to similar style as WGSL

This allows us to make use of the Nim generic instantiation and
regular procs we 'pick up' as well as types.

In principle it also allow us to do dead code elimination by only
generating what's actually called in the global functions (or single
kernel, if one uses `toGpuAst` first and then manually calls `codegen`).
---
 .../experimental/backends/backends.nim        |  3 +-
 .../experimental/backends/common_utils.nim    | 31 ++++++-
 .../experimental/backends/cuda.nim            | 80 +++++++++++++++++--
 .../experimental/backends/wgsl.nim            | 28 -------
 .../math_compiler/experimental/gpu_types.nim  |  1 +
 5 files changed, 107 insertions(+), 36 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim
index f2b71152..c7b45740 100644
--- a/constantine/math_compiler/experimental/backends/backends.nim
+++ b/constantine/math_compiler/experimental/backends/backends.nim
@@ -34,7 +34,8 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
 proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string =
   case Backend
   of bkCuda:
-    result = ctx.genCuda(ast)
+    ctx.preprocess(ast, kernel)
+    result = cuda.codegen(ctx)
   of bkWGSL:
     ctx.storagePass(ast, kernel)
     result = wgsl.codegen(ctx)
diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim
index 1def1b1f..01ee5d33 100644
--- a/constantine/math_compiler/experimental/backends/common_utils.nim
+++ b/constantine/math_compiler/experimental/backends/common_utils.nim
@@ -6,9 +6,36 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
+import std / tables
 import ../gpu_types
-# import ./backends
 
 proc address*(a: string): string = "&" & a
-
 proc size*(a: string): string = "sizeof(" & a & ")"
+
+proc isGlobal*(fn: GpuAst): bool =
+  doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind
+  result = attGlobal in fn.pAttributes
+
+proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) =
+  ## Farms the top level of the code for functions, variable and type definition.
+  ## All functions are added to the `allFnTab`, while only global ones (or even only
+  ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining
+  ## logic.
+  ## Variables and types are collected in `varBlock` and `typBlock`.
+  case ast.kind
+  of gpuProc:
+    ctx.allFnTab[ast.pName] = ast
+    if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal():
+      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
+    elif kernel.len == 0 and ast.isGlobal():
+      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
+  of gpuBlock:
+    # could be a type definition or global variable
+    for ch in ast:
+      ctx.farmTopLevel(ch, kernel, varBlock, typBlock)
+  of gpuVar, gpuConstexpr:
+    varBlock.statements.add ast
+  of gpuTypeDef, gpuAlias:
+    typBlock.statements.add ast
+  else:
+    discard
diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index ae8307f6..04d77449 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import std / [macros, strformat, strutils, sugar, sequtils]
+import std / [macros, strformat, strutils, sugar, sequtils, tables, algorithm]
 
 import ../gpu_types
 import ./common_utils
@@ -119,11 +119,62 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
 proc genMemcpy(lhs, rhs, size: string): string =
   result = &"memcpy({lhs}, {rhs}, {size})"
 
+proc scanFunctions(ctx: var GpuContext, n: GpuAst) =
+  ## Iterates over the given function and checks for all `gpuCall` nodes. Any function
+  ## called in the scope is added to `fnTab`. This is a form of dead code elimination.
+  case n.kind
+  of gpuCall:
+    let fn = n.cName
+    if fn in ctx.allFnTab:
+      # Check if any of the parameters are pointers (otherwise non generic)
+      if fn notin ctx.fnTab: # function not known, add to `fnTab` (i.e. avoid code elimination)
+        let fnCalled = ctx.allFnTab[fn]
+        ctx.fnTab[fn] = fnCalled
+        # still "scan for functions", i.e. fill `fnTab` from inner calls
+        for ch in fnCalled:
+          ctx.scanFunctions(ch)
+      # else we don't do anything for this function
+    # Harvest functions from arguments to this call!
+    for arg in n.cArgs:
+      ctx.scanFunctions(arg)
+  else:
+    for ch in n:
+      ctx.scanFunctions(ch)
 
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a))
 
+proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
+
+  # 1. Add all data from `genericInsts` and `types` tables
+  #    In CUDA the types have to be before any possible global variables using
+  #    them!
+  for k, v in pairs(ctx.genericInsts):
+    ctx.allFnTab[k] = v
+  # And all the known types
+  for k, typ in pairs(ctx.types):
+    ctx.globalBlocks.add typ
+
+  # 2. Fill table with all *global* functions or *only* the specific `kernel`
+  #    if any given
+  var varBlock = GpuAst(kind: gpuBlock)
+  var typBlock = GpuAst(kind: gpuBlock)
+  ctx.farmTopLevel(ast, kernel, varBlock, typBlock)
+  ctx.globalBlocks.add varBlock
+  ctx.globalBlocks.add typBlock
+  ## XXX: `typBlock` should now always be empty, as we pass all
+  ## found types into `ctx.types`
+
+  # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
+  #    the functions called and record them in `fnTab`.
+  let fns = toSeq(ctx.fnTab.pairs)
+  for (fnIdent, fn) in fns: # everything in `fnTab` at this point is a global function
+    # Get the original arguments (before lifting them) of this function. Needed in scan
+    # to check if `gpuCall` argument is a parameter.
+    let fnOrig = ctx.allFnTab[fnIdent]
+    ctx.scanFunctions(fn)
+
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   ## The actual CUDA code generator.
   let indentStr = "  ".repeat(indent)
@@ -143,10 +194,13 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
 
     # extern "C" is needed to avoid name mangling
     result = indentStr & "extern \"C\" " & attrs.join(" ") & " " &
-             fnSig & "{\n"
-
-    result &= ctx.genCuda(ast.pBody, indent + 1)
-    result &= "\n" & indentStr & "}"
+             fnSig
+    if ast.forwardDeclare:
+      result.add ";"
+    else:
+      result.add "{\n"
+      result &= ctx.genCuda(ast.pBody, indent + 1)
+      result &= "\n" & indentStr & "}"
 
   of gpuBlock:
     result = ""
@@ -296,3 +350,19 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     echo "Unhandled node kind in genCuda: ", ast.kind
     raiseAssert "Unhandled node kind in genCuda: " & ast.repr
     result = ""
+
+proc codegen*(ctx: var GpuContext): string =
+  ## Generate the actual code for all pieces of the puzzle
+  # 1. generate code for the global blocks (types, global vars etc)
+  for blk in ctx.globalBlocks:
+    result.add ctx.genCuda(blk) & ";\n\n"
+
+  # 2. generate all regular functions
+  let fns = toSeq(ctx.fnTab.pairs)
+  for (fnIdent, fn) in fns:
+    let fnC = fn.clone()
+    fnC.forwardDeclare = true
+    result.add ctx.genCuda(fnC) & "\n"
+
+  for fnIdent, fn in ctx.fnTab:
+    result.add ctx.genCuda(fn) & "\n\n"
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 9180b055..20dc01e0 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -147,34 +147,6 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
     if typ.len > 0:
       result.add &" -> {typ}"
 
-proc isGlobal(fn: GpuAst): bool =
-  doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind
-  result = attGlobal in fn.pAttributes
-
-proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) =
-  ## Farms the top level of the code for functions, variable and type definition.
-  ## All functions are added to the `allFnTab`, while only global ones (or even only
-  ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining
-  ## logic.
-  ## Variables and types are collected in `varBlock` and `typBlock`.
-  case ast.kind
-  of gpuProc:
-    ctx.allFnTab[ast.pName] = ast
-    if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal():
-      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
-    elif kernel.len == 0 and ast.isGlobal():
-      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
-  of gpuBlock:
-    # could be a type definition or global variable
-    for ch in ast:
-      ctx.farmTopLevel(ch, kernel, varBlock, typBlock)
-  of gpuVar, gpuConstexpr:
-    varBlock.statements.add ast
-  of gpuTypeDef, gpuAlias:
-    typBlock.statements.add ast
-  else:
-    discard
-
 proc patchType(t: GpuType): GpuType =
   ## Applies patches needed for WGSL support. E.g. `bool` cannot be a storage variable.
   result = t
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 3cd47a4c..9b4d9293 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -104,6 +104,7 @@ type
       pParams*: seq[GpuParam]
       pBody*: GpuAst
       pAttributes*: set[GpuAttribute] # order not important, hence set
+      forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration
     of gpuCall:
       cName*: GpuAst ## Will be a `GpuIdent`
       cArgs*: seq[GpuAst]

From 5e383fd2858cb18352b4f238d56b08f30d8cda45 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 21 Aug 2025 09:30:15 +0200
Subject: [PATCH 34/87] improve error message for `Dot` node if not ident/deref
 in WGSL

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 20dc01e0..19582a00 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -513,7 +513,8 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
 proc getStructType(n: GpuAst): GpuType =
   ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type
   ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct.
-  doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind
+  doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " &
+    $n.kind & " for node: " & $n
   var p = n
   if p.kind == gpuDeref:
     p = n.dOf

From 7eaa65f45c4e16b423cf0ac357adc983ba3aec7c Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 21 Aug 2025 09:30:52 +0200
Subject: [PATCH 35/87] always also inject `num_workgroups` argument into WGSL
 globals

When working with workgroups dispatched in an e.g. 2D grid,
`passEncoder.dispatchWorkgroups(N, M)` one needs the number of
workgroups to compute the unique thread ID.
---
 constantine/math_compiler/experimental/backends/wgsl.nim | 2 +-
 constantine/math_compiler/experimental/gpu_compiler.nim  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 19582a00..2f548ad5 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -813,7 +813,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     if $attGlobal in attrs:
       doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!"
       ## XXX: make this more flexible. In theory can be any name
-      fnArgs = "@builtin(global_invocation_id) global_id: vec3<u32>"
+      fnArgs = "@builtin(global_invocation_id) global_id: vec3<u32>, @builtin(num_workgroups) num_workgroups: vec3<u32>"
     let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs)
 
     result = indentStr & "fn " & fnSig & " {\n"
diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 08378417..51207212 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -70,6 +70,7 @@ let threadIdx* {.builtin.} = NvThreadIdx()
 
 ## WebGPU specific
 let global_id* {.builtin.} = WgslGridDim()
+let num_workgroups* {.builtin.} = WgslGridDim()
 
 ## Similar for procs. They don't need any implementation, as they won't ever be actually called.
 proc printf*(fmt: string) {.varargs, builtin.} = discard

From 151e0250369c6d9078fc51a03a538caaab76fc77 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 21 Aug 2025 10:38:59 +0200
Subject: [PATCH 36/87] rename WGSL `storagePass` to `preprocess`

---
 constantine/math_compiler/experimental/backends/backends.nim | 4 ++--
 constantine/math_compiler/experimental/backends/wgsl.nim     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim
index c7b45740..76d63a72 100644
--- a/constantine/math_compiler/experimental/backends/backends.nim
+++ b/constantine/math_compiler/experimental/backends/backends.nim
@@ -34,8 +34,8 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
 proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string =
   case Backend
   of bkCuda:
-    ctx.preprocess(ast, kernel)
+    cuda.preprocess(ctx, ast, kernel)
     result = cuda.codegen(ctx)
   of bkWGSL:
-    ctx.storagePass(ast, kernel)
+    wgsl.preprocess(ctx, ast, kernel)
     result = wgsl.codegen(ctx)
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 2f548ad5..9043e3a0 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -720,7 +720,7 @@ proc removeStructPointerFields(blk: var GpuAst) =
       else:
         inc i
 
-proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
+proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ## If `kernel` is a global function, we *only* generate code for that kernel.
   ## This is useful if your GPU code contains multiple kernels with differing
   ## parameters to avoid having to fill dummy buffers for all the unused parameters

From 987482956b09ebaa55d75e37b9770e45b96a8251 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 21 Aug 2025 10:39:16 +0200
Subject: [PATCH 37/87] remove `gpuTypeToString`, `genFunctionType` from
 `backends`

In the end we never used those variants anyway, as the code is fully
separate in their respective submodules now.
---
 .../experimental/backends/backends.nim          | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim
index 76d63a72..95fbda87 100644
--- a/constantine/math_compiler/experimental/backends/backends.nim
+++ b/constantine/math_compiler/experimental/backends/backends.nim
@@ -14,23 +14,6 @@ when defined(cuda):
 else:
   const Backend* = bkWGSL
 
-proc gpuTypeToString*(t: GpuTypeKind): string =
-  case Backend
-  of bkCuda: cuda.gpuTypeToString(t)
-  of bkWGSL: wgsl.gpuTypeToString(t)
-
-proc gpuTypeToString*(t: GpuType, ident = newGpuIdent(), allowArrayToPtr = false,
-                      allowEmptyIdent = false,
-                     ): string =
-  case Backend
-  of bkCuda: cuda.gpuTypeToString(t, ident.ident(), allowArrayToPtr, allowEmptyIdent)
-  of bkWGSL: wgsl.gpuTypeToString(t, ident, allowArrayToPtr, allowEmptyIdent)
-
-proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
-  case Backend
-  of bkCuda: cuda.genFunctionType(typ, fn, fnArgs)
-  of bkWGSL: wgsl.genFunctionType(typ, fn, fnArgs)
-
 proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string =
   case Backend
   of bkCuda:

From c617379d8cb69ccbddce21c4aff2bfa4af95c06c Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 10:57:54 +0200
Subject: [PATCH 38/87] [CUDA] fix indentation for variable declarations

---
 constantine/math_compiler/experimental/backends/cuda.nim | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 04d77449..c117471f 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -216,7 +216,9 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
 
   of gpuVar:
-    result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName.ident())
+    let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & " "
+                else: ""
+    result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident())
     # If there is an initialization, the type might require a memcpy
     if ast.vInit.kind != gpuVoid and not ast.vRequiresMemcpy:
       result &= " = " & ctx.genCuda(ast.vInit)

From 25c5b13f962941eda9d9b8f7821a0abb795f7779 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 11:10:34 +0200
Subject: [PATCH 39/87] improve type deduction for types that look generic

I.e. an array type with a constant in place for the array
size (i.e. *not* a `static int` generic argument) will look like a
generic in the sense that depending on how you look at the type with
`getType*`, you will end up seeing `Ident "Foo"` for the constant.

Therefore, we allow array length determination to fail and return
`gtInvalid`. If we see that happens for a variable or parameter, we
try to determine the type from the symbol of the parameter or
variable. For some reason that tends to have the fully instantiated
type.

However, for *return types* this is not possible, as there is no easy
symbol to look at (unless one were to go into the proc body and look
at symbol that is being returned). Therefore, if we encounter this
type of array as a return type (NOTE: most backends won't anyway allow
to return arrays, but that's beside the point), we treat the function
as a generic and look up its type when we encounter it in a `gpuCall`.
Then, the type will be fully instantiated again.
---
 .../math_compiler/experimental/gpu_types.nim  |   3 +
 .../math_compiler/experimental/nim_to_gpu.nim | 111 ++++++++++++------
 2 files changed, 81 insertions(+), 33 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 9b4d9293..bd12a1ee 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -53,6 +53,9 @@ type
     gtUA,          # UncheckedArray (UA) mapped to runtime sized arrays
     gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!)
     gtVoidPtr      # Opaque void pointer
+    gtInvalid      # Can be returned to indicate a call to `nimToGpuType` failed to determine a type
+                   ## XXX: make this the default value and replace all `gtVoid` placeholders by it
+
 
   GpuTypeField* = object
     name*: string
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 812f7591..040f5907 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has
 import ./gpu_types
 import ./backends/backends
 
-proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType
+proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType
 
 proc initGpuType(kind: GpuTypeKind): GpuType =
   ## If `kind` is `gtPtr` `to` must be the type we point to
@@ -20,11 +20,17 @@ proc initGpuType(kind: GpuTypeKind): GpuType =
 
 proc initGpuPtrType(to: GpuType, implicitPtr: bool): GpuType =
   ## If `kind` is `gtPtr` `to` must be the type we point to
-  result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr)
+  if to.kind == gtInvalid: # this is not a valid type
+    result = GpuType(kind: gtInvalid)
+  else:
+    result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr)
 
 proc initGpuUAType(to: GpuType): GpuType =
   ## Initializes a GPU type for an unchecked array (ptr wraps this)
-  result = GpuType(kind: gtUA, uaTo: to)
+  if to.kind == gtInvalid: # this is not a valid type
+    result = GpuType(kind: gtInvalid)
+  else:
+    result = GpuType(kind: gtUA, uaTo: to)
 
 proc initGpuVoidPtr(): GpuType =
   result = GpuType(kind: gtVoidPtr)
@@ -94,45 +100,61 @@ proc unpackGenericInst(t: NimNode): NimNode =
 proc toGpuTypeKind(t: NimNode): GpuTypeKind =
   result = t.unpackGenericInst().typeKind.toGpuTypeKind()
 
-proc getInnerPointerType(n: NimNode, allowToFail: bool = false): GpuType =
+proc getInnerPointerType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =
   doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind
   if n.typeKind in {ntyPointer, ntyUncheckedArray}:
     let typ = n.getTypeInst()
     doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr
     doAssert typ[0].kind in {nnkIdent, nnkSym}
     doAssert typ[0].strVal in ["ptr", "UncheckedArray"]
-    result = nimToGpuType(typ[1], allowToFail)
+    result = nimToGpuType(typ[1], allowToFail, allowArrayIdent)
   elif n.kind == nnkPtrTy:
-    result = nimToGpuType(n[0], allowToFail)
+    result = nimToGpuType(n[0], allowToFail, allowArrayIdent)
   elif n.kind == nnkAddr:
     let typ = n.getTypeInst()
-    result = getInnerPointerType(typ, allowToFail)
+    result = getInnerPointerType(typ, allowToFail, allowArrayIdent)
   elif n.kind == nnkVarTy:
     # VarTy
     #   Sym "BigInt"
-    result = nimToGpuType(n[0], allowToFail)
+    result = nimToGpuType(n[0], allowToFail, allowArrayIdent)
   elif n.kind == nnkSym: # symbol of e.g. `ntyVar`
-    result = nimToGpuType(n.getTypeInst(), allowToFail)
+    result = nimToGpuType(n.getTypeInst(), allowToFail, allowArrayIdent)
   else:
     raiseAssert "Found what: " & $n.treerepr
 
-proc determineArrayLength(n: NimNode): int =
+proc determineArrayLength(n: NimNode, allowArrayIdent: bool): int =
+  ## If `allowArrayIdent` is true, we do not emit the error message when
+  ## encountering an ident. This is the case for procs taking arrays
+  ## with a static array where the constant comes from outside the
+  ## macro. In that case we return `-1` indicating
+  ##  `proc mdsRowShfNaive(r: int, v: array[SPONGE_WIDTH, BigInt]): BigInt {.device.} =`
   case n[1].kind
   of nnkSym:
     # likely a constant, try to get its value
     result = n[1].getImpl.intVal
   of nnkIdent:
-    let msg = """Found array with length given by identifier: $#!
+    if not allowArrayIdent:
+      let msg = """Found array with length given by identifier: $#!
 You might want to create a typed template taking a typed parameter for this
 constant to force the Nim compiler to bind the symbol. In theory though this
 error should not appear anymore though, as we don't try to parse generic
 functions.
 """ % n[1].strVal
-    raiseAssert msg
+      raiseAssert msg
+    else:
+      result = -1 # return -1 to indicate caller should look at symbol
   else:
     case n[1].kind
     of nnkIntLit: result = n[1].intVal
     else:
+      # E.g.
+      # BracketExpr
+      #   Sym "array"
+      #   Infix
+      #     Ident ".."
+      #     IntLit 0
+      #     IntLit 11
+      #   Sym "BigInt"
       #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr
       doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr
       doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr
@@ -150,7 +172,7 @@ proc getTypeName(n: NimNode): string =
       result = n[0].strVal # type is the first node
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
-proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
+proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =
   ## Maps a Nim type to a type on the GPU
   ##
   ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases
@@ -159,30 +181,30 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
   case n.kind
   of nnkIdentDefs: # extract type for let / var based on explicit or implicit type
     if n[n.len - 2].kind != nnkEmpty: # explicit type
-      result = nimToGpuType(n[n.len - 2], allowToFail)
+      result = nimToGpuType(n[n.len - 2], allowToFail, allowArrayIdent)
     else: # take from last element
-      result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail)
+      result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail, allowArrayIdent)
   of nnkConstDef:
     if n[1].kind != nnkEmpty: # has an explicit type
-      result = nimToGpuType(n[1], allowToFail)
+      result = nimToGpuType(n[1], allowToFail, allowArrayIdent)
     else:
-      result = nimToGpuType(n[2], allowToFail) # derive from the RHS literal
+      result = nimToGpuType(n[2], allowToFail, allowArrayIdent) # derive from the RHS literal
   else:
     if n.kind == nnkEmpty: return initGpuType(gtVoid)
     case n.typeKind
     of ntyBool, ntyInt .. ntyUint64: # includes all float types
       result = initGpuType(toGpuTypeKind n.typeKind)
     of ntyPtr:
-      result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = false)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false)
     of ntyVar:
-      result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = true)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = true)
     of ntyPointer:
       result = initGpuVoidPtr()
     of ntyUncheckedArray:
       ## Note: this is just the internal type of the array. It is only a pointer due to
       ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
-      result = initGpuUAType(getInnerPointerType(n, allowToFail))
     of ntyObject, ntyAlias:
+      result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent))
       # for aliases, treat them identical to regular object types, but
       # `getTypeName` returns the alias!
       let impl = n.getTypeImpl
@@ -192,7 +214,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
     of ntyArray:
       # For a generic, static array type, e.g.:
       if n.kind == nnkSym:
-        return nimToGpuType(getTypeImpl(n), allowToFail)
+        return nimToGpuType(getTypeImpl(n), allowToFail, allowArrayIdent)
       if n.len == 3:
         # BracketExpr
         #   Sym "array"
@@ -200,8 +222,16 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
         #   Sym "uint32"
         doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr
         doAssert n[0].strVal == "array"
-        let len = determineArrayLength(n)
-        result = initGpuArrayType(n[2], len)
+        let len = determineArrayLength(n, allowArrayIdent)
+        if len < 0:
+          # indicates we found an array with an ident, e.g.
+          # BracketExpr
+          #   Sym "array"
+          #   Ident "SPONGE_WIDTH"
+          #   Sym "BigInt"
+          return GpuType(kind: gtInvalid)
+        else:
+          result = initGpuArrayType(n[2], len)
       else:
         # just an array literal
         # Bracket
@@ -212,7 +242,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
     #  echo n.getTypeImpl.treerepr
     #  error("o")
     of ntyGenericInvocation:
-      result = initGpuType(gtVoid)
+      result = initGpuType(gtInvalid)
       error("Generics are not supported in the CUDA DSL so far.")
     of ntyGenericInst:
       result = initGpuGenericInst(n)
@@ -222,7 +252,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType =
       # BracketExpr
       #   Sym "typeDesc"
       #   Sym "float32"
-      result = n.getType[1].nimToGpuType(allowToFail) # for a type desc we need to recurse using the type of it
+      result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it
     else:
       if allowToFail:
         result = GpuType(kind: gtVoid)
@@ -407,6 +437,15 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
     name.iName = fn.pName.iSym ## update the name of the called function
     ctx.genericInsts[fn.pName] = fn
 
+proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
+  ## Returns the type from a given Nim node `t` representing a type.
+  ## If that fails due to an identifier in the type, we instead try
+  ## to look up the type from the associated symbol, `n`.
+  result = nimToGpuType(t, allowArrayIdent = true)
+  if result.kind == gtInvalid:
+    # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
+    # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
+    result = n.getTypeInst.nimToGpuType()
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   ## XXX: things still left to do:
   ## - support `result` variable? Currently not supported. Maybe we will won't
@@ -456,7 +495,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result.pName = name
       result.pName.symbolKind = gsProc ## This is a procedure identifier
       doAssert node[3].kind == nnkFormalParams
-      result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type
+      let retType = node[3][0] # arg 0 is return type
+      if retType.kind == nnkEmpty:
+        result.pRetType = GpuType(kind: gtVoid) # actual void return
+      else:
+        # attempt to get type. If fails, we need to wait for a caller to this function to get types
+        # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside
+        # the macro. We then rely on our generics logic to later look this up when called
+        result.pRetType = nimToGpuType(retType, allowArrayIdent = true)
+        if result.pRetType.kind == gtVoid: # stop parsing this function
+          ctx.generics.incl name.iName # need to use raw name, *not* symbol
+          return GpuAst(kind: gpuVoid)
+
       # Process pragmas
       if node.pragma.kind != nnkEmpty:
         doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
@@ -464,10 +514,6 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         if result.pAttributes.len == 0: # means `nimonly` was applied
           return GpuAst(kind: gpuVoid)
       # Process parameters
-      echo "Node: ", node.treerepr
-      if node[2].kind == nnkGenericParams:
-        echo node[2][0].getImpl().treerepr
-        echo node[2][0].treerepr
       for i in 1 ..< node[3].len:
         let param = node[3][i]
         let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
@@ -479,8 +525,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         #   PtrTy
         #     Ident "float32"   # `param.len - 2`
         #   Empty               # `param.len - 1`
-        let paramType = nimToGpuType(param[typIdx])
-        #echo "Argument: ", param.treerepr, " has tpye: ", paramType
+        let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1])
         for i in 0 ..< numParams:
           var p = ctx.toGpuAst(param[i])
           let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam
@@ -522,7 +567,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         doAssert declaration[0][1].kind == nnkPragma
         varNode.vAttributes = collectAttributes(declaration[0][1])
       else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr
-      varNode.vType = nimToGpuType(declaration)
+      varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0])
       varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later
       # This is a *local* variable (i.e. `function` address space on WGSL) unless it is
       # annotated with `{.shared.}` (-> `workspace` in WGSL)

From ed5efd217f40e1ff4750fcdc95fba5dd0f16aab5 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 11:16:47 +0200
Subject: [PATCH 40/87] clone `forwardDeclare` in GpuAst

---
 constantine/math_compiler/experimental/gpu_types.nim | 1 +
 1 file changed, 1 insertion(+)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index bd12a1ee..ccc839b7 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -339,6 +339,7 @@ proc clone*(ast: GpuAst): GpuAst =
       result.pParams.add(clonedParam)
     result.pBody = ast.pBody.clone()
     result.pAttributes = ast.pAttributes
+    result.forwardDeclare = result.forwardDeclare
   of gpuCall:
     result = GpuAst(kind: gpuCall)
     result.cName = ast.cName.clone()

From d1cefd12a9ee9b203f1a1974d322c87c973c9c45 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 11:18:32 +0200
Subject: [PATCH 41/87] support tuples by mapping to object types

Tuples are essentially just objects anyway. So we make the
transformation explicit and thus support tuples (including anonymous).

Note: The Nim compiler already transforms tuple unpacking into
temporary variable + statements to assign the fields.
---
 .../math_compiler/experimental/nim_to_gpu.nim | 163 ++++++++++++++++--
 1 file changed, 151 insertions(+), 12 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 040f5907..e1744756 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -161,6 +161,67 @@ functions.
       doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr
       result = n[1][2].intVal + 1
 
+proc constructTupleTypeName(n: NimNode): string =
+  ## XXX: overthink if this should really be here and not somewhere else
+  ##
+  ## Given a tuple, generate a name from the field names and types, e.g.
+  ## `Tuple_lo_BaseType_hi_BaseType`
+  ##
+  ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying
+  ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types.
+  result = "Tuple_"
+  doAssert n.kind in [nnkTupleTy, nnkTupleConstr]
+  for i, ch in n:
+    case ch.kind
+    of nnkIdentDefs:
+      let typName = ch[ch.len - 2].getTypeImpl.repr # second to last is type name of field(s)
+      for j in 0 ..< ch.len - 2:
+        # Example:
+        # IdentDefs
+        #   Ident "hi"
+        #   Ident "lo"      `..< ch.len - 2 `
+        #   Sym "BaseType"  `..< ch.len - 1`
+        #   Empty           `..< ch.len`
+        result.add ch[j].strVal & "_" & typName
+        if j < ch.len - 3:
+          result.add "_"
+      if i < n.len - 1:
+        result.add "_"
+    of nnkExprColonExpr:
+      # ExprColonExpr
+      #   Sym "hi"
+      #   Infix
+      #     Sym "shr"
+      #     Sym "n"
+      #     IntLit 16
+      # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)`
+      #    give them a slightly different name
+      let typName = ch[0].getTypeImpl.repr ## XXX
+      doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr
+      result.add ch[0].strVal & "_" & typName
+      if i < n.len - 1:
+        result.add "_"
+    of nnkSym:
+      # TupleConstr
+      #   Sym "BaseType" <-- e.g. here
+      #   Sym "BaseType"
+      let typName = ch.getTypeImpl.repr
+      result.add "Field" & $i & "_" & typName
+      if i < n.len - 1:
+        result.add "_"
+    else:
+      # TupleConstr      e.g. a tuple constr like this
+      #   Infix
+      #     Sym "shr"
+      #     Sym "n"
+      #     IntLit 16
+      #   Infix
+      #     Sym "and"
+      #     Sym "n"
+      #     UInt32Lit 65535
+      # -> Try again with type impl
+      return constructTupleTypeName(getTypeImpl(n))
+
 proc getTypeName(n: NimNode): string =
   ## Returns the name of the type
   case n.kind
@@ -170,6 +231,8 @@ proc getTypeName(n: NimNode): string =
       result = n.getTypeInst.strVal
     else:
       result = n[0].strVal # type is the first node
+  of nnkTupleTy, nnkTupleConstr:
+    result = constructTupleTypeName(n)
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
 proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =
@@ -203,11 +266,12 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool =
     of ntyUncheckedArray:
       ## Note: this is just the internal type of the array. It is only a pointer due to
       ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
-    of ntyObject, ntyAlias:
       result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent))
+    of ntyObject, ntyAlias, ntyTuple:
       # for aliases, treat them identical to regular object types, but
       # `getTypeName` returns the alias!
-      let impl = n.getTypeImpl
+      let impl = if n.kind == nnkTupleConstr: n # might actually _lose_ information if used getTypeImpl
+                 else: n.getTypeImpl
       let flds = impl.parseTypeFields()
       let typName = getTypeName(n) # might be an object construction
       result = initGpuObjectType(typName, flds)
@@ -279,12 +343,34 @@ proc assignPrefixOp(op: string): string =
   else: result = op
 
 proc parseTypeFields(node: NimNode): seq[GpuTypeField] =
-  doAssert node.kind == nnkObjectTy
-  doAssert node[2].kind == nnkRecList
-  for ch in node[2]:
-    doAssert ch.kind == nnkIdentDefs and ch.len == 3
-    result.add GpuTypeField(name: ch[0].strVal,
-                            typ: nimToGpuType(ch[1]))
+  case node.kind
+  of nnkObjectTy:
+    doAssert node[2].kind == nnkRecList
+    for ch in node[2]:
+      doAssert ch.kind == nnkIdentDefs and ch.len == 3
+      result.add GpuTypeField(name: ch[0].strVal,
+                              typ: nimToGpuType(ch[1]))
+  of nnkTupleTy:
+    for ch in node:
+      doAssert ch.kind == nnkIdentDefs and ch.len == 3
+      result.add GpuTypeField(name: ch[0].strVal,
+                              typ: nimToGpuType(ch[1]))
+  of nnkTupleConstr:
+    # TupleConstr
+    #   Sym "BaseType"
+    #   Sym "BaseType"
+    for i, ch in node:
+      case ch.kind
+      of nnkSym:
+        result.add GpuTypeField(name: "Field" & $i,
+                                typ: nimToGpuType(ch))
+      of nnkExprColonExpr:
+        result.add GpuTypeField(name: ch[0].strVal,
+                                typ: nimToGpuType(ch[1]))
+      else:
+        return parseTypeFields(node.getTypeImpl) # will likely fall back to constr with `nnkSym`
+  else:
+    raiseAssert "Unsupported type to parse fields from: " & $node.kind
 
 template findIdx(col, el): untyped =
   var res = -1
@@ -696,9 +782,21 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result.dField = ctx.toGpuAst(node[1])
 
   of nnkBracketExpr:
-    result = GpuAst(kind: gpuIndex)
-    result.iArr = ctx.toGpuAst(node[0])
-    result.iIndex = ctx.toGpuAst(node[1])
+    case node[0].typeKind
+    of ntyTuple:
+      # need to replace `[idx]` by field access
+      let typ = nimToGpuType(node[0].getTypeImpl)
+      #doAssert typ in ctx.types
+      doAssert node[1].kind == nnkIntLit
+      let idx = node[1].intVal
+      let field = typ.oFields[idx].name
+      result = GpuAst(kind: gpuDot,
+                      dParent: ctx.toGpuAst(node[0]),
+                      dField: ctx.toGpuAst(ident(field)))
+    else:
+      result = GpuAst(kind: gpuIndex)
+      result.iArr = ctx.toGpuAst(node[0])
+      result.iIndex = ctx.toGpuAst(node[1])
 
   of nnkIdent, nnkOpenSymChoice:
     result = newGpuIdent()
@@ -823,7 +921,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
 
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
-    let flds = node[0].getTypeImpl.parseTypeFields() # sym
+    let flds = typ.oFields
     # find all fields that have been defined by the user
     var ocFields: seq[GpuFieldInit]
     for i in 1 ..< node.len: # all fields to be init'd
@@ -844,6 +942,47 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         result.ocFields.add GpuFieldInit(name: flds[i].name,
                                          value: dfl,
                                          typ: flds[i].typ)
+  of nnkTupleConstr:
+    let typ = nimToGpuType(node)
+    if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
+      # store the type instantiation
+      let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ)
+      case typ.kind
+      of gtObject:      typDef.tFields = typ.oFields
+      of gtGenericInst: typDef.tFields = typ.gFields
+      else:
+        raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
+      ctx.types[typ] = typDef
+
+    result = GpuAst(kind: gpuObjConstr, ocType: typ)
+    # get all fields of the type
+    let flds = typ.oFields
+    # find all fields that have been defined by the user
+    var ocFields: seq[GpuFieldInit]
+    for i in 0 ..< node.len: # all fields to be init'd
+      case node[i].kind
+      of nnkExprColonExpr:
+        ocFields.add GpuFieldInit(name: node[i][0].strVal,
+                                  value: ctx.toGpuAst(node[i][1]),
+                                  typ: GpuType(kind: gtVoid))
+      else:
+        ocFields.add GpuFieldInit(name: "Field" & $i,
+                                  value: ctx.toGpuAst(node[i]),
+                                  typ: GpuType(kind: gtVoid))
+
+    # now add fields in order of the type declaration
+    for i in 0 ..< flds.len:
+      let idx = findIdx(ocFields, flds[i].name)
+      if idx >= 0:
+        var f = ocFields[idx]
+        f.typ = flds[i].typ
+        result.ocFields.add f
+      else:
+        let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid))
+        result.ocFields.add GpuFieldInit(name: flds[i].name,
+                                         value: dfl,
+                                         typ: flds[i].typ)
+
 
   of nnkAsmStmt:
     doAssert node.len == 2

From 58d4608798355b81ae989b61e10fc77aaed743de Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 11:21:46 +0200
Subject: [PATCH 42/87] support Nim's implicit `result` variable

Finally supports the implicit `result`. :)
---
 .../math_compiler/experimental/nim_to_gpu.nim | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index e1744756..f39e77dd 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -532,6 +532,51 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
     # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
     # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
     result = n.getTypeInst.nimToGpuType()
+proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
+  ## Will insert a `gpuVar` for the implicit `result` variable, unless there
+  ## is a user defined `var result` that shadows it at the top level of the proc
+  ## body.
+  ##
+  ## Finally adds a `return result` statement if
+  ## - we add a `result` variable
+  ## - there is no `return` statement as the _last_ statement in the proc
+  if retType.kind == gtVoid: return # nothing to do if the proc returns nothing
+
+  proc hasCustomResult(n: GpuAst): bool =
+    doAssert n.kind == gpuBlock
+    for ch in n: # iterate all top level statements in the proc body
+      case ch.kind
+      of gpuVar:
+        if ch.vName.ident() == "result":
+          ## XXX: could maybe consider to emit a CT warning that `result` shadows the implicit
+          ## result variable
+          echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`."
+          return true
+      else:
+        discard
+
+  proc lastIsReturn(n: GpuAst): bool =
+    doAssert n.kind == gpuBlock
+    if n.statements[^1].kind == gpuReturn: return true
+
+  if not hasCustomResult(ast):
+    # insert `gpuVar` as the *first* statement
+    let resId = GpuAst(kind: gpuIdent, iName: "result",
+                       iSym: "result",
+                       iTyp: retType,
+                       symbolKind: gsLocal)
+    let res = GpuAst(kind: gpuVar, vName: resId,
+                     vType: retType,
+                     vInit: GpuAst(kind: gpuVoid), # no initialization
+                     vRequiresMemcpy: false,
+                     vMutable: true)
+    ast.statements.insert(res, 0)
+    # NOTE: The compiler rewrites expressions at the end of a `proc` into
+    # an assignment to `block: result = <expression>` for us.
+    if not lastIsReturn(ast):
+      # insert `return result`
+      ast.statements.add GpuAst(kind: gpuReturn, rValue: resId)
+
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   ## XXX: things still left to do:
   ## - support `result` variable? Currently not supported. Maybe we will won't
@@ -623,6 +668,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
 
       result.pBody = ctx.toGpuAst(node.body)
         .ensureBlock() # single line procs should be a block to generate `;`
+      result.pBody.maybeInsertResult(result.pRetType, result.pName.ident())
 
       # Add to table of known functions
       if result.pName notin ctx.allFnTab:
@@ -764,6 +810,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name
     result.bLeft = ctx.toGpuAst(node[1])
     result.bRight = ctx.toGpuAst(node[2])
+
     # We patch the types of int / float literals. WGSL does not automatically convert literals
     # to the target type. Determining the type here _can_ fail. In that case the
     # `lType` field will just be `gtVoid`, like the default.

From 9d5b02d03971b221174669c34fe7734785c09293 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 11:24:28 +0200
Subject: [PATCH 43/87] first steps towards supporting expressions

This by itself does not do much. But in order to support (statement
list / block) expressions, some more information about if something is
an expression / if a call is an expression is going to be needed.
---
 .../math_compiler/experimental/gpu_types.nim  |  8 ++++
 .../math_compiler/experimental/nim_to_gpu.nim | 44 +++++++++++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index ccc839b7..1e33f3ee 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -109,6 +109,7 @@ type
       pAttributes*: set[GpuAttribute] # order not important, hence set
       forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration
     of gpuCall:
+      cIsExpr*: bool ## If the call returns a value
       cName*: GpuAst ## Will be a `GpuIdent`
       cArgs*: seq[GpuAst]
     of gpuTemplateCall:
@@ -155,6 +156,7 @@ type
       aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array
       aLitType*: GpuType # type of first element
     of gpuBlock:
+      isExpr*: bool ## Whether this block represents an expression, i.e. it returns something
       blockLabel*: string # optional name of the block. If any given, will open a `{ }` scope in CUDA
       statements*: seq[GpuAst]
       ## XXX: we could add a `locals` argument here, which would refer to all local variables
@@ -274,6 +276,10 @@ type
     ## precise generic instantiations that are called.
     genericInsts*: OrderedTable[GpuAst, GpuAst]
 
+    ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can
+    ## check if they return a value when we encounter them in a `gpuCall`
+    builtins*: OrderedTable[GpuAst, GpuAst]
+
     ## Table of all known types. Filled during Nim -> GpuAst. Includes generic
     ## instantiations, but also all other types.
     ## Key: the raw type. Value: a full `gpuTypeDef`
@@ -409,6 +415,7 @@ proc clone*(ast: GpuAst): GpuAst =
     result.pVal = ast.pVal.clone()
   of gpuBlock:
     result = GpuAst(kind: gpuBlock)
+    result.isExpr = ast.isExpr
     result.blockLabel = ast.blockLabel
     for stmt in ast.statements:
       result.statements.add(stmt.clone())
@@ -458,6 +465,7 @@ proc clone*(ast: GpuAst): GpuAst =
     result.convExpr = ast.convExpr.clone()
   of gpuCast:
     result = GpuAst(kind: gpuCast)
+    result.cIsExpr = ast.cIsExpr
     result.cTo = ast.cTo.clone()
     result.cExpr = ast.cExpr.clone()
   of gpuComment:
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index f39e77dd..fba048ee 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -532,6 +532,18 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
     # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
     # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
     result = n.getTypeInst.nimToGpuType()
+
+proc isExpression(n: GpuAst): bool =
+  ## Returns whether the given AST node is an expression
+  case n.kind
+  of gpuCall: # only if it returns something!
+    result = n.cIsExpr
+  of gpuBinOp, gpuIdent, gpuLit, gpuArrayLit, gpuPrefix, gpuDot, gpuIndex, gpuObjConstr,
+     gpuAddr, gpuDeref, gpuConv, gpuCast, gpuConstExpr:
+    result = true
+  else:
+    result = false
+
 proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
   ## Will insert a `gpuVar` for the implicit `result` variable, unless there
   ## is a user defined `var result` that shadows it at the top level of the proc
@@ -577,6 +589,21 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
       # insert `return result`
       ast.statements.add GpuAst(kind: gpuReturn, rValue: resId)
 
+proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool =
+  ## Returns true if the given `fn` (gpuIdent) returns a value.
+  ## The function can either be:
+  ## - an inbuilt function
+  ## - a generic instantiation
+  ## - contained in `allFnTab`
+  if fn in ctx.allFnTab:
+    result = ctx.allFnTab[fn].pRetType.kind != gtVoid
+  elif fn in ctx.genericInsts:
+    result = ctx.genericInsts[fn].pRetType.kind != gtVoid
+  elif fn in ctx.builtins:
+    result = ctx.builtins[fn].pRetType.kind != gtVoid
+  else:
+    raiseAssert "The function: " & $fn & " is not known anywhere."
+
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   ## XXX: things still left to do:
   ## - support `result` variable? Currently not supported. Maybe we will won't
@@ -602,9 +629,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
                     blockLabel: blockLabel)
     for i in 1 ..< node.len: # index 0 is the block label
       result.statements.add ctx.toGpuAst(node[i])
+  of nnkBlockExpr:
+    ## XXX: For CUDA just a block?
+    let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal
+                     elif node[0].kind == nnkEmpty: ""
+                     else: raiseAssert "Unexpected node in block label field: " & $node.treerepr
+    result = GpuAst(kind: gpuBlock, blockLabel: blockLabel, isExpr: true)
+    for el in node:
+      if el.kind != nnkEmpty:
+        result.statements.add ctx.toGpuAst(el)
   of nnkStmtListExpr: # for statements that return a value.
     ## XXX: For CUDA just a block?
-    result = GpuAst(kind: gpuBlock)
+    result = GpuAst(kind: gpuBlock, isExpr: true)
     for el in node:
       if el.kind != nnkEmpty:
         result.statements.add ctx.toGpuAst(el)
@@ -642,7 +678,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       if node.pragma.kind != nnkEmpty:
         doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
         result.pAttributes = collectProcAttributes(node.pragma)
-        if result.pAttributes.len == 0: # means `nimonly` was applied
+        if result.pAttributes.len == 0: # means `nimonly` was applied / is a `builtin`
+          ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called
           return GpuAst(kind: gpuVoid)
       # Process parameters
       for i in 1 ..< node[3].len:
@@ -798,7 +835,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result.tcName = name
       result.tcArgs = args
     else:
-      result = GpuAst(kind: gpuCall)
+      let fnIsExpr = ctx.fnReturnsValue(name)
+      result = GpuAst(kind: gpuCall, cIsExpr: fnIsExpr)
       result.cName = name
       result.cArgs = args
 

From cc1eaf71135f5f954d7b119b0f5f99edc725cd6a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 12:08:05 +0200
Subject: [PATCH 44/87] move `cIsExpr` in clone to correct branch

Whoops
---
 constantine/math_compiler/experimental/gpu_types.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 1e33f3ee..969859fd 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -348,6 +348,7 @@ proc clone*(ast: GpuAst): GpuAst =
     result.forwardDeclare = result.forwardDeclare
   of gpuCall:
     result = GpuAst(kind: gpuCall)
+    result.cIsExpr = ast.cIsExpr
     result.cName = ast.cName.clone()
     for arg in ast.cArgs:
       result.cArgs.add(arg.clone())
@@ -465,7 +466,6 @@ proc clone*(ast: GpuAst): GpuAst =
     result.convExpr = ast.convExpr.clone()
   of gpuCast:
     result = GpuAst(kind: gpuCast)
-    result.cIsExpr = ast.cIsExpr
     result.cTo = ast.cTo.clone()
     result.cExpr = ast.cExpr.clone()
   of gpuComment:

From 7e43f71b6b20cd3fbcb207cec24cddc59b565082 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 12:08:26 +0200
Subject: [PATCH 45/87] also catch tuple types from `nnkBracketExpr`

---
 .../math_compiler/experimental/nim_to_gpu.nim | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index fba048ee..7df62c00 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -43,6 +43,17 @@ proc initGpuArrayType(aTyp: NimNode, len: int): GpuType =
   ## Construct an statically sized array type
   result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len)
 
+proc toTypeDef(typ: GpuType): GpuAst =
+  ## Converts a given object or generic instantiation type into an AST of a
+  ## corresponding type def.
+  # store the type instantiation
+  result = GpuAst(kind: gpuTypeDef, tTyp: typ)
+  case typ.kind
+  of gtObject:      result.tFields = typ.oFields
+  of gtGenericInst: result.tFields = typ.gFields
+  else:
+    raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
+
 proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
   case t
   #of ntyBool, ntyChar:
@@ -871,6 +882,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     of ntyTuple:
       # need to replace `[idx]` by field access
       let typ = nimToGpuType(node[0].getTypeImpl)
+      if typ notin ctx.types:
+        ctx.types[typ] = toTypeDef(typ)
       #doAssert typ in ctx.types
       doAssert node[1].kind == nnkIntLit
       let idx = node[1].intVal
@@ -994,15 +1007,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     ## this should never see `genericParam` I think
     let typ = nimToGpuType(node)
     if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
-      # store the type instantiation
-      let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ)
-      case typ.kind
-      of gtObject:      typDef.tFields = typ.oFields
-      of gtGenericInst: typDef.tFields = typ.gFields
-      else:
-        raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
-
-      ctx.types[typ] = typDef
+      ctx.types[typ] = toTypeDef(typ)
 
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
@@ -1030,14 +1035,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   of nnkTupleConstr:
     let typ = nimToGpuType(node)
     if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
-      # store the type instantiation
-      let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ)
-      case typ.kind
-      of gtObject:      typDef.tFields = typ.oFields
-      of gtGenericInst: typDef.tFields = typ.gFields
-      else:
-        raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
-      ctx.types[typ] = typDef
+      ctx.types[typ] = toTypeDef(typ)
 
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type

From 39c415835d7490177522e31fb55a24698a670986 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 12:08:50 +0200
Subject: [PATCH 46/87] handle `inline` pragma same as `forceinline`

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 7df62c00..13769353 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -420,7 +420,7 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
     case pragma.strVal
     of "device": result.incl attDevice
     of "global": result.incl attGlobal
-    of "forceinline": result.incl attForceInline
+    of "inline", "forceinline": result.incl attForceInline
     of "nimonly", "builtin":
       # used to fully ignore functions!
       return

From bce536f7f486a630bccf64aeadcbbdc02cc4b121 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 22 Aug 2025 12:14:18 +0200
Subject: [PATCH 47/87] fix detection of custom `result` by looking into
 `gpuBlock`

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 13769353..4f8648b8 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -575,6 +575,8 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
           ## result variable
           echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`."
           return true
+      of gpuBlock: # need to look at `gpuBlock` from top level, because variables are defined in a block
+        result = result or hasCustomResult(ch)
       else:
         discard
 

From dfd4f562db242eaf9fedc8c31d20811e6c7554c4 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 13:57:15 +0200
Subject: [PATCH 48/87] fix handling of `skipSemicolon` for nested usage

If nested, we must not reset the `skipSemicolon` back to false once
the inner one is done!
---
 constantine/math_compiler/experimental/gpu_types.nim | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 969859fd..2ca42e12 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -827,9 +827,12 @@ proc ident*(n: GpuAst): string =
   result = n.iName
 
 template withoutSemicolon*(ctx: var GpuContext, body: untyped): untyped =
-  ctx.skipSemicolon = true
-  body
-  ctx.skipSemicolon = false
+  if not ctx.skipSemicolon: # if we are already skipping, leave true
+    ctx.skipSemicolon = true
+    body
+    ctx.skipSemicolon = false
+  else:
+    body
 
 proc getInnerArrayLengths*(t: GpuType): string =
   ## Returns the lengths of the inner array types for a nested array.

From 8cdf72c97d6ac60040ac9cdcf3181f792d89ce46 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 16:52:46 +0200
Subject: [PATCH 49/87] if last expression _is_ return, we don't need `result`
 variable

At least not in theory. We _could_ construct a function in which there
_is_ a branch that returns `result` and another that returns something
else, but uhh, I guess we can leave that for later.
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 4f8648b8..0958378d 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -584,7 +584,7 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
     doAssert n.kind == gpuBlock
     if n.statements[^1].kind == gpuReturn: return true
 
-  if not hasCustomResult(ast):
+  if not hasCustomResult(ast) and not lastIsReturn(ast):
     # insert `gpuVar` as the *first* statement
     let resId = GpuAst(kind: gpuIdent, iName: "result",
                        iSym: "result",

From a40a87f3b40f27c31dc7139eb7b989818752325a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 16:54:15 +0200
Subject: [PATCH 50/87] remove debug output

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 1 -
 1 file changed, 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 0958378d..10593884 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -94,7 +94,6 @@ proc initGpuGenericInst(t: NimNode): GpuType =
     result = initGpuGenericInst(t[0])
   else:
     raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
-  echo "Got generic inst: ", result
 
 proc unpackGenericInst(t: NimNode): NimNode =
   let tKind = t.typeKind

From 396de7ba1d744ed31d20e4d052f4117706f2b54e Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 16:56:39 +0200
Subject: [PATCH 51/87] doc comment updates for procs in gpu_field_ops

---
 .../experimental/gpu_field_ops.nim            | 61 ++++++++++---------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim
index c16bc900..6fa550f3 100644
--- a/constantine/math_compiler/experimental/gpu_field_ops.nim
+++ b/constantine/math_compiler/experimental/gpu_field_ops.nim
@@ -189,52 +189,58 @@ template defWGSLHelpers*(): untyped {.dirty.} =
   ## Global variable to simulate carry flag. Private == one for each thread
   var carry_flag {.private.}: uint32 = 0'u32
 
-  # Add with carry out (sets carry flag)
   proc add_co(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry out (sets carry flag)
     let result = a + b
     # Check for overflow: carry occurs if result < a (or result < b)
     carry_flag = select(0'u32, 1'u32, result < a)
     return result
 
-  # Add with carry in and carry out
   proc add_cio(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry in and carry out
     let temp = a + b
     let result = temp + carry_flag
     # Carry out if: temp overflowed OR (temp + carry overflowed)
     carry_flag = select(0'u32, 1'u32, (temp < a) or (result < temp))
     return result
 
-  # Add with carry in only
   proc add_ci(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry in only.
+    # NOTE: `carry_flag` is not reset, because the next call after
+    # an `add_ci` *must* be `add_co` or `sub_bo`, but never
+    # `add/sub_cio/ci`!
     let temp = a + b
     let result = temp + carry_flag
     # Don't update carry flag for this operation
     return result
 
-  # Subtract with borrow out (sets borrow flag)
   proc sub_bo(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow out (sets borrow flag)
     let result = a - b
     # Borrow occurs if a < b
     carry_flag = select(0'u32, 1'u32, a < b)
     return result
 
-  # Subtract with borrow in only
-  proc sub_bi(a: uint32, b: uint32): uint32 {.device.} =
+  proc sub_bio(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow in and borrow out
+    # NOTE: `carry_flag` is not reset, because the next call after
+    # an `add_ci` *must* be `add_co` or `sub_bo`, but never
+    # `add/sub_cio/ci`!
     let temp = a - b
     let result = temp - carry_flag
-    # Don't update carry flag for this operation
+    # Borrow out if: a < b OR (temp - borrow underflowed)
+    carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag))
     return result
 
-  # Subtract with borrow in and borrow out
-  proc sub_bio(a: uint32, b: uint32): uint32 {.device.} =
+  proc sub_bi(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow in only
     let temp = a - b
     let result = temp - carry_flag
-    # Borrow out if: a < b OR (temp - borrow underflowed)
-    carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag))
+    # Don't update carry flag for this operation
     return result
 
-  # Select based on condition (equivalent to PTX slct)
   proc slct(a: uint32, b: uint32, pred: int32): uint32 {.device.} =
+    # Select based on condition (equivalent to PTX slct)
     return select(b, a, pred >= 0)
 
   proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} =
@@ -261,8 +267,8 @@ template defWGSLHelpers*(): untyped {.dirty.} =
 
     return p3 + (p1 shr 16) + (p2 shr 16) + carry
 
-  # r <- a * b + c (multiply-add low)
   proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    # r <- a * b + c (multiply-add low)
     return mul_lo(a, b) + c
 
   proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
@@ -280,8 +286,8 @@ template defWGSLHelpers*(): untyped {.dirty.} =
     let product = mul_lo(a, b)
     return add_cio(product, c)
 
-  # r <- (a * b) >> 32 + c (multiply-add high)
   proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    # r <- (a * b) >> 32 + c (multiply-add high)
     return mul_hi(a, b) + c
 
   proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
@@ -582,7 +588,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = mtymul_CIOS_sparebit(a, b, M, true)
 
   proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} =
-    ## Conditional copy in CUDA
+    ## Conditional copy.
     ## If condition is true: b is copied into a
     ## If condition is false: a is left unmodified
     ##
@@ -599,7 +605,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       a[i] = slct(b[i], a[i], cond)
 
   proc csetZero(r: var BigInt, condition: bool) {.device.} =
-    ## Conditionally set `r` to zero in CUDA
+    ## Conditionally set `r` to zero.
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -607,14 +613,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc csetOne(r: var BigInt, condition: bool) {.device.} =
-    ## Conditionally set `r` to one in CUDA
+    ## Conditionally set `r` to one.
     ##
     ## Note: This is constant-time
     template mOne: untyped = MontyOne
     r.ccopy(mOne, condition)
 
   proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} =
-    ## Conditionally add `a` to `r` in place in CUDA.
+    ## Conditionally add `a` to `r` in place..
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -622,7 +628,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} =
-    ## Conditionally subtract `a` from `r` in place in CUDA.
+    ## Conditionally subtract `a` from `r` in place.
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -630,14 +636,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc doubleElement(r: var BigInt, a: BigInt) {.device.} =
-    ## Double `a` and store it in `r` in CUDA.
+    ## Double `a` and store it in `r`.
     ##
     ## Note: This is constant-time
     r.add(a, a)
 
   proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} =
-    ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`
-    ## in CUDA.
+    ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`.
     ##
     ## Note: This is constant-time
     ##
@@ -649,7 +654,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true)
 
   proc isZero(r: var bool, a: BigInt) {.device.} =
-    ## Checks if `a` is zero in CUDA. Result is written to `r`.
+    ## Checks if `a` is zero. Result is written to `r`.
     ##
     ## Note: This is constant-time
     #r = true
@@ -661,7 +666,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = isZero == 0'u32
 
   proc isOdd(r: var bool, a: BigInt) {.device.} =
-    ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`.
+    ## Checks if the Montgomery value of `a` is odd. Result is written to `r`.
     ##
     ## IMPORTANT: The canonical value may or may not be odd if the Montgomery
     ## representation is odd (and vice versa!).
@@ -671,7 +676,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = (a[0] and 1'u32).bool
 
   proc neg(r: var BigInt, a: BigInt) {.device.} =
-    ## Computes the negation of `a` and stores it in `r` in CUDA.
+    ## Computes the negation of `a` and stores it in `r`.
     ##
     ## Note: This is constant-time
     # Check if input is zero
@@ -687,14 +692,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
 
   proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} =
     ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise
-    ## copy over `a` into `r` in CUDA.
+    ## copy over `a` into `r`.
     ##
     ## Note: This is constant-time
     r.neg(a)
     r.ccopy(a, not condition)
 
   proc shiftRight(r: var BigInt, k: uint32) {.device.} =
-    ## Shift `r` right by `k` bits in-nplace in CUDA.
+    ## Shift `r` right by `k` bits in-nplace.
     ##
     ## k MUST be less than the base word size (2^31)
     ##
@@ -716,7 +721,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r[lastIdx] = r[lastIdx] shr k
 
   proc div2(r: var BigInt) {.device.} =
-    ## Divide `r` by 2 in-place in CUDA.
+    ## Divide `r` by 2 in-place.
     ##
     ## Note: This is constant-time
     # check if the input is odd

From cd685528915eb23d36dcc5cd0ee4757131e03029 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 16:57:16 +0200
Subject: [PATCH 52/87] do not emit semicolon in binary operand child nodes

---
 constantine/math_compiler/experimental/backends/cuda.nim | 9 ++++++---
 constantine/math_compiler/experimental/backends/wgsl.nim | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index c117471f..1f12be38 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -285,9 +285,12 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result = ctx.genCuda(expandedBody, indent)
 
   of gpuBinOp:
-    result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " &
-             ast.bOp & " " &
-             ctx.genCuda(ast.bRight) & ")"
+    ctx.withoutSemicolon:
+      let l = ctx.genCuda(ast.bLeft)
+      let r = ctx.genCuda(ast.bRight)
+      result = indentStr & "(" & l & " " &
+               ast.bOp & " " &
+               r & ")"
 
   of gpuIdent:
     result = ast.ident()
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 9043e3a0..288a4fdd 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -931,9 +931,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result = ctx.genWebGpu(expandedBody, indent)
 
   of gpuBinOp:
-    result = indentStr & "(" & ctx.genWebGpu(ast.bLeft) & " " &
-             ast.bOp & " " &
-             ctx.genWebGpu(ast.bRight) & ")"
+    ctx.withoutSemicolon:
+      let l = ctx.genWebGpu(ast.bLeft)
+      let r = ctx.genWebGpu(ast.bRight)
+      result = indentStr & "(" & l & " " &
+               ast.bOp & " " &
+               r & ")"
 
   of gpuIdent:
     result = ast.ident()

From 30a4ac4f93c6b25483085c5007e56c1aecf0a454 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 17:17:01 +0200
Subject: [PATCH 53/87] handle function overloads by using `iSym` if
 encountered

---
 constantine/math_compiler/experimental/gpu_types.nim |  7 +++++++
 .../math_compiler/experimental/nim_to_gpu.nim        | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 2ca42e12..2ce2be12 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -285,6 +285,13 @@ type
     ## Key: the raw type. Value: a full `gpuTypeDef`
     types*: OrderedTable[GpuType, GpuAst]
 
+    ## This is _effectively_ just a set of all already produced function symbols.
+    ## We use it to determine if when encountering another function with the same
+    ## name, but different arguments to instead of using `iName` to use `iSym` as
+    ## the function name. This is to avoid overload issues in backends that don't
+    ## allow overloading by function signatures.
+    symChoices*: HashSet[string]
+
   ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we
   ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext`
   ## causes trouble.
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 10593884..a9430f64 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -501,6 +501,16 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
           result = ctx.toGpuAst(n) # if _no_ pragma
       else:
         result = ctx.toGpuAst(n) # if not proc or func
+
+      # handle overloads with different signatures
+      if n.strVal in ctx.symChoices:
+        # this is an overload of another function with different signature (not a generic, but
+        # overloads are not allowed in CUDA/WGSL/...). Update `sigTab` entry by using `iSym`
+        # for `iName` field for unique name
+        let id = ctx.sigTab[sig]
+        id.iName = id.iSym
+      else:
+        ctx.symChoices.incl result.iName # store this name in `symChoices`
   else:
     # else we use the str representation (repr for open / closed sym choice nodes)
     result = toAst n.repr
@@ -665,7 +675,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     # the `generics` set. When we encounter a `gpuCall` we will then check if the function
     # being called is part of the generic set and look up its _instantiated_ implementation
     # to parse it. The parsed generics are stored in the `genericInsts` table.
-    let name = ctx.toGpuAst(node.name)
+    let name = ctx.getFnName(node.name)
     if node[2].kind == nnkGenericParams: # is a generic
       ctx.generics.incl name.iName # need to use raw name, *not* symbol
       result = GpuAst(kind: gpuVoid)

From a0daf1637e10b303cf7a83fa582d326de7ea1784 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 19:25:39 +0200
Subject: [PATCH 54/87] ignore `magic` pragma procs

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index a9430f64..d7d5d3d2 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -427,6 +427,8 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
       return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil`
     of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend
       continue
+    of "magic":
+      return
     else:
       raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
 

From a260bacfab1e17e1634633d9605356541eeae003 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 25 Aug 2025 19:25:47 +0200
Subject: [PATCH 55/87] ignore forward declarations in Nim -> GpuAst

Any forward declaration will also have its regular implementation
somewhere. Otherwise the code is invalid anyway. As we generate our
own forward declarations, we can ignore them.
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index d7d5d3d2..f36c4f96 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -681,6 +681,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     if node[2].kind == nnkGenericParams: # is a generic
       ctx.generics.incl name.iName # need to use raw name, *not* symbol
       result = GpuAst(kind: gpuVoid)
+    elif node.body.kind == nnkEmpty: # just a forward declaration
+      result = GpuAst(kind: gpuVoid)
     else:
       result = GpuAst(kind: gpuProc)
       result.pName = name

From e0a00eec2b811d8edd5abcd7b5a667eedfc854c7 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 26 Aug 2025 08:11:23 +0200
Subject: [PATCH 56/87] add BigInt comparison and `toCanonical` conversion

We could consider to rename it `fromMont` to match the regular
Constantine naming, but given that here we (currently) only have a
single type to represent both, I picked a different name.
---
 .../experimental/gpu_field_ops.nim            | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim
index 6fa550f3..76523366 100644
--- a/constantine/math_compiler/experimental/gpu_field_ops.nim
+++ b/constantine/math_compiler/experimental/gpu_field_ops.nim
@@ -306,6 +306,113 @@ template defWGSLHelpers*(): untyped {.dirty.} =
     return add_cio(hi_product, c)
 
 
+template defBigIntCompare*(): untyped {.dirty.} =
+  ## This template adds a comparison operator for BigInts `<` (which is rewritten to
+  ## a function call `less`) as well as a `toCanonical` function to turn a Montgomery
+  ## representation into a canonical representation.
+  ## It is included in the `defCoreFieldOps` by default, so you need not manually use it.
+
+  proc less(a, b: BigInt): bool {.device.} =
+    ## Returns true if a < b for two big ints in *canonical*
+    ## representation.
+    ##
+    ## NOTE: The inputs are compared *as is*. That means if they are
+    ## in Montgomery representation the result will not reflect the
+    ## ordering relation of their associated canonical values!
+    ## Call `toCanonical` on field elements in Montgomery order before
+    ## comparing them.
+    ##
+    ## Comparison is constant-time
+    var borrow: uint32
+    # calculate sub with borrows for side effect. Use borrow flag
+    # at the end to determine if value was smaller
+    discard sub_bo(a[0], b[0])
+    staticFor i, 1, a.len:
+      discard sub_bio(a[i], b[i])
+    borrow = sub_bi(0'u32, 0'u32)
+    return borrow.bool
+
+  # template to rewrite `<` into a function call. Most backends don't allow custom operators
+  template `<`(b1, b2: BigInt): untyped = less(b1, b2)
+
+  proc muladd1_gpu(hi, lo: var uint32, a, b, c: uint32) {.device, forceinline.} =
+    ## Extended precision multiplication + addition
+    ## (hi, lo) <- a*b + c
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding any c cannot overflow
+    ##
+    ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd1`
+    lo = mulloadd_co(a, b, c)     # low part of a*b + c with carry out
+    hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in
+
+  proc muladd2_gpu(hi, lo: var uint32, a, b, c1, c2: uint32) {.device, forceinline.} =
+    ## Extended precision multiplication + addition + addition
+    ## (hi, lo) <- a*b + c1 + c2
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+    ##
+    ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd2`
+    lo = mulloadd_co(a, b, c1)    # low part of a*b + c1 with carry out
+    hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in
+    # Add c2 with carry propagation
+    lo = add_co(lo, c2)
+    hi = add_ci(hi, 0'u32)
+
+  proc sub_no_mod(a, b: BigInt): BigInt {.device.} =
+    ## Generate an optimized substraction kernel
+    ## with parameters `a, b, modulus: Limbs -> Limbs`
+    ## I.e. this does _not_ perform modular reduction.
+    var t = BigInt()
+    t[0] = sub_bo(a[0], b[0])
+    staticFor i, 1, a.len:
+      t[i] = sub_bio(a[i], b[i])
+    return t
+
+  proc sub_no_mod(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Subtraction of two finite field elements stored in `a` and `b`
+    ## *without* modular reduction.
+    ## The result is stored in `r`.
+    r = sub_no_mod(a, b)
+
+  proc csub_no_mod(r: var BigInt, a: BigInt, condition: bool) {.device.} =
+    ## Conditionally subtract `a` from `r` in place *without* modular
+    ## reduction.
+    ##
+    ## Note: This is constant-time
+    var t = BigInt()
+    t.sub_no_mod(r, a)
+    r.ccopy(t, condition)
+
+  proc fromMont_CIOS(r: var BigInt, a, M: BigInt, m0ninv: uint32) {.device.} =
+    ## Convert from Montgomery form to canonical BigInt form
+    # for i in 0 .. n-1:
+    #   m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
+    #   C, _ = t[0] + m * M[0]
+    #   for j in 1 ..n-1:
+    #     (C, t[j-1]) <- r[j] + m*M[j] + C
+    #   t[n-1] = C
+
+    var t = a # Ensure working in registers
+
+    staticFor i, 0, N:
+      let m = t[0] * m0ninv
+      var C, lo: uint32
+      muladd1_gpu(C, lo, m, M[0], t[0])
+      staticFor j, 1, N:
+        muladd2_gpu(C, t[j-1], m, M[j], C, t[j])
+      t[N-1] = C
+
+    t.csub_no_mod(M, not (t < M))
+    r = t
+
+  proc toCanonical(b: BigInt): BigInt {.device.} =
+    var canon: BigInt
+    canon.fromMont_CIOS(b, M, M0NInv)
+    return canon
+
 template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   # Need to get the limbs & spare bits data in a static context
   template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32

From c9d99020681333f03cd7336453b09f35417fdfee Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 26 Aug 2025 08:14:19 +0200
Subject: [PATCH 57/87] add FIPS Montgomery multiplication for fields without
 spare bits

The Goldilocks field does not have any spare bits. As a result using
CIOS leads to the wrong result.
---
 .../experimental/gpu_field_ops.nim            | 84 +++++++++++++++++--
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim
index 76523366..46c458f4 100644
--- a/constantine/math_compiler/experimental/gpu_field_ops.nim
+++ b/constantine/math_compiler/experimental/gpu_field_ops.nim
@@ -427,6 +427,10 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2))
   const M0NInv = getM0ninv().uint32
 
+  # `ccopy` needed for BigInt comparison logic
+  proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.}
+  defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts
+
   proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} =
     ## If a >= Modulus: r <- a-M
     ## else:            r <- a
@@ -511,7 +515,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       t[i] = sub_bio(a[i], b[i])
 
     let underflowMask = sub_bi(0'u32, 0'u32)
-
     # If underflow
     # TODO: predicated mov instead?
     var maskedM: BigInt = BigInt()
@@ -523,7 +526,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       t[i] = add_cio(t[i], maskedM[i])
     when N > 1:
       t[N-1] = add_ci(t[N-1], maskedM[N-1])
-
     return t
 
   proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} =
@@ -689,11 +691,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     ## The result is stored in `r`.
     r = modsub(a, b, M)
 
-  proc mul(r: var BigInt, a, b: BigInt) {.device.} =
-    ## Multiplication of two finite field elements stored in `a` and `b`.
-    ## The result is stored in `r`.
-    r = mtymul_CIOS_sparebit(a, b, M, true)
-
   proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} =
     ## Conditional copy.
     ## If condition is true: b is copied into a
@@ -772,6 +769,11 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       isZero = isZero or a[i]
     r = isZero == 0'u32
 
+  proc isZero(a: BigInt): bool {.device, forceinline.} =
+    result.isZero(a)
+  proc isNonZero(a: BigInt): bool {.device, forceinline.} =
+    result = not isZero(a)
+
   proc isOdd(r: var bool, a: BigInt) {.device.} =
     ## Checks if the Montgomery value of `a` is odd. Result is written to `r`.
     ##
@@ -840,3 +842,71 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
 
     # if it was odd, add `M+1/2` to go 'half-way around'
     r.cadd(PP1D2, isO)
+
+  proc mul_lohi(hi, lo: var uint32, a, b: uint32) {.device, forceinline.} =
+    lo = mul_lo(a, b)
+    hi = mul_hi(a, b)
+
+  proc mulAcc(t, u, v: var uint32, a, b: uint32) {.device, forceinline.} =
+    ## (t, u, v) <- (t, u, v) + a * b
+    v = mulloadd_co(a, b, v)     # v = (a*b).low + v, with carry out
+    u = mulhiadd_cio(a, b, u)    # u = (a*b).high + u + carry, with carry out
+    t = add_ci(t, 0'u32)         # t = t + carry
+
+  proc mtymul_FIPS(a, b, M: BigInt, lazyReduce: static bool = false): BigInt {.device.} =
+    ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS).
+    ## This implementation can be used for fields that do not have any spare bits.
+    ##
+    ## This maps
+    ## - [0, 2p) -> [0, 2p) with lazyReduce
+    ## - [0, 2p) -> [0, p) without
+    ##
+    ## lazyReduce skips the final substraction step.
+    # - Architectural Enhancements for Montgomery
+    #   Multiplication on Embedded RISC Processors
+    #   Johann Großschädl and Guy-Armand Kamendje, 2003
+    #   https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf
+    #
+    # - New Speed Records for Montgomery Modular
+    #   Multiplication on 8-bit AVR Microcontrollers
+    #   Zhe Liu and Johann Großschädl, 2013
+    #   https://eprint.iacr.org/2013/882.pdf
+    template m0ninv: untyped = M0NInv
+    var z = BigInt() # zero-init, ensure on stack and removes in-place problems in tower fields
+    const L = a.len
+    var t, u, v = 0'u32
+
+    staticFor i, 0, L:
+      staticFor j, 0, i:
+        mulAcc(t, u, v, a[j], b[i-j])
+        mulAcc(t, u, v, z[j], M[i-j])
+      mulAcc(t, u, v, a[i], b[0])
+      z[i] = v * m0ninv
+      mulAcc(t, u, v, z[i], M[0])
+      v = u
+      u = t
+      t = 0'u32
+
+    staticFor i, L, 2*L:
+      staticFor j, i-L+1, L:
+        mulAcc(t, u, v, a[j], b[i-j])
+        mulAcc(t, u, v, z[j], M[i-j])
+      z[i-L] = v
+      v = u
+      u = t
+      t = 0'u32
+
+    when not lazyReduce:
+      let cond = v != 0 or not(z < M)
+      # conditionally subtract using *non modular subtraction*. If `cond == true`,
+      # we are in `M <= z <= 2M` and can safely subtract `M`.
+      z.csub_no_mod(M, cond)
+    return z
+
+  proc mul(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Multiplication of two finite field elements stored in `a` and `b`.
+    ## The result is stored in `r`.
+    when spareBits() >= 1:
+      r = mtymul_CIOS_sparebit(a, b, M, true)
+    else: # e.g. Goldilocks
+      r = mtymul_FIPS(a, b, M, false)

From b6a63ef2d3e6cc2330a856d106ff1899e5adcec2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 26 Aug 2025 08:17:36 +0200
Subject: [PATCH 58/87] [cuda] for now emit `constexpr` for a `gpuConstexpr`
 (i.e. Nim `const`)

See the added TODO. The main point is that if we write things like

```nim
  const M = toBigInt(bigintToUint32Limbs(T.getModulus))
```

we want to have the Nim compiler evaluate the RHS at compile time. If
we were to make the user write

```
  var M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus))
```

instead the Nim compiler would evaluate the RHS at runtime.

We _could_ force the user to write

```nim
  const M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus))
```

instead though.

It is no problem however, to emit `__constant__` if it's a global and
`constexpr` for a local (where `__constant__` is anyhow forbidden in
CUDA). The only minor annoyance is that _maybe_ someone wants to emit
`constexpr` also in global scope.

_Maybe_ we'll go with a design that does the above by default but
allows you to overwrite it via

```nim
  const M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus))
```
-> explicitly force `__constant__`

```nim
  const M {.constexpr.} = toBigInt(bigintToUint32Limbs(T.getModulus))
```
-> explicity force `constexpr` instead.
---
 constantine/math_compiler/experimental/backends/cuda.nim | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 1f12be38..83220fdf 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -346,10 +346,15 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = "(*" & ctx.genCuda(ast.dOf) & ")"
 
   of gpuConstexpr:
+    ## TODO: We need to change the code such that we emit `constexpr` inside of procs and
+    ## `__constant__` outside of procs. The point is we want to support mapping to `__constant__`
+    ## for `const foo = bar` Nim declarations to evaluate values at Nim's compile time.
+    ## Alternatively, make user write `const foo {.constant.} = bar` to produce a global
+    ## `__constant__` value.
     if ast.cType.kind == gtArray:
-      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
+      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
     else:
-      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
+      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
 
   else:
     echo "Unhandled node kind in genCuda: ", ast.kind

From cf6fcbff0a37a1c91cde1e11b9b49710d9365199 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:37:54 +0200
Subject: [PATCH 59/87] [cuda] handle generic instantiations and
 `UncheckedArray`

---
 .../math_compiler/experimental/backends/cuda.nim     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 83220fdf..92f63efa 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -95,7 +95,18 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
       else:
         result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]"
     skipIdent = true
+  of gtGenericInst:
+    # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
+    # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
+    result = t.gName
+    if t.gArgs.len > 0:
+      result.add "_"
+    for i, g in t.gArgs:
+      result.add gpuTypeToString(g)
+      if i < t.gArgs.high:
+        result.add "_"
   of gtObject: result = t.name
+  of gtUA:     result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T?
   else:        result = gpuTypeToString(t.kind)
 
   if ident.len > 0 and not skipIdent: # still need to add ident
@@ -373,6 +384,7 @@ proc codegen*(ctx: var GpuContext): string =
     let fnC = fn.clone()
     fnC.forwardDeclare = true
     result.add ctx.genCuda(fnC) & "\n"
+  result.add "\n\n"
 
   for fnIdent, fn in ctx.fnTab:
     result.add ctx.genCuda(fn) & "\n\n"

From 1e8f84d288df0850ef9d2b617a268194ba0595f1 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:38:12 +0200
Subject: [PATCH 60/87] [wgsl] do not suffix generic inst types if they have no
 args

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 288a4fdd..d908f22b 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -115,7 +115,9 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr =
   of gtGenericInst:
     # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
     # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
-    result = t.gName & "_"
+    result = t.gName
+    if t.gArgs.len > 0:
+      result.add "_"
     for i, g in t.gArgs:
       result.add gpuTypeToString(g)
       if i < t.gArgs.high:

From e6cab34f6596dae0c2e2bcd91e42e849d994f385 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:48:18 +0200
Subject: [PATCH 61/87] handle `raises`, `noinit` pragmas

And use normalized strings because `noInit` and `noinit` both appears
often enough.
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index f36c4f96..a79cb1a4 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -429,6 +429,7 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
       continue
     of "magic":
       return
+    of "raises": discard # result.incl attDevice #discard # XXX
     else:
       raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
 
@@ -453,12 +454,13 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] =
     # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes
     # do not match the CUDA string we need to emit, which is what the string value of
     # the `GpuVarAttribute` enum stores
-    case pragma.strVal
-    of "cuExtern", "extern": result.add atvExtern
+    case pragma.strVal.normalize
+    of "cuextern", "extern": result.add atvExtern
     of "shared": result.add atvShared
     of "private": result.add atvPrivate
     of "volatile": result.add atvVolatile
     of "constant": result.add atvConstant
+    of "noinit": discard # XXX: ignore for now
     else:
       raiseAssert "Unexpected pragma: " & $pragma.treerepr
 

From 91a87e796a1a04e1788d9bbd89383be2e70e0cdb Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:52:54 +0200
Subject: [PATCH 62/87] use `getTypeName` for tuple type fields

Otherwise we get a big mess for objects :)
---
 .../math_compiler/experimental/nim_to_gpu.nim  | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index a79cb1a4..7dae2ad7 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -171,6 +171,7 @@ functions.
       doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr
       result = n[1][2].intVal + 1
 
+proc getTypeName(n: NimNode, recursedSym: bool = false): string
 proc constructTupleTypeName(n: NimNode): string =
   ## XXX: overthink if this should really be here and not somewhere else
   ##
@@ -179,12 +180,14 @@ proc constructTupleTypeName(n: NimNode): string =
   ##
   ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying
   ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types.
+  ## UPDATE: I changed the implementation to recurse into `getTypeName`
+  ## TODO: verify that this did not break the tuple test & specifically check for aliases
   result = "Tuple_"
   doAssert n.kind in [nnkTupleTy, nnkTupleConstr]
   for i, ch in n:
     case ch.kind
     of nnkIdentDefs:
-      let typName = ch[ch.len - 2].getTypeImpl.repr # second to last is type name of field(s)
+      let typName = ch[ch.len - 2].getTypeName() # second to last is type name of field(s)
       for j in 0 ..< ch.len - 2:
         # Example:
         # IdentDefs
@@ -206,7 +209,7 @@ proc constructTupleTypeName(n: NimNode): string =
       #     IntLit 16
       # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)`
       #    give them a slightly different name
-      let typName = ch[0].getTypeImpl.repr ## XXX
+      let typName = ch[0].getTypeName() ## XXX
       doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr
       result.add ch[0].strVal & "_" & typName
       if i < n.len - 1:
@@ -215,7 +218,7 @@ proc constructTupleTypeName(n: NimNode): string =
       # TupleConstr
       #   Sym "BaseType" <-- e.g. here
       #   Sym "BaseType"
-      let typName = ch.getTypeImpl.repr
+      let typName = ch.getTypeName()
       result.add "Field" & $i & "_" & typName
       if i < n.len - 1:
         result.add "_"
@@ -232,10 +235,15 @@ proc constructTupleTypeName(n: NimNode): string =
       # -> Try again with type impl
       return constructTupleTypeName(getTypeImpl(n))
 
-proc getTypeName(n: NimNode): string =
+proc getTypeName(n: NimNode, recursedSym: bool = false): string =
   ## Returns the name of the type
   case n.kind
-  of nnkIdent, nnkSym: result = n.strVal
+  of nnkIdent: result = n.strVal
+  of nnkSym:
+    if recursedSym:
+      result = n.strVal
+    else:
+      result = n.getTypeInst.getTypeName(true)
   of nnkObjConstr:
     if n[0].kind == nnkEmpty:
       result = n.getTypeInst.strVal

From bb4e8fb1881eba68b77be7a4c5b05a4c27fe821c Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:53:14 +0200
Subject: [PATCH 63/87] map `ntyString` to string explicitly in gpu type kinds

Not supported on some backends
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 7dae2ad7..c341de44 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -74,6 +74,7 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
   of ntyUInt16: gtUint16
   of ntyUInt32: gtUint32
   of ntyUInt64: gtUint64
+  of ntyString: gtString
   else:
     raiseAssert "Not supported yet: " & $t
 
@@ -275,6 +276,8 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool =
     case n.typeKind
     of ntyBool, ntyInt .. ntyUint64: # includes all float types
       result = initGpuType(toGpuTypeKind n.typeKind)
+    of ntyString: # only supported on some backends!
+      result = initGpuType(toGpuTypeKind n.typeKind)
     of ntyPtr:
       result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false)
     of ntyVar:

From de2cf85e51c746337ea8327d48958a8672f46186 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:53:48 +0200
Subject: [PATCH 64/87] map ntyUnused2 (lent T) to ptr T

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index c341de44..d21085e4 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -328,16 +328,21 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool =
     #  error("o")
     of ntyGenericInvocation:
       result = initGpuType(gtInvalid)
-      error("Generics are not supported in the CUDA DSL so far.")
+      error("Generics are not supported in the CUDA DSL so far.") # Note: this should not appear nowadays
     of ntyGenericInst:
       result = initGpuGenericInst(n)
-      #result = n.unpackGenericInst().nimToGpuType(allowToFail)
     of ntyTypeDesc:
       # `getType` returns a `BracketExpr` of eg:
       # BracketExpr
       #   Sym "typeDesc"
       #   Sym "float32"
       result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it
+    of ntyUnused2:
+      # BracketExpr
+      #   Sym "lent"
+      #   Sym "BigInt"
+      doAssert n.kind == nnkBracketExpr and n[0].strVal == "lent", "ntyUnused2: " & $n.treerepr
+      result = initGpuPtrType(nimToGpuType(n[1]), implicitPtr = false)
     else:
       if allowToFail:
         result = GpuType(kind: gtVoid)

From 1b0aa55d3ecfb2d3b28654ef8145d3aee63b5561 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:54:10 +0200
Subject: [PATCH 65/87] generate type names for bracket expressions e.g.
 generics

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index d21085e4..6122f601 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -252,6 +252,12 @@ proc getTypeName(n: NimNode, recursedSym: bool = false): string =
       result = n[0].strVal # type is the first node
   of nnkTupleTy, nnkTupleConstr:
     result = constructTupleTypeName(n)
+  of nnkBracketExpr:
+    # construct a type name `Foo_Bar_Baz`
+    for i, ch in n:
+      result.add ch.getTypeName()
+      if i < n.len - 1:
+        result.add "_"
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
 proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =

From 1a4dc3140790130fdd0f017dab2a2ac83a3aea41 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:54:45 +0200
Subject: [PATCH 66/87] improve generic inst -> gpu type by handling nnkSym

This is not perfect yet (I think). Needs some tests for different
situations. Works in practice for what I've used it for at least.
---
 .../math_compiler/experimental/nim_to_gpu.nim       | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 6122f601..ab5e45b5 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -93,6 +93,19 @@ proc initGpuGenericInst(t: NimNode): GpuType =
   of nnkObjConstr:
     doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr
     result = initGpuGenericInst(t[0])
+  of nnkSym:
+    let impl = getTypeImpl(t)
+    case impl.kind
+    of nnkDistinctTy:
+      ## XXX: assumes distinct of inbuilt type, not object!
+      result = nimToGpuType(impl[0])
+    of nnkObjectTy:
+      doAssert impl.kind == nnkObjectTy, "Unexpected node kind for generic inst: " & $impl.treerepr
+      ## XXX: use signature hash for type name? Otherwise will produce duplicates
+      result = GpuType(kind: gtGenericInst, gName: t.repr)
+      result.gFields = parseTypeFields(impl)
+    else:
+      raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
   else:
     raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
 

From 8583d2306e0a0ed51bc4d63aa1e03eb63e88d34d Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:55:39 +0200
Subject: [PATCH 67/87] overwrite function names of problematic identifiers

E.g. `[]` is not a sensible name on GPU backends. We rename it to
`get` for example. Note that the binary operators there likely never
appear there. We need to handle those via `gpuBinOp` (and call
`maybePatchFnName` from there)
---
 .../math_compiler/experimental/nim_to_gpu.nim | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index ab5e45b5..346aaedf 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -501,6 +501,28 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] =
 
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst
 
+proc maybePatchFnName(n: var GpuAst) =
+  ## Patches the function name for names that are not allowed on most backends, but appear
+  ## commonly in Nim (custom operators).
+  ##
+  ## NOTE: I think that the binary operators don't actually appear as a `gpuCall`, but still
+  ## as an infix node, even after sem checking by the Nim compiler.
+  doAssert n.kind == gpuIdent
+  template patch(arg, by: untyped): untyped =
+    arg.iSym = arg.iSym.replace(arg.iName, by)
+    arg.iName = by
+  let name = n.iName
+  case name
+  of "[]":  patch(n, "get")
+  of "[]=": patch(n, "set")
+  of "+":   patch(n, "add")
+  of "-":   patch(n, "sub")
+  of "*":   patch(n, "mul")
+  of "/":   patch(n, "div")
+  else:
+    # leave as is
+    discard
+
 proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
   ## Returns the name for the function. Either the symbol name _or_
   ## the `{.cudaName.}` pragma argument.
@@ -541,6 +563,10 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
       else:
         result = ctx.toGpuAst(n) # if not proc or func
 
+      # possibly patch function names, e.g. custom `[]`, `[]=`, `+` etc operators
+      # (inbuilt won't show up as a function name, but rather as a specific node kind, eg `nnkIndex`
+      result.maybePatchFnName()
+
       # handle overloads with different signatures
       if n.strVal in ctx.symChoices:
         # this is an overload of another function with different signature (not a generic, but

From bf27dcbf4376e16187f69c937b01109f516d9481 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:57:45 +0200
Subject: [PATCH 68/87] refactor proc signature parsing & unify adding types to
 `ctx.types`

NOTE: We'll change the `maybeAddType` code in the future to be done in
`nimToGpuType` directly. However, at the moment that produces a bit
too much required change with having to add `GpuContext` to a whole
bunch of functions that all call `nimToGpuType` internally.
---
 .../math_compiler/experimental/nim_to_gpu.nim | 121 ++++++++++--------
 1 file changed, 71 insertions(+), 50 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 346aaedf..b46a17c2 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -584,6 +584,66 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
     # ctx.sigTab[sig] = result
   result.symbolKind = gsProc # make sure it's a proc
 
+proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
+  ## Returns the type from a given Nim node `t` representing a type.
+  ## If that fails due to an identifier in the type, we instead try
+  ## to look up the type from the associated symbol, `n`.
+  result = nimToGpuType(t, allowArrayIdent = true)
+  if result.kind == gtInvalid:
+    # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
+    # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
+    result = n.getTypeInst.nimToGpuType()
+
+proc maybeAddType*(ctx: var GpuContext, typ: GpuType) =
+  ## Adds the given type to the table of known types, if it is some kind of
+  ## object type.
+  ##
+  ## XXX: What about aliases and distincts?
+  if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types:
+    ctx.types[typ] = toTypeDef(typ)
+
+proc parseProcParameters(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): seq[GpuParam] =
+  ## Returns all parameters of the given procedure from the `params` node
+  ## of type `nnkFormalParams`.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  for i in 1 ..< params.len:
+    let param = params[i]
+    let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
+    let typIdx = param.len - 2 # second to last is the type
+    # IdentDefs
+    #   Ident "x"
+    #   Ident "y"
+    #   Ident "res"
+    #   PtrTy
+    #     Ident "float32"   # `param.len - 2`
+    #   Empty               # `param.len - 1`
+    let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1])
+    ctx.maybeAddType(paramType)
+    for i in 0 ..< numParams:
+      var p = ctx.toGpuAst(param[i])
+      let symKind = if attGlobal in attrs: gsGlobalKernelParam
+                    else: gsDeviceKernelParam
+      p.iTyp = paramType     ## Update the type of the symbol
+      p.symbolKind = symKind ## and the symbol kind
+      let param = GpuParam(ident: p, typ: paramType)
+      result.add(param)
+
+proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType =
+  ## Returns the return type of the given procedure from the `params` node
+  ## of type `nnkFormalParams`.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  let retType = params[0] # arg 0 is return type
+  if retType.kind == nnkEmpty:
+    result = GpuType(kind: gtVoid) # actual void return
+  else:
+    # attempt to get type. If fails, we need to wait for a caller to this function to get types
+    # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside
+    # the macro. We then rely on our generics logic to later look this up when called
+    result = nimToGpuType(retType, allowArrayIdent = true)
+    if result.kind == gtVoid: # stop parsing this function
+      result = GpuType(kind: gtInvalid)
+  ctx.maybeAddType(result)
+
 proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
   ## Looks up the implementation of the given function and stores it in our table
   ## of generic instantiations.
@@ -608,16 +668,6 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
     name.iName = fn.pName.iSym ## update the name of the called function
     ctx.genericInsts[fn.pName] = fn
 
-proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
-  ## Returns the type from a given Nim node `t` representing a type.
-  ## If that fails due to an identifier in the type, we instead try
-  ## to look up the type from the associated symbol, `n`.
-  result = nimToGpuType(t, allowArrayIdent = true)
-  if result.kind == gtInvalid:
-    # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
-    # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
-    result = n.getTypeInst.nimToGpuType()
-
 proc isExpression(n: GpuAst): bool =
   ## Returns whether the given AST node is an expression
   case n.kind
@@ -750,18 +800,12 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result = GpuAst(kind: gpuProc)
       result.pName = name
       result.pName.symbolKind = gsProc ## This is a procedure identifier
-      doAssert node[3].kind == nnkFormalParams
-      let retType = node[3][0] # arg 0 is return type
-      if retType.kind == nnkEmpty:
-        result.pRetType = GpuType(kind: gtVoid) # actual void return
-      else:
-        # attempt to get type. If fails, we need to wait for a caller to this function to get types
-        # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside
-        # the macro. We then rely on our generics logic to later look this up when called
-        result.pRetType = nimToGpuType(retType, allowArrayIdent = true)
-        if result.pRetType.kind == gtVoid: # stop parsing this function
-          ctx.generics.incl name.iName # need to use raw name, *not* symbol
-          return GpuAst(kind: gpuVoid)
+      let params = node[3]
+      doAssert params.kind == nnkFormalParams
+      result.pRetType = ctx.parseProcReturnType(params)
+      if result.pRetType.kind == gtInvalid:
+        ctx.generics.incl name.iName # need to use raw name, *not* symbol
+        return GpuAst(kind: gpuVoid)
 
       # Process pragmas
       if node.pragma.kind != nnkEmpty:
@@ -771,27 +815,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
           ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called
           return GpuAst(kind: gpuVoid)
       # Process parameters
-      for i in 1 ..< node[3].len:
-        let param = node[3][i]
-        let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
-        let typIdx = param.len - 2 # second to last is the type
-        # IdentDefs
-        #   Ident "x"
-        #   Ident "y"
-        #   Ident "res"
-        #   PtrTy
-        #     Ident "float32"   # `param.len - 2`
-        #   Empty               # `param.len - 1`
-        let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1])
-        for i in 0 ..< numParams:
-          var p = ctx.toGpuAst(param[i])
-          let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam
-                        else: gsDeviceKernelParam
-          p.iTyp = paramType     ## Update the type of the symbol
-          p.symbolKind = symKind ## and the symbol kind
-          let param = GpuParam(ident: p, typ: paramType)
-          result.pParams.add(param)
-
+      result.pParams = ctx.parseProcParameters(params, result.pAttributes)
       result.pBody = ctx.toGpuAst(node.body)
         .ensureBlock() # single line procs should be a block to generate `;`
       result.pBody.maybeInsertResult(result.pRetType, result.pName.ident())
@@ -826,6 +850,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         varNode.vAttributes = collectAttributes(declaration[0][1])
       else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr
       varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0])
+      ctx.maybeAddType(varNode.vType)
       varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later
       # This is a *local* variable (i.e. `function` address space on WGSL) unless it is
       # annotated with `{.shared.}` (-> `workspace` in WGSL)
@@ -960,8 +985,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     of ntyTuple:
       # need to replace `[idx]` by field access
       let typ = nimToGpuType(node[0].getTypeImpl)
-      if typ notin ctx.types:
-        ctx.types[typ] = toTypeDef(typ)
+      ctx.maybeAddType(typ)
       #doAssert typ in ctx.types
       doAssert node[1].kind == nnkIntLit
       let idx = node[1].intVal
@@ -1084,9 +1108,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   of nnkObjConstr:
     ## this should never see `genericParam` I think
     let typ = nimToGpuType(node)
-    if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
-      ctx.types[typ] = toTypeDef(typ)
-
+    ctx.maybeAddType(typ)
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
     let flds = typ.oFields
@@ -1112,8 +1134,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
                                          typ: flds[i].typ)
   of nnkTupleConstr:
     let typ = nimToGpuType(node)
-    if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type
-      ctx.types[typ] = toTypeDef(typ)
+    ctx.maybeAddType(typ)
 
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type

From 81bd6f6c795287859fb1d0feb45d02f89ea61773 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 28 Aug 2025 18:59:59 +0200
Subject: [PATCH 69/87] handle recursive calls in GPU code

Previously due to our parsing of procs whenever they are called, we
would infinitely recurse in the parsing logic. We now record the
function signature and function identifier so that we can avoid that.

We need the function signature to get information about the return
type before we actually start parsing a function. Otherwise _inside_
of the recursive function we wouldn't be able to determine the return
type at the callsite of the recursive call (the initial parse hasn't
been completed at that point yet, which would fill the proc into `allFnTab`)
---
 .../math_compiler/experimental/gpu_types.nim  | 21 ++++++++++
 .../math_compiler/experimental/nim_to_gpu.nim | 40 +++++++++++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 2ce2be12..cac35019 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -236,6 +236,10 @@ type
     params*: seq[string]
     body*: GpuAst
 
+  GpuProcSignature* = object
+    params*: seq[GpuParam]
+    retType*: GpuType
+
   GpuContext* = object
     ## XXX: need table for generic invocations. Then when we encounter a type, need to map to
     ## the specific version
@@ -276,6 +280,12 @@ type
     ## precise generic instantiations that are called.
     genericInsts*: OrderedTable[GpuAst, GpuAst]
 
+    ## Table of procs and their signature to avoid looping infinitely for recursive procs
+    ## Arguments are:
+    ## - Key: ident of the proc
+    ## - Value: signature of the (possibly generic) instantiation
+    processedProcs*: OrderedTable[GpuAst, GpuProcSignature]
+
     ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can
     ## check if they return a value when we encounter them in a `gpuCall`
     builtins*: OrderedTable[GpuAst, GpuAst]
@@ -548,6 +558,17 @@ proc `==`*(a, b: GpuAst): bool =
   else:
     result = a.iSym == b.iSym and a.iTyp == b.iTyp and a.symbolKind == b.symbolKind
 
+proc `==`*(a, b: GpuProcSignature): bool =
+  if a.retType != b.retType: result = false
+  elif a.params.len != b.params.len:
+    result = false
+  else:
+    result = true
+    for i in 0 ..< a.params.len:
+      let ap = a.params[i]
+      let bp = b.params[i]
+      result = result and (ap == bp)
+
 proc len*(ast: GpuAst): int =
   case ast.kind
   of gpuProc:      1
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index b46a17c2..7be15c56 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -447,7 +447,8 @@ proc isBuiltIn(n: NimNode): bool =
       return true
 
 proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
-  doAssert n.kind == nnkPragma
+  doAssert n.kind in [nnkPragma, nnkEmpty]
+  if n.kind == nnkEmpty: return # no pragmas
   for pragma in n:
     doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr
     let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma
@@ -644,6 +645,16 @@ proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType =
       result = GpuType(kind: gtInvalid)
   ctx.maybeAddType(result)
 
+proc toGpuProcSignature(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): GpuProcSignature =
+  ## Creates a `GpuProcSignature` from the given `params` node of type `nnkFormalParams`
+
+  ##
+  ## NOTE: This procedure is only called from generically instantiated procs. Therefore,
+  ## we shouldn't need to worry about getting `gtInvalid` return types here.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  result = GpuProcSignature(params: ctx.parseProcParameters(params, attrs),
+                            retType: ctx.parseProcReturnType(params))
+
 proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
   ## Looks up the implementation of the given function and stores it in our table
   ## of generic instantiations.
@@ -656,9 +667,30 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
   let inst = node[0].getImpl()
   let sig = node[0].getTypeInst()
   inst.params = sig.params # copy over the parameters
+
+  # turn the signature into a `GpuProcSignature`
+  let attrs = collectProcAttributes(inst.pragma)
+  let procSig = ctx.toGpuProcSignature(sig.params, attrs)
+  if name in ctx.processedProcs:
+    return
+  else:
+    # Need to add isym here so that if we have recursive calls, we don't end up
+    # calling `toGpuAst` recursively forever
+    ctx.processedProcs[name] = procSig
+
   let fn = ctx.toGpuAst(inst)
-  if fn.kind == gpuVoid: # should be an inbuilt proc, i.e. annotated with `{.builtin.}`
-    doAssert inst.isBuiltIn()
+  if fn.kind == gpuVoid:
+    # Should be an inbuilt proc, i.e. annotated with `{.builtin.}`. However,
+    # functions that are available otherwise (e.g. in Nim's system like `abs`)
+    # in Nim _and_ backends will also show up here. Unless we wanted to manually
+    # wrap all of these, we can just skip the `isBuiltin` check here.
+    # If the user uses something not available in the backend, they'll get a
+    # compiler error from that compiler.
+    # It's mostly a matter of usability: For common procs like `abs` we cannot
+    # so easily define a custom overload `proc abs(...): ... {.builtin.}`, because
+    # that would overwrite the Nim version.
+    # doAssert inst.isBuiltIn()
+    return
   else:
     fn.pAttributes.incl attDevice # make sure this is interpreted as a device function
     doAssert fn.pName.iSym == name.iSym, "Not matching"
@@ -738,6 +770,8 @@ proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool =
     result = ctx.genericInsts[fn].pRetType.kind != gtVoid
   elif fn in ctx.builtins:
     result = ctx.builtins[fn].pRetType.kind != gtVoid
+  elif fn in ctx.processedProcs:
+    result = ctx.processedProcs[fn].retType.kind != gtVoid
   else:
     raiseAssert "The function: " & $fn & " is not known anywhere."
 

From 6cd1a5ed00201944137e4f64d68cbc3c6c42c5ce Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Fri, 29 Aug 2025 12:38:41 +0200
Subject: [PATCH 70/87] fix `modadd` in debug builds on CUDA

In a debug build (`-d:debugCuda`) modular addition produced off by one
errors.

As it turned out the problem was our calculation of `overflowedLimbs`
inside of `finalSubMayOverflow`. The carry flag set in the last
`add_cio` call in `modadd` does not reliably survive into the function
call in a debug build. We compute it directly after computing the last
`add_cio` call in `modadd` and simply pass it as an argument. This way
arithmetic also works reliably on a debug build.
---
 .../experimental/gpu_field_ops.nim            | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim
index 46c458f4..cc96abc3 100644
--- a/constantine/math_compiler/experimental/gpu_field_ops.nim
+++ b/constantine/math_compiler/experimental/gpu_field_ops.nim
@@ -431,7 +431,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.}
   defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts
 
-  proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} =
+  proc finalSubMayOverflow(a, M: BigInt, overflowedLimbs: uint32): BigInt {.device.} =
     ## If a >= Modulus: r <- a-M
     ## else:            r <- a
     ##
@@ -442,9 +442,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
     var scratch: BigInt = BigInt()
 
-    # Contains 0x0001 (if overflowed limbs) or 0x0000
-    let overflowedLimbs = add_ci(0'u32, 0'u32)
-
     # Now substract the modulus, and test a < M with the last borrow
     scratch[0] = sub_bo(a[0], M[0])
     staticFor i, 1, N:
@@ -454,9 +451,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     # 2. if a >= M, underflowedModulus >= 0
     # if underflowedModulus >= 0: a-M else: a
     # TODO: predicated mov instead?
-    ## TODO: Fix this. `slct` needs a negative value for the else branch
     let underflowedModulus = sub_bi(overflowedLimbs, 0'u32)
-
     var r: BigInt = BigInt()
     staticFor i, 0, N:
       r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus))
@@ -479,9 +474,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       scratch[i] = sub_bio(a[i], M[i])
 
     # If it underflows here, `a` was smaller than the modulus, which is what we want
-    ## TODO: Fix this. `slct` needs a negative value for the else branch
     let underflowedModulus = sub_bi(0'u32, 0'u32)
-
     var r: BigInt = BigInt()
     staticFor i, 0, N:
       r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus))
@@ -490,18 +483,21 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   proc modadd(a, b, M: BigInt): BigInt {.device.} =
     ## Generate an optimized modular addition kernel
     ## with parameters `a, b, modulus: Limbs -> Limbs`
-    # try to add two bigints
     var t = BigInt() # temporary
 
     t[0] = add_co(a[0], b[0])
     staticFor i, 1, N:
       t[i] = add_cio(a[i], b[i])
 
-    # can use `when` of course!
-    when spareBits() >= 1: # if spareBits() >= 1: # would also work
+    when spareBits() >= 1:
       t = finalSubNoOverflow(t, M)
     else:
-      t = finalSubMayOverflow(t, M)
+      # Contains 0x0001 (if overflowed limbs) or 0x0000
+      # This _must_ be computed here and not inside of `finalSubMayOverflow`. In a
+      # debug build on CUDA the carry flag would (potentially) be reset going into
+      # the function.
+      let overflowedLimbs = add_ci(0'u32, 0'u32)
+      t = finalSubMayOverflow(t, M, overflowedLimbs)
 
     return t
 

From 8ae1427d9e45dc2827ba6267351cde99bfe81837 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 1 Sep 2025 09:44:00 +0200
Subject: [PATCH 71/87] fix reassignment of `types` in RT codegen for aliases

---
 constantine/math_compiler/experimental/gpu_compiler.nim | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 51207212..01548f7b 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -133,7 +133,12 @@ proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string =
   for fn in gen.procs: # assign generics info to correct table
     ctx.genericInsts[fn.pName] = fn
   for typ in gen.types: # assign generics info to correct table
-    ctx.types[typ.tTyp] = typ
+    case typ.kind
+    of gpuTypeDef:
+      ctx.types[typ.tTyp] = typ
+    of gpuAlias:
+      ctx.types[typ.aTyp] = typ
+    else: raiseAssert "Unexpected node kind assigning to `types`: " & $typ
   result = ctx.codegen(ast, kernel)
 
 

From ac91c058e59fb3dfa629ac2c4d20f463ddcca483 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 1 Sep 2025 18:07:37 +0200
Subject: [PATCH 72/87] handle `nnkObjConstr` correctly when encountering
 gtGenericInst

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 7be15c56..725c17cb 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -1145,7 +1145,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     ctx.maybeAddType(typ)
     result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
-    let flds = typ.oFields
+    let flds = if typ.kind == gtObject: typ.oFields
+               elif typ.kind == gtGenericInst: typ.gFields
+               else: raiseAssert "ObjConstr must have an object type: " & $typ
     # find all fields that have been defined by the user
     var ocFields: seq[GpuFieldInit]
     for i in 1 ..< node.len: # all fields to be init'd

From 04ddb398001a35f012a20b2cfcfc4fdb05a3bf5d Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 12:40:34 +0200
Subject: [PATCH 73/87] do not emit `f` suffix for float literals

Nim float literals when converting to strings already come with a `f` suffix.
---
 constantine/math_compiler/experimental/backends/wgsl.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index d908f22b..b307cdb5 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -25,7 +25,7 @@ proc literalSuffix(t: GpuType): string =
   of gtUint32: "u"
   of gtInt32: "" # NOTE: We DON'T give as suffix to `i32` literals so that we can rely on more cases
                  # where WebGPU allows literals to be converted automatically!
-  of gtFloat32: "f"
+  of gtFloat32: "" # NOTE: float suffixes _already_ come with an `f` suffix in Nim!
   else: ""
 
 proc toAddressSpace(symKind: GpuSymbolKind): AddressSpace =

From 7b51b65b86f89d698282de46a630592684645977 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 12:40:59 +0200
Subject: [PATCH 74/87] [wgsl] append `;` for aliases

---
 constantine/math_compiler/experimental/backends/wgsl.nim | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index b307cdb5..0706dfc3 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -975,7 +975,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result.add "}"
 
   of gpuAlias:
-    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo)
+    # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped
+    # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons
+    # after alias definitions, but not after `struct`. Hence we add `;` manually here
+    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ";"
 
   of gpuObjConstr:
     result = gpuTypeToString(ast.ocType) & "("

From 2d5b367df2e6ca0ebaba04f089770ade6d5a8760 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 13:00:14 +0200
Subject: [PATCH 75/87] rewrite infix as `gpuCall` if arguments not basic types

The idea is that if the arguments are not basic types, we will need a
custom function to perform the infix operation.

Most backends however do not support custom operators and hence actual
`gpuBinOp` are not valid for custom types in general. Hence, we
rewrite them as `gpuCall` nodes with non-symbol based naming.

NOTE: Currently this does not handle the case where we might use an
inbuilt type like `vec3` and its implementation of infix operators
that _may_ be defined after all.

We need to find an elegant solution for that. Either by checking if
argument are of basic types (like in this commit) or if they are a
type annotated with `{.builtin.}`.

Alternatively, we could force the user to define operators for such
inbuilt types (i.e. wrap them) and then if there is a wrapper that is
marked `{.builtin.}` we don't replace infix by `gpuCall` either.
---
 .../experimental/backends/cuda.nim            |  2 +-
 .../experimental/backends/wgsl.nim            | 10 +--
 .../math_compiler/experimental/gpu_types.nim  | 10 ++-
 .../math_compiler/experimental/nim_to_gpu.nim | 63 ++++++++++++++-----
 4 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 92f63efa..6f3f33bb 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -300,7 +300,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       let l = ctx.genCuda(ast.bLeft)
       let r = ctx.genCuda(ast.bRight)
       result = indentStr & "(" & l & " " &
-               ast.bOp & " " &
+               ctx.genCuda(ast.bOp) & " " &
                r & ")"
 
   of gpuIdent:
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 0706dfc3..2e98b72c 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -499,15 +499,17 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) =
 
 proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
   doAssert n.kind == gpuBinOp
-  if n.bOp in ["<=", "==", ">=", "!="]: return n
+  if n.bOp.ident() in ["<=", "==", ">=", "!="]: return n
 
   template genAssign(left, rnode, op: typed): untyped =
     let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode)
     GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false)
 
-  let op = n.bOp
+  let op = n.bOp.ident()
   if op.len >= 2 and op[^1] == '=':
-    result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last
+    var opAst = GpuAst(kind: gpuIdent, iName: op[0 .. ^2])
+    opAst.iSym = opAst.iName
+    result = genAssign(n.bLeft, n.bRight, opAst) # all but last
   else:
     # leave untouched
     result = n
@@ -937,7 +939,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       let l = ctx.genWebGpu(ast.bLeft)
       let r = ctx.genWebGpu(ast.bRight)
       result = indentStr & "(" & l & " " &
-               ast.bOp & " " &
+               ctx.genWebGpu(ast.bOp) & " " &
                r & ")"
 
   of gpuIdent:
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index cac35019..dd6e23df 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -127,8 +127,10 @@ type
       wCond*: GpuAst
       wBody*: GpuAst
     of gpuBinOp:
-      bOp*: string
+      bOp*: GpuAst # `gpuIdent` of the binary operation
       bLeft*, bRight*: GpuAst
+      # types of left and right nodes. Determined from Nim symbol associated with `bOp`
+      bLeftTyp*, bRightTyp*: GpuType
     of gpuVar:
       vName*: GpuAst ## Will be a `GpuIdent`
       vType*: GpuType
@@ -391,9 +393,11 @@ proc clone*(ast: GpuAst): GpuAst =
     result.wBody = ast.wBody.clone()
   of gpuBinOp:
     result = GpuAst(kind: gpuBinOp)
-    result.bOp = ast.bOp
+    result.bOp = ast.bOp.clone()
     result.bLeft = ast.bLeft.clone()
     result.bRight = ast.bRight.clone()
+    result.bLeftTyp = ast.bLeftTyp.clone()
+    result.bRightTyp = ast.bRightTyp.clone()
   of gpuVar:
     result = GpuAst(kind: gpuVar)
     result.vName = ast.vName.clone()
@@ -678,7 +682,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add pretty(n.wCond, indent + 2)
     result.add pretty(n.wBody, indent + 2)
   of gpuBinOp:
-    result.add idd("Ident", n.bOp)
+    result.add pretty(n.bOp, indent + 2)
     result.add pretty(n.bLeft, indent + 2)
     result.add pretty(n.bRight, indent + 2)
   of gpuVar:
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 725c17cb..06588ae0 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -990,22 +990,53 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
 
   of nnkInfix:
     result = GpuAst(kind: gpuBinOp)
-    # if left/right is boolean we need logical AND/OR, otherwise
-    # bitwise
-    let isBoolean = node[1].typeKind == ntyBool
-    result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name
-    result.bLeft = ctx.toGpuAst(node[1])
-    result.bRight = ctx.toGpuAst(node[2])
-
-    # We patch the types of int / float literals. WGSL does not automatically convert literals
-    # to the target type. Determining the type here _can_ fail. In that case the
-    # `lType` field will just be `gtVoid`, like the default.
-    if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit:
-      # determine literal type based on `bRight`
-      result.bLeft.lType = nimToGpuType(node[2], allowToFail = true)
-    elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit:
-      # determine literal type based on `bLeft`
-      result.bRight.lType = nimToGpuType(node[1], allowToFail = true)
+    # Using `getType` to get the types of the arguuments
+    let typ = node[0].getTypeImpl() # e.g.
+    doAssert typ.kind == nnkProcTy, "Infix node is not a proc but: " & $typ.treerepr
+    # BracketExpr
+    #   Sym "proc"
+    #   Sym "int"  <- return type
+    #   Sym "int"  <- left op type
+    #   Sym "int"  <- right op type
+    result.bLeftTyp = nimToGpuType(typ[0][1])
+    result.bRightTyp = nimToGpuType(typ[0][2])
+    # if either is not a base type (`gtBool .. gtSize_t`) we actually deal with a _function call_
+    # instead of an binary operation. Will thus rewrite.
+    proc ofBasicType(t: GpuType, allowPtrLhs: bool): bool =
+      ## Determines if the given type is a basic POD type *or* a simple pointer to it.
+      ## This is because some infix nodes, e.g. `x += y` will have LHS arguments that are
+      ## `var T`, which appear as an implicit pointer here.
+      ##
+      ## TODO: Handle the case of backend inbuilt special types (like `vec3`), which may indeed
+      ## have inbuilt infix operators. Either by checking if the type has a `{.builtin.}` pragma
+      ## _or_ if there is a wrapped proc for this operator and if so do not rewrite as `gpuCall`
+      ## if that exists.
+      result = (t.kind in gtBool .. gtSize_t)
+      if allowPtrLhs:
+        result = result or ((t.kind == gtPtr) and t.implicit and t.to.kind in gtBool .. gtSize_t)
+
+    if not result.bLeftTyp.ofBasicType(true) or not result.bRightTyp.ofBasicType(false):
+      result = GpuAst(kind: gpuCall)
+      result.cName = ctx.getFnName(node[0])
+      result.cArgs = @[ctx.toGpuAst(node[1]), ctx.toGpuAst(node[2])]
+    else:
+      # if left/right is boolean we need logical AND/OR, otherwise bitwise
+      let isBoolean = result.bLeftTyp.kind == gtBool
+      var op = GpuAst(kind: gpuIdent, iName: assignOp(node[0].repr, isBoolean)) # repr so that open sym choice gets correct name
+      op.iSym = op.iName
+      result.bOp = op
+      result.bLeft = ctx.toGpuAst(node[1])
+      result.bRight = ctx.toGpuAst(node[2])
+
+      # We patch the types of int / float literals. WGSL does not automatically convert literals
+      # to the target type. Determining the type here _can_ fail. In that case the
+      # `lType` field will just be `gtVoid`, like the default.
+      if result.bLeft.kind == gpuLit: # and result.bRight.kind != gpuLit:
+        # determine literal type based on `bRight`
+        result.bLeft.lType = result.bLeftTyp # nimToGpuType(node[2], allowToFail = true)
+      elif result.bRight.kind == gpuLit: # and result.bLeft.kind != gpuLit:
+        # determine literal type based on `bLeft`
+        result.bRight.lType = result.bRightTyp #nimToGpuType(node[1], allowToFail = true)
 
   of nnkDotExpr:
     ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT

From c779258172c6711903c836bbeaf5b71b595fb000 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 17:00:47 +0200
Subject: [PATCH 76/87] support arbitrary values in array literals

---
 constantine/math_compiler/experimental/backends/cuda.nim | 2 +-
 constantine/math_compiler/experimental/backends/wgsl.nim | 2 +-
 constantine/math_compiler/experimental/gpu_types.nim     | 7 ++++---
 constantine/math_compiler/experimental/nim_to_gpu.nim    | 5 ++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 6f3f33bb..34a96445 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -314,7 +314,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   of gpuArrayLit:
     result = "{"
     for i, el in ast.aValues:
-      result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el
+      result.add "(" & gpuTypeToString(ast.aLitType) & ")" & ctx.genCuda(el)
       if i < ast.aValues.high:
         result.add ", "
     result.add "}"
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 2e98b72c..e9501440 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -959,7 +959,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   of gpuArrayLit:
     result = "array("
     for i, el in ast.aValues:
-      result.add gpuTypeToString(ast.aLitType) & "(" & el & ")"
+      result.add gpuTypeToString(ast.aLitType) & "(" & ctx.genWebGpu(el) & ")"
       if i < ast.aValues.high:
         result.add ", "
     result.add ")"
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index dd6e23df..834d8fa5 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -155,7 +155,7 @@ type
       cValue*: GpuAst # not just a string to support different types easily
       cType*: GpuType
     of gpuArrayLit:
-      aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array
+      aValues*: seq[GpuAst]
       aLitType*: GpuType # type of first element
     of gpuBlock:
       isExpr*: bool ## Whether this block represents an expression, i.e. it returns something
@@ -429,7 +429,8 @@ proc clone*(ast: GpuAst): GpuAst =
     result.cType = ast.cType.clone()
   of gpuArrayLit:
     result = GpuAst(kind: gpuArrayLit)
-    result.aValues = ast.aValues
+    for a in ast.aValues:
+      result.aValues.add a.clone()
     result.aLitType = ast.aLitType.clone()
   of gpuPrefix:
     result = GpuAst(kind: gpuPrefix)
@@ -705,7 +706,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add pretty(n.cValue, indent + 2)
   of gpuArrayLit:
     for el in n.aValues:
-      result.add id(el)
+      result.add pretty(el, indent + 2)
   of gpuBlock:
     if n.blockLabel.len > 0:
       result.add id("Label", n.blockLabel)
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 06588ae0..7176eb0f 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -1241,10 +1241,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
 
   of nnkBracket:
     let aLitTyp = nimToGpuType(node[0])
-    var aValues = newSeq[string]()
+    var aValues = newSeq[GpuAst]()
     for el in node:
-      ## XXX: Support not just int literals
-      aValues.add $el.intVal
+      aValues.add ctx.toGpuAst(el)
     result = GpuAst(kind: gpuArrayLit,
                     aValues: aValues,
                     aLitType: aLitTyp)

From e395ecd2e1a094feabd3980407bdab0c791a2531 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 17:04:04 +0200
Subject: [PATCH 77/87] handle generic instantiation with actual
 initializations

I.e. instead of just:

```nim
Vec[float32]()
```
being:
```
ObjConstr
  BracketExpr
    Sym "Vec3"
    Sym "float32"
```

Also handle the case of:

```
Vec3[float32](limbs: [1'f32, 1'f32, 1'f32])
```
being:
```
ObjConstr
  BracketExpr
    Sym "Vec3"
    Sym "float32"
  ExprColonExpr
    Sym "limbs"
    Bracket
      Float32Lit 1.0
      Float32Lit 1.0
      Float32Lit 1.0
```
---
 .../math_compiler/experimental/nim_to_gpu.nim | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 7176eb0f..59d473a4 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -79,20 +79,41 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
     raiseAssert "Not supported yet: " & $t
 
 proc parseTypeFields(node: NimNode): seq[GpuTypeField]
+
+proc getGenericTypeName(t: NimNode): string =
+  ## Returns the base name of the generic type, i.e. for
+  ## `Foo[Bar, Baz]` returns `Foo`.
+  case t.kind
+  of nnkSym: result = t.strVal
+  of nnkBracketExpr: result = t[0].getGenericTypeName()
+  else: raiseAssert "Unexpected node kind for generic instantiation type: " & $t.treerepr
+
+proc parseGenericArgs(t: NimNode): seq[GpuType] =
+  case t.kind
+  of nnkSym: return # no generic arguments
+  of nnkBracketExpr:
+    for i in 1 ..< t.len:
+      result.add nimToGpuType(t[i])
+  else:
+    raiseAssert "Unexpected node kind in parseGenericArgs: " & $t.treerepr
+
 proc initGpuGenericInst(t: NimNode): GpuType =
   doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind
   case t.kind
   of nnkBracketExpr: # regular generic instantiation
-    result = GpuType(kind: gtGenericInst, gName: t[0].repr)
-    for i in 1 ..< t.len: # grab all generic arguments
-      let typ = nimToGpuType(t[i])
-      result.gArgs.add typ
+    result = GpuType(kind: gtGenericInst, gName: getGenericTypeName(t))
+    result.gArgs = parseGenericArgs(t)
     # now parse the object fields
     let impl = t.getTypeImpl() # impl for the `gFields`
     result.gFields = parseTypeFields(impl)
   of nnkObjConstr:
-    doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr
-    result = initGpuGenericInst(t[0])
+    if t.len == 1:   # Generic instantiation without arguments
+      result = initGpuGenericInst(t[0])
+    elif t.len == 2: # ...and with arguments
+      doAssert t[1].kind == nnkExprColonExpr, "ObjConstr does not contain initialization as [1], but: " & $t.treerepr
+      result = initGpuGenericInst(t[0])
+    else:
+      raiseAssert "Unexpected number of elements in `nnkObjConstr` node for generic instantiation: " & $t.treerepr
   of nnkSym:
     let impl = getTypeImpl(t)
     case impl.kind

From ea2b36e29ca96b4dc612cf8b06bfd001b7540e2a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 2 Sep 2025 18:33:08 +0200
Subject: [PATCH 78/87] fix top level type definitions

Because we still had the `farmTopLevel` adding a (now empty) element
to the `globalBlocks` our array access to `ctx.globalBlocks[0]` to
generate the types didn't actually emit anything.
---
 .../math_compiler/experimental/backends/common_utils.nim  | 8 ++++----
 constantine/math_compiler/experimental/backends/cuda.nim  | 4 +---
 constantine/math_compiler/experimental/backends/wgsl.nim  | 8 ++++----
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim
index 01ee5d33..076625cf 100644
--- a/constantine/math_compiler/experimental/backends/common_utils.nim
+++ b/constantine/math_compiler/experimental/backends/common_utils.nim
@@ -16,12 +16,12 @@ proc isGlobal*(fn: GpuAst): bool =
   doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind
   result = attGlobal in fn.pAttributes
 
-proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) =
+proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock: var GpuAst) =
   ## Farms the top level of the code for functions, variable and type definition.
   ## All functions are added to the `allFnTab`, while only global ones (or even only
   ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining
   ## logic.
-  ## Variables and types are collected in `varBlock` and `typBlock`.
+  ## Variables are collected in `varBlock`.
   case ast.kind
   of gpuProc:
     ctx.allFnTab[ast.pName] = ast
@@ -32,10 +32,10 @@ proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, t
   of gpuBlock:
     # could be a type definition or global variable
     for ch in ast:
-      ctx.farmTopLevel(ch, kernel, varBlock, typBlock)
+      ctx.farmTopLevel(ch, kernel, varBlock)
   of gpuVar, gpuConstexpr:
     varBlock.statements.add ast
   of gpuTypeDef, gpuAlias:
-    typBlock.statements.add ast
+    raiseAssert "Unexpected type def / alias def found. These should be in `ctx.types` now: " & $ast
   else:
     discard
diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 34a96445..ec80aabf 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -170,10 +170,8 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   # 2. Fill table with all *global* functions or *only* the specific `kernel`
   #    if any given
   var varBlock = GpuAst(kind: gpuBlock)
-  var typBlock = GpuAst(kind: gpuBlock)
-  ctx.farmTopLevel(ast, kernel, varBlock, typBlock)
+  ctx.farmTopLevel(ast, kernel, varBlock)
   ctx.globalBlocks.add varBlock
-  ctx.globalBlocks.add typBlock
   ## XXX: `typBlock` should now always be empty, as we pass all
   ## found types into `ctx.types`
 
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index e9501440..716f4fb8 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -732,10 +732,8 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   # 1. Fill table with all *global* functions or *only* the specific `kernel`
   #    if any given
   var varBlock = GpuAst(kind: gpuBlock)
-  var typBlock = GpuAst(kind: gpuBlock)
-  ctx.farmTopLevel(ast, kernel, varBlock, typBlock)
+  ctx.farmTopLevel(ast, kernel, varBlock)
   ctx.globalBlocks.add varBlock
-  ctx.globalBlocks.add typBlock
   ## XXX: `typBlock` should now always be empty, as we pass all
   ## found types into `ctx.types`
 
@@ -743,8 +741,10 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   for k, v in pairs(ctx.genericInsts):
     ctx.allFnTab[k] = v
   # And all the known types
+  var typBlock = GpuAst(kind: gpuBlock)
   for k, typ in pairs(ctx.types):
-    ctx.globalBlocks.add typ
+    typBlock.statements.add typ
+  ctx.globalBlocks.add typBlock
 
   # 2. Remove all arguments from global functions, as none are allowed in WGSL
   for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table

From 96508f6bd0e30e10c2ae3c8c6eb3e503e0e90f58 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 3 Sep 2025 11:49:20 +0200
Subject: [PATCH 79/87] [cuda] add pass to strip `deref` if found inside
 `index` for pointers

But only strip if not pointer to an array type!
---
 .../experimental/backends/cuda.nim            | 28 +++++++++++++++++++
 .../math_compiler/experimental/nim_to_gpu.nim | 24 ++--------------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index ec80aabf..3e0e523a 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -152,6 +152,30 @@ proc scanFunctions(ctx: var GpuContext, n: GpuAst) =
     for ch in n:
       ctx.scanFunctions(ch)
 
+proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
+  ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects
+  ## that are rewritten include:
+  ##
+  ## - `Index` of `Deref` of `Ident` needs to be rewritten to `Index` of `Ident` if the
+  ##   ident is a pointer type, because `[]` is syntactic sugar for pointer arithmetic
+  ##   (unless the argument is a pointer to a static array)
+  case n.kind
+  of gpuIndex:
+    ## TODO: Assuming we have a more complicated expression instead of a `gpuIdent` in the deref
+    ## we won't perform replacement, but likely we should. Might use something like `determineIdent`
+    ## as used on WGSL in the future. Anyway, worts case this will lead to a NVRTC compile time error.
+    if n.iArr.kind == gpuDeref and
+       n.iArr.dOf.kind == gpuIdent and
+       n.iArr.dOf.iTyp.kind == gtPtr and    # identifier is a pointer?
+       n.iArr.dOf.iTyp.to.kind != gtArray:  # but not to an array
+      n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex)
+    else:
+      for ch in mitems(n):
+        ctx.makeCodeValid(ch)
+  else:
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch)
+
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a))
@@ -184,6 +208,10 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
     let fnOrig = ctx.allFnTab[fnIdent]
     ctx.scanFunctions(fn)
 
+  # 4. Finalize the code by performing some required AST transformations to make the code valid.
+  for (fnIdent, fn) in mpairs(ctx.fnTab):
+    ctx.makeCodeValid(fn)
+
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   ## The actual CUDA code generator.
   let indentStr = "  ".repeat(indent)
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 59d473a4..1f63c749 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -1286,27 +1286,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     # `HiddenAddr` appears for accesses to `var` passed arguments
     result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0]))
 
-  of nnkHiddenDeref:
-    case node.typeKind
-    of ntyUncheckedArray:
-      # `getTypeInst(node)` would yield:
-      # BracketExpr
-      #   Sym "UncheckedArray"
-      #   Sym "uint32"
-      # i.e. it is a `ptr UncheckedArray[T]`
-      # In this case we just ignore the deref, because on the CUDA
-      # side it is just a plain pointer array we index into using
-      # `foo[i]`.
-      result = ctx.toGpuAst(node[0])
-    else:
-      # Otherwise we treat it like a regular deref
-      # HiddenDeref
-      #   Sym "x"
-      # With e.g. `getTypeInst(node) = Sym "BigInt"`
-      # and `node.typeKind = ntyObject`
-      # due to a `var` parameter
-      result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
-  of nnkDerefExpr: #, nnkHiddenDeref:
+  of nnkDerefExpr, nnkHiddenDeref:
+    # treat hidden and regular deref the same nowadays. On some backends may strip derefs, if
+    # they appear e.g. in an `gpuIndex` (CUDA)
     result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
 
   of nnkConstDef:

From c823f9feb159534f774b82b369bc9d16983c5d16 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 3 Sep 2025 11:53:42 +0200
Subject: [PATCH 80/87] remove old comment

---
 constantine/math_compiler/experimental/backends/cuda.nim | 2 --
 1 file changed, 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 3e0e523a..8544a666 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -196,8 +196,6 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   var varBlock = GpuAst(kind: gpuBlock)
   ctx.farmTopLevel(ast, kernel, varBlock)
   ctx.globalBlocks.add varBlock
-  ## XXX: `typBlock` should now always be empty, as we pass all
-  ## found types into `ctx.types`
 
   # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
   #    the functions called and record them in `fnTab`.

From 6527759bb71f0c951790b84ba424e5bf97cc7fa2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 3 Sep 2025 15:23:50 +0200
Subject: [PATCH 81/87] better handle replacement of derefs

This allows us to make a better choice about when to replace and when
not to replace.
---
 .../experimental/backends/cuda.nim            | 63 ++++++++++++++++---
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 8544a666..37dbfb8f 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -152,6 +152,56 @@ proc scanFunctions(ctx: var GpuContext, n: GpuAst) =
     for ch in n:
       ctx.scanFunctions(ch)
 
+proc getFieldType(t: GpuType, field: GpuAst): GpuType =
+  ## Returns the type of the field. `t` must be an object or generic instantiation.
+  ## `field` must be an ident.
+  doAssert field.kind == gpuIdent, "Field is not an ident: " & $field
+  doAssert t.kind in [gtObject, gtGenericInst]
+  let flds = if t.kind == gtObject: t.oFields
+               else: t.gFields
+  result = GpuType(kind: gtInvalid)
+  for f in flds:
+    if f.name == field.ident():
+      return f.typ
+
+proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType =
+  ## Tries to determine the underlying type of the AST.
+  ##
+  ## If `typeOfIndex` is `true`, we return the type of the index we access. Otherwise
+  ## we return the type of the array / pointer.
+  ##
+  ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types!
+  template dfl(): untyped = GpuType(kind: gtInvalid)
+  case arg.kind
+  of gpuIdent: arg.iTyp
+  of gpuAddr: GpuType(kind: gtPtr, to: ctx.getType(arg.aOf))
+  of gpuDeref:
+    let argTyp = ctx.getType(arg.dOf)
+    doAssert argTyp.kind == gtPtr
+    argTyp.to
+  of gpuCall: dfl()
+  of gpuIndex:
+    let arrType = ctx.getType(arg.iArr)
+    if typeOfIndex:
+      case arrType.kind
+      of gtPtr:   arrType.to
+      of gtUA:    arrType.uaTo
+      of gtArray: arrType.aTyp
+      else: raiseAssert "`gpuIndex` cannot be of a non pointer / array type: " & $arrType
+    else:
+      arrType
+  of gpuDot:
+    let parentTyp = ctx.getType(arg.dParent)
+    parentTyp.getFieldType(arg.dField)
+  of gpuLit: arg.lType
+  of gpuBinOp: dfl() ## XXX: store resulting type of `gpuBinOp`!
+  #of gpuBlock: arg.statements[^1].getType()
+  of gpuPrefix: ctx.getType(arg.pVal)
+  of gpuConv: arg.convTo
+  of gpuCast: arg.cTo # ident of the thing we cast
+  else:
+    raiseAssert "Not implemented to determine type from node: " & $arg
+
 proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
   ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects
   ## that are rewritten include:
@@ -161,14 +211,11 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
   ##   (unless the argument is a pointer to a static array)
   case n.kind
   of gpuIndex:
-    ## TODO: Assuming we have a more complicated expression instead of a `gpuIdent` in the deref
-    ## we won't perform replacement, but likely we should. Might use something like `determineIdent`
-    ## as used on WGSL in the future. Anyway, worts case this will lead to a NVRTC compile time error.
-    if n.iArr.kind == gpuDeref and
-       n.iArr.dOf.kind == gpuIdent and
-       n.iArr.dOf.iTyp.kind == gtPtr and    # identifier is a pointer?
-       n.iArr.dOf.iTyp.to.kind != gtArray:  # but not to an array
-      n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex)
+    if n.iArr.kind == gpuDeref:
+      # get type of deref'd node, but do not fold `gpuIndex` (i.e. get type of collection)
+      let typ = ctx.getType(n, typeOfIndex = false)
+      if typ.kind != gtArray:
+        n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex)
     else:
       for ch in mitems(n):
         ctx.makeCodeValid(ch)

From 3095d4fd649241104d9405d6ce5277c05563e128 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 3 Sep 2025 18:08:49 +0200
Subject: [PATCH 82/87] remove Nim gensym'd suffix from `tmpTuple` variables

I.e. variables that correspond to tuple unpacking in Nim
---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 1f63c749..ec23ac1a 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -1104,6 +1104,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       if result.iName == "_":
         result.iName = "tmp_" & $ctx.genSymCount
         inc ctx.genSymCount
+      elif result.iName.startsWith("tmpTuple_"): # will have a Nim gensym'd suffix, replace by custom counter
+        result.iName = "tmpTuple_" & $ctx.genSymCount
+        result.iSym = result.iName & "_" & node.signatureHash() # and update the iSym to not be based on Nim's value either
+        inc ctx.genSymCount
       ctx.sigTab[s] = result
     else:
       result = ctx.sigTab[s]

From 59c16b676d7de84a0d76381306da3326e5492207 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Sep 2025 15:40:23 +0200
Subject: [PATCH 83/87] replace single letter strings by characters

---
 .../experimental/backends/cuda.nim            | 82 +++++++++----------
 .../experimental/backends/wgsl.nim            | 76 ++++++++---------
 2 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 37dbfb8f..4899c3f5 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -70,7 +70,7 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
       # so as our ident we pass `theIdent = (*<ident>)` and generate the type for the internal
       # array type, which yields e.g. `BigInt <theIdent>[4]`.
       let ptrStar = gpuTypeToString(t.kind)
-      result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")")
+      result = gpuTypeToString(t.to, '(' & ptrStar & ident & ')')
       skipIdent = true
     else:
       let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent)
@@ -87,30 +87,30 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
     of gtArray: # nested array
       let typ = getInnerArrayType(t)        # get inner most type
       let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...`
-      result = typ & " " & ident & lengths
+      result = typ & ' ' & ident & lengths
     else:
       # NOTE: Nested arrays don't have an inner identifier!
       if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead
-        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]"
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & "[]"
       else:
-        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]"
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & '[' & $t.aLen & ']'
     skipIdent = true
   of gtGenericInst:
     # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
     # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
     result = t.gName
     if t.gArgs.len > 0:
-      result.add "_"
+      result.add '_'
     for i, g in t.gArgs:
       result.add gpuTypeToString(g)
       if i < t.gArgs.high:
-        result.add "_"
+        result.add '_'
   of gtObject: result = t.name
   of gtUA:     result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T?
   else:        result = gpuTypeToString(t.kind)
 
   if ident.len > 0 and not skipIdent: # still need to add ident
-    result.add " " & ident
+    result.add ' ' & ident
 
 proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
   ## Returns the correct function with its return type
@@ -275,30 +275,30 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs)
 
     # extern "C" is needed to avoid name mangling
-    result = indentStr & "extern \"C\" " & attrs.join(" ") & " " &
+    result = indentStr & "extern \"C\" " & attrs.join(" ") & ' ' &
              fnSig
     if ast.forwardDeclare:
-      result.add ";"
+      result.add ';'
     else:
       result.add "{\n"
       result &= ctx.genCuda(ast.pBody, indent + 1)
-      result &= "\n" & indentStr & "}"
+      result &= '\n' & indentStr & '}'
 
   of gpuBlock:
     result = ""
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n'
     for i, el in ast.statements:
       result.add ctx.genCuda(el, indent)
       if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added
-        result.add ";"
+        result.add ';'
       if i < ast.statements.high:
-        result.add "\n"
+        result.add '\n'
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n'
 
   of gpuVar:
-    let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & " "
+    let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & ' '
                 else: ""
     result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident())
     # If there is an initialization, the type might require a memcpy
@@ -320,35 +320,35 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     # skip semicolon in the condition. Otherwise can lead to problematic code
     ctx.withoutSemicolon: # skip semicolon for if bodies
       result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n"
-    result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.ifThen, indent + 1) & '\n'
+    result &= indentStr & '}'
     if ast.ifElse.kind != gpuVoid:
       result &= " else {\n"
-      result &= ctx.genCuda(ast.ifElse, indent + 1) & "\n"
-      result &= indentStr & "}"
+      result &= ctx.genCuda(ast.ifElse, indent + 1) & '\n'
+      result &= indentStr & '}'
 
   of gpuFor:
     result = indentStr & "for(int " & ast.fVar.ident() & " = " &
              ctx.genCuda(ast.fStart) & "; " &
              ast.fVar.ident() & " < " & ctx.genCuda(ast.fEnd) & "; " &
              ast.fVar.ident() & "++) {\n"
-    result &= ctx.genCuda(ast.fBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.fBody, indent + 1) & '\n'
+    result &= indentStr & '}'
   of gpuWhile:
     ctx.withoutSemicolon:
       result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n"
-    result &= ctx.genCuda(ast.wBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.wBody, indent + 1) & '\n'
+    result &= indentStr & '}'
 
   of gpuDot:
-    result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField)
+    result = ctx.genCuda(ast.dParent) & '.' & ctx.genCuda(ast.dField)
 
   of gpuIndex:
-    result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]"
+    result = ctx.genCuda(ast.iArr) & '[' & ctx.genCuda(ast.iIndex) & ']'
 
   of gpuCall:
-    result = indentStr & ast.cName.ident() & "(" &
-             ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")"
+    result = indentStr & ast.cName.ident() & '(' &
+             ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ')'
 
   of gpuTemplateCall:
     when nimvm:
@@ -370,25 +370,25 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     ctx.withoutSemicolon:
       let l = ctx.genCuda(ast.bLeft)
       let r = ctx.genCuda(ast.bRight)
-      result = indentStr & "(" & l & " " &
-               ctx.genCuda(ast.bOp) & " " &
-               r & ")"
+      result = indentStr & '(' & l & ' ' &
+               ctx.genCuda(ast.bOp) & ' ' &
+               r & ')'
 
   of gpuIdent:
     result = ast.ident()
 
   of gpuLit:
-    if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\""
+    if ast.lType.kind == gtString: result = '"' & ast.lValue & '"'
     elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder
     else: result = ast.lValue
 
   of gpuArrayLit:
     result = "{"
     for i, el in ast.aValues:
-      result.add "(" & gpuTypeToString(ast.aLitType) & ")" & ctx.genCuda(el)
+      result.add '(' & gpuTypeToString(ast.aLitType) & ')' & ctx.genCuda(el)
       if i < ast.aValues.high:
         result.add ", "
-    result.add "}"
+    result.add '}'
 
   of gpuReturn:
     result = indentStr & "return " & ctx.genCuda(ast.rValue)
@@ -400,7 +400,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = "struct " & gpuTypeToString(ast.tTyp) & "{\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, el.name) & ";\n"
-    result.add "}"
+    result.add '}'
 
   of gpuObjConstr:
     result = "{"
@@ -408,7 +408,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result.add ctx.genCuda(el.value)
       if i < ast.ocFields.len - 1:
         result.add ", "
-    result.add "}"
+    result.add '}'
 
   of gpuInlineAsm:
     result = indentStr & "asm(" & ast.stmt.strip & ");"
@@ -417,15 +417,15 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = indentStr & "/* " & ast.comment & " */"
 
   of gpuConv:
-    result = "(" & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.convExpr)
+    result = '(' & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.convExpr)
   of gpuCast:
-    result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr)
+    result = '(' & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.cExpr)
 
   of gpuAddr:
-    result = "(&" & ctx.genCuda(ast.aOf) & ")"
+    result = "(&" & ctx.genCuda(ast.aOf) & ')'
 
   of gpuDeref:
-    result = "(*" & ctx.genCuda(ast.dOf) & ")"
+    result = "(*" & ctx.genCuda(ast.dOf) & ')'
 
   of gpuConstexpr:
     ## TODO: We need to change the code such that we emit `constexpr` inside of procs and
@@ -436,7 +436,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     if ast.cType.kind == gtArray:
       result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
     else:
-      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
+      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & ' ' & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
 
   else:
     echo "Unhandled node kind in genCuda: ", ast.kind
@@ -454,7 +454,7 @@ proc codegen*(ctx: var GpuContext): string =
   for (fnIdent, fn) in fns:
     let fnC = fn.clone()
     fnC.forwardDeclare = true
-    result.add ctx.genCuda(fnC) & "\n"
+    result.add ctx.genCuda(fnC) & '\n'
   result.add "\n\n"
 
   for fnIdent, fn in ctx.fnTab:
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 716f4fb8..35a80f32 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -117,13 +117,13 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr =
     # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
     result = t.gName
     if t.gArgs.len > 0:
-      result.add "_"
+      result.add '_'
     for i, g in t.gArgs:
       result.add gpuTypeToString(g)
       if i < t.gArgs.high:
-        result.add "_"
+        result.add '_'
   of gtObject: result = t.name
-  of gtUA:     result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">"
+  of gtUA:     result = gpuTypeToString(t.kind) & '<' & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & '>'
   else:        result = gpuTypeToString(t.kind)
 
   if ident.len > 0 and not skipIdent: # still need to add ident
@@ -295,7 +295,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string
   ## the information we precisely want to extract (different symbol kind etc) would make
   ## it so that we cannot look up elements.
   doAssert n.kind == gpuCall, "Not a call, but: " & $n.kind
-  result = n.cName.ident() & "_"
+  result = n.cName.ident() & '_'
   for i, arg in n.cArgs:
     let p = params[i]
     var s: string
@@ -312,7 +312,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string
       s = shortAddrSpace(addrSpace) & m
     result.add s
     if i < n.cArgs.high:
-      result.add "_"
+      result.add '_'
 
 proc makeFnGeneric(fn: GpuAst, gi: GenericInst): GpuAst =
   ## Returns a (shallow) copy of the input function (which is a clone of the
@@ -823,20 +823,20 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = indentStr & "fn " & fnSig & " {\n"
 
     result &= ctx.genWebGpu(ast.pBody, indent + 1)
-    result &= "\n" & indentStr & "}"
+    result &= '\n' & indentStr & '}'
 
   of gpuBlock:
     result = ""
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n'
     for i, el in ast.statements:
       result.add ctx.genWebGpu(el, indent)
       if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added
-        result.add ";"
+        result.add ';'
       if i < ast.statements.high:
-        result.add "\n"
+        result.add '\n'
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n'
 
   of gpuVar:
     let letOrVar = if ast.vMutable: "var" else: "let"
@@ -872,7 +872,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was
         # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and
         # we patched the `bool -> i32` and thus need to convert it.
-        result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ")"
+        result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ')'
       else:
         result = indentStr & ctx.genWebGpu(ast.aLeft) & " = " & ctx.genWebGpu(ast.aRight)
 
@@ -886,36 +886,36 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         result = indentStr & "if (false) {\n"
       else:
         result = indentStr & "if (" & ctx.genWebGpu(ast.ifCond) & ") {\n"
-    result &= ctx.genWebGpu(ast.ifThen, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.ifThen, indent + 1) & '\n'
+    result &= indentStr & '}'
     if ast.ifElse.kind != gpuVoid:
       result &= " else {\n"
-      result &= ctx.genWebGpu(ast.ifElse, indent + 1) & "\n"
-      result &= indentStr & "}"
+      result &= ctx.genWebGpu(ast.ifElse, indent + 1) & '\n'
+      result &= indentStr & '}'
 
   of gpuFor:
     result = indentStr & "for(var " & ast.fVar.ident() & ": i32 = " &
              ctx.genWebGpu(ast.fStart) & "; " &
              ast.fVar.ident() & " < " & ctx.genWebGpu(ast.fEnd) & "; " &
              ast.fVar.ident() & "++) {\n"
-    result &= ctx.genWebGpu(ast.fBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.fBody, indent + 1) & '\n'
+    result &= indentStr & '}'
   of gpuWhile:
     ctx.withoutSemicolon:
       result = indentStr & "while (" & ctx.genWebGpu(ast.wCond) & "){\n"
-    result &= ctx.genWebGpu(ast.wBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.wBody, indent + 1) & '\n'
+    result &= indentStr & '}'
 
   of gpuDot:
-    result = ctx.genWebGpu(ast.dParent) & "." & ctx.genWebGpu(ast.dField)
+    result = ctx.genWebGpu(ast.dParent) & '.' & ctx.genWebGpu(ast.dField)
 
   of gpuIndex:
-    result = ctx.genWebGpu(ast.iArr) & "[" & ctx.genWebGpu(ast.iIndex) & "]"
+    result = ctx.genWebGpu(ast.iArr) & '[' & ctx.genWebGpu(ast.iIndex) & ']'
 
   of gpuCall:
     ctx.withoutSemicolon:
-      result = indentStr & ast.cName.ident() & "(" &
-               ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ")"
+      result = indentStr & ast.cName.ident() & '(' &
+               ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ')'
 
   of gpuTemplateCall:
     when nimvm:
@@ -938,15 +938,15 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     ctx.withoutSemicolon:
       let l = ctx.genWebGpu(ast.bLeft)
       let r = ctx.genWebGpu(ast.bRight)
-      result = indentStr & "(" & l & " " &
-               ctx.genWebGpu(ast.bOp) & " " &
-               r & ")"
+      result = indentStr & '(' & l & ' ' &
+               ctx.genWebGpu(ast.bOp) & ' ' &
+               r & ')'
 
   of gpuIdent:
     result = ast.ident()
 
   of gpuLit:
-    if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\""
+    if ast.lType.kind == gtString: result = '"' & ast.lValue & '"'
     elif ast.lValue == "DEFAULT":
       ## TODO: We could "manually" construct a zero version!
       ## NOTE: There *are* default initializations to zero. Just not for fields that
@@ -959,10 +959,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   of gpuArrayLit:
     result = "array("
     for i, el in ast.aValues:
-      result.add gpuTypeToString(ast.aLitType) & "(" & ctx.genWebGpu(el) & ")"
+      result.add gpuTypeToString(ast.aLitType) & '(' & ctx.genWebGpu(el) & ')'
       if i < ast.aValues.high:
         result.add ", "
-    result.add ")"
+    result.add ')'
 
   of gpuReturn:
     result = indentStr & "return " & ctx.genWebGpu(ast.rValue)
@@ -974,16 +974,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = "struct " & gpuTypeToString(ast.tTyp) & " {\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n"
-    result.add "}"
+    result.add '}'
 
   of gpuAlias:
     # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped
     # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons
     # after alias definitions, but not after `struct`. Hence we add `;` manually here
-    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ";"
+    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ';'
 
   of gpuObjConstr:
-    result = gpuTypeToString(ast.ocType) & "("
+    result = gpuTypeToString(ast.ocType) & '('
     for i, el in ast.ocFields:
       if el.value.kind == gpuLit and el.value.lValue == "DEFAULT":
         # use type to construct a default value
@@ -993,7 +993,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         result.add ctx.genWebGpu(el.value)
       if i < ast.ocFields.len - 1:
         result.add ", "
-    result.add ")"
+    result.add ')'
 
   of gpuInlineAsm:
     raiseAssert "Inline assembly not supported on the WebGPU target."
@@ -1002,16 +1002,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = indentStr & "/* " & ast.comment & " */"
 
   of gpuConv:
-    result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & "(" & ctx.genWebGpu(ast.convExpr) & ")"
+    result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & '(' & ctx.genWebGpu(ast.convExpr) & ')'
 
   of gpuCast:
-    result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ")"
+    result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ')'
 
   of gpuAddr:
-    result = "(&" & ctx.genWebGpu(ast.aOf) & ")"
+    result = "(&" & ctx.genWebGpu(ast.aOf) & ')'
 
   of gpuDeref:
-    result = "(*" & ctx.genWebGpu(ast.dOf) & ")"
+    result = "(*" & ctx.genWebGpu(ast.dOf) & ')'
 
   of gpuConstexpr:
     result = indentStr & "const " & ctx.genWebGpu(ast.cIdent) & ": " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " = " & ctx.genWebGpu(ast.cValue)
@@ -1050,7 +1050,7 @@ proc codegen*(ctx: var GpuContext): string =
   # 1. Generate the header for all global variables
   for id, g in ctx.globals:
     result.add genGlobal(g)
-  result.add "\n"
+  result.add '\n'
 
   # 2. generate code for the global blocks (types, global vars etc)
   for blk in ctx.globalBlocks:

From 096b5bc7ae3315e028d6c760da7afee56c20820f Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Sep 2025 15:40:44 +0200
Subject: [PATCH 84/87] extend `maybeAddType` to find types behind Ptr/UA/Array

---
 constantine/math_compiler/experimental/nim_to_gpu.nim | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index ec23ac1a..5373581b 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -616,11 +616,21 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
     # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
     result = n.getTypeInst.nimToGpuType()
 
+proc stripPtrOrArrayType(t: GpuType): GpuType =
+  ## Strips any pointer or array type to return any struct / generic instantiation
+  ## it might contain
+  case t.kind
+  of gtPtr:    result = stripPtrOrArrayType t.to
+  of gtUA:     result = stripPtrOrArrayType t.uaTo
+  of gtArray:  result = stripPtrOrArrayType t.aTyp
+  else:        result = t
+
 proc maybeAddType*(ctx: var GpuContext, typ: GpuType) =
   ## Adds the given type to the table of known types, if it is some kind of
   ## object type.
   ##
   ## XXX: What about aliases and distincts?
+  let typ = typ.stripPtrOrArrayType() # get any underlying type
   if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types:
     ctx.types[typ] = toTypeDef(typ)
 

From 1aaf1b8b0f8fd6b0400f03f9b4b7bab290bc858c Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Sep 2025 15:53:50 +0200
Subject: [PATCH 85/87] move all `builtins` into separate files, one for each
 backend

---
 .../experimental/builtins/builtins.nim        | 19 ++++
 .../experimental/builtins/common_builtins.nim | 46 +++++++++
 .../experimental/builtins/cuda_builtins.nim   | 48 ++++++++++
 .../experimental/builtins/wgsl_builtins.nim   | 26 ++++++
 .../experimental/gpu_compiler.nim             | 93 +------------------
 5 files changed, 143 insertions(+), 89 deletions(-)
 create mode 100644 constantine/math_compiler/experimental/builtins/builtins.nim
 create mode 100644 constantine/math_compiler/experimental/builtins/common_builtins.nim
 create mode 100644 constantine/math_compiler/experimental/builtins/cuda_builtins.nim
 create mode 100644 constantine/math_compiler/experimental/builtins/wgsl_builtins.nim

diff --git a/constantine/math_compiler/experimental/builtins/builtins.nim b/constantine/math_compiler/experimental/builtins/builtins.nim
new file mode 100644
index 00000000..da079e83
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/builtins.nim
@@ -0,0 +1,19 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# NOTE: For the moment we import and export builtins here for all backends.
+# Once we change the code to make single backends importable on their own,
+# this will be changed and these builtins will be imported/exported in the
+# corresponding CUDA/WGSL etc module the user needs to import.
+import ./common_builtins
+import ./cuda_builtins
+import ./wgsl_builtins
+
+export common_builtins
+export cuda_builtins
+export wgsl_builtins
diff --git a/constantine/math_compiler/experimental/builtins/common_builtins.nim b/constantine/math_compiler/experimental/builtins/common_builtins.nim
new file mode 100644
index 00000000..b9a144a5
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/common_builtins.nim
@@ -0,0 +1,46 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+#import std / [macros, strutils, sequtils, options, sugar, tables, strformat, hashes, sets]
+#
+#import ./gpu_types
+#import ./backends/backends
+#import ./nim_to_gpu
+#
+#export gpu_types
+
+template nimonly*(): untyped {.pragma.}
+template cudaName*(s: string): untyped {.pragma.}
+
+## Dummy data for the typed nature of the `cuda` macro. These define commonly used
+## CUDA specific names so that they produce valid Nim code in the context of a typed macro.
+template global*() {.pragma.}
+template device*() {.pragma.}
+template forceinline*() {.pragma.}
+
+## If attached to a function, type or variable it will refer to a built in
+## in the target backend. This is used for all the functions, types and variables
+## defined below to indicate that we do not intend to generate code for them.
+template builtin*() {.pragma.}
+# If attached to a `var` it will be treated as a
+# `__constant__`! Only useful if you want to define a
+# constant without initializing it (and then use
+# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it
+# before executing the kernel)
+template constant*() {.pragma.}
+
+## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its
+## own `extern` pragma (due to requiring an argument it cannot be reused):
+## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma
+template cuExtern*(): untyped {.pragma.}
+template shared*(): untyped {.pragma.}
+template private*(): untyped {.pragma.}
+## You would typically use `cuExtern` and `shared` together:
+## `var x {.cuExtern, shared.}: array[N, Foo]`
+## for example to declare a constant array that is filled by the
+## host before kernel execution.
diff --git a/constantine/math_compiler/experimental/builtins/cuda_builtins.nim b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim
new file mode 100644
index 00000000..5d002f93
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim
@@ -0,0 +1,48 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./common_builtins
+
+type
+  Dim* = cint ## dummy to have access to math
+  NvBlockIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvBlockDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvThreadIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvGridDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+
+
+## These are dummy elements to make CUDA block / thread index / dim
+## access possible in the *typed* `cuda` macro. It cannot be `const`,
+## because then the typed code would evaluate the values before we
+## can work with it from the typed macro.
+let blockIdx* {.builtin.} = NvBlockIdx()
+let blockDim* {.builtin.} = NvBlockDim()
+let gridDim* {.builtin.} = NvGridDim()
+let threadIdx* {.builtin.} = NvThreadIdx()
+
+## Similar for procs. They don't need any implementation, as they won't ever be actually called.
+proc printf*(fmt: string) {.varargs, builtin.} = discard
+proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard
+
+## While you can use `malloc` on device with small sizes, it is usually not
+## recommended to do so.
+proc malloc*(size: csize_t): pointer {.builtin.}  = discard
+proc free*(p: pointer) {.builtin.} = discard
+proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard
diff --git a/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim
new file mode 100644
index 00000000..fe7e11f0
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim
@@ -0,0 +1,26 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./common_builtins
+
+type
+  DimWgsl = uint32
+  WgslGridDim = object
+    x*: DimWgsl
+    y*: DimWgsl
+    z*: DimWgsl
+
+## WebGPU specific
+let global_id* {.builtin.} = WgslGridDim()
+let num_workgroups* {.builtin.} = WgslGridDim()
+
+## WebGPU select
+proc select*[T](f, t: T, cond: bool): T {.builtin.} =
+  # Implementation to run WebGPU code on CPU
+  if cond: t
+  else: f
diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 01548f7b..58ad4207 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -14,97 +14,12 @@ import ./nim_to_gpu
 
 export gpu_types
 
-template nimonly*(): untyped {.pragma.}
-template cudaName*(s: string): untyped {.pragma.}
-
-## Dummy data for the typed nature of the `cuda` macro. These define commonly used
-## CUDA specific names so that they produce valid Nim code in the context of a typed macro.
-template global*() {.pragma.}
-template device*() {.pragma.}
-template forceinline*() {.pragma.}
-
-## If attached to a function, type or variable it will refer to a built in
-## in the target backend. This is used for all the functions, types and variables
-## defined below to indicate that we do not intend to generate code for them.
-template builtin*() {.pragma.}
-
-# If attached to a `var` it will be treated as a
-# `__constant__`! Only useful if you want to define a
-# constant without initializing it (and then use
-# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it
-# before executing the kernel)
-template constant*() {.pragma.}
-type
-  Dim* = cint ## dummy to have access to math
-  NvBlockIdx* = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvBlockDim = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvThreadIdx* = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvGridDim = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-
-  DimWgsl = uint32
-  WgslGridDim = object
-    x*: DimWgsl
-    y*: DimWgsl
-    z*: DimWgsl
-
-## These are dummy elements to make CUDA block / thread index / dim
-## access possible in the *typed* `cuda` macro. It cannot be `const`,
-## because then the typed code would evaluate the values before we
-## can work with it from the typed macro.
-let blockIdx* {.builtin.} = NvBlockIdx()
-let blockDim* {.builtin.} = NvBlockDim()
-let gridDim* {.builtin.} = NvGridDim()
-let threadIdx* {.builtin.} = NvThreadIdx()
-
-## WebGPU specific
-let global_id* {.builtin.} = WgslGridDim()
-let num_workgroups* {.builtin.} = WgslGridDim()
-
-## Similar for procs. They don't need any implementation, as they won't ever be actually called.
-proc printf*(fmt: string) {.varargs, builtin.} = discard
-proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard
-
-## WebGPU select
-proc select*[T](f, t: T, cond: bool): T {.builtin.} =
-  # Implementation to run WebGPU code on CPU
-  if cond: t
-  else: f
-
-## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its
-## own `extern` pragma (due to requiring an argument it cannot be reused):
-## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma
-template cuExtern*(): untyped {.pragma.}
-template shared*(): untyped {.pragma.}
-template private*(): untyped {.pragma.}
-## You would typically use `cuExtern` and `shared` together:
-## `var x {.cuExtern, shared.}: array[N, Foo]`
-## for example to declare a constant array that is filled by the
-## host before kernel execution.
-
-## While you can use `malloc` on device with small sizes, it is usually not
-## recommended to do so.
-proc malloc*(size: csize_t): pointer {.builtin.}  = discard
-proc free*(p: pointer) {.builtin.} = discard
-proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard
+import builtins/builtins # all the builtins for the backend to make the Nim compiler happy
+export builtins
 
 macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
-  ## WARNING: The following are *not* supported:
-  ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
-  ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
-  ##         is actually a field access!
-  ## - most regular Nim features :)
+  ## Converts the body of this macro into a `GpuAst` from where it can be converted
+  ## into CUDA or WGSL code at runtime.
   var ctx = GpuContext()
   let ast = ctx.toGpuAst(body)
   let genProcs = toSeq(ctx.genericInsts.values)

From 71c379f7c84a07973f456538c18c0ba500cb46d3 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Sep 2025 15:54:28 +0200
Subject: [PATCH 86/87] update doc comment of `cuda` macro

---
 .../math_compiler/experimental/gpu_compiler.nim        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 58ad4207..0a8b7362 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -28,11 +28,11 @@ macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
   newLit((g, ast))
 
 macro cuda*(body: typed): string =
-  ## WARNING: The following are *not* supported:
-  ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
-  ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
-  ##         is actually a field access!
-  ## - most regular Nim features :)
+  ## Converts the body of this macro into a `GpuAst` and from there into a string of
+  ## CUDA or WGSL code.
+  ##
+  ## TODO: make `cuda` choose CUDA backend, `wgsl` WGSL etc. Need to change code
+  ## that chooses backend etc.
   #echo body.treerepr
   var ctx = GpuContext()
   let gpuAst = ctx.toGpuAst(body)

From 05321435f3df621f48067a1bb27d8592533751bc Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Sep 2025 15:56:57 +0200
Subject: [PATCH 87/87] improve explanation of `typeOfIndex` in CUDA's
 `getType` helper

---
 .../math_compiler/experimental/backends/cuda.nim       | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 4899c3f5..7e8e4a3b 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -167,8 +167,14 @@ proc getFieldType(t: GpuType, field: GpuAst): GpuType =
 proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType =
   ## Tries to determine the underlying type of the AST.
   ##
-  ## If `typeOfIndex` is `true`, we return the type of the index we access. Otherwise
-  ## we return the type of the array / pointer.
+  ## If `typeOfIndex` is `true`, in case of a `gpuIndex` node, we return the type of the
+  ## element behind the index access `[]`. Otherwise we return the type of the array / pointer.
+  ##
+  ## Let `foo` be an array `let foo = [1'f32, 2, 3]`.
+  ## Let `n` be a `GpuAst` node of kind `gpuIndex` corresponding to `foo[1]`.
+  ## Then `ctx.getType(n, typeOfIndex = true)` returns `float32` while
+  ## `ctx.getType(n, typeOfIndex = false)` returns `array[3, float32]` (as
+  ## a `GpuType`).
   ##
   ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types!
   template dfl(): untyped = GpuType(kind: gtInvalid)