diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim
index f2b71152..95fbda87 100644
--- a/constantine/math_compiler/experimental/backends/backends.nim
+++ b/constantine/math_compiler/experimental/backends/backends.nim
@@ -14,27 +14,11 @@ when defined(cuda):
 else:
   const Backend* = bkWGSL
 
-proc gpuTypeToString*(t: GpuTypeKind): string =
-  case Backend
-  of bkCuda: cuda.gpuTypeToString(t)
-  of bkWGSL: wgsl.gpuTypeToString(t)
-
-proc gpuTypeToString*(t: GpuType, ident = newGpuIdent(), allowArrayToPtr = false,
-                      allowEmptyIdent = false,
-                     ): string =
-  case Backend
-  of bkCuda: cuda.gpuTypeToString(t, ident.ident(), allowArrayToPtr, allowEmptyIdent)
-  of bkWGSL: wgsl.gpuTypeToString(t, ident, allowArrayToPtr, allowEmptyIdent)
-
-proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
-  case Backend
-  of bkCuda: cuda.genFunctionType(typ, fn, fnArgs)
-  of bkWGSL: wgsl.genFunctionType(typ, fn, fnArgs)
-
 proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string =
   case Backend
   of bkCuda:
-    result = ctx.genCuda(ast)
+    cuda.preprocess(ctx, ast, kernel)
+    result = cuda.codegen(ctx)
   of bkWGSL:
-    ctx.storagePass(ast, kernel)
+    wgsl.preprocess(ctx, ast, kernel)
     result = wgsl.codegen(ctx)
diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim
index 1def1b1f..076625cf 100644
--- a/constantine/math_compiler/experimental/backends/common_utils.nim
+++ b/constantine/math_compiler/experimental/backends/common_utils.nim
@@ -6,9 +6,36 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
+import std / tables
 import ../gpu_types
-# import ./backends
 
 proc address*(a: string): string = "&" & a
-
 proc size*(a: string): string = "sizeof(" & a & ")"
+
+proc isGlobal*(fn: GpuAst): bool =
+  doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind
+  result = attGlobal in fn.pAttributes
+
+proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock: var GpuAst) =
+  ## Farms the top level of the code for functions, variable and type definition.
+  ## All functions are added to the `allFnTab`, while only global ones (or even only
+  ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining
+  ## logic.
+  ## Variables are collected in `varBlock`.
+  case ast.kind
+  of gpuProc:
+    ctx.allFnTab[ast.pName] = ast
+    if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal():
+      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
+    elif kernel.len == 0 and ast.isGlobal():
+      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
+  of gpuBlock:
+    # could be a type definition or global variable
+    for ch in ast:
+      ctx.farmTopLevel(ch, kernel, varBlock)
+  of gpuVar, gpuConstexpr:
+    varBlock.statements.add ast
+  of gpuTypeDef, gpuAlias:
+    raiseAssert "Unexpected type def / alias def found. These should be in `ctx.types` now: " & $ast
+  else:
+    discard
diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim
index 227bde79..7e8e4a3b 100644
--- a/constantine/math_compiler/experimental/backends/cuda.nim
+++ b/constantine/math_compiler/experimental/backends/cuda.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import std / [macros, strformat, strutils, sugar, sequtils]
+import std / [macros, strformat, strutils, sugar, sequtils, tables, algorithm]
 
 import ../gpu_types
 import ./common_utils
@@ -43,6 +43,7 @@ proc gpuTypeToString*(t: GpuTypeKind): string =
   of gtVoidPtr: "void*"
   of gtObject: "struct"
   of gtString: "const char*"
+  of gtUA: "" # `UncheckedArray` by itself is nothing in CUDA
   else:
     raiseAssert "Invalid type : " & $t
 
@@ -57,6 +58,10 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
   var skipIdent = false
   case t.kind
   of gtPtr:
+    var t = t # if `ptr UncheckedArray`, remove the `gtUA` layer. No meaning on CUDA
+    if t.to.kind == gtUA:
+      t.to = t.to.uaTo
+
     if t.to.kind == gtArray: # ptr to array type
       # need to pass `*` for the pointer into the identifier, i.e.
       # `state: var array[4, BigInt]`
@@ -65,7 +70,7 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
       # so as our ident we pass `theIdent = (*<ident>)` and generate the type for the internal
       # array type, which yields e.g. `BigInt <theIdent>[4]`.
       let ptrStar = gpuTypeToString(t.kind)
-      result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")")
+      result = gpuTypeToString(t.to, '(' & ptrStar & ident & ')')
       skipIdent = true
     else:
       let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent)
@@ -82,19 +87,30 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false,
     of gtArray: # nested array
       let typ = getInnerArrayType(t)        # get inner most type
       let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...`
-      result = typ & " " & ident & lengths
+      result = typ & ' ' & ident & lengths
     else:
       # NOTE: Nested arrays don't have an inner identifier!
       if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead
-        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]"
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & "[]"
       else:
-        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]"
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & '[' & $t.aLen & ']'
     skipIdent = true
+  of gtGenericInst:
+    # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
+    # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
+    result = t.gName
+    if t.gArgs.len > 0:
+      result.add '_'
+    for i, g in t.gArgs:
+      result.add gpuTypeToString(g)
+      if i < t.gArgs.high:
+        result.add '_'
   of gtObject: result = t.name
+  of gtUA:     result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T?
   else:        result = gpuTypeToString(t.kind)
 
   if ident.len > 0 and not skipIdent: # still need to add ident
-    result.add " " & ident
+    result.add ' ' & ident
 
 proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
   ## Returns the correct function with its return type
@@ -114,11 +130,139 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
 proc genMemcpy(lhs, rhs, size: string): string =
   result = &"memcpy({lhs}, {rhs}, {size})"
 
+proc scanFunctions(ctx: var GpuContext, n: GpuAst) =
+  ## Iterates over the given function and checks for all `gpuCall` nodes. Any function
+  ## called in the scope is added to `fnTab`. This is a form of dead code elimination.
+  case n.kind
+  of gpuCall:
+    let fn = n.cName
+    if fn in ctx.allFnTab:
+      # Check if any of the parameters are pointers (otherwise non generic)
+      if fn notin ctx.fnTab: # function not known, add to `fnTab` (i.e. avoid code elimination)
+        let fnCalled = ctx.allFnTab[fn]
+        ctx.fnTab[fn] = fnCalled
+        # still "scan for functions", i.e. fill `fnTab` from inner calls
+        for ch in fnCalled:
+          ctx.scanFunctions(ch)
+      # else we don't do anything for this function
+    # Harvest functions from arguments to this call!
+    for arg in n.cArgs:
+      ctx.scanFunctions(arg)
+  else:
+    for ch in n:
+      ctx.scanFunctions(ch)
+
+proc getFieldType(t: GpuType, field: GpuAst): GpuType =
+  ## Returns the type of the field. `t` must be an object or generic instantiation.
+  ## `field` must be an ident.
+  doAssert field.kind == gpuIdent, "Field is not an ident: " & $field
+  doAssert t.kind in [gtObject, gtGenericInst]
+  let flds = if t.kind == gtObject: t.oFields
+               else: t.gFields
+  result = GpuType(kind: gtInvalid)
+  for f in flds:
+    if f.name == field.ident():
+      return f.typ
+
+proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType =
+  ## Tries to determine the underlying type of the AST.
+  ##
+  ## If `typeOfIndex` is `true`, in case of a `gpuIndex` node, we return the type of the
+  ## element behind the index access `[]`. Otherwise we return the type of the array / pointer.
+  ##
+  ## Let `foo` be an array `let foo = [1'f32, 2, 3]`.
+  ## Let `n` be a `GpuAst` node of kind `gpuIndex` corresponding to `foo[1]`.
+  ## Then `ctx.getType(n, typeOfIndex = true)` returns `float32` while
+  ## `ctx.getType(n, typeOfIndex = false)` returns `array[3, float32]` (as
+  ## a `GpuType`).
+  ##
+  ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types!
+  template dfl(): untyped = GpuType(kind: gtInvalid)
+  case arg.kind
+  of gpuIdent: arg.iTyp
+  of gpuAddr: GpuType(kind: gtPtr, to: ctx.getType(arg.aOf))
+  of gpuDeref:
+    let argTyp = ctx.getType(arg.dOf)
+    doAssert argTyp.kind == gtPtr
+    argTyp.to
+  of gpuCall: dfl()
+  of gpuIndex:
+    let arrType = ctx.getType(arg.iArr)
+    if typeOfIndex:
+      case arrType.kind
+      of gtPtr:   arrType.to
+      of gtUA:    arrType.uaTo
+      of gtArray: arrType.aTyp
+      else: raiseAssert "`gpuIndex` cannot be of a non pointer / array type: " & $arrType
+    else:
+      arrType
+  of gpuDot:
+    let parentTyp = ctx.getType(arg.dParent)
+    parentTyp.getFieldType(arg.dField)
+  of gpuLit: arg.lType
+  of gpuBinOp: dfl() ## XXX: store resulting type of `gpuBinOp`!
+  #of gpuBlock: arg.statements[^1].getType()
+  of gpuPrefix: ctx.getType(arg.pVal)
+  of gpuConv: arg.convTo
+  of gpuCast: arg.cTo # ident of the thing we cast
+  else:
+    raiseAssert "Not implemented to determine type from node: " & $arg
+
+proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) =
+  ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects
+  ## that are rewritten include:
+  ##
+  ## - `Index` of `Deref` of `Ident` needs to be rewritten to `Index` of `Ident` if the
+  ##   ident is a pointer type, because `[]` is syntactic sugar for pointer arithmetic
+  ##   (unless the argument is a pointer to a static array)
+  case n.kind
+  of gpuIndex:
+    if n.iArr.kind == gpuDeref:
+      # get type of deref'd node, but do not fold `gpuIndex` (i.e. get type of collection)
+      let typ = ctx.getType(n, typeOfIndex = false)
+      if typ.kind != gtArray:
+        n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex)
+    else:
+      for ch in mitems(n):
+        ctx.makeCodeValid(ch)
+  else:
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch)
 
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a))
 
+proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
+
+  # 1. Add all data from `genericInsts` and `types` tables
+  #    In CUDA the types have to be before any possible global variables using
+  #    them!
+  for k, v in pairs(ctx.genericInsts):
+    ctx.allFnTab[k] = v
+  # And all the known types
+  for k, typ in pairs(ctx.types):
+    ctx.globalBlocks.add typ
+
+  # 2. Fill table with all *global* functions or *only* the specific `kernel`
+  #    if any given
+  var varBlock = GpuAst(kind: gpuBlock)
+  ctx.farmTopLevel(ast, kernel, varBlock)
+  ctx.globalBlocks.add varBlock
+
+  # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
+  #    the functions called and record them in `fnTab`.
+  let fns = toSeq(ctx.fnTab.pairs)
+  for (fnIdent, fn) in fns: # everything in `fnTab` at this point is a global function
+    # Get the original arguments (before lifting them) of this function. Needed in scan
+    # to check if `gpuCall` argument is a parameter.
+    let fnOrig = ctx.allFnTab[fnIdent]
+    ctx.scanFunctions(fn)
+
+  # 4. Finalize the code by performing some required AST transformations to make the code valid.
+  for (fnIdent, fn) in mpairs(ctx.fnTab):
+    ctx.makeCodeValid(fn)
+
 proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   ## The actual CUDA code generator.
   let indentStr = "  ".repeat(indent)
@@ -137,27 +281,32 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs)
 
     # extern "C" is needed to avoid name mangling
-    result = indentStr & "extern \"C\" " & attrs.join(" ") & " " &
-             fnSig & "{\n"
-
-    result &= ctx.genCuda(ast.pBody, indent + 1)
-    result &= "\n" & indentStr & "}"
+    result = indentStr & "extern \"C\" " & attrs.join(" ") & ' ' &
+             fnSig
+    if ast.forwardDeclare:
+      result.add ';'
+    else:
+      result.add "{\n"
+      result &= ctx.genCuda(ast.pBody, indent + 1)
+      result &= '\n' & indentStr & '}'
 
   of gpuBlock:
     result = ""
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n'
     for i, el in ast.statements:
       result.add ctx.genCuda(el, indent)
       if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added
-        result.add ";"
+        result.add ';'
       if i < ast.statements.high:
-        result.add "\n"
+        result.add '\n'
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n'
 
   of gpuVar:
-    result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName.ident())
+    let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & ' '
+                else: ""
+    result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident())
     # If there is an initialization, the type might require a memcpy
     if ast.vInit.kind != gpuVoid and not ast.vRequiresMemcpy:
       result &= " = " & ctx.genCuda(ast.vInit)
@@ -177,35 +326,35 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     # skip semicolon in the condition. Otherwise can lead to problematic code
     ctx.withoutSemicolon: # skip semicolon for if bodies
       result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n"
-    result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.ifThen, indent + 1) & '\n'
+    result &= indentStr & '}'
     if ast.ifElse.kind != gpuVoid:
       result &= " else {\n"
-      result &= ctx.genCuda(ast.ifElse, indent + 1) & "\n"
-      result &= indentStr & "}"
+      result &= ctx.genCuda(ast.ifElse, indent + 1) & '\n'
+      result &= indentStr & '}'
 
   of gpuFor:
     result = indentStr & "for(int " & ast.fVar.ident() & " = " &
              ctx.genCuda(ast.fStart) & "; " &
              ast.fVar.ident() & " < " & ctx.genCuda(ast.fEnd) & "; " &
              ast.fVar.ident() & "++) {\n"
-    result &= ctx.genCuda(ast.fBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.fBody, indent + 1) & '\n'
+    result &= indentStr & '}'
   of gpuWhile:
     ctx.withoutSemicolon:
       result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n"
-    result &= ctx.genCuda(ast.wBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genCuda(ast.wBody, indent + 1) & '\n'
+    result &= indentStr & '}'
 
   of gpuDot:
-    result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField)
+    result = ctx.genCuda(ast.dParent) & '.' & ctx.genCuda(ast.dField)
 
   of gpuIndex:
-    result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]"
+    result = ctx.genCuda(ast.iArr) & '[' & ctx.genCuda(ast.iIndex) & ']'
 
   of gpuCall:
-    result = indentStr & ast.cName.ident() & "(" &
-             ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")"
+    result = indentStr & ast.cName.ident() & '(' &
+             ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ')'
 
   of gpuTemplateCall:
     when nimvm:
@@ -224,25 +373,28 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result = ctx.genCuda(expandedBody, indent)
 
   of gpuBinOp:
-    result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " &
-             ast.bOp & " " &
-             ctx.genCuda(ast.bRight) & ")"
+    ctx.withoutSemicolon:
+      let l = ctx.genCuda(ast.bLeft)
+      let r = ctx.genCuda(ast.bRight)
+      result = indentStr & '(' & l & ' ' &
+               ctx.genCuda(ast.bOp) & ' ' &
+               r & ')'
 
   of gpuIdent:
     result = ast.ident()
 
   of gpuLit:
-    if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\""
+    if ast.lType.kind == gtString: result = '"' & ast.lValue & '"'
     elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder
     else: result = ast.lValue
 
   of gpuArrayLit:
     result = "{"
     for i, el in ast.aValues:
-      result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el
+      result.add '(' & gpuTypeToString(ast.aLitType) & ')' & ctx.genCuda(el)
       if i < ast.aValues.high:
         result.add ", "
-    result.add "}"
+    result.add '}'
 
   of gpuReturn:
     result = indentStr & "return " & ctx.genCuda(ast.rValue)
@@ -251,10 +403,10 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = ast.pOp & ctx.genCuda(ast.pVal)
 
   of gpuTypeDef:
-    result = "struct " & ast.tName & "{\n"
+    result = "struct " & gpuTypeToString(ast.tTyp) & "{\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, el.name) & ";\n"
-    result.add "}"
+    result.add '}'
 
   of gpuObjConstr:
     result = "{"
@@ -262,7 +414,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result.add ctx.genCuda(el.value)
       if i < ast.ocFields.len - 1:
         result.add ", "
-    result.add "}"
+    result.add '}'
 
   of gpuInlineAsm:
     result = indentStr & "asm(" & ast.stmt.strip & ");"
@@ -271,23 +423,45 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = indentStr & "/* " & ast.comment & " */"
 
   of gpuConv:
-    result = "(" & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.convExpr)
+    result = '(' & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.convExpr)
   of gpuCast:
-    result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr)
+    result = '(' & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.cExpr)
 
   of gpuAddr:
-    result = "(&" & ctx.genCuda(ast.aOf) & ")"
+    result = "(&" & ctx.genCuda(ast.aOf) & ')'
 
   of gpuDeref:
-    result = "(*" & ctx.genCuda(ast.dOf) & ")"
+    result = "(*" & ctx.genCuda(ast.dOf) & ')'
 
   of gpuConstexpr:
+    ## TODO: We need to change the code such that we emit `constexpr` inside of procs and
+    ## `__constant__` outside of procs. The point is we want to support mapping to `__constant__`
+    ## for `const foo = bar` Nim declarations to evaluate values at Nim's compile time.
+    ## Alternatively, make user write `const foo {.constant.} = bar` to produce a global
+    ## `__constant__` value.
     if ast.cType.kind == gtArray:
-      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
+      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
     else:
-      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
+      result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & ' ' & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
 
   else:
     echo "Unhandled node kind in genCuda: ", ast.kind
     raiseAssert "Unhandled node kind in genCuda: " & ast.repr
     result = ""
+
+proc codegen*(ctx: var GpuContext): string =
+  ## Generate the actual code for all pieces of the puzzle
+  # 1. generate code for the global blocks (types, global vars etc)
+  for blk in ctx.globalBlocks:
+    result.add ctx.genCuda(blk) & ";\n\n"
+
+  # 2. generate all regular functions
+  let fns = toSeq(ctx.fnTab.pairs)
+  for (fnIdent, fn) in fns:
+    let fnC = fn.clone()
+    fnC.forwardDeclare = true
+    result.add ctx.genCuda(fnC) & '\n'
+  result.add "\n\n"
+
+  for fnIdent, fn in ctx.fnTab:
+    result.add ctx.genCuda(fn) & "\n\n"
diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim
index 0b6552bd..35a80f32 100644
--- a/constantine/math_compiler/experimental/backends/wgsl.nim
+++ b/constantine/math_compiler/experimental/backends/wgsl.nim
@@ -25,7 +25,7 @@ proc literalSuffix(t: GpuType): string =
   of gtUint32: "u"
   of gtInt32: "" # NOTE: We DON'T give as suffix to `i32` literals so that we can rely on more cases
                  # where WebGPU allows literals to be converted automatically!
-  of gtFloat32: "f"
+  of gtFloat32: "" # NOTE: float suffixes _already_ come with an `f` suffix in Nim!
   else: ""
 
 proc toAddressSpace(symKind: GpuSymbolKind): AddressSpace =
@@ -56,7 +56,7 @@ proc fromAddressSpace(addrSpace: AddressSpace): GpuSymbolKind =
 proc constructPtrSignature(addrSpace: AddressSpace, idTyp: GpuType, ptrStr, typStr: string): string =
   ## Constructs the `ptr<addressSpace, typStr, [read / read_write]>` string, which only includes
   ## the RW string if the address space is `storage`
-  let rw = if not idTyp.isNil: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`)
+  let rw = if idTyp.kind != gtVoid: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`)
   let rwStr = if rw: "read_write" else: "read"
   case addrSpace
   of asStorage: result = &"{ptrStr}<{addrSpace}, {typStr}, {rwStr}>"
@@ -112,8 +112,18 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr =
     else:
       result = &"{identPrefix}array<{typ}, {t.aLen}>"
     skipIdent = true
+  of gtGenericInst:
+    # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations
+    # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`.
+    result = t.gName
+    if t.gArgs.len > 0:
+      result.add '_'
+    for i, g in t.gArgs:
+      result.add gpuTypeToString(g)
+      if i < t.gArgs.high:
+        result.add '_'
   of gtObject: result = t.name
-  of gtUA:     result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">"
+  of gtUA:     result = gpuTypeToString(t.kind) & '<' & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & '>'
   else:        result = gpuTypeToString(t.kind)
 
   if ident.len > 0 and not skipIdent: # still need to add ident
@@ -139,34 +149,6 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string =
     if typ.len > 0:
       result.add &" -> {typ}"
 
-proc isGlobal(fn: GpuAst): bool =
-  doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind
-  result = attGlobal in fn.pAttributes
-
-proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) =
-  ## Farms the top level of the code for functions, variable and type definition.
-  ## All functions are added to the `allFnTab`, while only global ones (or even only
-  ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining
-  ## logic.
-  ## Variables and types are collected in `varBlock` and `typBlock`.
-  case ast.kind
-  of gpuProc:
-    ctx.allFnTab[ast.pName] = ast
-    if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal():
-      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
-    elif kernel.len == 0 and ast.isGlobal():
-      ctx.fnTab[ast.pName] = ast.clone() # store global function extra
-  of gpuBlock:
-    # could be a type definition or global variable
-    for ch in ast:
-      ctx.farmTopLevel(ch, kernel, varBlock, typBlock)
-  of gpuVar, gpuConstexpr:
-    varBlock.statements.add ast
-  of gpuTypeDef:
-    typBlock.statements.add ast
-  else:
-    discard
-
 proc patchType(t: GpuType): GpuType =
   ## Applies patches needed for WGSL support. E.g. `bool` cannot be a storage variable.
   result = t
@@ -205,6 +187,7 @@ proc determineSymKind(arg: GpuAst): GpuSymbolKind =
   of gpuBlock: arg.statements[^1].determineSymKind() # look at last element
   of gpuPrefix: gsLocal # equivalent to constructing a local var
   of gpuConv: gsLocal # a converted value will be a local var
+  of gpuCast: arg.cExpr.determineSymKind() # symbol kind of the thing we cast
   else:
     raiseAssert "Not implemented to determine symbol kind from node: " & $arg
 
@@ -225,6 +208,7 @@ proc determineMutability(arg: GpuAst): bool =
   of gpuBlock: arg.statements[^1].determineMutability() # look at last element
   of gpuPrefix: false # equivalent to constructing a local var
   of gpuConv: false # a converted value will be immutable
+  of gpuCast: arg.cExpr.determineMutability() # mutability of the thing we cast
   else:
     raiseAssert "Not implemented to determine mutability from node: " & $arg
 
@@ -250,8 +234,9 @@ proc determineIdent(arg: GpuAst): GpuAst =
   of gpuBlock: arg.statements[^1].determineIdent()
   of gpuPrefix: dfl()
   of gpuConv: dfl()
+  of gpuCast: arg.cExpr.determineIdent() # ident of the thing we cast
   else:
-    raiseAssert "Not implemented to determine mutability from node: " & $arg
+    raiseAssert "Not implemented to determine ident from node: " & $arg
 
 proc getGenericArguments(args: seq[GpuAst], params: seq[GpuParam], callerParams: Table[string, GpuParam]): seq[GenericArg] =
   ## If an argument is not a ptr argument in the original function (`params`) then
@@ -310,7 +295,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string
   ## the information we precisely want to extract (different symbol kind etc) would make
   ## it so that we cannot look up elements.
   doAssert n.kind == gpuCall, "Not a call, but: " & $n.kind
-  result = n.cName.ident() & "_"
+  result = n.cName.ident() & '_'
   for i, arg in n.cArgs:
     let p = params[i]
     var s: string
@@ -327,7 +312,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string
       s = shortAddrSpace(addrSpace) & m
     result.add s
     if i < n.cArgs.high:
-      result.add "_"
+      result.add '_'
 
 proc makeFnGeneric(fn: GpuAst, gi: GenericInst): GpuAst =
   ## Returns a (shallow) copy of the input function (which is a clone of the
@@ -385,6 +370,11 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp
   ## of that function (hence the name `scanGenerics`). The generic instance will be added
   ## to `fnTab` instead. The name of the generic will be derived based on the types
   ## of arguments with respect to mutability and address space.
+  ##
+  ## In addition this function records any time a `struct` is constructed in a `gpuObjConstr`
+  ## node and a pointer field assigned to it. As pointer fields are not valid in WGSL, we
+  ## record them here to replace them by their arguments passed to the constructor later.
+  ## The pointers _must_ be pointers passed into a global kernel (i.e. `storage` address space).
   case n.kind
   of gpuCall:
     let fn = n.cName
@@ -435,6 +425,18 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp
     # Harvest generics from arguments to this call!
     for arg in n.cArgs:
       ctx.scanGenerics(arg, callerParams)
+  of gpuObjConstr:
+    # If pointer argument of `storage`, strip out, if pointer type field
+    # otherwise, raise CT error
+    for f in n.ocFields:
+      if f.typ.kind == gtPtr:
+        doAssert f.value.kind in [gpuAddr, gpuIdent], "Constructing a pointer field " &
+          "from a more complex expression than an ident or an address-of operation " &
+          "is currently not supported."
+        let id = f.value.determineIdent()
+        doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " &
+          "variable (i.e. an argument to a global kernel) is not supported: " & $f
+        ctx.structsWithPtrs[(n.ocType, f.name)] = id
   else:
     for ch in n:
       ctx.scanGenerics(ch, callerParams)
@@ -495,7 +497,234 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) =
     for ch in mitems(n):
       ctx.injectAddressOf(ch)
 
-proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
+proc rewriteCompoundAssignment(n: GpuAst): GpuAst =
+  doAssert n.kind == gpuBinOp
+  if n.bOp.ident() in ["<=", "==", ">=", "!="]: return n
+
+  template genAssign(left, rnode, op: typed): untyped =
+    let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode)
+    GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false)
+
+  let op = n.bOp.ident()
+  if op.len >= 2 and op[^1] == '=':
+    var opAst = GpuAst(kind: gpuIdent, iName: op[0 .. ^2])
+    opAst.iSym = opAst.iName
+    result = genAssign(n.bLeft, n.bRight, opAst) # all but last
+  else:
+    # leave untouched
+    result = n
+
+proc getStructType(n: GpuAst): GpuType =
+  ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type
+  ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct.
+  doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " &
+    $n.kind & " for node: " & $n
+  var p = n
+  if p.kind == gpuDeref:
+    p = n.dOf
+  result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject:
+             p.iTyp.to
+           elif p.iTyp.kind == gtObject:
+             p.iTyp
+           else: GpuType(kind: gtVoid)
+
+proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
+proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) =
+  ## Addresses other AST patterns that need to be rewritten on WGSL. Aspects
+  ## that are rewritten include:
+  ##
+  ## - (`gpuBinOp`) rewriting compound assignment operators as regular assignments, `x += y` ↦ `x = x + y`
+  ##
+  ## - (`gpuDot`) replace field access of struct pointer fields by the pointers passed into the object
+  ##   constructor (ref `scanGenerics`). `inGlobal` is used to decide what exactly we replace
+  ##   it by. Inside of a global function the variables won't be pointers, hence we insert `&foo`.
+  ##   In device functions, the globals will have been passed into the function as a parameter,
+  ##   `ptr<storage, ...>`. Thus, we replace by `foo`.
+  ##   NOTE: We could consider to move this into `scanGenerics`, but for the moment I prefer to
+  ##   do code transformations here and `scanGenerics` being only about data collection.
+  ##
+  ## - (`gpuAssign`) compile time errors, if a user tries to assign a pointer to a struct pointer field
+  ##   outside the constructor.
+  ##
+  ## - (`gpuCall`) potentially update signatures of our custom generic functions. In `scanGenerics` if we
+  ##   have a call like `foo(bar.ptrField)` we will determine the signature of `foo` to have
+  ##   a `function` pointer, because `bar` will be a local struct instance. However, due to
+  ##   our replacement rules and fact that *only* storage pointers may be assigned to constructors
+  ##   the correct signature would be `storage` for the first argument after replacing `bar.ptrField`
+  ##   by its value in the constructor.
+  ##
+  ## - (`gpuObjConstr`) delete arguments to object constructors, which assign pointer fields.
+  ##
+  ## - (`gpuVar`) update types of new variables based on the RHS. May have changed since Nim -> GpuAst,
+  ##   due to `gpuDot` replacement further up.
+  ##
+  ## NOTE: A few cases already raise compile time errors _here_ and not in `checkCodeValid`,
+  ## as some transformations otherwise break the detection.
+  case n.kind
+  of gpuBinOp:
+    n = rewriteCompoundAssignment(n)
+    for ch in mitems(n): # now go over children
+      ctx.makeCodeValid(ch, inGlobal)
+  of gpuObjConstr: # strip out arguments that are pointer types
+    let t = n.ocType
+    var i = 0
+    while i < n.ocFields.len:
+      let f = n.ocFields[i]
+      if (t, f.name) in ctx.structsWithPtrs:
+        if f.typ.kind == gtPtr:
+          n.ocFields.delete(i)
+        else:
+          inc i
+      else:
+        inc i
+  of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res`
+    var p = n.dParent
+    let id = getStructType(p)
+    doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind
+    let field = n.dField.ident()
+    if id.kind != gtVoid and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer
+      let v = ctx.structsWithPtrs[(id, field)]
+      ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions,
+      ## we will have passed the parameter
+      if inGlobal:
+        n = GpuAst(kind: gpuAddr, aOf: v) # overwrite with the address of value passed in to the object constructor
+      else:
+        n = v
+  of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field
+    if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]:
+      let dot = n.aLeft
+      let id = getStructType(dot.dParent)
+      if id.kind != gtVoid:
+        doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind
+        let field = dot.dField.ident()
+        if (id, field) in ctx.structsWithPtrs:
+          raiseAssert "Assignment of a struct (`" & pretty(id) & "`) field of a pointer type is not supported. " &
+            "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n)
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+  of gpuCall:
+    # we might need to update the type of generics, if we did the replacement in `gpuDot`, because
+    # a struct ptr field will have had the wrong storage type
+    for ch in mitems(n): # first process children
+      ctx.makeCodeValid(ch, inGlobal)
+    # now check if any argument's type mismatches against the generic we recorded
+    let fnName = n.cName
+    if fnName in ctx.fnTab: # otherwise will not be generated by us, so irrelevan
+      # NOTE: theoretically, if we had struct pointer field replacements with symbols that had
+      # *different* address spaces, we'd need to split one generic into multiple again here.
+      # But that shouldn't be possible, because our entire replacement is currently only
+      # sane if we store a *storage pointer* in a struct. We would have raised in `scanGenerics`
+      # because of invalid pointer assignment in an object constructor.
+      let fn = ctx.fnTab[fnName]
+      let params = fn.pParams
+      for i, arg in n: # walk the parameters again and compare
+        let argId = arg.determineIdent()
+        if argId.kind != gpuVoid and argId.ident().len > 0:
+          var p = params[i]
+          ## XXX: update anything else? We mostly care about the address space here, because
+          ## the rest _should_ be the same anyway.
+          if p.addressSpace != argId.symbolKind.toAddressSpace():
+            p.addressSpace = argId.symbolKind.toAddressSpace()
+            p.ident.symbolKind = argId.symbolKind
+            fn.pParams[i] = p # write back, not a ref type!
+  of gpuVar:
+    # first recurse on the `gpuVar` to get possible replacements
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+    # update LHS with info from RHS by copying over its symbol kind. Different types are
+    # possible after replacements of `gpuDot` nodes above.
+    if n.vType.kind == gtPtr:
+      let rightId = n.vInit.determineIdent()
+      n.vName.symbolKind = rightId.symbolKind
+      n.vType.mutable = rightId.iTyp.mutable
+      n.vName.iTyp.mutable = rightId.iTyp.mutable
+  else:
+    for ch in mitems(n):
+      ctx.makeCodeValid(ch, inGlobal)
+
+proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) =
+  ## Update symbols in global functions to have same mutability and symbolkind as
+  ## parameters
+  case n.kind
+  of gpuIdent:
+    if n.iSym in ctx.globals:
+      n.symbolKind = gsGlobalKernelParam
+      if n.iTyp.kind == gtPtr:
+        let g = ctx.globals[n.iSym]
+        n.iTyp.mutable = g.typ.kind == gtPtr # arguments as pointers == mutable
+  else:
+    for ch in n:
+      ctx.updateSymsInGlobals(ch)
+
+proc checkCodeValid(ctx: var GpuContext, n: GpuAst) =
+  ## Checks if the code is valid according to WGSL spec.
+  ## So far handles:
+  ## - variables (`var`) to pointer types are not allowed
+  ##
+  ## Some code is already rejected in earlier passes, if a compiler pass would transform
+  ## the code in such a way as making a detection of illegal code invalid.
+  case n.kind
+  of gpuVar:
+    if n.vType.kind == gtPtr and n.vMutable: # `vMutable == var` -> not allowed to store pointers
+      let code = ctx.genWebGpu(n)
+      raiseAssert "The node: `" & $code & "` constructs a variable (`var`) to a pointer type. This " &
+        "is invalid in WGSL. Use `let`."
+  else:
+    for ch in n:
+      ctx.checkCodeValid(ch)
+
+proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) =
+  ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to
+  ## the `globals` of the context. Such variables are *not* regular global constants, but rather
+  ## `storage` buffers, which are filled before the kernel is executed.
+  ##
+  ## XXX: Document current not ideal behavior that one needs to be careful to pass data into
+  ## `wgsl.fakeExecute` precisely in the order in which the `var foo {.constant.}` are defined
+  ## *AND* after all kernel parameters!
+  doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind
+  var i = 0
+  while i < blk.len:
+    doAssert blk.kind == gpuBlock
+    let g = blk.statements[i]
+    if g.kind == gpuVar and atvConstant in g.vAttributes:
+      # remove this from `globalBlocks` and add to `globals`
+      doAssert g.vInit.kind == gpuVoid, "A variable annotated with `{.constant.}` must not have an initialization!"
+      # we construct a fake parameter from it
+      ## XXX: `storage` address space is probably what we want, but think more about it
+      let param = GpuParam(ident: g.vName, typ: g.vType, addressSpace: asStorage)
+      ctx.globals[param.ident.iSym] = param
+      blk.statements.delete(i)
+      # no need to increase `i`
+    else:
+      inc i
+
+proc removeStructPointerFields(blk: var GpuAst) =
+  ## Filters out `ptr` fields from all structs.
+  ##
+  ## If a type is used with `storage` pointer arguments, we will later perform replacement of field
+  ## access to the pointer field by the value we assign.
+  ##
+  ## If the user assigns a local (`function`) pointer, we raise a CT error. We _could_ in theory support
+  ## replacement for local pointer types too, but it requires a more careful analysis of which
+  ## local to replace by and the name in other scopes. I.e. passing a local pointer to a constructor
+  ## which has a different name than in the calling scope would require us to traverse the AST up to
+  ## the calling scope.
+  ##
+  ## Given the extreme limitations on `let` variables with pointers anyway, I don't think there is muc
+  ## purpose on supporting such features.
+  doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind
+  for typ in mitems(blk):
+    if typ.kind == gpuAlias: continue # don't need to mutate aliases!
+    doAssert typ.kind == gpuTypeDef
+    var i = 0
+    while i < typ.tFields.len:
+      let f = typ.tFields[i]
+      if f.typ.kind == gtPtr: # delete
+        typ.tFields.delete(i)
+      else:
+        inc i
+
+proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   ## If `kernel` is a global function, we *only* generate code for that kernel.
   ## This is useful if your GPU code contains multiple kernels with differing
   ## parameters to avoid having to fill dummy buffers for all the unused parameters
@@ -503,9 +732,18 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
   # 1. Fill table with all *global* functions or *only* the specific `kernel`
   #    if any given
   var varBlock = GpuAst(kind: gpuBlock)
-  var typBlock = GpuAst(kind: gpuBlock)
-  ctx.farmTopLevel(ast, kernel, varBlock, typBlock)
+  ctx.farmTopLevel(ast, kernel, varBlock)
   ctx.globalBlocks.add varBlock
+  ## XXX: `typBlock` should now always be empty, as we pass all
+  ## found types into `ctx.types`
+
+  # Now add the generics to the `allFnTab`
+  for k, v in pairs(ctx.genericInsts):
+    ctx.allFnTab[k] = v
+  # And all the known types
+  var typBlock = GpuAst(kind: gpuBlock)
+  for k, typ in pairs(ctx.types):
+    typBlock.statements.add typ
   ctx.globalBlocks.add typBlock
 
   # 2. Remove all arguments from global functions, as none are allowed in WGSL
@@ -515,9 +753,21 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
       for p in fn.pParams:
         ctx.globals[p.ident.iSym] = p # copy all parameters over to globals
       fn.pParams.setLen(0) # delete function's parameters
+      # now update all appearances of the parameters, now globals, such that they reflect
+      # the correct symbol kind and mutability
+      ctx.updateSymsInGlobals(fn)
     else:
       discard
 
+  # 2.b filter out all `var foo {.constant.}: dtype` from the `globalBlocks` and add them to
+  #    the `globals`
+  # `globalBlocks` has two entries:
+  # 0: variables
+  # 1: types
+  ctx.pullConstantPragmaVars(ctx.globalBlocks[0])
+  # 2.c remove all fields of structs, which have pointer type
+  removeStructPointerFields(ctx.globalBlocks[1])
+
   # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect
   #    the functions called and record them in `fnTab`. If they have pointer arguments we
   #    generate a generic instantiation for the exact pointer types used.
@@ -538,8 +788,15 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") =
     if fn.isGlobal(): # non global functions don't need to be mutated
       ctx.injectAddressOf(fn)
 
+  # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5`
+  for (fnIdent, fn) in mpairs(ctx.fnTab):
+    ctx.makeCodeValid(fn, inGlobal = fn.isGlobal())
+
+  # 6. finally raise error if we find anything that is not allowed in WGSL after our transformations
+  for (fnIdent, fn) in pairs(ctx.fnTab):
+    ctx.checkCodeValid(fn)
+
 
-proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string
 proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a))
 proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a))
 
@@ -549,46 +806,37 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   case ast.kind
   of gpuVoid: return # nothing to emit
   of gpuProc:
-
-    ## XXX: if a {.global.} / attGlobal proc, lift arguments
-    ## Store all arguments in the `GpuContext`
-    ## *AFTER* processing all of the code, generate header and place at beginning
-    ## Most difficult:
-    ## - track identifiers from {.global.} functions into arbitrary layers and remove
-    ## BUT, we can also have a full preprocessing pass.
-
     let attrs = collect:
       for att in ast.pAttributes:
         $att
 
-    # Parameters
     var params: seq[string]
     for p in ast.pParams:
       params.add gpuTypeToString(p.typ, p.ident, allowEmptyIdent = false)
     var fnArgs = params.join(", ")
     if $attGlobal in attrs:
       doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!"
-      ## XXX: clean this up. Add the global id builtin
-      fnArgs = "@builtin(global_invocation_id) global_id: vec3<u32>"
+      ## XXX: make this more flexible. In theory can be any name
+      fnArgs = "@builtin(global_invocation_id) global_id: vec3<u32>, @builtin(num_workgroups) num_workgroups: vec3<u32>"
     let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs)
 
     result = indentStr & "fn " & fnSig & " {\n"
 
     result &= ctx.genWebGpu(ast.pBody, indent + 1)
-    result &= "\n" & indentStr & "}"
+    result &= '\n' & indentStr & '}'
 
   of gpuBlock:
     result = ""
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n'
     for i, el in ast.statements:
       result.add ctx.genWebGpu(el, indent)
       if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added
-        result.add ";"
+        result.add ';'
       if i < ast.statements.high:
-        result.add "\n"
+        result.add '\n'
     if ast.blockLabel.len > 0:
-      result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
+      result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n'
 
   of gpuVar:
     let letOrVar = if ast.vMutable: "var" else: "let"
@@ -619,36 +867,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight),
                                        ctx.size(ast.aLeft))
     else:
-      proc determineIdent(arg: GpuAst): GpuAst =
-        ## Tries to determine the underlying ident that is contained in this node.
-        ## The issue is the argument to a `gpuCall` can be a complicated expression.
-        ## Depending on the node it may be possible to extract a simple identifier,
-        ## e.g. for `addr(foo)` (`gpuAddr` of `gpuIdent` node) we can get the ident.
-        ## If this fails, we return a `gpuVoid` node.
-        ##
-        ## TODO: Think about if it ever makes sense to extract the ident underlying
-        ## e.g. `deref` and use _that_ to determine mutability & address space.
-        template dfl(): untyped = GpuAst(kind: gpuVoid)
-        case arg.kind
-        of gpuIdent: arg
-        of gpuAddr: arg.aOf.determineIdent()
-        of gpuDeref: arg.dOf.determineIdent()
-        of gpuCall: dfl()
-        of gpuIndex: arg.iArr.determineIdent()
-        of gpuDot: arg.dParent.determineIdent()
-        of gpuLit: dfl()
-        of gpuBinOp: dfl()
-        of gpuBlock: arg.statements[^1].determineIdent()
-        of gpuPrefix: dfl()
-        of gpuConv: dfl()
-        else:
-          raiseAssert "Not implemented to determine mutability from node: " & $arg
-      let leftTyp = ast.aLeft.determineIdent().iTyp
-      if leftTyp.kind == gtPtr and leftTyp.to.kind == gtInt32:
+      let leftId = ast.aLeft.determineIdent()
+      if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32:
         # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was
         # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and
         # we patched the `bool -> i32` and thus need to convert it.
-        result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ")"
+        result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ')'
       else:
         result = indentStr & ctx.genWebGpu(ast.aLeft) & " = " & ctx.genWebGpu(ast.aRight)
 
@@ -662,36 +886,36 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
         result = indentStr & "if (false) {\n"
       else:
         result = indentStr & "if (" & ctx.genWebGpu(ast.ifCond) & ") {\n"
-    result &= ctx.genWebGpu(ast.ifThen, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.ifThen, indent + 1) & '\n'
+    result &= indentStr & '}'
     if ast.ifElse.kind != gpuVoid:
       result &= " else {\n"
-      result &= ctx.genWebGpu(ast.ifElse, indent + 1) & "\n"
-      result &= indentStr & "}"
+      result &= ctx.genWebGpu(ast.ifElse, indent + 1) & '\n'
+      result &= indentStr & '}'
 
   of gpuFor:
     result = indentStr & "for(var " & ast.fVar.ident() & ": i32 = " &
              ctx.genWebGpu(ast.fStart) & "; " &
              ast.fVar.ident() & " < " & ctx.genWebGpu(ast.fEnd) & "; " &
              ast.fVar.ident() & "++) {\n"
-    result &= ctx.genWebGpu(ast.fBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.fBody, indent + 1) & '\n'
+    result &= indentStr & '}'
   of gpuWhile:
     ctx.withoutSemicolon:
       result = indentStr & "while (" & ctx.genWebGpu(ast.wCond) & "){\n"
-    result &= ctx.genWebGpu(ast.wBody, indent + 1) & "\n"
-    result &= indentStr & "}"
+    result &= ctx.genWebGpu(ast.wBody, indent + 1) & '\n'
+    result &= indentStr & '}'
 
   of gpuDot:
-    result = ctx.genWebGpu(ast.dParent) & "." & ctx.genWebGpu(ast.dField)
+    result = ctx.genWebGpu(ast.dParent) & '.' & ctx.genWebGpu(ast.dField)
 
   of gpuIndex:
-    result = ctx.genWebGpu(ast.iArr) & "[" & ctx.genWebGpu(ast.iIndex) & "]"
+    result = ctx.genWebGpu(ast.iArr) & '[' & ctx.genWebGpu(ast.iIndex) & ']'
 
   of gpuCall:
     ctx.withoutSemicolon:
-      result = indentStr & ast.cName.ident() & "(" &
-               ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ")"
+      result = indentStr & ast.cName.ident() & '(' &
+               ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ')'
 
   of gpuTemplateCall:
     when nimvm:
@@ -711,15 +935,18 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
       result = ctx.genWebGpu(expandedBody, indent)
 
   of gpuBinOp:
-    result = indentStr & "(" & ctx.genWebGpu(ast.bLeft) & " " &
-             ast.bOp & " " &
-             ctx.genWebGpu(ast.bRight) & ")"
+    ctx.withoutSemicolon:
+      let l = ctx.genWebGpu(ast.bLeft)
+      let r = ctx.genWebGpu(ast.bRight)
+      result = indentStr & '(' & l & ' ' &
+               ctx.genWebGpu(ast.bOp) & ' ' &
+               r & ')'
 
   of gpuIdent:
     result = ast.ident()
 
   of gpuLit:
-    if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\""
+    if ast.lType.kind == gtString: result = '"' & ast.lValue & '"'
     elif ast.lValue == "DEFAULT":
       ## TODO: We could "manually" construct a zero version!
       ## NOTE: There *are* default initializations to zero. Just not for fields that
@@ -732,10 +959,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
   of gpuArrayLit:
     result = "array("
     for i, el in ast.aValues:
-      result.add gpuTypeToString(ast.aLitType) & "(" & el & ")"
+      result.add gpuTypeToString(ast.aLitType) & '(' & ctx.genWebGpu(el) & ')'
       if i < ast.aValues.high:
         result.add ", "
-    result.add ")"
+    result.add ')'
 
   of gpuReturn:
     result = indentStr & "return " & ctx.genWebGpu(ast.rValue)
@@ -744,18 +971,29 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = ast.pOp & ctx.genWebGpu(ast.pVal)
 
   of gpuTypeDef:
-    result = "struct " & ast.tName & "{\n"
+    result = "struct " & gpuTypeToString(ast.tTyp) & " {\n"
     for el in ast.tFields:
       result.add "  " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n"
-    result.add "}"
+    result.add '}'
+
+  of gpuAlias:
+    # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped
+    # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons
+    # after alias definitions, but not after `struct`. Hence we add `;` manually here
+    result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ';'
 
   of gpuObjConstr:
-    result = ast.ocName & "("
+    result = gpuTypeToString(ast.ocType) & '('
     for i, el in ast.ocFields:
-      result.add ctx.genWebGpu(el.value)
+      if el.value.kind == gpuLit and el.value.lValue == "DEFAULT":
+        # use type to construct a default value
+        let typStr = gpuTypeToString(el.typ, allowEmptyIdent = true)
+        result.add typStr & "()"
+      else:
+        result.add ctx.genWebGpu(el.value)
       if i < ast.ocFields.len - 1:
         result.add ", "
-    result.add ")"
+    result.add ')'
 
   of gpuInlineAsm:
     raiseAssert "Inline assembly not supported on the WebGPU target."
@@ -764,16 +1002,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
     result = indentStr & "/* " & ast.comment & " */"
 
   of gpuConv:
-    result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & "(" & ctx.genWebGpu(ast.convExpr) & ")"
+    result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & '(' & ctx.genWebGpu(ast.convExpr) & ')'
 
   of gpuCast:
-    result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ")"
+    result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ')'
 
   of gpuAddr:
-    result = "(&" & ctx.genWebGpu(ast.aOf) & ")"
+    result = "(&" & ctx.genWebGpu(ast.aOf) & ')'
 
   of gpuDeref:
-    result = "(*" & ctx.genWebGpu(ast.dOf) & ")"
+    result = "(*" & ctx.genWebGpu(ast.dOf) & ')'
 
   of gpuConstexpr:
     result = indentStr & "const " & ctx.genWebGpu(ast.cIdent) & ": " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " = " & ctx.genWebGpu(ast.cValue)
@@ -812,7 +1050,7 @@ proc codegen*(ctx: var GpuContext): string =
   # 1. Generate the header for all global variables
   for id, g in ctx.globals:
     result.add genGlobal(g)
-  result.add "\n"
+  result.add '\n'
 
   # 2. generate code for the global blocks (types, global vars etc)
   for blk in ctx.globalBlocks:
@@ -822,5 +1060,5 @@ proc codegen*(ctx: var GpuContext): string =
   for fnIdent, fn in ctx.fnTab:
     if fn.isGlobal():
       ## XXX: make adjustable!
-      result.add "@compute @workgroup_size(NUM_WORKGROUPS)\n"
+      result.add "@compute @workgroup_size(WORKGROUP_SIZE)\n"
     result.add ctx.genWebGpu(fn) & "\n\n"
diff --git a/constantine/math_compiler/experimental/builtins/builtins.nim b/constantine/math_compiler/experimental/builtins/builtins.nim
new file mode 100644
index 00000000..da079e83
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/builtins.nim
@@ -0,0 +1,19 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# NOTE: For the moment we import and export builtins here for all backends.
+# Once we change the code to make single backends importable on their own,
+# this will be changed and these builtins will be imported/exported in the
+# corresponding CUDA/WGSL etc module the user needs to import.
+import ./common_builtins
+import ./cuda_builtins
+import ./wgsl_builtins
+
+export common_builtins
+export cuda_builtins
+export wgsl_builtins
diff --git a/constantine/math_compiler/experimental/builtins/common_builtins.nim b/constantine/math_compiler/experimental/builtins/common_builtins.nim
new file mode 100644
index 00000000..b9a144a5
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/common_builtins.nim
@@ -0,0 +1,46 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+#import std / [macros, strutils, sequtils, options, sugar, tables, strformat, hashes, sets]
+#
+#import ./gpu_types
+#import ./backends/backends
+#import ./nim_to_gpu
+#
+#export gpu_types
+
+template nimonly*(): untyped {.pragma.}
+template cudaName*(s: string): untyped {.pragma.}
+
+## Dummy data for the typed nature of the `cuda` macro. These define commonly used
+## CUDA specific names so that they produce valid Nim code in the context of a typed macro.
+template global*() {.pragma.}
+template device*() {.pragma.}
+template forceinline*() {.pragma.}
+
+## If attached to a function, type or variable it will refer to a built in
+## in the target backend. This is used for all the functions, types and variables
+## defined below to indicate that we do not intend to generate code for them.
+template builtin*() {.pragma.}
+# If attached to a `var` it will be treated as a
+# `__constant__`! Only useful if you want to define a
+# constant without initializing it (and then use
+# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it
+# before executing the kernel)
+template constant*() {.pragma.}
+
+## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its
+## own `extern` pragma (due to requiring an argument it cannot be reused):
+## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma
+template cuExtern*(): untyped {.pragma.}
+template shared*(): untyped {.pragma.}
+template private*(): untyped {.pragma.}
+## You would typically use `cuExtern` and `shared` together:
+## `var x {.cuExtern, shared.}: array[N, Foo]`
+## for example to declare a constant array that is filled by the
+## host before kernel execution.
diff --git a/constantine/math_compiler/experimental/builtins/cuda_builtins.nim b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim
new file mode 100644
index 00000000..5d002f93
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim
@@ -0,0 +1,48 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./common_builtins
+
+type
+  Dim* = cint ## dummy to have access to math
+  NvBlockIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvBlockDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvThreadIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvGridDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+
+
+## These are dummy elements to make CUDA block / thread index / dim
+## access possible in the *typed* `cuda` macro. It cannot be `const`,
+## because then the typed code would evaluate the values before we
+## can work with it from the typed macro.
+let blockIdx* {.builtin.} = NvBlockIdx()
+let blockDim* {.builtin.} = NvBlockDim()
+let gridDim* {.builtin.} = NvGridDim()
+let threadIdx* {.builtin.} = NvThreadIdx()
+
+## Similar for procs. They don't need any implementation, as they won't ever be actually called.
+proc printf*(fmt: string) {.varargs, builtin.} = discard
+proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard
+
+## While you can use `malloc` on device with small sizes, it is usually not
+## recommended to do so.
+proc malloc*(size: csize_t): pointer {.builtin.}  = discard
+proc free*(p: pointer) {.builtin.} = discard
+proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard
diff --git a/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim
new file mode 100644
index 00000000..fe7e11f0
--- /dev/null
+++ b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim
@@ -0,0 +1,26 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ./common_builtins
+
+type
+  DimWgsl = uint32
+  WgslGridDim = object
+    x*: DimWgsl
+    y*: DimWgsl
+    z*: DimWgsl
+
+## WebGPU specific
+let global_id* {.builtin.} = WgslGridDim()
+let num_workgroups* {.builtin.} = WgslGridDim()
+
+## WebGPU select
+proc select*[T](f, t: T, cond: bool): T {.builtin.} =
+  # Implementation to run WebGPU code on CPU
+  if cond: t
+  else: f
diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
index 8f1aa879..ae5732ad 100644
--- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim
+++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
@@ -41,13 +41,15 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool =
   case n.typeKind
   of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
     result = false
-  of ntyObject, ntyArray:
+  of ntyObject:
     if passStructByPointer:
       result = false # regular objects can just be copied!
     else:
       result = true # struct passing by pointer forbidden
     ## NOTE: strictly speaking this is not the case of course! If the object
     ## contains refs, it won't hold!
+  of ntyArray: # statically sized arrays are passed by pointer in CUDA / C++ / C!
+    result = true
   of ntyGenericInst:
     if passStructByPointer:
       let impl = n.getTypeImpl()
@@ -60,6 +62,12 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool =
       result = false
     else:
       result = true
+  of ntyAlias:
+    let impl = n.getTypeInst()
+    if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr":
+      result = false
+    else:
+      result = true
   else:
     result = true
 
diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim
index 0c68e88e..0a8b7362 100644
--- a/constantine/math_compiler/experimental/gpu_compiler.nim
+++ b/constantine/math_compiler/experimental/gpu_compiler.nim
@@ -14,103 +14,25 @@ import ./nim_to_gpu
 
 export gpu_types
 
-template nimonly*(): untyped {.pragma.}
-template cudaName*(s: string): untyped {.pragma.}
+import builtins/builtins # all the builtins for the backend to make the Nim compiler happy
+export builtins
 
-## Dummy data for the typed nature of the `cuda` macro. These define commonly used
-## CUDA specific names so that they produce valid Nim code in the context of a typed macro.
-template global*() {.pragma.}
-template device*() {.pragma.}
-template forceinline*() {.pragma.}
-
-# If attached to a `var` it will be treated as a
-# `__constant__`! Only useful if you want to define a
-# constant without initializing it (and then use
-# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it
-# before executing the kernel)
-template constant*() {.pragma.}
-type
-  Dim* = cint ## dummy to have access to math
-  NvBlockIdx* = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvBlockDim = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvThreadIdx* = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-  NvGridDim = object
-    x*: Dim
-    y*: Dim
-    z*: Dim
-
-  DimWgsl = uint32
-  WgslGridDim = object
-    x*: DimWgsl
-    y*: DimWgsl
-    z*: DimWgsl
-
-
-## These are dummy elements to make CUDA block / thread index / dim
-## access possible in the *typed* `cuda` macro. It cannot be `const`,
-## because then the typed code would evaluate the values before we
-## can work with it from the typed macro.
-let blockIdx* = NvBlockIdx()
-let blockDim* = NvBlockDim()
-let gridDim* = NvGridDim()
-let threadIdx* = NvThreadIdx()
-
-## WebGPU specific
-let global_id* = WgslGridDim()
-
-## Similar for procs. They don't need any implementation, as they won't ever be actually called.
-proc printf*(fmt: string) {.varargs.} = discard
-proc memcpy*(dst, src: pointer, size: int) = discard
-
-## WebGPU select
-proc select*[T](f, t: T, cond: bool): T =
-  # Implementation to run WebGPU code on CPU
-  if cond: t
-  else: f
-
-## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its
-## own `extern` pragma (due to requiring an argument it cannot be reused):
-## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma
-template cuExtern*(): untyped {.pragma.}
-template shared*(): untyped {.pragma.}
-template private*(): untyped {.pragma.}
-## You would typically use `cuExtern` and `shared` together:
-## `var x {.cuExtern, shared.}: array[N, Foo]`
-## for example to declare a constant array that is filled by the
-## host before kernel execution.
-
-## While you can use `malloc` on device with small sizes, it is usually not
-## recommended to do so.
-proc malloc*(size: csize_t): pointer  = discard
-proc free*(p: pointer) = discard
-proc syncthreads*() {.cudaName: "__syncthreads".} = discard
-
-macro toGpuAst*(body: typed): GpuAst =
-  ## WARNING: The following are *not* supported:
-  ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
-  ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
-  ##         is actually a field access!
-  ## - most regular Nim features :)
-  echo body.treerepr
-  echo body.repr
+macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) =
+  ## Converts the body of this macro into a `GpuAst` from where it can be converted
+  ## into CUDA or WGSL code at runtime.
   var ctx = GpuContext()
-  newLit(ctx.toGpuAst(body))
+  let ast = ctx.toGpuAst(body)
+  let genProcs = toSeq(ctx.genericInsts.values)
+  let genTypes = toSeq(ctx.types.values)
+  let g = GpuGenericsInfo(procs: genProcs, types: genTypes)
+  newLit((g, ast))
 
 macro cuda*(body: typed): string =
-  ## WARNING: The following are *not* supported:
-  ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
-  ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
-  ##         is actually a field access!
-  ## - most regular Nim features :)
+  ## Converts the body of this macro into a `GpuAst` and from there into a string of
+  ## CUDA or WGSL code.
+  ##
+  ## TODO: make `cuda` choose CUDA backend, `wgsl` WGSL etc. Need to change code
+  ## that chooses backend etc.
   #echo body.treerepr
   var ctx = GpuContext()
   let gpuAst = ctx.toGpuAst(body)
@@ -119,13 +41,22 @@ macro cuda*(body: typed): string =
   let body = ctx.codegen(gpuAst)
   result = newLit(body)
 
-proc codegen*(ast: GpuAst, kernel: string = ""): string =
+proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string =
   ## Generates the code based on the given AST (optionally at runtime) and restricts
   ## it to a single global kernel (WebGPU) if any given.
-  let ast = ast.clone() ## XXX: remove clone
   var ctx = GpuContext()
+  for fn in gen.procs: # assign generics info to correct table
+    ctx.genericInsts[fn.pName] = fn
+  for typ in gen.types: # assign generics info to correct table
+    case typ.kind
+    of gpuTypeDef:
+      ctx.types[typ.tTyp] = typ
+    of gpuAlias:
+      ctx.types[typ.aTyp] = typ
+    else: raiseAssert "Unexpected node kind assigning to `types`: " & $typ
   result = ctx.codegen(ast, kernel)
 
+
 when isMainModule:
   # Mini example
   let kernel = cuda:
diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim
index c16bc900..cc96abc3 100644
--- a/constantine/math_compiler/experimental/gpu_field_ops.nim
+++ b/constantine/math_compiler/experimental/gpu_field_ops.nim
@@ -189,52 +189,58 @@ template defWGSLHelpers*(): untyped {.dirty.} =
   ## Global variable to simulate carry flag. Private == one for each thread
   var carry_flag {.private.}: uint32 = 0'u32
 
-  # Add with carry out (sets carry flag)
   proc add_co(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry out (sets carry flag)
     let result = a + b
     # Check for overflow: carry occurs if result < a (or result < b)
     carry_flag = select(0'u32, 1'u32, result < a)
     return result
 
-  # Add with carry in and carry out
   proc add_cio(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry in and carry out
     let temp = a + b
     let result = temp + carry_flag
     # Carry out if: temp overflowed OR (temp + carry overflowed)
     carry_flag = select(0'u32, 1'u32, (temp < a) or (result < temp))
     return result
 
-  # Add with carry in only
   proc add_ci(a: uint32, b: uint32): uint32 {.device.} =
+    # Add with carry in only.
+    # NOTE: `carry_flag` is not reset, because the next call after
+    # an `add_ci` *must* be `add_co` or `sub_bo`, but never
+    # `add/sub_cio/ci`!
     let temp = a + b
     let result = temp + carry_flag
     # Don't update carry flag for this operation
     return result
 
-  # Subtract with borrow out (sets borrow flag)
   proc sub_bo(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow out (sets borrow flag)
     let result = a - b
     # Borrow occurs if a < b
     carry_flag = select(0'u32, 1'u32, a < b)
     return result
 
-  # Subtract with borrow in only
-  proc sub_bi(a: uint32, b: uint32): uint32 {.device.} =
+  proc sub_bio(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow in and borrow out
+    # NOTE: `carry_flag` is not reset, because the next call after
+    # an `add_ci` *must* be `add_co` or `sub_bo`, but never
+    # `add/sub_cio/ci`!
     let temp = a - b
     let result = temp - carry_flag
-    # Don't update carry flag for this operation
+    # Borrow out if: a < b OR (temp - borrow underflowed)
+    carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag))
     return result
 
-  # Subtract with borrow in and borrow out
-  proc sub_bio(a: uint32, b: uint32): uint32 {.device.} =
+  proc sub_bi(a: uint32, b: uint32): uint32 {.device.} =
+    # Subtract with borrow in only
     let temp = a - b
     let result = temp - carry_flag
-    # Borrow out if: a < b OR (temp - borrow underflowed)
-    carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag))
+    # Don't update carry flag for this operation
     return result
 
-  # Select based on condition (equivalent to PTX slct)
   proc slct(a: uint32, b: uint32, pred: int32): uint32 {.device.} =
+    # Select based on condition (equivalent to PTX slct)
     return select(b, a, pred >= 0)
 
   proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} =
@@ -261,8 +267,8 @@ template defWGSLHelpers*(): untyped {.dirty.} =
 
     return p3 + (p1 shr 16) + (p2 shr 16) + carry
 
-  # r <- a * b + c (multiply-add low)
   proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    # r <- a * b + c (multiply-add low)
     return mul_lo(a, b) + c
 
   proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
@@ -280,8 +286,8 @@ template defWGSLHelpers*(): untyped {.dirty.} =
     let product = mul_lo(a, b)
     return add_cio(product, c)
 
-  # r <- (a * b) >> 32 + c (multiply-add high)
   proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    # r <- (a * b) >> 32 + c (multiply-add high)
     return mul_hi(a, b) + c
 
   proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
@@ -300,6 +306,113 @@ template defWGSLHelpers*(): untyped {.dirty.} =
     return add_cio(hi_product, c)
 
 
+template defBigIntCompare*(): untyped {.dirty.} =
+  ## This template adds a comparison operator for BigInts `<` (which is rewritten to
+  ## a function call `less`) as well as a `toCanonical` function to turn a Montgomery
+  ## representation into a canonical representation.
+  ## It is included in the `defCoreFieldOps` by default, so you need not manually use it.
+
+  proc less(a, b: BigInt): bool {.device.} =
+    ## Returns true if a < b for two big ints in *canonical*
+    ## representation.
+    ##
+    ## NOTE: The inputs are compared *as is*. That means if they are
+    ## in Montgomery representation the result will not reflect the
+    ## ordering relation of their associated canonical values!
+    ## Call `toCanonical` on field elements in Montgomery order before
+    ## comparing them.
+    ##
+    ## Comparison is constant-time
+    var borrow: uint32
+    # calculate sub with borrows for side effect. Use borrow flag
+    # at the end to determine if value was smaller
+    discard sub_bo(a[0], b[0])
+    staticFor i, 1, a.len:
+      discard sub_bio(a[i], b[i])
+    borrow = sub_bi(0'u32, 0'u32)
+    return borrow.bool
+
+  # template to rewrite `<` into a function call. Most backends don't allow custom operators
+  template `<`(b1, b2: BigInt): untyped = less(b1, b2)
+
+  proc muladd1_gpu(hi, lo: var uint32, a, b, c: uint32) {.device, forceinline.} =
+    ## Extended precision multiplication + addition
+    ## (hi, lo) <- a*b + c
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding any c cannot overflow
+    ##
+    ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd1`
+    lo = mulloadd_co(a, b, c)     # low part of a*b + c with carry out
+    hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in
+
+  proc muladd2_gpu(hi, lo: var uint32, a, b, c1, c2: uint32) {.device, forceinline.} =
+    ## Extended precision multiplication + addition + addition
+    ## (hi, lo) <- a*b + c1 + c2
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+    ##
+    ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd2`
+    lo = mulloadd_co(a, b, c1)    # low part of a*b + c1 with carry out
+    hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in
+    # Add c2 with carry propagation
+    lo = add_co(lo, c2)
+    hi = add_ci(hi, 0'u32)
+
+  proc sub_no_mod(a, b: BigInt): BigInt {.device.} =
+    ## Generate an optimized substraction kernel
+    ## with parameters `a, b, modulus: Limbs -> Limbs`
+    ## I.e. this does _not_ perform modular reduction.
+    var t = BigInt()
+    t[0] = sub_bo(a[0], b[0])
+    staticFor i, 1, a.len:
+      t[i] = sub_bio(a[i], b[i])
+    return t
+
+  proc sub_no_mod(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Subtraction of two finite field elements stored in `a` and `b`
+    ## *without* modular reduction.
+    ## The result is stored in `r`.
+    r = sub_no_mod(a, b)
+
+  proc csub_no_mod(r: var BigInt, a: BigInt, condition: bool) {.device.} =
+    ## Conditionally subtract `a` from `r` in place *without* modular
+    ## reduction.
+    ##
+    ## Note: This is constant-time
+    var t = BigInt()
+    t.sub_no_mod(r, a)
+    r.ccopy(t, condition)
+
+  proc fromMont_CIOS(r: var BigInt, a, M: BigInt, m0ninv: uint32) {.device.} =
+    ## Convert from Montgomery form to canonical BigInt form
+    # for i in 0 .. n-1:
+    #   m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication)
+    #   C, _ = t[0] + m * M[0]
+    #   for j in 1 ..n-1:
+    #     (C, t[j-1]) <- r[j] + m*M[j] + C
+    #   t[n-1] = C
+
+    var t = a # Ensure working in registers
+
+    staticFor i, 0, N:
+      let m = t[0] * m0ninv
+      var C, lo: uint32
+      muladd1_gpu(C, lo, m, M[0], t[0])
+      staticFor j, 1, N:
+        muladd2_gpu(C, t[j-1], m, M[j], C, t[j])
+      t[N-1] = C
+
+    t.csub_no_mod(M, not (t < M))
+    r = t
+
+  proc toCanonical(b: BigInt): BigInt {.device.} =
+    var canon: BigInt
+    canon.fromMont_CIOS(b, M, M0NInv)
+    return canon
+
 template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   # Need to get the limbs & spare bits data in a static context
   template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32
@@ -314,7 +427,11 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2))
   const M0NInv = getM0ninv().uint32
 
-  proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} =
+  # `ccopy` needed for BigInt comparison logic
+  proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.}
+  defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts
+
+  proc finalSubMayOverflow(a, M: BigInt, overflowedLimbs: uint32): BigInt {.device.} =
     ## If a >= Modulus: r <- a-M
     ## else:            r <- a
     ##
@@ -325,9 +442,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
     var scratch: BigInt = BigInt()
 
-    # Contains 0x0001 (if overflowed limbs) or 0x0000
-    let overflowedLimbs = add_ci(0'u32, 0'u32)
-
     # Now substract the modulus, and test a < M with the last borrow
     scratch[0] = sub_bo(a[0], M[0])
     staticFor i, 1, N:
@@ -337,9 +451,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     # 2. if a >= M, underflowedModulus >= 0
     # if underflowedModulus >= 0: a-M else: a
     # TODO: predicated mov instead?
-    ## TODO: Fix this. `slct` needs a negative value for the else branch
     let underflowedModulus = sub_bi(overflowedLimbs, 0'u32)
-
     var r: BigInt = BigInt()
     staticFor i, 0, N:
       r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus))
@@ -362,9 +474,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       scratch[i] = sub_bio(a[i], M[i])
 
     # If it underflows here, `a` was smaller than the modulus, which is what we want
-    ## TODO: Fix this. `slct` needs a negative value for the else branch
     let underflowedModulus = sub_bi(0'u32, 0'u32)
-
     var r: BigInt = BigInt()
     staticFor i, 0, N:
       r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus))
@@ -373,18 +483,21 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
   proc modadd(a, b, M: BigInt): BigInt {.device.} =
     ## Generate an optimized modular addition kernel
     ## with parameters `a, b, modulus: Limbs -> Limbs`
-    # try to add two bigints
     var t = BigInt() # temporary
 
     t[0] = add_co(a[0], b[0])
     staticFor i, 1, N:
       t[i] = add_cio(a[i], b[i])
 
-    # can use `when` of course!
-    when spareBits() >= 1: # if spareBits() >= 1: # would also work
+    when spareBits() >= 1:
       t = finalSubNoOverflow(t, M)
     else:
-      t = finalSubMayOverflow(t, M)
+      # Contains 0x0001 (if overflowed limbs) or 0x0000
+      # This _must_ be computed here and not inside of `finalSubMayOverflow`. In a
+      # debug build on CUDA the carry flag would (potentially) be reset going into
+      # the function.
+      let overflowedLimbs = add_ci(0'u32, 0'u32)
+      t = finalSubMayOverflow(t, M, overflowedLimbs)
 
     return t
 
@@ -398,7 +511,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       t[i] = sub_bio(a[i], b[i])
 
     let underflowMask = sub_bi(0'u32, 0'u32)
-
     # If underflow
     # TODO: predicated mov instead?
     var maskedM: BigInt = BigInt()
@@ -410,7 +522,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       t[i] = add_cio(t[i], maskedM[i])
     when N > 1:
       t[N-1] = add_ci(t[N-1], maskedM[N-1])
-
     return t
 
   proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} =
@@ -576,13 +687,8 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     ## The result is stored in `r`.
     r = modsub(a, b, M)
 
-  proc mul(r: var BigInt, a, b: BigInt) {.device.} =
-    ## Multiplication of two finite field elements stored in `a` and `b`.
-    ## The result is stored in `r`.
-    r = mtymul_CIOS_sparebit(a, b, M, true)
-
   proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} =
-    ## Conditional copy in CUDA
+    ## Conditional copy.
     ## If condition is true: b is copied into a
     ## If condition is false: a is left unmodified
     ##
@@ -599,7 +705,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       a[i] = slct(b[i], a[i], cond)
 
   proc csetZero(r: var BigInt, condition: bool) {.device.} =
-    ## Conditionally set `r` to zero in CUDA
+    ## Conditionally set `r` to zero.
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -607,14 +713,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc csetOne(r: var BigInt, condition: bool) {.device.} =
-    ## Conditionally set `r` to one in CUDA
+    ## Conditionally set `r` to one.
     ##
     ## Note: This is constant-time
     template mOne: untyped = MontyOne
     r.ccopy(mOne, condition)
 
   proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} =
-    ## Conditionally add `a` to `r` in place in CUDA.
+    ## Conditionally add `a` to `r` in place..
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -622,7 +728,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} =
-    ## Conditionally subtract `a` from `r` in place in CUDA.
+    ## Conditionally subtract `a` from `r` in place.
     ##
     ## Note: This is constant-time
     var t = BigInt()
@@ -630,14 +736,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r.ccopy(t, condition)
 
   proc doubleElement(r: var BigInt, a: BigInt) {.device.} =
-    ## Double `a` and store it in `r` in CUDA.
+    ## Double `a` and store it in `r`.
     ##
     ## Note: This is constant-time
     r.add(a, a)
 
   proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} =
-    ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`
-    ## in CUDA.
+    ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`.
     ##
     ## Note: This is constant-time
     ##
@@ -649,7 +754,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true)
 
   proc isZero(r: var bool, a: BigInt) {.device.} =
-    ## Checks if `a` is zero in CUDA. Result is written to `r`.
+    ## Checks if `a` is zero. Result is written to `r`.
     ##
     ## Note: This is constant-time
     #r = true
@@ -660,8 +765,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
       isZero = isZero or a[i]
     r = isZero == 0'u32
 
+  proc isZero(a: BigInt): bool {.device, forceinline.} =
+    result.isZero(a)
+  proc isNonZero(a: BigInt): bool {.device, forceinline.} =
+    result = not isZero(a)
+
   proc isOdd(r: var bool, a: BigInt) {.device.} =
-    ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`.
+    ## Checks if the Montgomery value of `a` is odd. Result is written to `r`.
     ##
     ## IMPORTANT: The canonical value may or may not be odd if the Montgomery
     ## representation is odd (and vice versa!).
@@ -671,7 +781,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r = (a[0] and 1'u32).bool
 
   proc neg(r: var BigInt, a: BigInt) {.device.} =
-    ## Computes the negation of `a` and stores it in `r` in CUDA.
+    ## Computes the negation of `a` and stores it in `r`.
     ##
     ## Note: This is constant-time
     # Check if input is zero
@@ -687,14 +797,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
 
   proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} =
     ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise
-    ## copy over `a` into `r` in CUDA.
+    ## copy over `a` into `r`.
     ##
     ## Note: This is constant-time
     r.neg(a)
     r.ccopy(a, not condition)
 
   proc shiftRight(r: var BigInt, k: uint32) {.device.} =
-    ## Shift `r` right by `k` bits in-nplace in CUDA.
+    ## Shift `r` right by `k` bits in-nplace.
     ##
     ## k MUST be less than the base word size (2^31)
     ##
@@ -716,7 +826,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
     r[lastIdx] = r[lastIdx] shr k
 
   proc div2(r: var BigInt) {.device.} =
-    ## Divide `r` by 2 in-place in CUDA.
+    ## Divide `r` by 2 in-place.
     ##
     ## Note: This is constant-time
     # check if the input is odd
@@ -728,3 +838,71 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} =
 
     # if it was odd, add `M+1/2` to go 'half-way around'
     r.cadd(PP1D2, isO)
+
+  proc mul_lohi(hi, lo: var uint32, a, b: uint32) {.device, forceinline.} =
+    lo = mul_lo(a, b)
+    hi = mul_hi(a, b)
+
+  proc mulAcc(t, u, v: var uint32, a, b: uint32) {.device, forceinline.} =
+    ## (t, u, v) <- (t, u, v) + a * b
+    v = mulloadd_co(a, b, v)     # v = (a*b).low + v, with carry out
+    u = mulhiadd_cio(a, b, u)    # u = (a*b).high + u + carry, with carry out
+    t = add_ci(t, 0'u32)         # t = t + carry
+
+  proc mtymul_FIPS(a, b, M: BigInt, lazyReduce: static bool = false): BigInt {.device.} =
+    ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS).
+    ## This implementation can be used for fields that do not have any spare bits.
+    ##
+    ## This maps
+    ## - [0, 2p) -> [0, 2p) with lazyReduce
+    ## - [0, 2p) -> [0, p) without
+    ##
+    ## lazyReduce skips the final substraction step.
+    # - Architectural Enhancements for Montgomery
+    #   Multiplication on Embedded RISC Processors
+    #   Johann Großschädl and Guy-Armand Kamendje, 2003
+    #   https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf
+    #
+    # - New Speed Records for Montgomery Modular
+    #   Multiplication on 8-bit AVR Microcontrollers
+    #   Zhe Liu and Johann Großschädl, 2013
+    #   https://eprint.iacr.org/2013/882.pdf
+    template m0ninv: untyped = M0NInv
+    var z = BigInt() # zero-init, ensure on stack and removes in-place problems in tower fields
+    const L = a.len
+    var t, u, v = 0'u32
+
+    staticFor i, 0, L:
+      staticFor j, 0, i:
+        mulAcc(t, u, v, a[j], b[i-j])
+        mulAcc(t, u, v, z[j], M[i-j])
+      mulAcc(t, u, v, a[i], b[0])
+      z[i] = v * m0ninv
+      mulAcc(t, u, v, z[i], M[0])
+      v = u
+      u = t
+      t = 0'u32
+
+    staticFor i, L, 2*L:
+      staticFor j, i-L+1, L:
+        mulAcc(t, u, v, a[j], b[i-j])
+        mulAcc(t, u, v, z[j], M[i-j])
+      z[i-L] = v
+      v = u
+      u = t
+      t = 0'u32
+
+    when not lazyReduce:
+      let cond = v != 0 or not(z < M)
+      # conditionally subtract using *non modular subtraction*. If `cond == true`,
+      # we are in `M <= z <= 2M` and can safely subtract `M`.
+      z.csub_no_mod(M, cond)
+    return z
+
+  proc mul(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Multiplication of two finite field elements stored in `a` and `b`.
+    ## The result is stored in `r`.
+    when spareBits() >= 1:
+      r = mtymul_CIOS_sparebit(a, b, M, true)
+    else: # e.g. Goldilocks
+      r = mtymul_FIPS(a, b, M, false)
diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim
index 448a783e..834d8fa5 100644
--- a/constantine/math_compiler/experimental/gpu_types.nim
+++ b/constantine/math_compiler/experimental/gpu_types.nim
@@ -33,6 +33,7 @@ type
     gpuDot          # Member access (a.b)
     gpuIndex        # Array indexing (a[b])
     gpuTypeDef      # Type definition
+    gpuAlias        # A type alias
     gpuObjConstr    # Object (struct) constructor
     gpuInlineAsm    # Inline assembly (PTX)
     gpuAddr         # Address of an expression
@@ -45,18 +46,23 @@ type
   GpuTypeKind* = enum
     gtVoid,
     gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics
-    gtArray,     # Static array `array[N, dtype]` -> `dtype[N]`
+    gtArray,       # Static array `array[N, dtype]` -> `dtype[N]`
     gtString,
-    gtObject,    # Struct types
-    gtPtr,       # Pointer type, carries inner type
-    gtUA,        # UncheckedArray (UA) mapped to runtime sized arrays
-    gtVoidPtr    # Opaque void pointer
+    gtObject,      # Struct types
+    gtPtr,         # Pointer type, carries inner type
+    gtUA,          # UncheckedArray (UA) mapped to runtime sized arrays
+    gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!)
+    gtVoidPtr      # Opaque void pointer
+    gtInvalid      # Can be returned to indicate a call to `nimToGpuType` failed to determine a type
+                   ## XXX: make this the default value and replace all `gtVoid` placeholders by it
+
 
   GpuTypeField* = object
     name*: string
     typ*: GpuType
 
   GpuType* = ref object
+    builtin*: bool ## Whether the type refers to a builtin type or not
     case kind*: GpuTypeKind
     of gtPtr:
       to*: GpuType # `ptr T` points to `to`
@@ -74,6 +80,10 @@ type
       aLen*: int     # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time
                     # On both CUDA and WebGPU a length of `0` is also used to generate `int foo[]` (CUDA)
                     # `array<foo>` (WebGPU) (runtime sized arrays), which are generated from `ptr UncheckedArray[float32]` for example.
+    of gtGenericInst:
+      gName*: string # name of the generic type
+      gArgs*: seq[GpuType] # list of the instantiated generic arguments e.g. `vec3<f32>` on WGSL backend
+      gFields*: seq[GpuTypeField] # same as `oFields` for `gtObject`
     else: discard
 
   GpuAttribute* = enum
@@ -97,7 +107,9 @@ type
       pParams*: seq[GpuParam]
       pBody*: GpuAst
       pAttributes*: set[GpuAttribute] # order not important, hence set
+      forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration
     of gpuCall:
+      cIsExpr*: bool ## If the call returns a value
       cName*: GpuAst ## Will be a `GpuIdent`
       cArgs*: seq[GpuAst]
     of gpuTemplateCall:
@@ -115,8 +127,10 @@ type
       wCond*: GpuAst
       wBody*: GpuAst
     of gpuBinOp:
-      bOp*: string
+      bOp*: GpuAst # `gpuIdent` of the binary operation
       bLeft*, bRight*: GpuAst
+      # types of left and right nodes. Determined from Nim symbol associated with `bOp`
+      bLeftTyp*, bRightTyp*: GpuType
     of gpuVar:
       vName*: GpuAst ## Will be a `GpuIdent`
       vType*: GpuType
@@ -141,9 +155,10 @@ type
       cValue*: GpuAst # not just a string to support different types easily
       cType*: GpuType
     of gpuArrayLit:
-      aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array
+      aValues*: seq[GpuAst]
       aLitType*: GpuType # type of first element
     of gpuBlock:
+      isExpr*: bool ## Whether this block represents an expression, i.e. it returns something
       blockLabel*: string # optional name of the block. If any given, will open a `{ }` scope in CUDA
       statements*: seq[GpuAst]
       ## XXX: we could add a `locals` argument here, which would refer to all local variables
@@ -159,10 +174,14 @@ type
       pOp*: string
       pVal*: GpuAst
     of gpuTypeDef:
-      tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols
+      tTyp*: GpuType ## the actual type. Used to generate the name
       tFields*: seq[GpuTypeField]
+    of gpuAlias:
+      aTyp*: GpuType ## Name of the type alias
+      aTo*: GpuAst ## Type the alias maps to
+      aDistinct*: bool ## If the alias is a distinct type in Nim.
     of gpuObjConstr:
-      ocName*: string  # type we construct
+      ocType*: GpuType  # type we construct
       ## XXX: it would be better if we already fill the fields with default values here
       ocFields*: seq[GpuFieldInit] # the fields we initialize
     of gpuInlineAsm:
@@ -212,12 +231,17 @@ type
   GpuFieldInit* = object
     name*: string
     value*: GpuAst
+    typ*: GpuType
 
   ## XXX: UNUSED
   TemplateInfo* = object
     params*: seq[string]
     body*: GpuAst
 
+  GpuProcSignature* = object
+    params*: seq[GpuParam]
+    retType*: GpuType
+
   GpuContext* = object
     ## XXX: need table for generic invocations. Then when we encounter a type, need to map to
     ## the specific version
@@ -245,6 +269,47 @@ type
     #                        ## when we finish, we pop. Before we pop, we assign the variable definitions to the `gpuBlock`
     #                        ## `locals`
     genSymCount*: int ## increases for every generated identifier (currently only underscore `_`), hence the basic solution
+    ## Maps a struct type and field name, which is of pointer type to the value the user assigns
+    ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()`
+    ## constructor (WebGPU only).
+    structsWithPtrs*: Table[(GpuType, string), GpuAst]
+    ## Set of all generic proc names we have encountered in Nim -> GpuAst. When
+    ## we see an `nnkCall` we check if we call a generic function. If so, look up
+    ## the instantiated generic, parse it and store in `genericInsts` below.
+    generics*: HashSet[string]
+
+    ## Stores the unique identifiers (keys) and the implementations of the
+    ## precise generic instantiations that are called.
+    genericInsts*: OrderedTable[GpuAst, GpuAst]
+
+    ## Table of procs and their signature to avoid looping infinitely for recursive procs
+    ## Arguments are:
+    ## - Key: ident of the proc
+    ## - Value: signature of the (possibly generic) instantiation
+    processedProcs*: OrderedTable[GpuAst, GpuProcSignature]
+
+    ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can
+    ## check if they return a value when we encounter them in a `gpuCall`
+    builtins*: OrderedTable[GpuAst, GpuAst]
+
+    ## Table of all known types. Filled during Nim -> GpuAst. Includes generic
+    ## instantiations, but also all other types.
+    ## Key: the raw type. Value: a full `gpuTypeDef`
+    types*: OrderedTable[GpuType, GpuAst]
+
+    ## This is _effectively_ just a set of all already produced function symbols.
+    ## We use it to determine if when encountering another function with the same
+    ## name, but different arguments to instead of using `iName` to use `iSym` as
+    ## the function name. This is to avoid overload issues in backends that don't
+    ## allow overloading by function signatures.
+    symChoices*: HashSet[string]
+
+  ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we
+  ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext`
+  ## causes trouble.
+  GpuGenericsInfo* = object
+    procs*: seq[GpuAst]
+    types*: seq[GpuAst]
 
   GenericArg* = object
     addrSpace*: AddressSpace ## We store the address space, because that's what matters
@@ -274,6 +339,12 @@ proc clone*(typ: GpuType): GpuType =
   of gtArray:
     result.aTyp = typ.aTyp.clone()
     result.aLen = typ.aLen
+  of gtGenericInst:
+    result.gName = typ.gName
+    for g in typ.gArgs:
+      result.gArgs.add g.clone()
+    for f in typ.gFields:
+      result.gFields.add GpuTypeField(name: f.name, typ: f.typ.clone())
   else: discard
 
 proc clone*(ast: GpuAst): GpuAst =
@@ -293,8 +364,10 @@ proc clone*(ast: GpuAst): GpuAst =
       result.pParams.add(clonedParam)
     result.pBody = ast.pBody.clone()
     result.pAttributes = ast.pAttributes
+    result.forwardDeclare = result.forwardDeclare
   of gpuCall:
     result = GpuAst(kind: gpuCall)
+    result.cIsExpr = ast.cIsExpr
     result.cName = ast.cName.clone()
     for arg in ast.cArgs:
       result.cArgs.add(arg.clone())
@@ -320,9 +393,11 @@ proc clone*(ast: GpuAst): GpuAst =
     result.wBody = ast.wBody.clone()
   of gpuBinOp:
     result = GpuAst(kind: gpuBinOp)
-    result.bOp = ast.bOp
+    result.bOp = ast.bOp.clone()
     result.bLeft = ast.bLeft.clone()
     result.bRight = ast.bRight.clone()
+    result.bLeftTyp = ast.bLeftTyp.clone()
+    result.bRightTyp = ast.bRightTyp.clone()
   of gpuVar:
     result = GpuAst(kind: gpuVar)
     result.vName = ast.vName.clone()
@@ -354,7 +429,8 @@ proc clone*(ast: GpuAst): GpuAst =
     result.cType = ast.cType.clone()
   of gpuArrayLit:
     result = GpuAst(kind: gpuArrayLit)
-    result.aValues = ast.aValues
+    for a in ast.aValues:
+      result.aValues.add a.clone()
     result.aLitType = ast.aLitType.clone()
   of gpuPrefix:
     result = GpuAst(kind: gpuPrefix)
@@ -362,6 +438,7 @@ proc clone*(ast: GpuAst): GpuAst =
     result.pVal = ast.pVal.clone()
   of gpuBlock:
     result = GpuAst(kind: gpuBlock)
+    result.isExpr = ast.isExpr
     result.blockLabel = ast.blockLabel
     for stmt in ast.statements:
       result.statements.add(stmt.clone())
@@ -378,14 +455,24 @@ proc clone*(ast: GpuAst): GpuAst =
     result.iIndex = ast.iIndex.clone()
   of gpuTypeDef:
     result = GpuAst(kind: gpuTypeDef)
-    result.tName = ast.tName
+    result.tTyp = ast.tTyp.clone()
     for f in ast.tFields:
       result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone()))
+  of gpuAlias:
+    result = GpuAst(kind: gpuAlias)
+    result.aTyp = ast.aTyp.clone()
+    result.aTo = ast.aTo.clone()
   of gpuObjConstr:
     result = GpuAst(kind: gpuObjConstr)
-    result.ocName = ast.ocName
+    result.ocType = ast.ocType.clone()
     for f in ast.ocFields:
-      result.ocFields.add(GpuFieldInit(name: f.name, value: f.value.clone()))
+      result.ocFields.add(
+        GpuFieldInit(
+          name: f.name,
+          value: f.value.clone(),
+          typ: f.typ.clone()
+        )
+      )
   of gpuInlineAsm:
     result = GpuAst(kind: gpuInlineAsm)
     result.stmt = ast.stmt
@@ -423,6 +510,12 @@ proc hash*(t: GpuType): Hash =
   of gtArray:
     h = h !& hash(t.aTyp)
     h = h !& hash(t.aLen)
+  of gtGenericInst:
+    h = h !& hash(t.gName)
+    for g in t.gArgs:
+      h = h !& hash(g)
+    for f in t.gFields:
+      h = h !& hash(f)
   else: discard
   result = !$ h
 
@@ -451,6 +544,15 @@ proc `==`*(a, b: GpuType): bool =
       else:
         for i in 0 ..< a.oFields.len:
           result = result and (a.oFields[i] == b.oFields[i])
+    of gtGenericInst:
+      result = a.gName == b.gName
+      if a.gArgs.len != b.gArgs.len: result = false
+      elif a.gFields.len != b.gFields.len: result = false
+      else:
+        for i in 0 ..< a.gArgs.len:
+          result = result and (a.gArgs[i] == b.gArgs[i])
+        for i in 0 ..< a.gFields.len:
+          result = result and (a.gFields[i] == b.gFields[i])
     of gtArray: result = a.aTyp == b.aTyp and a.aLen == b.aLen
     else: discard
 
@@ -461,6 +563,17 @@ proc `==`*(a, b: GpuAst): bool =
   else:
     result = a.iSym == b.iSym and a.iTyp == b.iTyp and a.symbolKind == b.symbolKind
 
+proc `==`*(a, b: GpuProcSignature): bool =
+  if a.retType != b.retType: result = false
+  elif a.params.len != b.params.len:
+    result = false
+  else:
+    result = true
+    for i in 0 ..< a.params.len:
+      let ap = a.params[i]
+      let bp = b.params[i]
+      result = result and (ap == bp)
+
 proc len*(ast: GpuAst): int =
   case ast.kind
   of gpuProc:      1
@@ -496,6 +609,31 @@ proc removePrefix(s, p: string): string =
   result = s
   result.removePrefix(p)
 
+proc pretty*(t: GpuType): string =
+  ## returns a flat (but lossy) string representation of the type
+  if t == nil:
+    result = "GpuType(nil)"
+  else:
+    case t.kind
+    of gtPtr:
+      result = if t.implicit: "var " else: "ptr "
+      result.add pretty(t.to)
+    of gtUA:
+      result = "UncheckedArray[" & t.uaTo.pretty() & "]"
+    of gtObject:
+      result = t.name # just the name
+    of gtArray:
+      result = "array[" & $t.aLen & ", " & t.aTyp.pretty() & "]"
+    of gtGenericInst:
+      result = t.gName & "["
+      for i, g in t.gArgs:
+        result.add pretty(g)
+        if i < t.gArgs.high:
+          result.add ", "
+      result.add "]"
+    else:
+      result = ($t.kind).removePrefix("gt")
+
 proc pretty*(n: GpuAst, indent: int = 0): string =
   template id(): untyped = repeat(" ", indent)
   template idn(x): untyped = repeat(" ", indent) & $x
@@ -545,17 +683,17 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add pretty(n.wCond, indent + 2)
     result.add pretty(n.wBody, indent + 2)
   of gpuBinOp:
-    result.add idd("Ident", n.bOp)
+    result.add pretty(n.bOp, indent + 2)
     result.add pretty(n.bLeft, indent + 2)
     result.add pretty(n.bRight, indent + 2)
   of gpuVar:
     result.add pretty(n.vName, indent + 2)
     result.add pretty(n.vInit, indent + 2)
     if n.vAttributes.len > 0:
-      result.add id("Attributes")
+      result.add idd("Attributes")
       for attr in n.vAttributes:
         let indent = indent + 2
-        result.add id(attr)
+        result.add idd(attr)
   of gpuAssign:
     result.add pretty(n.aLeft, indent + 2)
     result.add pretty(n.aRight, indent + 2)
@@ -568,7 +706,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add pretty(n.cValue, indent + 2)
   of gpuArrayLit:
     for el in n.aValues:
-      result.add id(el)
+      result.add pretty(el, indent + 2)
   of gpuBlock:
     if n.blockLabel.len > 0:
       result.add id("Label", n.blockLabel)
@@ -586,17 +724,22 @@ proc pretty*(n: GpuAst, indent: int = 0): string =
     result.add id("Op", n.pOp)
     result.add pretty(n.pVal, indent + 2)
   of gpuTypeDef:
-    result.add id("Type", n.tName)
+    result.add id("Type", pretty(n.tTyp))
     result.add id("Fields")
     for t in n.tFields:
       let indent = indent + 2
       result.add id(t.name)
+  of gpuAlias:
+    result.add id("Alias", pretty(n.aTyp))
+    result.add pretty(n.aTo, indent + 2)
   of gpuObjConstr:
-    result.add id("Ident", n.ocName)
-    result.add id("Fields")
+    result.add idd("Ident", pretty(n.ocType))
+    result.add idd("Fields")
     for f in n.ocFields:
-      let indent = indent + 2
-      result.add id("Name", f.name)
+      var indent = indent + 2
+      result.add idd("Field")
+      indent = indent + 2
+      result.add idd("Name", f.name)
       result.add pretty(f.value, indent + 2)
   of gpuInlineAsm:
     result.add id(n.stmt)
@@ -694,6 +837,19 @@ iterator mitems*(ast: var GpuAst): var GpuAst =
 iterator items*(ast: GpuAst): GpuAst =
   iterImpl(ast, mutable = false)
 
+iterator mpairs*(ast: var GpuAst): (int, var GpuAst) =
+  ## Iterate over all child nodes of the given AST and the index
+  var i = 0
+  for el in mitems(ast):
+    yield (i, el)
+    inc i
+
+iterator pairs*(ast: GpuAst): (int, GpuAst) =
+  var i = 0
+  for el in items(ast):
+    yield (i, el)
+    inc i
+
 
 ## General utility helpers
 
@@ -704,9 +860,12 @@ proc ident*(n: GpuAst): string =
   result = n.iName
 
 template withoutSemicolon*(ctx: var GpuContext, body: untyped): untyped =
-  ctx.skipSemicolon = true
-  body
-  ctx.skipSemicolon = false
+  if not ctx.skipSemicolon: # if we are already skipping, leave true
+    ctx.skipSemicolon = true
+    body
+    ctx.skipSemicolon = false
+  else:
+    body
 
 proc getInnerArrayLengths*(t: GpuType): string =
   ## Returns the lengths of the inner array types for a nested array.
diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim
index 45af3782..5373581b 100644
--- a/constantine/math_compiler/experimental/nim_to_gpu.nim
+++ b/constantine/math_compiler/experimental/nim_to_gpu.nim
@@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has
 import ./gpu_types
 import ./backends/backends
 
-proc nimToGpuType(n: NimNode): GpuType
+proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType
 
 proc initGpuType(kind: GpuTypeKind): GpuType =
   ## If `kind` is `gtPtr` `to` must be the type we point to
@@ -20,11 +20,17 @@ proc initGpuType(kind: GpuTypeKind): GpuType =
 
 proc initGpuPtrType(to: GpuType, implicitPtr: bool): GpuType =
   ## If `kind` is `gtPtr` `to` must be the type we point to
-  result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr)
+  if to.kind == gtInvalid: # this is not a valid type
+    result = GpuType(kind: gtInvalid)
+  else:
+    result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr)
 
 proc initGpuUAType(to: GpuType): GpuType =
   ## Initializes a GPU type for an unchecked array (ptr wraps this)
-  result = GpuType(kind: gtUA, uaTo: to)
+  if to.kind == gtInvalid: # this is not a valid type
+    result = GpuType(kind: gtInvalid)
+  else:
+    result = GpuType(kind: gtUA, uaTo: to)
 
 proc initGpuVoidPtr(): GpuType =
   result = GpuType(kind: gtVoidPtr)
@@ -37,6 +43,17 @@ proc initGpuArrayType(aTyp: NimNode, len: int): GpuType =
   ## Construct an statically sized array type
   result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len)
 
+proc toTypeDef(typ: GpuType): GpuAst =
+  ## Converts a given object or generic instantiation type into an AST of a
+  ## corresponding type def.
+  # store the type instantiation
+  result = GpuAst(kind: gpuTypeDef, tTyp: typ)
+  case typ.kind
+  of gtObject:      result.tFields = typ.oFields
+  of gtGenericInst: result.tFields = typ.gFields
+  else:
+    raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation."
+
 proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
   case t
   #of ntyBool, ntyChar:
@@ -57,9 +74,62 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
   of ntyUInt16: gtUint16
   of ntyUInt32: gtUint32
   of ntyUInt64: gtUint64
+  of ntyString: gtString
   else:
     raiseAssert "Not supported yet: " & $t
 
+proc parseTypeFields(node: NimNode): seq[GpuTypeField]
+
+proc getGenericTypeName(t: NimNode): string =
+  ## Returns the base name of the generic type, i.e. for
+  ## `Foo[Bar, Baz]` returns `Foo`.
+  case t.kind
+  of nnkSym: result = t.strVal
+  of nnkBracketExpr: result = t[0].getGenericTypeName()
+  else: raiseAssert "Unexpected node kind for generic instantiation type: " & $t.treerepr
+
+proc parseGenericArgs(t: NimNode): seq[GpuType] =
+  case t.kind
+  of nnkSym: return # no generic arguments
+  of nnkBracketExpr:
+    for i in 1 ..< t.len:
+      result.add nimToGpuType(t[i])
+  else:
+    raiseAssert "Unexpected node kind in parseGenericArgs: " & $t.treerepr
+
+proc initGpuGenericInst(t: NimNode): GpuType =
+  doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind
+  case t.kind
+  of nnkBracketExpr: # regular generic instantiation
+    result = GpuType(kind: gtGenericInst, gName: getGenericTypeName(t))
+    result.gArgs = parseGenericArgs(t)
+    # now parse the object fields
+    let impl = t.getTypeImpl() # impl for the `gFields`
+    result.gFields = parseTypeFields(impl)
+  of nnkObjConstr:
+    if t.len == 1:   # Generic instantiation without arguments
+      result = initGpuGenericInst(t[0])
+    elif t.len == 2: # ...and with arguments
+      doAssert t[1].kind == nnkExprColonExpr, "ObjConstr does not contain initialization as [1], but: " & $t.treerepr
+      result = initGpuGenericInst(t[0])
+    else:
+      raiseAssert "Unexpected number of elements in `nnkObjConstr` node for generic instantiation: " & $t.treerepr
+  of nnkSym:
+    let impl = getTypeImpl(t)
+    case impl.kind
+    of nnkDistinctTy:
+      ## XXX: assumes distinct of inbuilt type, not object!
+      result = nimToGpuType(impl[0])
+    of nnkObjectTy:
+      doAssert impl.kind == nnkObjectTy, "Unexpected node kind for generic inst: " & $impl.treerepr
+      ## XXX: use signature hash for type name? Otherwise will produce duplicates
+      result = GpuType(kind: gtGenericInst, gName: t.repr)
+      result.gFields = parseTypeFields(impl)
+    else:
+      raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
+  else:
+    raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr
+
 proc unpackGenericInst(t: NimNode): NimNode =
   let tKind = t.typeKind
   if tKind == ntyGenericInst:
@@ -75,96 +145,201 @@ proc unpackGenericInst(t: NimNode): NimNode =
 proc toGpuTypeKind(t: NimNode): GpuTypeKind =
   result = t.unpackGenericInst().typeKind.toGpuTypeKind()
 
-proc getInnerPointerType(n: NimNode): GpuType =
+proc getInnerPointerType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =
   doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind
   if n.typeKind in {ntyPointer, ntyUncheckedArray}:
     let typ = n.getTypeInst()
     doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr
     doAssert typ[0].kind in {nnkIdent, nnkSym}
     doAssert typ[0].strVal in ["ptr", "UncheckedArray"]
-    result = nimToGpuType(typ[1])
+    result = nimToGpuType(typ[1], allowToFail, allowArrayIdent)
   elif n.kind == nnkPtrTy:
-    result = nimToGpuType(n[0])
+    result = nimToGpuType(n[0], allowToFail, allowArrayIdent)
   elif n.kind == nnkAddr:
     let typ = n.getTypeInst()
-    result = getInnerPointerType(typ)
+    result = getInnerPointerType(typ, allowToFail, allowArrayIdent)
   elif n.kind == nnkVarTy:
     # VarTy
     #   Sym "BigInt"
-    result = nimToGpuType(n[0])
+    result = nimToGpuType(n[0], allowToFail, allowArrayIdent)
+  elif n.kind == nnkSym: # symbol of e.g. `ntyVar`
+    result = nimToGpuType(n.getTypeInst(), allowToFail, allowArrayIdent)
   else:
     raiseAssert "Found what: " & $n.treerepr
 
-proc determineArrayLength(n: NimNode): int =
+proc determineArrayLength(n: NimNode, allowArrayIdent: bool): int =
+  ## If `allowArrayIdent` is true, we do not emit the error message when
+  ## encountering an ident. This is the case for procs taking arrays
+  ## with a static array where the constant comes from outside the
+  ## macro. In that case we return `-1` indicating
+  ##  `proc mdsRowShfNaive(r: int, v: array[SPONGE_WIDTH, BigInt]): BigInt {.device.} =`
   case n[1].kind
   of nnkSym:
     # likely a constant, try to get its value
     result = n[1].getImpl.intVal
   of nnkIdent:
-    let msg = """Found array with length given by identifier: $#!
+    if not allowArrayIdent:
+      let msg = """Found array with length given by identifier: $#!
 You might want to create a typed template taking a typed parameter for this
-constant to force the Nim compiler to bind the symbol.
+constant to force the Nim compiler to bind the symbol. In theory though this
+error should not appear anymore though, as we don't try to parse generic
+functions.
 """ % n[1].strVal
-    raiseAssert msg
+      raiseAssert msg
+    else:
+      result = -1 # return -1 to indicate caller should look at symbol
   else:
     case n[1].kind
     of nnkIntLit: result = n[1].intVal
     else:
+      # E.g.
+      # BracketExpr
+      #   Sym "array"
+      #   Infix
+      #     Ident ".."
+      #     IntLit 0
+      #     IntLit 11
+      #   Sym "BigInt"
       #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr
       doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr
       doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr
       doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr
       result = n[1][2].intVal + 1
 
-proc getTypeName(n: NimNode): string =
+proc getTypeName(n: NimNode, recursedSym: bool = false): string
+proc constructTupleTypeName(n: NimNode): string =
+  ## XXX: overthink if this should really be here and not somewhere else
+  ##
+  ## Given a tuple, generate a name from the field names and types, e.g.
+  ## `Tuple_lo_BaseType_hi_BaseType`
+  ##
+  ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying
+  ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types.
+  ## UPDATE: I changed the implementation to recurse into `getTypeName`
+  ## TODO: verify that this did not break the tuple test & specifically check for aliases
+  result = "Tuple_"
+  doAssert n.kind in [nnkTupleTy, nnkTupleConstr]
+  for i, ch in n:
+    case ch.kind
+    of nnkIdentDefs:
+      let typName = ch[ch.len - 2].getTypeName() # second to last is type name of field(s)
+      for j in 0 ..< ch.len - 2:
+        # Example:
+        # IdentDefs
+        #   Ident "hi"
+        #   Ident "lo"      `..< ch.len - 2 `
+        #   Sym "BaseType"  `..< ch.len - 1`
+        #   Empty           `..< ch.len`
+        result.add ch[j].strVal & "_" & typName
+        if j < ch.len - 3:
+          result.add "_"
+      if i < n.len - 1:
+        result.add "_"
+    of nnkExprColonExpr:
+      # ExprColonExpr
+      #   Sym "hi"
+      #   Infix
+      #     Sym "shr"
+      #     Sym "n"
+      #     IntLit 16
+      # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)`
+      #    give them a slightly different name
+      let typName = ch[0].getTypeName() ## XXX
+      doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr
+      result.add ch[0].strVal & "_" & typName
+      if i < n.len - 1:
+        result.add "_"
+    of nnkSym:
+      # TupleConstr
+      #   Sym "BaseType" <-- e.g. here
+      #   Sym "BaseType"
+      let typName = ch.getTypeName()
+      result.add "Field" & $i & "_" & typName
+      if i < n.len - 1:
+        result.add "_"
+    else:
+      # TupleConstr      e.g. a tuple constr like this
+      #   Infix
+      #     Sym "shr"
+      #     Sym "n"
+      #     IntLit 16
+      #   Infix
+      #     Sym "and"
+      #     Sym "n"
+      #     UInt32Lit 65535
+      # -> Try again with type impl
+      return constructTupleTypeName(getTypeImpl(n))
+
+proc getTypeName(n: NimNode, recursedSym: bool = false): string =
   ## Returns the name of the type
   case n.kind
-  of nnkIdent, nnkSym: result = n.strVal
+  of nnkIdent: result = n.strVal
+  of nnkSym:
+    if recursedSym:
+      result = n.strVal
+    else:
+      result = n.getTypeInst.getTypeName(true)
   of nnkObjConstr:
     if n[0].kind == nnkEmpty:
       result = n.getTypeInst.strVal
     else:
       result = n[0].strVal # type is the first node
+  of nnkTupleTy, nnkTupleConstr:
+    result = constructTupleTypeName(n)
+  of nnkBracketExpr:
+    # construct a type name `Foo_Bar_Baz`
+    for i, ch in n:
+      result.add ch.getTypeName()
+      if i < n.len - 1:
+        result.add "_"
   else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
 
-proc parseTypeFields(node: NimNode): seq[GpuTypeField]
-proc nimToGpuType(n: NimNode): GpuType =
+proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType =
   ## Maps a Nim type to a type on the GPU
+  ##
+  ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases
+  ## where we would otherwise raise. This is so that in some cases where
+  ## we only _attempt_ to determine a type, we can do so safely.
   case n.kind
   of nnkIdentDefs: # extract type for let / var based on explicit or implicit type
     if n[n.len - 2].kind != nnkEmpty: # explicit type
-      result = nimToGpuType(n[n.len - 2])
+      result = nimToGpuType(n[n.len - 2], allowToFail, allowArrayIdent)
     else: # take from last element
-      result = nimToGpuType(n[n.len - 1].getTypeInst())
+      result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail, allowArrayIdent)
   of nnkConstDef:
     if n[1].kind != nnkEmpty: # has an explicit type
-      result = nimToGpuType(n[1])
+      result = nimToGpuType(n[1], allowToFail, allowArrayIdent)
     else:
-      result = nimToGpuType(n[2]) # derive from the RHS literal
+      result = nimToGpuType(n[2], allowToFail, allowArrayIdent) # derive from the RHS literal
   else:
     if n.kind == nnkEmpty: return initGpuType(gtVoid)
     case n.typeKind
     of ntyBool, ntyInt .. ntyUint64: # includes all float types
       result = initGpuType(toGpuTypeKind n.typeKind)
+    of ntyString: # only supported on some backends!
+      result = initGpuType(toGpuTypeKind n.typeKind)
     of ntyPtr:
-      result = initGpuPtrType(getInnerPointerType(n), implicitPtr = false)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false)
     of ntyVar:
-      result = initGpuPtrType(getInnerPointerType(n), implicitPtr = true)
+      result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = true)
     of ntyPointer:
       result = initGpuVoidPtr()
     of ntyUncheckedArray:
       ## Note: this is just the internal type of the array. It is only a pointer due to
       ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
-      result = initGpuUAType(getInnerPointerType(n))
-    of ntyObject:
-      let impl = n.getTypeImpl
+      result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent))
+    of ntyObject, ntyAlias, ntyTuple:
+      # for aliases, treat them identical to regular object types, but
+      # `getTypeName` returns the alias!
+      let impl = if n.kind == nnkTupleConstr: n # might actually _lose_ information if used getTypeImpl
+                 else: n.getTypeImpl
       let flds = impl.parseTypeFields()
       let typName = getTypeName(n) # might be an object construction
       result = initGpuObjectType(typName, flds)
     of ntyArray:
       # For a generic, static array type, e.g.:
       if n.kind == nnkSym:
-        return nimToGpuType(getTypeImpl(n))
+        return nimToGpuType(getTypeImpl(n), allowToFail, allowArrayIdent)
       if n.len == 3:
         # BracketExpr
         #   Sym "array"
@@ -172,8 +347,16 @@ proc nimToGpuType(n: NimNode): GpuType =
         #   Sym "uint32"
         doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr
         doAssert n[0].strVal == "array"
-        let len = determineArrayLength(n)
-        result = initGpuArrayType(n[2], len)
+        let len = determineArrayLength(n, allowArrayIdent)
+        if len < 0:
+          # indicates we found an array with an ident, e.g.
+          # BracketExpr
+          #   Sym "array"
+          #   Ident "SPONGE_WIDTH"
+          #   Sym "BigInt"
+          return GpuType(kind: gtInvalid)
+        else:
+          result = initGpuArrayType(n[2], len)
       else:
         # just an array literal
         # Bracket
@@ -184,11 +367,27 @@ proc nimToGpuType(n: NimNode): GpuType =
     #  echo n.getTypeImpl.treerepr
     #  error("o")
     of ntyGenericInvocation:
-      result = initGpuType(gtVoid)
-      error("Generics are not supported in the CUDA DSL so far.")
+      result = initGpuType(gtInvalid)
+      error("Generics are not supported in the CUDA DSL so far.") # Note: this should not appear nowadays
     of ntyGenericInst:
-      result = n.unpackGenericInst().nimToGpuType()
-    else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr
+      result = initGpuGenericInst(n)
+    of ntyTypeDesc:
+      # `getType` returns a `BracketExpr` of eg:
+      # BracketExpr
+      #   Sym "typeDesc"
+      #   Sym "float32"
+      result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it
+    of ntyUnused2:
+      # BracketExpr
+      #   Sym "lent"
+      #   Sym "BigInt"
+      doAssert n.kind == nnkBracketExpr and n[0].strVal == "lent", "ntyUnused2: " & $n.treerepr
+      result = initGpuPtrType(nimToGpuType(n[1]), implicitPtr = false)
+    else:
+      if allowToFail:
+        result = GpuType(kind: gtVoid)
+      else:
+        raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr
 
 proc assignOp(op: string, isBoolean: bool): string =
   ## Returns the correct CUDA operation given the Nim operator.
@@ -210,12 +409,34 @@ proc assignPrefixOp(op: string): string =
   else: result = op
 
 proc parseTypeFields(node: NimNode): seq[GpuTypeField] =
-  doAssert node.kind == nnkObjectTy
-  doAssert node[2].kind == nnkRecList
-  for ch in node[2]:
-    doAssert ch.kind == nnkIdentDefs and ch.len == 3
-    result.add GpuTypeField(name: ch[0].strVal,
-                            typ: nimToGpuType(ch[1]))
+  case node.kind
+  of nnkObjectTy:
+    doAssert node[2].kind == nnkRecList
+    for ch in node[2]:
+      doAssert ch.kind == nnkIdentDefs and ch.len == 3
+      result.add GpuTypeField(name: ch[0].strVal,
+                              typ: nimToGpuType(ch[1]))
+  of nnkTupleTy:
+    for ch in node:
+      doAssert ch.kind == nnkIdentDefs and ch.len == 3
+      result.add GpuTypeField(name: ch[0].strVal,
+                              typ: nimToGpuType(ch[1]))
+  of nnkTupleConstr:
+    # TupleConstr
+    #   Sym "BaseType"
+    #   Sym "BaseType"
+    for i, ch in node:
+      case ch.kind
+      of nnkSym:
+        result.add GpuTypeField(name: "Field" & $i,
+                                typ: nimToGpuType(ch))
+      of nnkExprColonExpr:
+        result.add GpuTypeField(name: ch[0].strVal,
+                                typ: nimToGpuType(ch[1]))
+      else:
+        return parseTypeFields(node.getTypeImpl) # will likely fall back to constr with `nnkSym`
+  else:
+    raiseAssert "Unsupported type to parse fields from: " & $node.kind
 
 template findIdx(col, el): untyped =
   var res = -1
@@ -235,17 +456,37 @@ proc requiresMemcpy(n: NimNode): bool =
   ## At the moment we only emit a `memcpy` statement for array types
   result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy
 
+proc isBuiltIn(n: NimNode): bool =
+  ## Checks if the given proc is a `{.builtin.}` (or if it is a Nim "built in"
+  ## proc that uses `importc`, as we cannot emit those; they _need_ to have a
+  ## WGSL / CUDA equivalent built in)
+  doAssert n.kind in [nnkProcDef, nnkFuncDef], "Argument is not a proc: " & $n.treerepr
+  for pragma in n.pragma:
+    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr
+    let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma
+    if pragma.strVal in ["builtin", "importc"]:
+      return true
+
 proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
-  doAssert n.kind == nnkPragma
+  doAssert n.kind in [nnkPragma, nnkEmpty]
+  if n.kind == nnkEmpty: return # no pragmas
   for pragma in n:
-    doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr
+    doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr
+    let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma
     case pragma.strVal
     of "device": result.incl attDevice
     of "global": result.incl attGlobal
-    of "forceinline": result.incl attForceInline
-    of "nimonly":
+    of "inline", "forceinline": result.incl attForceInline
+    of "nimonly", "builtin":
       # used to fully ignore functions!
       return
+    of "importc": # encountered if we analyze a proc from outside `cuda` scope
+      return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil`
+    of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend
+      continue
+    of "magic":
+      return
+    of "raises": discard # result.incl attDevice #discard # XXX
     else:
       raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
 
@@ -270,17 +511,40 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] =
     # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes
     # do not match the CUDA string we need to emit, which is what the string value of
     # the `GpuVarAttribute` enum stores
-    case pragma.strVal
-    of "cuExtern", "extern": result.add atvExtern
+    case pragma.strVal.normalize
+    of "cuextern", "extern": result.add atvExtern
     of "shared": result.add atvShared
     of "private": result.add atvPrivate
     of "volatile": result.add atvVolatile
     of "constant": result.add atvConstant
+    of "noinit": discard # XXX: ignore for now
     else:
       raiseAssert "Unexpected pragma: " & $pragma.treerepr
 
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst
 
+proc maybePatchFnName(n: var GpuAst) =
+  ## Patches the function name for names that are not allowed on most backends, but appear
+  ## commonly in Nim (custom operators).
+  ##
+  ## NOTE: I think that the binary operators don't actually appear as a `gpuCall`, but still
+  ## as an infix node, even after sem checking by the Nim compiler.
+  doAssert n.kind == gpuIdent
+  template patch(arg, by: untyped): untyped =
+    arg.iSym = arg.iSym.replace(arg.iName, by)
+    arg.iName = by
+  let name = n.iName
+  case name
+  of "[]":  patch(n, "get")
+  of "[]=": patch(n, "set")
+  of "+":   patch(n, "add")
+  of "-":   patch(n, "sub")
+  of "*":   patch(n, "mul")
+  of "/":   patch(n, "div")
+  else:
+    # leave as is
+    discard
+
 proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
   ## Returns the name for the function. Either the symbol name _or_
   ## the `{.cudaName.}` pragma argument.
@@ -320,6 +584,20 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
           result = ctx.toGpuAst(n) # if _no_ pragma
       else:
         result = ctx.toGpuAst(n) # if not proc or func
+
+      # possibly patch function names, e.g. custom `[]`, `[]=`, `+` etc operators
+      # (inbuilt won't show up as a function name, but rather as a specific node kind, eg `nnkIndex`
+      result.maybePatchFnName()
+
+      # handle overloads with different signatures
+      if n.strVal in ctx.symChoices:
+        # this is an overload of another function with different signature (not a generic, but
+        # overloads are not allowed in CUDA/WGSL/...). Update `sigTab` entry by using `iSym`
+        # for `iName` field for unique name
+        let id = ctx.sigTab[sig]
+        id.iName = id.iSym
+      else:
+        ctx.symChoices.incl result.iName # store this name in `symChoices`
   else:
     # else we use the str representation (repr for open / closed sym choice nodes)
     result = toAst n.repr
@@ -328,6 +606,206 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst =
     # ctx.sigTab[sig] = result
   result.symbolKind = gsProc # make sure it's a proc
 
+proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType =
+  ## Returns the type from a given Nim node `t` representing a type.
+  ## If that fails due to an identifier in the type, we instead try
+  ## to look up the type from the associated symbol, `n`.
+  result = nimToGpuType(t, allowArrayIdent = true)
+  if result.kind == gtInvalid:
+    # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means
+    # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead
+    result = n.getTypeInst.nimToGpuType()
+
+proc stripPtrOrArrayType(t: GpuType): GpuType =
+  ## Strips any pointer or array type to return any struct / generic instantiation
+  ## it might contain
+  case t.kind
+  of gtPtr:    result = stripPtrOrArrayType t.to
+  of gtUA:     result = stripPtrOrArrayType t.uaTo
+  of gtArray:  result = stripPtrOrArrayType t.aTyp
+  else:        result = t
+
+proc maybeAddType*(ctx: var GpuContext, typ: GpuType) =
+  ## Adds the given type to the table of known types, if it is some kind of
+  ## object type.
+  ##
+  ## XXX: What about aliases and distincts?
+  let typ = typ.stripPtrOrArrayType() # get any underlying type
+  if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types:
+    ctx.types[typ] = toTypeDef(typ)
+
+proc parseProcParameters(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): seq[GpuParam] =
+  ## Returns all parameters of the given procedure from the `params` node
+  ## of type `nnkFormalParams`.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  for i in 1 ..< params.len:
+    let param = params[i]
+    let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
+    let typIdx = param.len - 2 # second to last is the type
+    # IdentDefs
+    #   Ident "x"
+    #   Ident "y"
+    #   Ident "res"
+    #   PtrTy
+    #     Ident "float32"   # `param.len - 2`
+    #   Empty               # `param.len - 1`
+    let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1])
+    ctx.maybeAddType(paramType)
+    for i in 0 ..< numParams:
+      var p = ctx.toGpuAst(param[i])
+      let symKind = if attGlobal in attrs: gsGlobalKernelParam
+                    else: gsDeviceKernelParam
+      p.iTyp = paramType     ## Update the type of the symbol
+      p.symbolKind = symKind ## and the symbol kind
+      let param = GpuParam(ident: p, typ: paramType)
+      result.add(param)
+
+proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType =
+  ## Returns the return type of the given procedure from the `params` node
+  ## of type `nnkFormalParams`.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  let retType = params[0] # arg 0 is return type
+  if retType.kind == nnkEmpty:
+    result = GpuType(kind: gtVoid) # actual void return
+  else:
+    # attempt to get type. If fails, we need to wait for a caller to this function to get types
+    # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside
+    # the macro. We then rely on our generics logic to later look this up when called
+    result = nimToGpuType(retType, allowArrayIdent = true)
+    if result.kind == gtVoid: # stop parsing this function
+      result = GpuType(kind: gtInvalid)
+  ctx.maybeAddType(result)
+
+proc toGpuProcSignature(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): GpuProcSignature =
+  ## Creates a `GpuProcSignature` from the given `params` node of type `nnkFormalParams`
+
+  ##
+  ## NOTE: This procedure is only called from generically instantiated procs. Therefore,
+  ## we shouldn't need to worry about getting `gtInvalid` return types here.
+  doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr
+  result = GpuProcSignature(params: ctx.parseProcParameters(params, attrs),
+                            retType: ctx.parseProcReturnType(params))
+
+proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) =
+  ## Looks up the implementation of the given function and stores it in our table
+  ## of generic instantiations.
+  ##
+  ## For any looked up procedure, we attach the `{.device.}` pragma.
+  ##
+  ## Mutates the `name` of the given function to match its generic name.
+  # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature
+  # Only the latter contains e.g. correct instantiation of static array sizes
+  let inst = node[0].getImpl()
+  let sig = node[0].getTypeInst()
+  inst.params = sig.params # copy over the parameters
+
+  # turn the signature into a `GpuProcSignature`
+  let attrs = collectProcAttributes(inst.pragma)
+  let procSig = ctx.toGpuProcSignature(sig.params, attrs)
+  if name in ctx.processedProcs:
+    return
+  else:
+    # Need to add isym here so that if we have recursive calls, we don't end up
+    # calling `toGpuAst` recursively forever
+    ctx.processedProcs[name] = procSig
+
+  let fn = ctx.toGpuAst(inst)
+  if fn.kind == gpuVoid:
+    # Should be an inbuilt proc, i.e. annotated with `{.builtin.}`. However,
+    # functions that are available otherwise (e.g. in Nim's system like `abs`)
+    # in Nim _and_ backends will also show up here. Unless we wanted to manually
+    # wrap all of these, we can just skip the `isBuiltin` check here.
+    # If the user uses something not available in the backend, they'll get a
+    # compiler error from that compiler.
+    # It's mostly a matter of usability: For common procs like `abs` we cannot
+    # so easily define a custom overload `proc abs(...): ... {.builtin.}`, because
+    # that would overwrite the Nim version.
+    # doAssert inst.isBuiltIn()
+    return
+  else:
+    fn.pAttributes.incl attDevice # make sure this is interpreted as a device function
+    doAssert fn.pName.iSym == name.iSym, "Not matching"
+    # now overwrite the identifier's `iName` field by its `iSym` so that different
+    # generic insts have different
+    fn.pName.iName = fn.pName.iSym
+    name.iName = fn.pName.iSym ## update the name of the called function
+    ctx.genericInsts[fn.pName] = fn
+
+proc isExpression(n: GpuAst): bool =
+  ## Returns whether the given AST node is an expression
+  case n.kind
+  of gpuCall: # only if it returns something!
+    result = n.cIsExpr
+  of gpuBinOp, gpuIdent, gpuLit, gpuArrayLit, gpuPrefix, gpuDot, gpuIndex, gpuObjConstr,
+     gpuAddr, gpuDeref, gpuConv, gpuCast, gpuConstExpr:
+    result = true
+  else:
+    result = false
+
+proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) =
+  ## Will insert a `gpuVar` for the implicit `result` variable, unless there
+  ## is a user defined `var result` that shadows it at the top level of the proc
+  ## body.
+  ##
+  ## Finally adds a `return result` statement if
+  ## - we add a `result` variable
+  ## - there is no `return` statement as the _last_ statement in the proc
+  if retType.kind == gtVoid: return # nothing to do if the proc returns nothing
+
+  proc hasCustomResult(n: GpuAst): bool =
+    doAssert n.kind == gpuBlock
+    for ch in n: # iterate all top level statements in the proc body
+      case ch.kind
+      of gpuVar:
+        if ch.vName.ident() == "result":
+          ## XXX: could maybe consider to emit a CT warning that `result` shadows the implicit
+          ## result variable
+          echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`."
+          return true
+      of gpuBlock: # need to look at `gpuBlock` from top level, because variables are defined in a block
+        result = result or hasCustomResult(ch)
+      else:
+        discard
+
+  proc lastIsReturn(n: GpuAst): bool =
+    doAssert n.kind == gpuBlock
+    if n.statements[^1].kind == gpuReturn: return true
+
+  if not hasCustomResult(ast) and not lastIsReturn(ast):
+    # insert `gpuVar` as the *first* statement
+    let resId = GpuAst(kind: gpuIdent, iName: "result",
+                       iSym: "result",
+                       iTyp: retType,
+                       symbolKind: gsLocal)
+    let res = GpuAst(kind: gpuVar, vName: resId,
+                     vType: retType,
+                     vInit: GpuAst(kind: gpuVoid), # no initialization
+                     vRequiresMemcpy: false,
+                     vMutable: true)
+    ast.statements.insert(res, 0)
+    # NOTE: The compiler rewrites expressions at the end of a `proc` into
+    # an assignment to `block: result = <expression>` for us.
+    if not lastIsReturn(ast):
+      # insert `return result`
+      ast.statements.add GpuAst(kind: gpuReturn, rValue: resId)
+
+proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool =
+  ## Returns true if the given `fn` (gpuIdent) returns a value.
+  ## The function can either be:
+  ## - an inbuilt function
+  ## - a generic instantiation
+  ## - contained in `allFnTab`
+  if fn in ctx.allFnTab:
+    result = ctx.allFnTab[fn].pRetType.kind != gtVoid
+  elif fn in ctx.genericInsts:
+    result = ctx.genericInsts[fn].pRetType.kind != gtVoid
+  elif fn in ctx.builtins:
+    result = ctx.builtins[fn].pRetType.kind != gtVoid
+  elif fn in ctx.processedProcs:
+    result = ctx.processedProcs[fn].retType.kind != gtVoid
+  else:
+    raiseAssert "The function: " & $fn & " is not known anywhere."
+
 proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   ## XXX: things still left to do:
   ## - support `result` variable? Currently not supported. Maybe we will won't
@@ -353,9 +831,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
                     blockLabel: blockLabel)
     for i in 1 ..< node.len: # index 0 is the block label
       result.statements.add ctx.toGpuAst(node[i])
+  of nnkBlockExpr:
+    ## XXX: For CUDA just a block?
+    let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal
+                     elif node[0].kind == nnkEmpty: ""
+                     else: raiseAssert "Unexpected node in block label field: " & $node.treerepr
+    result = GpuAst(kind: gpuBlock, blockLabel: blockLabel, isExpr: true)
+    for el in node:
+      if el.kind != nnkEmpty:
+        result.statements.add ctx.toGpuAst(el)
   of nnkStmtListExpr: # for statements that return a value.
     ## XXX: For CUDA just a block?
-    result = GpuAst(kind: gpuBlock)
+    result = GpuAst(kind: gpuBlock, isExpr: true)
     for el in node:
       if el.kind != nnkEmpty:
         result.statements.add ctx.toGpuAst(el)
@@ -364,42 +851,43 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result = ctx.toGpuAst(node[0])
 
   of nnkProcDef, nnkFuncDef:
-    result = GpuAst(kind: gpuProc)
-    result.pName = ctx.toGpuAst(node.name)
-    result.pName.symbolKind = gsProc ## This is a procedure identifier
-    doAssert node[3].kind == nnkFormalParams
-    result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type
-    # Process pragmas
-    if node.pragma.kind != nnkEmpty:
-      doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
-      result.pAttributes = collectProcAttributes(node.pragma)
-      if result.pAttributes.len == 0: # means `nimonly` was applied
+    # if it is a _generic_ function, we don't actually process it here. instead we add it to
+    # the `generics` set. When we encounter a `gpuCall` we will then check if the function
+    # being called is part of the generic set and look up its _instantiated_ implementation
+    # to parse it. The parsed generics are stored in the `genericInsts` table.
+    let name = ctx.getFnName(node.name)
+    if node[2].kind == nnkGenericParams: # is a generic
+      ctx.generics.incl name.iName # need to use raw name, *not* symbol
+      result = GpuAst(kind: gpuVoid)
+    elif node.body.kind == nnkEmpty: # just a forward declaration
+      result = GpuAst(kind: gpuVoid)
+    else:
+      result = GpuAst(kind: gpuProc)
+      result.pName = name
+      result.pName.symbolKind = gsProc ## This is a procedure identifier
+      let params = node[3]
+      doAssert params.kind == nnkFormalParams
+      result.pRetType = ctx.parseProcReturnType(params)
+      if result.pRetType.kind == gtInvalid:
+        ctx.generics.incl name.iName # need to use raw name, *not* symbol
         return GpuAst(kind: gpuVoid)
-    # Process parameters
-    for i in 1 ..< node[3].len:
-      let param = node[3][i]
-      let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
-      let typIdx = param.len - 2 # second to last is the type
-      # IdentDefs
-      #   Ident "x"
-      #   Ident "y"
-      #   Ident "res"
-      #   PtrTy
-      #     Ident "float32"   # `param.len - 2`
-      #   Empty               # `param.len - 1`
-      let paramType = nimToGpuType(param[typIdx])
-      #echo "Argument: ", param.treerepr, " has tpye: ", paramType
-      for i in 0 ..< numParams:
-        var p = ctx.toGpuAst(param[i])
-        let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam
-                      else: gsDeviceKernelParam
-        p.iTyp = paramType     ## Update the type of the symbol
-        p.symbolKind = symKind ## and the symbol kind
-        let param = GpuParam(ident: p, typ: paramType)
-        result.pParams.add(param)
-
-    result.pBody = ctx.toGpuAst(node.body)
-      .ensureBlock() # single line procs should be a block to generate `;`
+
+      # Process pragmas
+      if node.pragma.kind != nnkEmpty:
+        doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
+        result.pAttributes = collectProcAttributes(node.pragma)
+        if result.pAttributes.len == 0: # means `nimonly` was applied / is a `builtin`
+          ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called
+          return GpuAst(kind: gpuVoid)
+      # Process parameters
+      result.pParams = ctx.parseProcParameters(params, result.pAttributes)
+      result.pBody = ctx.toGpuAst(node.body)
+        .ensureBlock() # single line procs should be a block to generate `;`
+      result.pBody.maybeInsertResult(result.pRetType, result.pName.ident())
+
+      # Add to table of known functions
+      if result.pName notin ctx.allFnTab:
+        ctx.allFnTab[result.pName] = result
 
   of nnkLetSection, nnkVarSection:
     # For a section with multiple declarations, create a block
@@ -426,7 +914,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
         doAssert declaration[0][1].kind == nnkPragma
         varNode.vAttributes = collectAttributes(declaration[0][1])
       else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr
-      varNode.vType = nimToGpuType(declaration)
+      varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0])
+      ctx.maybeAddType(varNode.vType)
       varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later
       # This is a *local* variable (i.e. `function` address space on WGSL) unless it is
       # annotated with `{.shared.}` (-> `workspace` in WGSL)
@@ -489,6 +978,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     ## NOTE: Currently we process templates, but we expect them to be already
     ## expanded by the Nim compiler. Thus we could in theory expand them manually
     ## but fortunately we don't need to.
+    return GpuAst(kind: gpuVoid)
     let tName = node[0].strVal
 
     # Extract parameters
@@ -508,8 +998,14 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result = GpuAst(kind: gpuVoid)
 
   of nnkCall, nnkCommand:
-    # Check if this is a template call
+    # `name` below is name + signature hash. Check if this is a generic based on node repr
     let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice
+    if node[0].repr in ctx.generics or name notin ctx.allFnTab:
+      # process the generic instantiaton and store *or* pull in a proc defined outside
+      # the `cuda` macro by its implementation.
+      ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`!
+      ctx.addProcToGenericInsts(node, name)
+
     let args = node[1..^1].mapIt(ctx.toGpuAst(it))
     # Producing a template call something like this (but problematic due to overloads etc)
     # we could then perform manual replacement of the template in the CUDA generation pass.
@@ -518,26 +1014,60 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       result.tcName = name
       result.tcArgs = args
     else:
-      result = GpuAst(kind: gpuCall)
+      let fnIsExpr = ctx.fnReturnsValue(name)
+      result = GpuAst(kind: gpuCall, cIsExpr: fnIsExpr)
       result.cName = name
       result.cArgs = args
 
   of nnkInfix:
     result = GpuAst(kind: gpuBinOp)
-    # if left/right is boolean we need logical AND/OR, otherwise
-    # bitwise
-    let isBoolean = node[1].typeKind == ntyBool
-    result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name
-    result.bLeft = ctx.toGpuAst(node[1])
-    result.bRight = ctx.toGpuAst(node[2])
-    # We patch the types of int / float literals. WGSL does not automatically convert literals
-    # to the target type.
-    if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit:
-      # determine literal type based on `bRight`
-      result.bLeft.lType = nimToGpuType(node[2])
-    elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit:
-      # determine literal type based on `bLeft`
-      result.bRight.lType = nimToGpuType(node[1])
+    # Using `getType` to get the types of the arguuments
+    let typ = node[0].getTypeImpl() # e.g.
+    doAssert typ.kind == nnkProcTy, "Infix node is not a proc but: " & $typ.treerepr
+    # BracketExpr
+    #   Sym "proc"
+    #   Sym "int"  <- return type
+    #   Sym "int"  <- left op type
+    #   Sym "int"  <- right op type
+    result.bLeftTyp = nimToGpuType(typ[0][1])
+    result.bRightTyp = nimToGpuType(typ[0][2])
+    # if either is not a base type (`gtBool .. gtSize_t`) we actually deal with a _function call_
+    # instead of an binary operation. Will thus rewrite.
+    proc ofBasicType(t: GpuType, allowPtrLhs: bool): bool =
+      ## Determines if the given type is a basic POD type *or* a simple pointer to it.
+      ## This is because some infix nodes, e.g. `x += y` will have LHS arguments that are
+      ## `var T`, which appear as an implicit pointer here.
+      ##
+      ## TODO: Handle the case of backend inbuilt special types (like `vec3`), which may indeed
+      ## have inbuilt infix operators. Either by checking if the type has a `{.builtin.}` pragma
+      ## _or_ if there is a wrapped proc for this operator and if so do not rewrite as `gpuCall`
+      ## if that exists.
+      result = (t.kind in gtBool .. gtSize_t)
+      if allowPtrLhs:
+        result = result or ((t.kind == gtPtr) and t.implicit and t.to.kind in gtBool .. gtSize_t)
+
+    if not result.bLeftTyp.ofBasicType(true) or not result.bRightTyp.ofBasicType(false):
+      result = GpuAst(kind: gpuCall)
+      result.cName = ctx.getFnName(node[0])
+      result.cArgs = @[ctx.toGpuAst(node[1]), ctx.toGpuAst(node[2])]
+    else:
+      # if left/right is boolean we need logical AND/OR, otherwise bitwise
+      let isBoolean = result.bLeftTyp.kind == gtBool
+      var op = GpuAst(kind: gpuIdent, iName: assignOp(node[0].repr, isBoolean)) # repr so that open sym choice gets correct name
+      op.iSym = op.iName
+      result.bOp = op
+      result.bLeft = ctx.toGpuAst(node[1])
+      result.bRight = ctx.toGpuAst(node[2])
+
+      # We patch the types of int / float literals. WGSL does not automatically convert literals
+      # to the target type. Determining the type here _can_ fail. In that case the
+      # `lType` field will just be `gtVoid`, like the default.
+      if result.bLeft.kind == gpuLit: # and result.bRight.kind != gpuLit:
+        # determine literal type based on `bRight`
+        result.bLeft.lType = result.bLeftTyp # nimToGpuType(node[2], allowToFail = true)
+      elif result.bRight.kind == gpuLit: # and result.bLeft.kind != gpuLit:
+        # determine literal type based on `bLeft`
+        result.bRight.lType = result.bRightTyp #nimToGpuType(node[1], allowToFail = true)
 
   of nnkDotExpr:
     ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT
@@ -547,9 +1077,22 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     result.dField = ctx.toGpuAst(node[1])
 
   of nnkBracketExpr:
-    result = GpuAst(kind: gpuIndex)
-    result.iArr = ctx.toGpuAst(node[0])
-    result.iIndex = ctx.toGpuAst(node[1])
+    case node[0].typeKind
+    of ntyTuple:
+      # need to replace `[idx]` by field access
+      let typ = nimToGpuType(node[0].getTypeImpl)
+      ctx.maybeAddType(typ)
+      #doAssert typ in ctx.types
+      doAssert node[1].kind == nnkIntLit
+      let idx = node[1].intVal
+      let field = typ.oFields[idx].name
+      result = GpuAst(kind: gpuDot,
+                      dParent: ctx.toGpuAst(node[0]),
+                      dField: ctx.toGpuAst(ident(field)))
+    else:
+      result = GpuAst(kind: gpuIndex)
+      result.iArr = ctx.toGpuAst(node[0])
+      result.iIndex = ctx.toGpuAst(node[1])
 
   of nnkIdent, nnkOpenSymChoice:
     result = newGpuIdent()
@@ -560,7 +1103,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
   of nnkSym:
     let s = node.repr & "_" & node.signatureHash()
     # NOTE: The reason we have a tab of known symbols is not to keep the same _reference_ to each
-    # symbol, but rather to allow having the same symbol kind (set in the caller of this call).
+    # symbol, but rather to allow having the same symbol kind and appropriate type for each
+    # symbol `GpuAst` (of kind `gpuIdent`), which is set in the caller of this call.
     # For example in `nnkCall` nodes returning the value from the table automatically means the
     # `symbolKind` is local / function argument etc.
     if s notin ctx.sigTab:
@@ -570,7 +1114,11 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       if result.iName == "_":
         result.iName = "tmp_" & $ctx.genSymCount
         inc ctx.genSymCount
-        #ctx.sigTab[s] = result
+      elif result.iName.startsWith("tmpTuple_"): # will have a Nim gensym'd suffix, replace by custom counter
+        result.iName = "tmpTuple_" & $ctx.genSymCount
+        result.iSym = result.iName & "_" & node.signatureHash() # and update the iSym to not be based on Nim's value either
+        inc ctx.genSymCount
+      ctx.sigTab[s] = result
     else:
       result = ctx.sigTab[s]
 
@@ -635,28 +1183,90 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       doAssert el.kind == nnkTypeDef
       result.statements.add ctx.toGpuAst(el)
   of nnkTypeDef:
-    result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal)
-    result.tFields = parseTypeFields(node[2])
+    doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len
+    let name = ctx.toGpuAst(node[0])
+    if node[1].kind == nnkGenericParams: # if this is a generic, only store existence of it
+                                         # will store the instantiatons in `nnkObjConstr`
+      result = GpuAst(kind: gpuVoid)
+    else:
+      let typ = nimToGpuType(node[0])
+      case node[2].kind
+      of nnkObjectTy: # regular `type foo = object`
+        result = GpuAst(kind: gpuTypeDef, tTyp: typ)
+        result.tFields = parseTypeFields(node[2])
+      of nnkSym:      # a type alias `type foo = bar`
+        result = GpuAst(kind: gpuAlias, aTyp: typ,
+                        aTo: ctx.toGpuAst(node[2]))
+      else:
+        raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind
+
+      # include this the set of known types to not generate duplicates
+      ctx.types[typ] = result
+      # Reset the type we return to void. We now generate _all_ types from the
+      # `types`.
+      result = GpuAst(kind: gpuVoid)
   of nnkObjConstr:
-    let typName = getTypeName(node)
-    result = GpuAst(kind: gpuObjConstr, ocName: typName)
+    ## this should never see `genericParam` I think
+    let typ = nimToGpuType(node)
+    ctx.maybeAddType(typ)
+    result = GpuAst(kind: gpuObjConstr, ocType: typ)
     # get all fields of the type
-    let flds = node[0].getTypeImpl.parseTypeFields() # sym
+    let flds = if typ.kind == gtObject: typ.oFields
+               elif typ.kind == gtGenericInst: typ.gFields
+               else: raiseAssert "ObjConstr must have an object type: " & $typ
     # find all fields that have been defined by the user
     var ocFields: seq[GpuFieldInit]
     for i in 1 ..< node.len: # all fields to be init'd
       doAssert node[i].kind == nnkExprColonExpr
       ocFields.add GpuFieldInit(name: node[i][0].strVal,
-                                value: ctx.toGpuAst(node[i][1]))
+                                value: ctx.toGpuAst(node[i][1]),
+                                typ: GpuType(kind: gtVoid))
+
     # now add fields in order of the type declaration
     for i in 0 ..< flds.len:
       let idx = findIdx(ocFields, flds[i].name)
       if idx >= 0:
-        result.ocFields.add ocFields[idx]
+        var f = ocFields[idx]
+        f.typ = flds[i].typ
+        result.ocFields.add f
       else:
         let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid))
         result.ocFields.add GpuFieldInit(name: flds[i].name,
-                                         value: dfl)
+                                         value: dfl,
+                                         typ: flds[i].typ)
+  of nnkTupleConstr:
+    let typ = nimToGpuType(node)
+    ctx.maybeAddType(typ)
+
+    result = GpuAst(kind: gpuObjConstr, ocType: typ)
+    # get all fields of the type
+    let flds = typ.oFields
+    # find all fields that have been defined by the user
+    var ocFields: seq[GpuFieldInit]
+    for i in 0 ..< node.len: # all fields to be init'd
+      case node[i].kind
+      of nnkExprColonExpr:
+        ocFields.add GpuFieldInit(name: node[i][0].strVal,
+                                  value: ctx.toGpuAst(node[i][1]),
+                                  typ: GpuType(kind: gtVoid))
+      else:
+        ocFields.add GpuFieldInit(name: "Field" & $i,
+                                  value: ctx.toGpuAst(node[i]),
+                                  typ: GpuType(kind: gtVoid))
+
+    # now add fields in order of the type declaration
+    for i in 0 ..< flds.len:
+      let idx = findIdx(ocFields, flds[i].name)
+      if idx >= 0:
+        var f = ocFields[idx]
+        f.typ = flds[i].typ
+        result.ocFields.add f
+      else:
+        let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid))
+        result.ocFields.add GpuFieldInit(name: flds[i].name,
+                                         value: dfl,
+                                         typ: flds[i].typ)
+
 
   of nnkAsmStmt:
     doAssert node.len == 2
@@ -666,10 +1276,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
 
   of nnkBracket:
     let aLitTyp = nimToGpuType(node[0])
-    var aValues = newSeq[string]()
+    var aValues = newSeq[GpuAst]()
     for el in node:
-      ## XXX: Support not just int literals
-      aValues.add $el.intVal
+      aValues.add ctx.toGpuAst(el)
     result = GpuAst(kind: gpuArrayLit,
                     aValues: aValues,
                     aLitType: aLitTyp)
@@ -691,27 +1300,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
     # `HiddenAddr` appears for accesses to `var` passed arguments
     result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0]))
 
-  of nnkHiddenDeref:
-    case node.typeKind
-    of ntyUncheckedArray:
-      # `getTypeInst(node)` would yield:
-      # BracketExpr
-      #   Sym "UncheckedArray"
-      #   Sym "uint32"
-      # i.e. it is a `ptr UncheckedArray[T]`
-      # In this case we just ignore the deref, because on the CUDA
-      # side it is just a plain pointer array we index into using
-      # `foo[i]`.
-      result = ctx.toGpuAst(node[0])
-    else:
-      # Otherwise we treat it like a regular deref
-      # HiddenDeref
-      #   Sym "x"
-      # With e.g. `getTypeInst(node) = Sym "BigInt"`
-      # and `node.typeKind = ntyObject`
-      # due to a `var` parameter
-      result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
-  of nnkDerefExpr: #, nnkHiddenDeref:
+  of nnkDerefExpr, nnkHiddenDeref:
+    # treat hidden and regular deref the same nowadays. On some backends may strip derefs, if
+    # they appear e.g. in an `gpuIndex` (CUDA)
     result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
 
   of nnkConstDef:
@@ -730,6 +1321,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst =
       doAssert el.kind == nnkConstDef
       result.statements.add ctx.toGpuAst(el)
 
+  of nnkWhenStmt:
+    raiseAssert "We shouldn't be seeing a `when` statement after sem check of the Nim code."
   else:
     echo "Unhandled node kind in toGpuAst: ", node.kind
     raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr
diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim
index dcfde9c1..95928e2b 100644
--- a/constantine/math_compiler/experimental/runtime_compile.nim
+++ b/constantine/math_compiler/experimental/runtime_compile.nim
@@ -86,15 +86,17 @@ proc log*(nvrtc: var NVRTC) =
 proc compile*(nvrtc: var NVRTC) =
   # Compile the program with fmad disabled.
   # Note: Can specify GPU target architecture explicitly with '-arch' flag.
-  const
-    Options = [
-      cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is
-      # "--fmad=false", # and whatever other options for example
-    ]
-
-    NumberOfOptions = cint Options.len
-  let compileResult =  nvrtcCompileProgram(nvrtc.prog, NumberOfOptions,
-                                           cast[cstringArray](addr Options[0]))
+  var options = @[
+    cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is
+    # "--fmad=false", # and whatever other options for example
+  ]
+  when defined(debugCuda):
+    options.add cstring "--device-debug"       # Equivalent to -g
+    options.add cstring "--generate-line-info" # Equivalent to -lineinfo
+
+  let numberOfOptions = cint options.len
+  let compileResult =  nvrtcCompileProgram(nvrtc.prog, numberOfOptions,
+                                           cast[cstringArray](addr options[0]))
 
   nvrtc.log()
   ## XXX: only in `DebugCuda`?
diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim
index 8e22f6ef..cfc8d2c9 100644
--- a/constantine/platforms/abis/nvidia_abi.nim
+++ b/constantine/platforms/abis/nvidia_abi.nim
@@ -855,11 +855,11 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi
 {.pop.}
 
 proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult {.v2.}
+proc cuCtxDestroy*(ctx: CUcontext): CUresult {.v2.}
 proc cuCtxSynchronize*(ctx: CUcontext): CUresult {.v2.}
 
 {.push noconv, importc, dynlib: libCuda.}
 
-proc cuCtxDestroy*(ctx: CUcontext): CUresult
 proc cuCtxSynchronize*(): CUresult
 proc cuCtxGetCurrent*(ctx: var CUcontext): CUresult
 proc cuCtxSetCurrent*(ctx: CUcontext): CUresult
@@ -875,7 +875,6 @@ proc cuModuleUnload*(module: CUmodule): CUresult
 proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
 proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult
 proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult
-proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult
 
 proc cuLaunchKernel*(
        kernel: CUfunction,
@@ -889,6 +888,8 @@ proc cuLaunchKernel*(
 
 {.pop.} # {.push noconv, importc, dynlib: "libcuda.so"..}
 
+proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult {.v2.}
+
 proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult {.v2.}
 proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult {.v1.}
 proc cuMemFree*(devptr: CUdeviceptr): CUresult {.v2.}