From ca7c920d162fe85d0785cb74e86d5d45f51dd4f6 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 12:52:22 +0200 Subject: [PATCH 01/87] handle `nnkSym` in `getInnerPointerType` --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 45af3782..5157fec7 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -92,6 +92,8 @@ proc getInnerPointerType(n: NimNode): GpuType = # VarTy # Sym "BigInt" result = nimToGpuType(n[0]) + elif n.kind == nnkSym: # symbol of e.g. `ntyVar` + result = nimToGpuType(n.getTypeInst()) else: raiseAssert "Found what: " & $n.treerepr From 561328eb469f68d8fcbc6ecc3880b4d24814a016 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 12:52:33 +0200 Subject: [PATCH 02/87] fix `constructPtrSignature` after change from nil -> gtVoid for iTyp The type of the identifier is now always `gtVoid`, so the previous check `not idTyp.isNil` does not work anylonger. --- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 0b6552bd..2ff1e336 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -56,7 +56,7 @@ proc fromAddressSpace(addrSpace: AddressSpace): GpuSymbolKind = proc constructPtrSignature(addrSpace: AddressSpace, idTyp: GpuType, ptrStr, typStr: string): string = ## Constructs the `ptr` string, which only includes ## the RW string if the address space is `storage` - let rw = if not idTyp.isNil: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`) + let rw = if idTyp.kind != gtVoid: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`) let rwStr = if rw: "read_write" else: "read" case addrSpace of asStorage: result = &"{ptrStr}<{addrSpace}, {typStr}, {rwStr}>" From d7de955815f05832b9146a1b6777505a765e7314 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 12:53:34 +0200 Subject: [PATCH 03/87] handle `gpuCast` in determineSymKind/Mutability/Ident --- constantine/math_compiler/experimental/backends/wgsl.nim | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 2ff1e336..26edc9c1 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -205,6 +205,7 @@ proc determineSymKind(arg: GpuAst): GpuSymbolKind = of gpuBlock: arg.statements[^1].determineSymKind() # look at last element of gpuPrefix: gsLocal # equivalent to constructing a local var of gpuConv: gsLocal # a converted value will be a local var + of gpuCast: arg.cExpr.determineSymKind() # symbol kind of the thing we cast else: raiseAssert "Not implemented to determine symbol kind from node: " & $arg @@ -225,6 +226,7 @@ proc determineMutability(arg: GpuAst): bool = of gpuBlock: arg.statements[^1].determineMutability() # look at last element of gpuPrefix: false # equivalent to constructing a local var of gpuConv: false # a converted value will be immutable + of gpuCast: arg.cExpr.determineMutability() # mutability of the thing we cast else: raiseAssert "Not implemented to determine mutability from node: " & $arg @@ -250,8 +252,9 @@ proc determineIdent(arg: GpuAst): GpuAst = of gpuBlock: arg.statements[^1].determineIdent() of gpuPrefix: dfl() of gpuConv: dfl() + of gpuCast: arg.cExpr.determineIdent() # ident of the thing we cast else: - raiseAssert "Not implemented to determine mutability from node: " & $arg + raiseAssert "Not implemented to determine ident from node: " & $arg proc getGenericArguments(args: seq[GpuAst], params: seq[GpuParam], callerParams: Table[string, GpuParam]): seq[GenericArg] = ## If an argument is not a ptr argument in the original function (`params`) then From 4277fcf9bc999e0902aa516354b3aeae8f94e365 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 14:16:03 +0200 Subject: [PATCH 04/87] allow determination of GPU type to fail in `nimToGpuType` This is only for the optional helper used on the WebGPU backend, where we try to determine the type in an infix expression. However, for some arguments to the infix this is not uniquely possible. I.e. we might encounter `SomeInteger`, which is not a unique type. In this case we just fall back to not assigning a known type. --- .../math_compiler/experimental/nim_to_gpu.nim | 51 +++++++++++-------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 5157fec7..2e8a42d3 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has import ./gpu_types import ./backends/backends -proc nimToGpuType(n: NimNode): GpuType +proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType proc initGpuType(kind: GpuTypeKind): GpuType = ## If `kind` is `gtPtr` `to` must be the type we point to @@ -75,25 +75,25 @@ proc unpackGenericInst(t: NimNode): NimNode = proc toGpuTypeKind(t: NimNode): GpuTypeKind = result = t.unpackGenericInst().typeKind.toGpuTypeKind() -proc getInnerPointerType(n: NimNode): GpuType = +proc getInnerPointerType(n: NimNode, allowToFail: bool = false): GpuType = doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind if n.typeKind in {ntyPointer, ntyUncheckedArray}: let typ = n.getTypeInst() doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr doAssert typ[0].kind in {nnkIdent, nnkSym} doAssert typ[0].strVal in ["ptr", "UncheckedArray"] - result = nimToGpuType(typ[1]) + result = nimToGpuType(typ[1], allowToFail) elif n.kind == nnkPtrTy: - result = nimToGpuType(n[0]) + result = nimToGpuType(n[0], allowToFail) elif n.kind == nnkAddr: let typ = n.getTypeInst() - result = getInnerPointerType(typ) + result = getInnerPointerType(typ, allowToFail) elif n.kind == nnkVarTy: # VarTy # Sym "BigInt" - result = nimToGpuType(n[0]) + result = nimToGpuType(n[0], allowToFail) elif n.kind == nnkSym: # symbol of e.g. `ntyVar` - result = nimToGpuType(n.getTypeInst()) + result = nimToGpuType(n.getTypeInst(), allowToFail) else: raiseAssert "Found what: " & $n.treerepr @@ -130,34 +130,38 @@ proc getTypeName(n: NimNode): string = else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr proc parseTypeFields(node: NimNode): seq[GpuTypeField] -proc nimToGpuType(n: NimNode): GpuType = +proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = ## Maps a Nim type to a type on the GPU + ## + ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases + ## where we would otherwise raise. This is so that in some cases where + ## we only _attempt_ to determine a type, we can do so safely. case n.kind of nnkIdentDefs: # extract type for let / var based on explicit or implicit type if n[n.len - 2].kind != nnkEmpty: # explicit type - result = nimToGpuType(n[n.len - 2]) + result = nimToGpuType(n[n.len - 2], allowToFail) else: # take from last element - result = nimToGpuType(n[n.len - 1].getTypeInst()) + result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail) of nnkConstDef: if n[1].kind != nnkEmpty: # has an explicit type - result = nimToGpuType(n[1]) + result = nimToGpuType(n[1], allowToFail) else: - result = nimToGpuType(n[2]) # derive from the RHS literal + result = nimToGpuType(n[2], allowToFail) # derive from the RHS literal else: if n.kind == nnkEmpty: return initGpuType(gtVoid) case n.typeKind of ntyBool, ntyInt .. ntyUint64: # includes all float types result = initGpuType(toGpuTypeKind n.typeKind) of ntyPtr: - result = initGpuPtrType(getInnerPointerType(n), implicitPtr = false) + result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = false) of ntyVar: - result = initGpuPtrType(getInnerPointerType(n), implicitPtr = true) + result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = true) of ntyPointer: result = initGpuVoidPtr() of ntyUncheckedArray: ## Note: this is just the internal type of the array. It is only a pointer due to ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. - result = initGpuUAType(getInnerPointerType(n)) + result = initGpuUAType(getInnerPointerType(n, allowToFail)) of ntyObject: let impl = n.getTypeImpl let flds = impl.parseTypeFields() @@ -166,7 +170,7 @@ proc nimToGpuType(n: NimNode): GpuType = of ntyArray: # For a generic, static array type, e.g.: if n.kind == nnkSym: - return nimToGpuType(getTypeImpl(n)) + return nimToGpuType(getTypeImpl(n), allowToFail) if n.len == 3: # BracketExpr # Sym "array" @@ -189,8 +193,12 @@ proc nimToGpuType(n: NimNode): GpuType = result = initGpuType(gtVoid) error("Generics are not supported in the CUDA DSL so far.") of ntyGenericInst: - result = n.unpackGenericInst().nimToGpuType() - else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr + result = n.unpackGenericInst().nimToGpuType(allowToFail) + else: + if allowToFail: + result = GpuType(kind: gtVoid) + else: + raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr proc assignOp(op: string, isBoolean: bool): string = ## Returns the correct CUDA operation given the Nim operator. @@ -533,13 +541,14 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.bLeft = ctx.toGpuAst(node[1]) result.bRight = ctx.toGpuAst(node[2]) # We patch the types of int / float literals. WGSL does not automatically convert literals - # to the target type. + # to the target type. Determining the type here _can_ fail. In that case the + # `lType` field will just be `gtVoid`, like the default. if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit: # determine literal type based on `bRight` - result.bLeft.lType = nimToGpuType(node[2]) + result.bLeft.lType = nimToGpuType(node[2], allowToFail = true) elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit: # determine literal type based on `bLeft` - result.bRight.lType = nimToGpuType(node[1]) + result.bRight.lType = nimToGpuType(node[1], allowToFail = true) of nnkDotExpr: ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT From ba89dcbc983ed906ab0da958f2f3b714dc21b000 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 14:17:19 +0200 Subject: [PATCH 05/87] handle `UncheckedArray` in `gpuTypeToString` on CUDA backend --- constantine/math_compiler/experimental/backends/cuda.nim | 1 + 1 file changed, 1 insertion(+) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 227bde79..1077b92c 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -43,6 +43,7 @@ proc gpuTypeToString*(t: GpuTypeKind): string = of gtVoidPtr: "void*" of gtObject: "struct" of gtString: "const char*" + of gtUA: "" # `UncheckedArray` by itself is nothing in CUDA else: raiseAssert "Invalid type : " & $t From a2bbf982e5df16dc47578f97444b645ad673cae9 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 15:17:27 +0200 Subject: [PATCH 06/87] rename constant to set workgroup size --- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 26edc9c1..dc41cb12 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -825,5 +825,5 @@ proc codegen*(ctx: var GpuContext): string = for fnIdent, fn in ctx.fnTab: if fn.isGlobal(): ## XXX: make adjustable! - result.add "@compute @workgroup_size(NUM_WORKGROUPS)\n" + result.add "@compute @workgroup_size(WORKGROUP_SIZE)\n" result.add ctx.genWebGpu(fn) & "\n\n" From 1c5ff0c0925f660c2108e2c3e6a42986d77e9a88 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 15:19:51 +0200 Subject: [PATCH 07/87] minor cleanup --- .../math_compiler/experimental/backends/wgsl.nim | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index dc41cb12..a6ecf6d4 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -552,26 +552,17 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = case ast.kind of gpuVoid: return # nothing to emit of gpuProc: - - ## XXX: if a {.global.} / attGlobal proc, lift arguments - ## Store all arguments in the `GpuContext` - ## *AFTER* processing all of the code, generate header and place at beginning - ## Most difficult: - ## - track identifiers from {.global.} functions into arbitrary layers and remove - ## BUT, we can also have a full preprocessing pass. - let attrs = collect: for att in ast.pAttributes: $att - # Parameters var params: seq[string] for p in ast.pParams: params.add gpuTypeToString(p.typ, p.ident, allowEmptyIdent = false) var fnArgs = params.join(", ") if $attGlobal in attrs: doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!" - ## XXX: clean this up. Add the global id builtin + ## XXX: make this more flexible. In theory can be any name fnArgs = "@builtin(global_invocation_id) global_id: vec3" let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs) From b98f06985ccfc7d8ce3b15b4b3a4ef0b81586579 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 11 Aug 2025 18:49:15 +0200 Subject: [PATCH 08/87] correctly handle `gtUA` (UncheckedArray) on CUDA backend The `gtUA` type enum element is new and was not correctly handled yet on the CUDA backend. --- constantine/math_compiler/experimental/backends/cuda.nim | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 1077b92c..9ee253ca 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -58,6 +58,10 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, var skipIdent = false case t.kind of gtPtr: + var t = t # if `ptr UncheckedArray`, remove the `gtUA` layer. No meaning on CUDA + if t.to.kind == gtUA: + t.to = t.to.uaTo + if t.to.kind == gtArray: # ptr to array type # need to pass `*` for the pointer into the identifier, i.e. # `state: var array[4, BigInt]` From c0e1eb88fa111405e5e2b8ad26037acfa2201ab3 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 12 Aug 2025 18:02:45 +0200 Subject: [PATCH 09/87] fix access of type for left ident in assignment --- constantine/math_compiler/experimental/backends/wgsl.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index a6ecf6d4..865996ba 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -637,8 +637,8 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuConv: dfl() else: raiseAssert "Not implemented to determine mutability from node: " & $arg - let leftTyp = ast.aLeft.determineIdent().iTyp - if leftTyp.kind == gtPtr and leftTyp.to.kind == gtInt32: + let leftId = ast.aLeft.determineIdent() + if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32: # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and # we patched the `bool -> i32` and thus need to convert it. From 24afcfe883ae3f2dff11c9aebc0c370bcbd5c425 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 12 Aug 2025 18:03:17 +0200 Subject: [PATCH 10/87] rewrite compound assignment operators in all functions i.e. x += 5 becomes x = x + 5 etc for any prefix `foo=` in x foo= y we generate x = x foo y --- .../experimental/backends/wgsl.nim | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 865996ba..1fc86f0d 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -542,6 +542,31 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ctx.injectAddressOf(fn) + proc rewriteCompoundAssignment(n: GpuAst): GpuAst = + doAssert n.kind == gpuBinOp + + template genAssign(left, rnode, op: typed): untyped = + let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode) + GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false) + + let op = n.bOp + if op.len >= 2 and op[^1] == '=': + result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last + else: + # leave untouched + result = n + + proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = + case n.kind + of gpuBinOp: n = rewriteCompoundAssignment(n) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch) + # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5` + for (fnIdent, fn) in mpairs(ctx.fnTab): + ctx.makeCodeValid(fn) + + proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a)) From 18d582f271675da3363b6fc18457ee52d28884cc Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 16:57:53 +0200 Subject: [PATCH 11/87] extend `GpuFieldInit` by a type field We need that type to determine information about what type we actually assign in a `gpuObjConstr`. --- constantine/math_compiler/experimental/gpu_types.nim | 9 ++++++++- constantine/math_compiler/experimental/nim_to_gpu.nim | 11 ++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 448a783e..569682ff 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -212,6 +212,7 @@ type GpuFieldInit* = object name*: string value*: GpuAst + typ*: GpuType ## XXX: UNUSED TemplateInfo* = object @@ -385,7 +386,13 @@ proc clone*(ast: GpuAst): GpuAst = result = GpuAst(kind: gpuObjConstr) result.ocName = ast.ocName for f in ast.ocFields: - result.ocFields.add(GpuFieldInit(name: f.name, value: f.value.clone())) + result.ocFields.add( + GpuFieldInit( + name: f.name, + value: f.value.clone(), + typ: f.typ.clone() + ) + ) of gpuInlineAsm: result = GpuAst(kind: gpuInlineAsm) result.stmt = ast.stmt diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 2e8a42d3..ce460e1a 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -658,16 +658,21 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = for i in 1 ..< node.len: # all fields to be init'd doAssert node[i].kind == nnkExprColonExpr ocFields.add GpuFieldInit(name: node[i][0].strVal, - value: ctx.toGpuAst(node[i][1])) + value: ctx.toGpuAst(node[i][1]), + typ: GpuType(kind: gtVoid)) + # now add fields in order of the type declaration for i in 0 ..< flds.len: let idx = findIdx(ocFields, flds[i].name) if idx >= 0: - result.ocFields.add ocFields[idx] + var f = ocFields[idx] + f.typ = flds[i].typ + result.ocFields.add f else: let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid)) result.ocFields.add GpuFieldInit(name: flds[i].name, - value: dfl) + value: dfl, + typ: flds[i].typ) of nnkAsmStmt: doAssert node.len == 2 From fbfeab6cc97afe52897aa2e4ac7f19e3d7156f11 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 16:59:45 +0200 Subject: [PATCH 12/87] make sure to update symbols in global functions too Otherwise we don't have up to date type / symbol kind information in globals. --- .../experimental/backends/wgsl.nim | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 1fc86f0d..78436f9b 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -498,6 +498,20 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) = for ch in mitems(n): ctx.injectAddressOf(ch) +proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) = + ## Update symbols in global functions to have same mutability and symbolkind as + ## parameters + case n.kind + of gpuIdent: + if n.iSym in ctx.globals: + n.symbolKind = gsGlobalKernelParam + if n.iTyp.kind == gtPtr: + let g = ctx.globals[n.iSym] + n.iTyp.mutable = g.typ.kind == gtPtr # arguments as pointers == mutable + else: + for ch in n: + ctx.updateSymsInGlobals(ch) + proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ## If `kernel` is a global function, we *only* generate code for that kernel. ## This is useful if your GPU code contains multiple kernels with differing @@ -518,6 +532,9 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = for p in fn.pParams: ctx.globals[p.ident.iSym] = p # copy all parameters over to globals fn.pParams.setLen(0) # delete function's parameters + # now update all appearances of the parameters, now globals, such that they reflect + # the correct symbol kind and mutability + ctx.updateSymsInGlobals(fn) else: discard From 1f04a82aee993af78ed66860d6044826d8783c61 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:00:49 +0200 Subject: [PATCH 13/87] remove local `determineIdent` from `genWebGpu` --- .../experimental/backends/wgsl.nim | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 78436f9b..66bda610 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -655,30 +655,6 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight), ctx.size(ast.aLeft)) else: - proc determineIdent(arg: GpuAst): GpuAst = - ## Tries to determine the underlying ident that is contained in this node. - ## The issue is the argument to a `gpuCall` can be a complicated expression. - ## Depending on the node it may be possible to extract a simple identifier, - ## e.g. for `addr(foo)` (`gpuAddr` of `gpuIdent` node) we can get the ident. - ## If this fails, we return a `gpuVoid` node. - ## - ## TODO: Think about if it ever makes sense to extract the ident underlying - ## e.g. `deref` and use _that_ to determine mutability & address space. - template dfl(): untyped = GpuAst(kind: gpuVoid) - case arg.kind - of gpuIdent: arg - of gpuAddr: arg.aOf.determineIdent() - of gpuDeref: arg.dOf.determineIdent() - of gpuCall: dfl() - of gpuIndex: arg.iArr.determineIdent() - of gpuDot: arg.dParent.determineIdent() - of gpuLit: dfl() - of gpuBinOp: dfl() - of gpuBlock: arg.statements[^1].determineIdent() - of gpuPrefix: dfl() - of gpuConv: dfl() - else: - raiseAssert "Not implemented to determine mutability from node: " & $arg let leftId = ast.aLeft.determineIdent() if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32: # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was From ebcff83187377238fce18fb57145a3dea0e56a0b Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:01:17 +0200 Subject: [PATCH 14/87] fix tree representation of GpuAst --- .../math_compiler/experimental/gpu_types.nim | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 569682ff..52b4b668 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -559,10 +559,10 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add pretty(n.vName, indent + 2) result.add pretty(n.vInit, indent + 2) if n.vAttributes.len > 0: - result.add id("Attributes") + result.add idd("Attributes") for attr in n.vAttributes: let indent = indent + 2 - result.add id(attr) + result.add idd(attr) of gpuAssign: result.add pretty(n.aLeft, indent + 2) result.add pretty(n.aRight, indent + 2) @@ -599,11 +599,13 @@ proc pretty*(n: GpuAst, indent: int = 0): string = let indent = indent + 2 result.add id(t.name) of gpuObjConstr: - result.add id("Ident", n.ocName) - result.add id("Fields") + result.add idd("Ident", n.ocName) + result.add idd("Fields") for f in n.ocFields: - let indent = indent + 2 - result.add id("Name", f.name) + var indent = indent + 2 + result.add idd("Field") + indent = indent + 2 + result.add idd("Name", f.name) result.add pretty(f.value, indent + 2) of gpuInlineAsm: result.add id(n.stmt) From 3a32d2b9a8e93b1e6de6d1bf0dda999d3bc250c0 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:01:25 +0200 Subject: [PATCH 15/87] add `mpairs`, `pairs` iterator for `GpuAst` --- .../math_compiler/experimental/gpu_types.nim | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 52b4b668..0b6b560d 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -703,6 +703,19 @@ iterator mitems*(ast: var GpuAst): var GpuAst = iterator items*(ast: GpuAst): GpuAst = iterImpl(ast, mutable = false) +iterator mpairs*(ast: var GpuAst): (int, var GpuAst) = + ## Iterate over all child nodes of the given AST and the index + var i = 0 + for el in mitems(ast): + yield (i, el) + inc i + +iterator pairs*(ast: GpuAst): (int, GpuAst) = + var i = 0 + for el in items(ast): + yield (i, el) + inc i + ## General utility helpers From a6186ae5174f051f340c4c152f5cfc479ccb44a5 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:04:38 +0200 Subject: [PATCH 16/87] support default initialization in obj constr fields As structs only support 'constructible types' in their fields anyway, we can just default initialize all fields the user leaves out in an object constructor. --- constantine/math_compiler/experimental/backends/wgsl.nim | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 66bda610..27036005 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -764,7 +764,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuObjConstr: result = ast.ocName & "(" for i, el in ast.ocFields: - result.add ctx.genWebGpu(el.value) + if el.value.kind == gpuLit and el.value.lValue == "DEFAULT": + # use type to construct a default value + let typStr = gpuTypeToString(el.typ, allowEmptyIdent = true) + result.add typStr & "()" + else: + result.add ctx.genWebGpu(el.value) if i < ast.ocFields.len - 1: result.add ", " result.add ")" From f485deb1c32cd0097ecf1efb5eb6882cd4e78fea Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:06:11 +0200 Subject: [PATCH 17/87] =?UTF-8?q?correctly=20handle=20`var=20foo=20{.const?= =?UTF-8?q?ant.}`=20variables=20by=20=E2=87=92=20globals?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those are intended for runtime constants, i.e. further storage buffers in the context of WGSL. In CUDA we'd use `copyToSymbol` to copy the data to them before execution. --- .../experimental/backends/wgsl.nim | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 27036005..96c5fcba 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -512,6 +512,31 @@ proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) = for ch in n: ctx.updateSymsInGlobals(ch) +proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) = + ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to + ## the `globals` of the context. Such variables are *not* regular global constants, but rather + ## `storage` buffers, which are filled before the kernel is executed. + ## + ## XXX: Document current not ideal behavior that one needs to be careful to pass data into + ## `wgsl.fakeExecute` precisely in the order in which the `var foo {.constant.}` are defined + ## *AND* after all kernel parameters! + doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind + var i = 0 + while i < blk.len: + doAssert blk.kind == gpuBlock + let g = blk.statements[i] + if g.kind == gpuVar and atvConstant in g.vAttributes: + # remove this from `globalBlocks` and add to `globals` + doAssert g.vInit.kind == gpuVoid, "A variable annotated with `{.constant.}` must not have an initialization!" + # we construct a fake parameter from it + ## XXX: `storage` address space is probably what we want, but think more about it + let param = GpuParam(ident: g.vName, typ: g.vType, addressSpace: asStorage) + ctx.globals[param.ident.iSym] = param + blk.statements.delete(i) + # no need to increase `i` + else: + inc i + proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ## If `kernel` is a global function, we *only* generate code for that kernel. ## This is useful if your GPU code contains multiple kernels with differing @@ -538,6 +563,12 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = else: discard + # 2.b filter out all `var foo {.constant.}: dtype` from the `globalBlocks` and add them to + # the `globals` + # `globalBlocks` has two entries: + # 0: variables + # 1: types + ctx.pullConstantPragmaVars(ctx.globalBlocks[0]) # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect # the functions called and record them in `fnTab`. If they have pointer arguments we # generate a generic instantiation for the exact pointer types used. From 08e43c5fd9b51304375b3c5220b28adca275b86c Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:13:52 +0200 Subject: [PATCH 18/87] move rewriting of compound assignment out of `storagePass` --- .../experimental/backends/wgsl.nim | 50 ++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 96c5fcba..7e41aa2a 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -498,6 +498,32 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) = for ch in mitems(n): ctx.injectAddressOf(ch) +proc rewriteCompoundAssignment(n: GpuAst): GpuAst = + doAssert n.kind == gpuBinOp + if n.bOp in ["<=", "==", ">=", "!="]: return n + + template genAssign(left, rnode, op: typed): untyped = + let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode) + GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false) + + let op = n.bOp + if op.len >= 2 and op[^1] == '=': + result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last + else: + # leave untouched + result = n + +proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string +proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = + case n.kind + of gpuBinOp: + n = rewriteCompoundAssignment(n) + for ch in mitems(n): # now go over children + ctx.makeCodeValid(ch, inGlobal) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) = ## Update symbols in global functions to have same mutability and symbolkind as ## parameters @@ -589,33 +615,11 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = if fn.isGlobal(): # non global functions don't need to be mutated ctx.injectAddressOf(fn) - - proc rewriteCompoundAssignment(n: GpuAst): GpuAst = - doAssert n.kind == gpuBinOp - - template genAssign(left, rnode, op: typed): untyped = - let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode) - GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false) - - let op = n.bOp - if op.len >= 2 and op[^1] == '=': - result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last - else: - # leave untouched - result = n - - proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = - case n.kind - of gpuBinOp: n = rewriteCompoundAssignment(n) - else: - for ch in mitems(n): - ctx.makeCodeValid(ch) # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5` for (fnIdent, fn) in mpairs(ctx.fnTab): - ctx.makeCodeValid(fn) + ctx.makeCodeValid(fn, inGlobal = fn.isGlobal()) -proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a)) From 00c4b16d0c2dd5cb022fd6c6e272643bdfee008d Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:18:38 +0200 Subject: [PATCH 19/87] implement lifting of struct pointer fields This gives us the convenience of passing around a struct with a pointer field, while preserving the restrictions of WebGPU. Because of the fact that storage buffers are global anyway, we can "pretend" the pointers are part of an object and simply replace them by the global when the fields are used. We throw a CT error if one tries to assign a non storage buffer pointer to a field. --- .../experimental/backends/wgsl.nim | 161 ++++++++++++++++++ .../math_compiler/experimental/gpu_types.nim | 4 + 2 files changed, 165 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 7e41aa2a..542e060f 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -388,6 +388,11 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp ## of that function (hence the name `scanGenerics`). The generic instance will be added ## to `fnTab` instead. The name of the generic will be derived based on the types ## of arguments with respect to mutability and address space. + ## + ## In addition this function records any time a `struct` is constructed in a `gpuObjConstr` + ## node and a pointer field assigned to it. As pointer fields are not valid in WGSL, we + ## record them here to replace them by their arguments passed to the constructor later. + ## The pointers _must_ be pointers passed into a global kernel (i.e. `storage` address space). case n.kind of gpuCall: let fn = n.cName @@ -438,6 +443,18 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp # Harvest generics from arguments to this call! for arg in n.cArgs: ctx.scanGenerics(arg, callerParams) + of gpuObjConstr: + # If pointer argument of `storage`, strip out, if pointer type field + # otherwise, raise CT error + for f in n.ocFields: + if f.typ.kind == gtPtr: + doAssert f.value.kind in [gpuAddr, gpuIdent], "Constructing a pointer field " & + "from a more complex expression than an ident or an address-of operation " & + "is currently not supported." + let id = f.value.determineIdent() + doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " & + "variable (i.e. an argument to a global kernel) is not supported: " & $f + ctx.structsWithPtrs[(n.ocName, f.name)] = id else: for ch in n: ctx.scanGenerics(ch, callerParams) @@ -513,13 +530,129 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst = # leave untouched result = n +proc getStructName(n: GpuAst): string = + ## Given an identifier `gpuIdent` (or `Deref` of one), return the name of the struct type + ## the ident is of or an empty string if it is not (pointing to) a struct. + doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind + var p = n + if p.kind == gpuDeref: + p = n.dOf + result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject: + p.iTyp.to.name + elif p.iTyp.kind == gtObject: + p.iTyp.name + else: "" + proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = + ## Addresses other AST patterns that need to be rewritten on WGSL. Aspects + ## that are rewritten include: + ## + ## - (`gpuBinOp`) rewriting compound assignment operators as regular assignments, `x += y` ↦ `x = x + y` + ## + ## - (`gpuDot`) replace field access of struct pointer fields by the pointers passed into the object + ## constructor (ref `scanGenerics`). `inGlobal` is used to decide what exactly we replace + ## it by. Inside of a global function the variables won't be pointers, hence we insert `&foo`. + ## In device functions, the globals will have been passed into the function as a parameter, + ## `ptr`. Thus, we replace by `foo`. + ## NOTE: We could consider to move this into `scanGenerics`, but for the moment I prefer to + ## do code transformations here and `scanGenerics` being only about data collection. + ## + ## - (`gpuAssign`) compile time errors, if a user tries to assign a pointer to a struct pointer field + ## outside the constructor. + ## + ## - (`gpuCall`) potentially update signatures of our custom generic functions. In `scanGenerics` if we + ## have a call like `foo(bar.ptrField)` we will determine the signature of `foo` to have + ## a `function` pointer, because `bar` will be a local struct instance. However, due to + ## our replacement rules and fact that *only* storage pointers may be assigned to constructors + ## the correct signature would be `storage` for the first argument after replacing `bar.ptrField` + ## by its value in the constructor. + ## + ## - (`gpuObjConstr`) delete arguments to object constructors, which assign pointer fields. + ## + ## - (`gpuVar`) update types of new variables based on the RHS. May have changed since Nim -> GpuAst, + ## due to `gpuDot` replacement further up. + ## + ## NOTE: A few cases already raise compile time errors _here_ and not in `checkCodeValid`, + ## as some transformations otherwise break the detection. case n.kind of gpuBinOp: n = rewriteCompoundAssignment(n) for ch in mitems(n): # now go over children ctx.makeCodeValid(ch, inGlobal) + of gpuObjConstr: # strip out arguments that are pointer types + let t = n.ocName + var i = 0 + while i < n.ocFields.len: + let f = n.ocFields[i] + if (t, f.name) in ctx.structsWithPtrs: + if f.typ.kind == gtPtr: + n.ocFields.delete(i) + else: + inc i + else: + inc i + of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res` + var p = n.dParent + let id = getStructName(p) + doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind + let field = n.dField.ident() + if id.len > 0 and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer + let v = ctx.structsWithPtrs[(id, field)] + ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions, + ## we will have passed the parameter + if inGlobal: + n = GpuAst(kind: gpuAddr, aOf: v) # overwrite with the address of value passed in to the object constructor + else: + n = v + of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field + if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]: + let dot = n.aLeft + let id = getStructName(dot.dParent) + if id.len > 0: + doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind + let field = dot.dField.ident() + if (id, field) in ctx.structsWithPtrs: + raiseAssert "Assignment of a struct (`" & id & "`) field of a pointer type is not supported. " & + "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n) + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + of gpuCall: + # we might need to update the type of generics, if we did the replacement in `gpuDot`, because + # a struct ptr field will have had the wrong storage type + for ch in mitems(n): # first process children + ctx.makeCodeValid(ch, inGlobal) + # now check if any argument's type mismatches against the generic we recorded + let fnName = n.cName + if fnName in ctx.fnTab: # otherwise will not be generated by us, so irrelevan + # NOTE: theoretically, if we had struct pointer field replacements with symbols that had + # *different* address spaces, we'd need to split one generic into multiple again here. + # But that shouldn't be possible, because our entire replacement is currently only + # sane if we store a *storage pointer* in a struct. We would have raised in `scanGenerics` + # because of invalid pointer assignment in an object constructor. + let fn = ctx.fnTab[fnName] + let params = fn.pParams + for i, arg in n: # walk the parameters again and compare + let argId = arg.determineIdent() + if argId.kind != gpuVoid and argId.ident().len > 0: + var p = params[i] + ## XXX: update anything else? We mostly care about the address space here, because + ## the rest _should_ be the same anyway. + if p.addressSpace != argId.symbolKind.toAddressSpace(): + p.addressSpace = argId.symbolKind.toAddressSpace() + p.ident.symbolKind = argId.symbolKind + fn.pParams[i] = p # write back, not a ref type! + of gpuVar: + # first recurse on the `gpuVar` to get possible replacements + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + # update LHS with info from RHS by copying over its symbol kind. Different types are + # possible after replacements of `gpuDot` nodes above. + if n.vType.kind == gtPtr: + let rightId = n.vInit.determineIdent() + n.vName.symbolKind = rightId.symbolKind + n.vType.mutable = rightId.iTyp.mutable + n.vName.iTyp.mutable = rightId.iTyp.mutable else: for ch in mitems(n): ctx.makeCodeValid(ch, inGlobal) @@ -563,6 +696,31 @@ proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) = else: inc i +proc removeStructPointerFields(blk: var GpuAst) = + ## Filters out `ptr` fields from all structs. + ## + ## If a type is used with `storage` pointer arguments, we will later perform replacement of field + ## access to the pointer field by the value we assign. + ## + ## If the user assigns a local (`function`) pointer, we raise a CT error. We _could_ in theory support + ## replacement for local pointer types too, but it requires a more careful analysis of which + ## local to replace by and the name in other scopes. I.e. passing a local pointer to a constructor + ## which has a different name than in the calling scope would require us to traverse the AST up to + ## the calling scope. + ## + ## Given the extreme limitations on `let` variables with pointers anyway, I don't think there is muc + ## purpose on supporting such features. + doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind + for typ in mitems(blk): + doAssert typ.kind == gpuTypeDef + var i = 0 + while i < typ.tFields.len: + let f = typ.tFields[i] + if f.typ.kind == gtPtr: # delete + typ.tFields.delete(i) + else: + inc i + proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ## If `kernel` is a global function, we *only* generate code for that kernel. ## This is useful if your GPU code contains multiple kernels with differing @@ -595,6 +753,9 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = # 0: variables # 1: types ctx.pullConstantPragmaVars(ctx.globalBlocks[0]) + # 2.c remove all fields of structs, which have pointer type + removeStructPointerFields(ctx.globalBlocks[1]) + # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect # the functions called and record them in `fnTab`. If they have pointer arguments we # generate a generic instantiation for the exact pointer types used. diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 0b6b560d..ff402991 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -246,6 +246,10 @@ type # ## when we finish, we pop. Before we pop, we assign the variable definitions to the `gpuBlock` # ## `locals` genSymCount*: int ## increases for every generated identifier (currently only underscore `_`), hence the basic solution + ## Maps a struct type and field name, which is of pointer type to the value the user assigns + ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()` + ## constructor (WebGPU only). + structsWithPtrs*: Table[(string, string), GpuAst] GenericArg* = object addrSpace*: AddressSpace ## We store the address space, because that's what matters From 4046396c4f5d1b16067b33832a699de979cf10dd Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:20:18 +0200 Subject: [PATCH 20/87] add helper to check code for validity Currently only checks if we assign a pointer to a `var` variable (which is not allowed in WGSL). --- .../experimental/backends/wgsl.nim | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 542e060f..336d1797 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -671,6 +671,23 @@ proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) = for ch in n: ctx.updateSymsInGlobals(ch) +proc checkCodeValid(ctx: var GpuContext, n: GpuAst) = + ## Checks if the code is valid according to WGSL spec. + ## So far handles: + ## - variables (`var`) to pointer types are not allowed + ## + ## Some code is already rejected in earlier passes, if a compiler pass would transform + ## the code in such a way as making a detection of illegal code invalid. + case n.kind + of gpuVar: + if n.vType.kind == gtPtr and n.vMutable: # `vMutable == var` -> not allowed to store pointers + let code = ctx.genWebGpu(n) + raiseAssert "The node: `" & $code & "` constructs a variable (`var`) to a pointer type. This " & + "is invalid in WGSL. Use `let`." + else: + for ch in n: + ctx.checkCodeValid(ch) + proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) = ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to ## the `globals` of the context. Such variables are *not* regular global constants, but rather @@ -780,6 +797,10 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = for (fnIdent, fn) in mpairs(ctx.fnTab): ctx.makeCodeValid(fn, inGlobal = fn.isGlobal()) + # 6. finally raise error if we find anything that is not allowed in WGSL after our transformations + for (fnIdent, fn) in pairs(ctx.fnTab): + ctx.checkCodeValid(fn) + proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a)) From f27b0a1aeecf6195907ae41997440db3de13e027 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 14 Aug 2025 17:21:12 +0200 Subject: [PATCH 21/87] add support for full Nim generics in the context of GPU code This means we can finally write something like: ``` proc foo[N: static int](ar: array[N, BigInt], ...) ``` We generate generic instantiations for every instantiation the Nim compiler produces (plus potentially additional ones for every pointer type argument the user passes into such a function). --- .../experimental/backends/wgsl.nim | 4 + .../experimental/gpu_compiler.nim | 15 ++- .../math_compiler/experimental/gpu_types.nim | 14 +++ .../math_compiler/experimental/nim_to_gpu.nim | 115 +++++++++++------- 4 files changed, 101 insertions(+), 47 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 336d1797..728ab57c 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -751,6 +751,10 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ctx.globalBlocks.add varBlock ctx.globalBlocks.add typBlock + # Now add the generics to the `allFnTab` + for k, v in pairs(ctx.genericInsts): + ctx.allFnTab[k] = v + # 2. Remove all arguments from global functions, as none are allowed in WGSL for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table if (fn.isGlobal() and kernel.len > 0 and fn.pName.ident() == kernel) or diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 0c68e88e..d71c4298 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -94,16 +94,17 @@ proc malloc*(size: csize_t): pointer = discard proc free*(p: pointer) = discard proc syncthreads*() {.cudaName: "__syncthreads".} = discard -macro toGpuAst*(body: typed): GpuAst = +macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = ## WARNING: The following are *not* supported: ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between ## what is a field access and a function call. Hence we assume any `nnkDotExpr` ## is actually a field access! ## - most regular Nim features :) - echo body.treerepr - echo body.repr var ctx = GpuContext() - newLit(ctx.toGpuAst(body)) + let ast = ctx.toGpuAst(body) + let gen = toSeq(ctx.genericInsts.values) + let g = GpuGenericsInfo(data: gen) + newLit((g, ast)) macro cuda*(body: typed): string = ## WARNING: The following are *not* supported: @@ -119,13 +120,15 @@ macro cuda*(body: typed): string = let body = ctx.codegen(gpuAst) result = newLit(body) -proc codegen*(ast: GpuAst, kernel: string = ""): string = +proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string = ## Generates the code based on the given AST (optionally at runtime) and restricts ## it to a single global kernel (WebGPU) if any given. - let ast = ast.clone() ## XXX: remove clone var ctx = GpuContext() + for fn in gen.data: # assign generics info to correct table + ctx.genericInsts[fn.pName] = fn result = ctx.codegen(ast, kernel) + when isMainModule: # Mini example let kernel = cuda: diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index ff402991..87b1a22a 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -250,6 +250,20 @@ type ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()` ## constructor (WebGPU only). structsWithPtrs*: Table[(string, string), GpuAst] + ## Set of all generic proc names we have encountered in Nim -> GpuAst. When + ## we see an `nnkCall` we check if we call a generic function. If so, look up + ## the instantiated generic, parse it and store in `genericInsts` below. + generics*: HashSet[string] + + ## Stores the unique identifiers (keys) and the implementations of the + ## precise generic instantiations that are called. + genericInsts*: OrderedTable[GpuAst, GpuAst] + + ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we + ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext` + ## causes trouble. + GpuGenericsInfo* = object + data*: seq[GpuAst] GenericArg* = object addrSpace*: AddressSpace ## We store the address space, because that's what matters diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index ce460e1a..fe814ffa 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -105,7 +105,9 @@ proc determineArrayLength(n: NimNode): int = of nnkIdent: let msg = """Found array with length given by identifier: $#! You might want to create a typed template taking a typed parameter for this -constant to force the Nim compiler to bind the symbol. +constant to force the Nim compiler to bind the symbol. In theory though this +error should not appear anymore though, as we don't try to parse generic +functions. """ % n[1].strVal raiseAssert msg else: @@ -248,7 +250,8 @@ proc requiresMemcpy(n: NimNode): bool = proc collectProcAttributes(n: NimNode): set[GpuAttribute] = doAssert n.kind == nnkPragma for pragma in n: - doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr + doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall], "Unexpected node kind: " & $pragma.treerepr + let pragma = if pragma.kind == nnkCall: pragma[0] else: pragma case pragma.strVal of "device": result.incl attDevice of "global": result.incl attGlobal @@ -374,42 +377,55 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = ctx.toGpuAst(node[0]) of nnkProcDef, nnkFuncDef: - result = GpuAst(kind: gpuProc) - result.pName = ctx.toGpuAst(node.name) - result.pName.symbolKind = gsProc ## This is a procedure identifier - doAssert node[3].kind == nnkFormalParams - result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type - # Process pragmas - if node.pragma.kind != nnkEmpty: - doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" - result.pAttributes = collectProcAttributes(node.pragma) - if result.pAttributes.len == 0: # means `nimonly` was applied - return GpuAst(kind: gpuVoid) - # Process parameters - for i in 1 ..< node[3].len: - let param = node[3][i] - let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: - let typIdx = param.len - 2 # second to last is the type - # IdentDefs - # Ident "x" - # Ident "y" - # Ident "res" - # PtrTy - # Ident "float32" # `param.len - 2` - # Empty # `param.len - 1` - let paramType = nimToGpuType(param[typIdx]) - #echo "Argument: ", param.treerepr, " has tpye: ", paramType - for i in 0 ..< numParams: - var p = ctx.toGpuAst(param[i]) - let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam - else: gsDeviceKernelParam - p.iTyp = paramType ## Update the type of the symbol - p.symbolKind = symKind ## and the symbol kind - let param = GpuParam(ident: p, typ: paramType) - result.pParams.add(param) - - result.pBody = ctx.toGpuAst(node.body) - .ensureBlock() # single line procs should be a block to generate `;` + # if it is a _generic_ function, we don't actually process it here. instead we add it to + # the `generics` set. When we encounter a `gpuCall` we will then check if the function + # being called is part of the generic set and look up its _instantiated_ implementation + # to parse it. The parsed generics are stored in the `genericInsts` table. + let name = ctx.toGpuAst(node.name) + if node[2].kind == nnkGenericParams: # is a generic + ctx.generics.incl name.iName # need to use raw name, *not* symbol + result = GpuAst(kind: gpuVoid) + else: + result = GpuAst(kind: gpuProc) + result.pName = name + result.pName.symbolKind = gsProc ## This is a procedure identifier + doAssert node[3].kind == nnkFormalParams + result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type + # Process pragmas + if node.pragma.kind != nnkEmpty: + doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" + result.pAttributes = collectProcAttributes(node.pragma) + if result.pAttributes.len == 0: # means `nimonly` was applied + return GpuAst(kind: gpuVoid) + # Process parameters + echo "Node: ", node.treerepr + if node[2].kind == nnkGenericParams: + echo node[2][0].getImpl().treerepr + echo node[2][0].treerepr + for i in 1 ..< node[3].len: + let param = node[3][i] + let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: + let typIdx = param.len - 2 # second to last is the type + # IdentDefs + # Ident "x" + # Ident "y" + # Ident "res" + # PtrTy + # Ident "float32" # `param.len - 2` + # Empty # `param.len - 1` + let paramType = nimToGpuType(param[typIdx]) + #echo "Argument: ", param.treerepr, " has tpye: ", paramType + for i in 0 ..< numParams: + var p = ctx.toGpuAst(param[i]) + let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam + else: gsDeviceKernelParam + p.iTyp = paramType ## Update the type of the symbol + p.symbolKind = symKind ## and the symbol kind + let param = GpuParam(ident: p, typ: paramType) + result.pParams.add(param) + + result.pBody = ctx.toGpuAst(node.body) + .ensureBlock() # single line procs should be a block to generate `;` of nnkLetSection, nnkVarSection: # For a section with multiple declarations, create a block @@ -518,8 +534,22 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = GpuAst(kind: gpuVoid) of nnkCall, nnkCommand: - # Check if this is a template call + # `name` below is name + signature hash. Check if this is a generic based on node repr let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice + if node[0].repr in ctx.generics: # process the generic instantiaton and store + # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature + # Only the latter contains e.g. correct instantiation of static array sizes + let inst = node[0].getImpl() + let sig = node[0].getTypeInst() + inst.params = sig.params # copy over the parameters + let fn = ctx.toGpuAst(inst) + doAssert fn.pName.iSym == name.iSym, "Not matching" + # now overwrite the identifier's `iName` field by its `iSym` so that different + # generic insts have different + fn.pName.iName = fn.pName.iSym + name.iName = fn.pName.iSym + ctx.genericInsts[fn.pName] = fn + let args = node[1..^1].mapIt(ctx.toGpuAst(it)) # Producing a template call something like this (but problematic due to overloads etc) # we could then perform manual replacement of the template in the CUDA generation pass. @@ -571,7 +601,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkSym: let s = node.repr & "_" & node.signatureHash() # NOTE: The reason we have a tab of known symbols is not to keep the same _reference_ to each - # symbol, but rather to allow having the same symbol kind (set in the caller of this call). + # symbol, but rather to allow having the same symbol kind and appropriate type for each + # symbol `GpuAst` (of kind `gpuIdent`), which is set in the caller of this call. # For example in `nnkCall` nodes returning the value from the table automatically means the # `symbolKind` is local / function argument etc. if s notin ctx.sigTab: @@ -581,7 +612,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if result.iName == "_": result.iName = "tmp_" & $ctx.genSymCount inc ctx.genSymCount - #ctx.sigTab[s] = result + ctx.sigTab[s] = result else: result = ctx.sigTab[s] @@ -746,6 +777,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert el.kind == nnkConstDef result.statements.add ctx.toGpuAst(el) + of nnkWhenStmt: + raiseAssert "We shouldn't be seeing a `when` statement after sem check of the Nim code." else: echo "Unhandled node kind in toGpuAst: ", node.kind raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr From d3e0c456d03dd50f9424fd427185618c6bb39207 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Sun, 17 Aug 2025 18:06:34 +0200 Subject: [PATCH 22/87] implement support for type aliases So far only the code generation for WGSL is done, but the CUDA code gen is simple. --- .../math_compiler/experimental/backends/wgsl.nim | 6 +++++- .../math_compiler/experimental/gpu_types.nim | 12 ++++++++++++ .../math_compiler/experimental/nim_to_gpu.nim | 16 +++++++++++++--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 728ab57c..c7ce0107 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -162,7 +162,7 @@ proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, ty ctx.farmTopLevel(ch, kernel, varBlock, typBlock) of gpuVar, gpuConstexpr: varBlock.statements.add ast - of gpuTypeDef: + of gpuTypeDef, gpuAlias: typBlock.statements.add ast else: discard @@ -729,6 +729,7 @@ proc removeStructPointerFields(blk: var GpuAst) = ## purpose on supporting such features. doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind for typ in mitems(blk): + if typ.kind == gpuAlias: continue # don't need to mutate aliases! doAssert typ.kind == gpuTypeDef var i = 0 while i < typ.tFields.len: @@ -982,6 +983,9 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add " " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n" result.add "}" + of gpuAlias: + result = "alias " & ast.aName & " = " & ctx.genWebGpu(ast.aTo) + of gpuObjConstr: result = ast.ocName & "(" for i, el in ast.ocFields: diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 87b1a22a..4adf8941 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -33,6 +33,7 @@ type gpuDot # Member access (a.b) gpuIndex # Array indexing (a[b]) gpuTypeDef # Type definition + gpuAlias # A type alias gpuObjConstr # Object (struct) constructor gpuInlineAsm # Inline assembly (PTX) gpuAddr # Address of an expression @@ -161,6 +162,10 @@ type of gpuTypeDef: tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols tFields*: seq[GpuTypeField] + of gpuAlias: + aName*: string ## Name of the type alias + aTo*: GpuAst ## Type the alias maps to + aDistinct*: bool ## If the alias is a distinct type in Nim. of gpuObjConstr: ocName*: string # type we construct ## XXX: it would be better if we already fill the fields with default values here @@ -400,6 +405,10 @@ proc clone*(ast: GpuAst): GpuAst = result.tName = ast.tName for f in ast.tFields: result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone())) + of gpuAlias: + result = GpuAst(kind: gpuAlias) + result.aName = ast.aName + result.aTo = ast.aTo.clone() of gpuObjConstr: result = GpuAst(kind: gpuObjConstr) result.ocName = ast.ocName @@ -616,6 +625,9 @@ proc pretty*(n: GpuAst, indent: int = 0): string = for t in n.tFields: let indent = indent + 2 result.add id(t.name) + of gpuAlias: + result.add id("Alias", n.aName) + result.add pretty(n.aTo, indent + 2) of gpuObjConstr: result.add idd("Ident", n.ocName) result.add idd("Fields") diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index fe814ffa..9c1df7af 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -164,7 +164,9 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = ## Note: this is just the internal type of the array. It is only a pointer due to ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. result = initGpuUAType(getInnerPointerType(n, allowToFail)) - of ntyObject: + of ntyObject, ntyAlias: + # for aliases, treat them identical to regular object types, but + # `getTypeName` returns the alias! let impl = n.getTypeImpl let flds = impl.parseTypeFields() let typName = getTypeName(n) # might be an object construction @@ -677,8 +679,16 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert el.kind == nnkTypeDef result.statements.add ctx.toGpuAst(el) of nnkTypeDef: - result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal) - result.tFields = parseTypeFields(node[2]) + doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len + case node[2].kind + of nnkObjectTy: # regular `type foo = object` + result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal) + result.tFields = parseTypeFields(node[2]) + of nnkSym: # a type alias `type foo = bar` + result = GpuAst(kind: gpuAlias, aName: node[0].strVal, + aTo: ctx.toGpuAst(node[2])) + else: + raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind of nnkObjConstr: let typName = getTypeName(node) result = GpuAst(kind: gpuObjConstr, ocName: typName) From 0d111035b7404ef001d2768a5637e0d3af969296 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Sun, 17 Aug 2025 18:07:02 +0200 Subject: [PATCH 23/87] allow "pulling in" procs from outside `cuda` scope The generic logic is essentially what we need to pull in a proc defined outside the scope of the `cuda` macro into the code. Essentially when we encounter a function that is not known to us yet, we simply look up its implementation from the symbol. This is the first step towards untying the current `cuda` macro code from relying on having _everything_ be defined under that macro. In the future the idea is that essentially only the `{.global.}` procs strictly need to be defined in the macro. These then pull in everything they need (i.e. everything that is used and has been checked by the compiler to be used). --- constantine/math_compiler/experimental/nim_to_gpu.nim | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 9c1df7af..8316b411 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -429,6 +429,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.pBody = ctx.toGpuAst(node.body) .ensureBlock() # single line procs should be a block to generate `;` + # Add to table of known functions + if result.pName notin ctx.allFnTab: + ctx.allFnTab[result.pName] = result + of nnkLetSection, nnkVarSection: # For a section with multiple declarations, create a block result = GpuAst(kind: gpuBlock) @@ -538,7 +542,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkCall, nnkCommand: # `name` below is name + signature hash. Check if this is a generic based on node repr let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice - if node[0].repr in ctx.generics: # process the generic instantiaton and store + if node[0].repr in ctx.generics or name notin ctx.allFnTab: + # process the generic instantiaton and store *or* pull in a proc defined outside + # the `cuda` macro by its implementation. + ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`! # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature # Only the latter contains e.g. correct instantiation of static array sizes let inst = node[0].getImpl() From 2c3f8f62b23d5ac658c6f572cd13afe37b673794 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 18 Aug 2025 12:22:02 +0200 Subject: [PATCH 24/87] add pretty printer for GpuType This is lossy so it is not the default representation. --- .../math_compiler/experimental/gpu_types.nim | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 4adf8941..b2f5293c 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -530,6 +530,31 @@ proc removePrefix(s, p: string): string = result = s result.removePrefix(p) +proc pretty*(t: GpuType): string = + ## returns a flat (but lossy) string representation of the type + if t == nil: + result = "GpuType(nil)" + else: + case t.kind + of gtPtr: + result = if t.implicit: "var " else: "ptr " + result.add pretty(t.to) + of gtUA: + result = "UncheckedArray[" & t.uaTo.pretty() & "]" + of gtObject: + result = t.name # just the name + of gtArray: + result = "array[" & $t.aLen & ", " & t.aTyp.pretty() & "]" + of gtGenericInst: + result = t.gName & "[" + for i, g in t.gArgs: + result.add pretty(g) + if i < t.gArgs.high: + result.add ", " + result.add "]" + else: + result = ($t.kind).removePrefix("gt") + proc pretty*(n: GpuAst, indent: int = 0): string = template id(): untyped = repeat(" ", indent) template idn(x): untyped = repeat(" ", indent) & $x From 0bf280739f555fe4d8a7a2a62db9f0f1ca8d3185 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 18 Aug 2025 12:32:37 +0200 Subject: [PATCH 25/87] store all types in `ctx.types` table, support 'pulling' types... from outside the `cuda` scope. This is the second step towards making it so that the `cuda` macro only needs to contain the calling code (`{.global.}` proc in general). --- .../experimental/backends/cuda.nim | 2 +- .../experimental/backends/wgsl.nim | 37 +++++---- .../experimental/gpu_compiler.nim | 9 +- .../math_compiler/experimental/gpu_types.nim | 28 ++++--- .../math_compiler/experimental/nim_to_gpu.nim | 82 ++++++++++++++----- 5 files changed, 105 insertions(+), 53 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 9ee253ca..ae8307f6 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -256,7 +256,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ast.pOp & ctx.genCuda(ast.pVal) of gpuTypeDef: - result = "struct " & ast.tName & "{\n" + result = "struct " & gpuTypeToString(ast.tTyp) & "{\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, el.name) & ";\n" result.add "}" diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index c7ce0107..1c95ea5f 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -454,7 +454,7 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp let id = f.value.determineIdent() doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " & "variable (i.e. an argument to a global kernel) is not supported: " & $f - ctx.structsWithPtrs[(n.ocName, f.name)] = id + ctx.structsWithPtrs[(n.ocType, f.name)] = id else: for ch in n: ctx.scanGenerics(ch, callerParams) @@ -530,18 +530,18 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst = # leave untouched result = n -proc getStructName(n: GpuAst): string = - ## Given an identifier `gpuIdent` (or `Deref` of one), return the name of the struct type - ## the ident is of or an empty string if it is not (pointing to) a struct. +proc getStructType(n: GpuAst): GpuType = + ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type + ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct. doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind var p = n if p.kind == gpuDeref: p = n.dOf result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject: - p.iTyp.to.name + p.iTyp.to elif p.iTyp.kind == gtObject: - p.iTyp.name - else: "" + p.iTyp + else: GpuType(kind: gtVoid) proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = @@ -581,7 +581,7 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = for ch in mitems(n): # now go over children ctx.makeCodeValid(ch, inGlobal) of gpuObjConstr: # strip out arguments that are pointer types - let t = n.ocName + let t = n.ocType var i = 0 while i < n.ocFields.len: let f = n.ocFields[i] @@ -594,10 +594,10 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = inc i of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res` var p = n.dParent - let id = getStructName(p) + let id = getStructType(p) doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind let field = n.dField.ident() - if id.len > 0 and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer + if id.kind != gtVoid and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer let v = ctx.structsWithPtrs[(id, field)] ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions, ## we will have passed the parameter @@ -608,12 +608,12 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]: let dot = n.aLeft - let id = getStructName(dot.dParent) - if id.len > 0: + let id = getStructType(dot.dParent) + if id.kind != gtVoid: doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind let field = dot.dField.ident() if (id, field) in ctx.structsWithPtrs: - raiseAssert "Assignment of a struct (`" & id & "`) field of a pointer type is not supported. " & + raiseAssert "Assignment of a struct (`" & pretty(id) & "`) field of a pointer type is not supported. " & "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n) for ch in mitems(n): ctx.makeCodeValid(ch, inGlobal) @@ -751,10 +751,15 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ctx.farmTopLevel(ast, kernel, varBlock, typBlock) ctx.globalBlocks.add varBlock ctx.globalBlocks.add typBlock + ## XXX: `typBlock` should now always be empty, as we pass all + ## found types into `ctx.types` # Now add the generics to the `allFnTab` for k, v in pairs(ctx.genericInsts): ctx.allFnTab[k] = v + # And all the known types + for k, typ in pairs(ctx.types): + ctx.globalBlocks.add typ # 2. Remove all arguments from global functions, as none are allowed in WGSL for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table @@ -978,16 +983,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ast.pOp & ctx.genWebGpu(ast.pVal) of gpuTypeDef: - result = "struct " & ast.tName & "{\n" + result = "struct " & gpuTypeToString(ast.tTyp) & " {\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n" result.add "}" of gpuAlias: - result = "alias " & ast.aName & " = " & ctx.genWebGpu(ast.aTo) + result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) of gpuObjConstr: - result = ast.ocName & "(" + result = gpuTypeToString(ast.ocType) & "(" for i, el in ast.ocFields: if el.value.kind == gpuLit and el.value.lValue == "DEFAULT": # use type to construct a default value diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index d71c4298..0fd28760 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -102,8 +102,9 @@ macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = ## - most regular Nim features :) var ctx = GpuContext() let ast = ctx.toGpuAst(body) - let gen = toSeq(ctx.genericInsts.values) - let g = GpuGenericsInfo(data: gen) + let genProcs = toSeq(ctx.genericInsts.values) + let genTypes = toSeq(ctx.types.values) + let g = GpuGenericsInfo(procs: genProcs, types: genTypes) newLit((g, ast)) macro cuda*(body: typed): string = @@ -124,8 +125,10 @@ proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string = ## Generates the code based on the given AST (optionally at runtime) and restricts ## it to a single global kernel (WebGPU) if any given. var ctx = GpuContext() - for fn in gen.data: # assign generics info to correct table + for fn in gen.procs: # assign generics info to correct table ctx.genericInsts[fn.pName] = fn + for typ in gen.types: # assign generics info to correct table + ctx.types[typ.tTyp] = typ result = ctx.codegen(ast, kernel) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index b2f5293c..9b3f4a70 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -160,14 +160,14 @@ type pOp*: string pVal*: GpuAst of gpuTypeDef: - tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols + tTyp*: GpuType ## the actual type. Used to generate the name tFields*: seq[GpuTypeField] of gpuAlias: - aName*: string ## Name of the type alias + aTyp*: GpuType ## Name of the type alias aTo*: GpuAst ## Type the alias maps to aDistinct*: bool ## If the alias is a distinct type in Nim. of gpuObjConstr: - ocName*: string # type we construct + ocType*: GpuType # type we construct ## XXX: it would be better if we already fill the fields with default values here ocFields*: seq[GpuFieldInit] # the fields we initialize of gpuInlineAsm: @@ -254,7 +254,7 @@ type ## Maps a struct type and field name, which is of pointer type to the value the user assigns ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()` ## constructor (WebGPU only). - structsWithPtrs*: Table[(string, string), GpuAst] + structsWithPtrs*: Table[(GpuType, string), GpuAst] ## Set of all generic proc names we have encountered in Nim -> GpuAst. When ## we see an `nnkCall` we check if we call a generic function. If so, look up ## the instantiated generic, parse it and store in `genericInsts` below. @@ -264,11 +264,17 @@ type ## precise generic instantiations that are called. genericInsts*: OrderedTable[GpuAst, GpuAst] + ## Table of all known types. Filled during Nim -> GpuAst. Includes generic + ## instantiations, but also all other types. + ## Key: the raw type. Value: a full `gpuTypeDef` + types*: OrderedTable[GpuType, GpuAst] + ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext` ## causes trouble. GpuGenericsInfo* = object - data*: seq[GpuAst] + procs*: seq[GpuAst] + types*: seq[GpuAst] GenericArg* = object addrSpace*: AddressSpace ## We store the address space, because that's what matters @@ -402,16 +408,16 @@ proc clone*(ast: GpuAst): GpuAst = result.iIndex = ast.iIndex.clone() of gpuTypeDef: result = GpuAst(kind: gpuTypeDef) - result.tName = ast.tName + result.tTyp = ast.tTyp.clone() for f in ast.tFields: result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone())) of gpuAlias: result = GpuAst(kind: gpuAlias) - result.aName = ast.aName + result.aTyp = ast.aTyp.clone() result.aTo = ast.aTo.clone() of gpuObjConstr: result = GpuAst(kind: gpuObjConstr) - result.ocName = ast.ocName + result.ocType = ast.ocType.clone() for f in ast.ocFields: result.ocFields.add( GpuFieldInit( @@ -645,16 +651,16 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add id("Op", n.pOp) result.add pretty(n.pVal, indent + 2) of gpuTypeDef: - result.add id("Type", n.tName) + result.add id("Type", pretty(n.tTyp)) result.add id("Fields") for t in n.tFields: let indent = indent + 2 result.add id(t.name) of gpuAlias: - result.add id("Alias", n.aName) + result.add id("Alias", pretty(n.aTyp)) result.add pretty(n.aTo, indent + 2) of gpuObjConstr: - result.add idd("Ident", n.ocName) + result.add idd("Ident", pretty(n.ocType)) result.add idd("Fields") for f in n.ocFields: var indent = indent + 2 diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 8316b411..62c43a6b 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -343,6 +343,30 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = # ctx.sigTab[sig] = result result.symbolKind = gsProc # make sure it's a proc +proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = + ## Looks up the implementation of the given function and stores it in our table + ## of generic instantiations. + ## + ## For any looked up procedure, we attach the `{.device.}` pragma. + ## + ## Mutates the `name` of the given function to match its generic name. + # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature + # Only the latter contains e.g. correct instantiation of static array sizes + let inst = node[0].getImpl() + let sig = node[0].getTypeInst() + inst.params = sig.params # copy over the parameters + let fn = ctx.toGpuAst(inst) + if fn.kind == gpuVoid: # should be an inbuilt proc, i.e. annotated with `{.builtin.}` + doAssert inst.isBuiltIn() + else: + fn.pAttributes.incl attDevice # make sure this is interpreted as a device function + doAssert fn.pName.iSym == name.iSym, "Not matching" + # now overwrite the identifier's `iName` field by its `iSym` so that different + # generic insts have different + fn.pName.iName = fn.pName.iSym + name.iName = fn.pName.iSym ## update the name of the called function + ctx.genericInsts[fn.pName] = fn + proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## XXX: things still left to do: ## - support `result` variable? Currently not supported. Maybe we will won't @@ -521,6 +545,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## NOTE: Currently we process templates, but we expect them to be already ## expanded by the Nim compiler. Thus we could in theory expand them manually ## but fortunately we don't need to. + return GpuAst(kind: gpuVoid) let tName = node[0].strVal # Extract parameters @@ -546,18 +571,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = # process the generic instantiaton and store *or* pull in a proc defined outside # the `cuda` macro by its implementation. ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`! - # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature - # Only the latter contains e.g. correct instantiation of static array sizes - let inst = node[0].getImpl() - let sig = node[0].getTypeInst() - inst.params = sig.params # copy over the parameters - let fn = ctx.toGpuAst(inst) - doAssert fn.pName.iSym == name.iSym, "Not matching" - # now overwrite the identifier's `iName` field by its `iSym` so that different - # generic insts have different - fn.pName.iName = fn.pName.iSym - name.iName = fn.pName.iSym - ctx.genericInsts[fn.pName] = fn + ctx.addProcToGenericInsts(node, name) let args = node[1..^1].mapIt(ctx.toGpuAst(it)) # Producing a template call something like this (but problematic due to overloads etc) @@ -687,18 +701,42 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.statements.add ctx.toGpuAst(el) of nnkTypeDef: doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len - case node[2].kind - of nnkObjectTy: # regular `type foo = object` - result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal) - result.tFields = parseTypeFields(node[2]) - of nnkSym: # a type alias `type foo = bar` - result = GpuAst(kind: gpuAlias, aName: node[0].strVal, - aTo: ctx.toGpuAst(node[2])) + let name = ctx.toGpuAst(node[0]) + if node[1].kind == nnkGenericParams: # if this is a generic, only store existence of it + # will store the instantiatons in `nnkObjConstr` + result = GpuAst(kind: gpuVoid) else: - raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind + let typ = nimToGpuType(node[0]) + case node[2].kind + of nnkObjectTy: # regular `type foo = object` + result = GpuAst(kind: gpuTypeDef, tTyp: typ) + result.tFields = parseTypeFields(node[2]) + of nnkSym: # a type alias `type foo = bar` + result = GpuAst(kind: gpuAlias, aTyp: typ, + aTo: ctx.toGpuAst(node[2])) + else: + raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind + + # include this the set of known types to not generate duplicates + ctx.types[typ] = result + # Reset the type we return to void. We now generate _all_ types from the + # `types`. + result = GpuAst(kind: gpuVoid) of nnkObjConstr: - let typName = getTypeName(node) - result = GpuAst(kind: gpuObjConstr, ocName: typName) + ## this should never see `genericParam` I think + let typ = nimToGpuType(node) + if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type + # store the type instantiation + let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ) + case typ.kind + of gtObject: typDef.tFields = typ.oFields + of gtGenericInst: typDef.tFields = typ.gFields + else: + raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." + + ctx.types[typ] = typDef + + result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type let flds = node[0].getTypeImpl.parseTypeFields() # sym # find all fields that have been defined by the user From d1a73906d16e3aaffc721d8c9b7b4438ef6428f1 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 18 Aug 2025 12:33:45 +0200 Subject: [PATCH 26/87] support generic instantiations, producing unique WGSL types --- .../experimental/backends/wgsl.nim | 8 +++++ .../math_compiler/experimental/gpu_types.nim | 36 ++++++++++++++++--- .../math_compiler/experimental/nim_to_gpu.nim | 29 +++++++++++++-- 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 1c95ea5f..9180b055 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -112,6 +112,14 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr = else: result = &"{identPrefix}array<{typ}, {t.aLen}>" skipIdent = true + of gtGenericInst: + # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations + # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. + result = t.gName & "_" + for i, g in t.gArgs: + result.add gpuTypeToString(g) + if i < t.gArgs.high: + result.add "_" of gtObject: result = t.name of gtUA: result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">" else: result = gpuTypeToString(t.kind) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 9b3f4a70..fe893028 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -46,12 +46,13 @@ type GpuTypeKind* = enum gtVoid, gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics - gtArray, # Static array `array[N, dtype]` -> `dtype[N]` + gtArray, # Static array `array[N, dtype]` -> `dtype[N]` gtString, - gtObject, # Struct types - gtPtr, # Pointer type, carries inner type - gtUA, # UncheckedArray (UA) mapped to runtime sized arrays - gtVoidPtr # Opaque void pointer + gtObject, # Struct types + gtPtr, # Pointer type, carries inner type + gtUA, # UncheckedArray (UA) mapped to runtime sized arrays + gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!) + gtVoidPtr # Opaque void pointer GpuTypeField* = object name*: string @@ -75,6 +76,10 @@ type aLen*: int # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time # On both CUDA and WebGPU a length of `0` is also used to generate `int foo[]` (CUDA) # `array` (WebGPU) (runtime sized arrays), which are generated from `ptr UncheckedArray[float32]` for example. + of gtGenericInst: + gName*: string # name of the generic type + gArgs*: seq[GpuType] # list of the instantiated generic arguments e.g. `vec3` on WGSL backend + gFields*: seq[GpuTypeField] # same as `oFields` for `gtObject` else: discard GpuAttribute* = enum @@ -304,6 +309,12 @@ proc clone*(typ: GpuType): GpuType = of gtArray: result.aTyp = typ.aTyp.clone() result.aLen = typ.aLen + of gtGenericInst: + result.gName = typ.gName + for g in typ.gArgs: + result.gArgs.add g.clone() + for f in typ.gFields: + result.gFields.add GpuTypeField(name: f.name, typ: f.typ.clone()) else: discard proc clone*(ast: GpuAst): GpuAst = @@ -463,6 +474,12 @@ proc hash*(t: GpuType): Hash = of gtArray: h = h !& hash(t.aTyp) h = h !& hash(t.aLen) + of gtGenericInst: + h = h !& hash(t.gName) + for g in t.gArgs: + h = h !& hash(g) + for f in t.gFields: + h = h !& hash(f) else: discard result = !$ h @@ -491,6 +508,15 @@ proc `==`*(a, b: GpuType): bool = else: for i in 0 ..< a.oFields.len: result = result and (a.oFields[i] == b.oFields[i]) + of gtGenericInst: + result = a.gName == b.gName + if a.gArgs.len != b.gArgs.len: result = false + elif a.gFields.len != b.gFields.len: result = false + else: + for i in 0 ..< a.gArgs.len: + result = result and (a.gArgs[i] == b.gArgs[i]) + for i in 0 ..< a.gFields.len: + result = result and (a.gFields[i] == b.gFields[i]) of gtArray: result = a.aTyp == b.aTyp and a.aLen == b.aLen else: discard diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 62c43a6b..4b7dd9a8 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -60,6 +60,25 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = else: raiseAssert "Not supported yet: " & $t +proc parseTypeFields(node: NimNode): seq[GpuTypeField] +proc initGpuGenericInst(t: NimNode): GpuType = + doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind + case t.kind + of nnkBracketExpr: # regular generic instantiation + result = GpuType(kind: gtGenericInst, gName: t[0].repr) + for i in 1 ..< t.len: # grab all generic arguments + let typ = nimToGpuType(t[i]) + result.gArgs.add typ + # now parse the object fields + let impl = t.getTypeImpl() # impl for the `gFields` + result.gFields = parseTypeFields(impl) + of nnkObjConstr: + doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr + result = initGpuGenericInst(t[0]) + else: + raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr + echo "Got generic inst: ", result + proc unpackGenericInst(t: NimNode): NimNode = let tKind = t.typeKind if tKind == ntyGenericInst: @@ -131,7 +150,6 @@ proc getTypeName(n: NimNode): string = result = n[0].strVal # type is the first node else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr -proc parseTypeFields(node: NimNode): seq[GpuTypeField] proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = ## Maps a Nim type to a type on the GPU ## @@ -197,7 +215,14 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = result = initGpuType(gtVoid) error("Generics are not supported in the CUDA DSL so far.") of ntyGenericInst: - result = n.unpackGenericInst().nimToGpuType(allowToFail) + result = initGpuGenericInst(n) + #result = n.unpackGenericInst().nimToGpuType(allowToFail) + of ntyTypeDesc: + # `getType` returns a `BracketExpr` of eg: + # BracketExpr + # Sym "typeDesc" + # Sym "float32" + result = n.getType[1].nimToGpuType(allowToFail) # for a type desc we need to recurse using the type of it else: if allowToFail: result = GpuType(kind: gtVoid) From 3482a45a4328556f763eb1bd4da0d24c056a7cba Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 18 Aug 2025 12:34:17 +0200 Subject: [PATCH 27/87] add `{.builtin.}` pragma intended to specify builtin procs in CUDA/WGSL This is fundamentally doing the same as `{.nimonly.}`, but for this purpose the different name makes the intent clearer. --- .../experimental/gpu_compiler.nim | 28 +++++++++++-------- .../math_compiler/experimental/gpu_types.nim | 1 + .../math_compiler/experimental/nim_to_gpu.nim | 19 +++++++++++-- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 0fd28760..08378417 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -23,6 +23,11 @@ template global*() {.pragma.} template device*() {.pragma.} template forceinline*() {.pragma.} +## If attached to a function, type or variable it will refer to a built in +## in the target backend. This is used for all the functions, types and variables +## defined below to indicate that we do not intend to generate code for them. +template builtin*() {.pragma.} + # If attached to a `var` it will be treated as a # `__constant__`! Only useful if you want to define a # constant without initializing it (and then use @@ -54,25 +59,24 @@ type y*: DimWgsl z*: DimWgsl - ## These are dummy elements to make CUDA block / thread index / dim ## access possible in the *typed* `cuda` macro. It cannot be `const`, ## because then the typed code would evaluate the values before we ## can work with it from the typed macro. -let blockIdx* = NvBlockIdx() -let blockDim* = NvBlockDim() -let gridDim* = NvGridDim() -let threadIdx* = NvThreadIdx() +let blockIdx* {.builtin.} = NvBlockIdx() +let blockDim* {.builtin.} = NvBlockDim() +let gridDim* {.builtin.} = NvGridDim() +let threadIdx* {.builtin.} = NvThreadIdx() ## WebGPU specific -let global_id* = WgslGridDim() +let global_id* {.builtin.} = WgslGridDim() ## Similar for procs. They don't need any implementation, as they won't ever be actually called. -proc printf*(fmt: string) {.varargs.} = discard -proc memcpy*(dst, src: pointer, size: int) = discard +proc printf*(fmt: string) {.varargs, builtin.} = discard +proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard ## WebGPU select -proc select*[T](f, t: T, cond: bool): T = +proc select*[T](f, t: T, cond: bool): T {.builtin.} = # Implementation to run WebGPU code on CPU if cond: t else: f @@ -90,9 +94,9 @@ template private*(): untyped {.pragma.} ## While you can use `malloc` on device with small sizes, it is usually not ## recommended to do so. -proc malloc*(size: csize_t): pointer = discard -proc free*(p: pointer) = discard -proc syncthreads*() {.cudaName: "__syncthreads".} = discard +proc malloc*(size: csize_t): pointer {.builtin.} = discard +proc free*(p: pointer) {.builtin.} = discard +proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = ## WARNING: The following are *not* supported: diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index fe893028..3cd47a4c 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -59,6 +59,7 @@ type typ*: GpuType GpuType* = ref object + builtin*: bool ## Whether the type refers to a builtin type or not case kind*: GpuTypeKind of gtPtr: to*: GpuType # `ptr T` points to `to` diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 4b7dd9a8..0d8ab8ca 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -274,18 +274,31 @@ proc requiresMemcpy(n: NimNode): bool = ## At the moment we only emit a `memcpy` statement for array types result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy +proc isBuiltIn(n: NimNode): bool = + ## Checks if the given proc is a `{.builtin.}` (or if it is a Nim "built in" + ## proc that uses `importc`, as we cannot emit those; they _need_ to have a + ## WGSL / CUDA equivalent built in) + doAssert n.kind in [nnkProcDef, nnkFuncDef], "Argument is not a proc: " & $n.treerepr + for pragma in n.pragma: + doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr + let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma + if pragma.strVal in ["builtin", "importc"]: + return true + proc collectProcAttributes(n: NimNode): set[GpuAttribute] = doAssert n.kind == nnkPragma for pragma in n: - doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall], "Unexpected node kind: " & $pragma.treerepr - let pragma = if pragma.kind == nnkCall: pragma[0] else: pragma + doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr + let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma case pragma.strVal of "device": result.incl attDevice of "global": result.incl attGlobal of "forceinline": result.incl attForceInline - of "nimonly": + of "nimonly", "builtin": # used to fully ignore functions! return + of "importc": # encountered if we analyze a proc from outside `cuda` scope + return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil` else: raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr From cd20189ea82572cb7c6782f27840953280844163 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 18 Aug 2025 19:20:39 +0200 Subject: [PATCH 28/87] fix context version for `cuModuleGetGlobal` --- constantine/platforms/abis/nvidia_abi.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim index 8e22f6ef..cfc8d2c9 100644 --- a/constantine/platforms/abis/nvidia_abi.nim +++ b/constantine/platforms/abis/nvidia_abi.nim @@ -855,11 +855,11 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi {.pop.} proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult {.v2.} +proc cuCtxDestroy*(ctx: CUcontext): CUresult {.v2.} proc cuCtxSynchronize*(ctx: CUcontext): CUresult {.v2.} {.push noconv, importc, dynlib: libCuda.} -proc cuCtxDestroy*(ctx: CUcontext): CUresult proc cuCtxSynchronize*(): CUresult proc cuCtxGetCurrent*(ctx: var CUcontext): CUresult proc cuCtxSetCurrent*(ctx: CUcontext): CUresult @@ -875,7 +875,6 @@ proc cuModuleUnload*(module: CUmodule): CUresult proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.} proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult -proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult proc cuLaunchKernel*( kernel: CUfunction, @@ -889,6 +888,8 @@ proc cuLaunchKernel*( {.pop.} # {.push noconv, importc, dynlib: "libcuda.so"..} +proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult {.v2.} + proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult {.v2.} proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult {.v1.} proc cuMemFree*(devptr: CUdeviceptr): CUresult {.v2.} From dff79875eeb4e8456ac43aa065961286bc248cd0 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 20 Aug 2025 11:19:19 +0200 Subject: [PATCH 29/87] allow to compile with `-d:debugCuda` to compile in debug mode --- .../experimental/runtime_compile.nim | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim index dcfde9c1..95928e2b 100644 --- a/constantine/math_compiler/experimental/runtime_compile.nim +++ b/constantine/math_compiler/experimental/runtime_compile.nim @@ -86,15 +86,17 @@ proc log*(nvrtc: var NVRTC) = proc compile*(nvrtc: var NVRTC) = # Compile the program with fmad disabled. # Note: Can specify GPU target architecture explicitly with '-arch' flag. - const - Options = [ - cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is - # "--fmad=false", # and whatever other options for example - ] - - NumberOfOptions = cint Options.len - let compileResult = nvrtcCompileProgram(nvrtc.prog, NumberOfOptions, - cast[cstringArray](addr Options[0])) + var options = @[ + cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is + # "--fmad=false", # and whatever other options for example + ] + when defined(debugCuda): + options.add cstring "--device-debug" # Equivalent to -g + options.add cstring "--generate-line-info" # Equivalent to -lineinfo + + let numberOfOptions = cint options.len + let compileResult = nvrtcCompileProgram(nvrtc.prog, numberOfOptions, + cast[cstringArray](addr options[0])) nvrtc.log() ## XXX: only in `DebugCuda`? From b834dc9a3cf11b0665c270f1fca0baa0034a2684 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 20 Aug 2025 11:44:54 +0200 Subject: [PATCH 30/87] ignore `varargs` pragma in procs --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 0d8ab8ca..812f7591 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -299,6 +299,8 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] = return of "importc": # encountered if we analyze a proc from outside `cuda` scope return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil` + of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend + continue else: raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr From 9fb4900005af4ea530ed73cad3c699f6304c8497 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 20 Aug 2025 11:45:25 +0200 Subject: [PATCH 31/87] make sure statically sized arrays are copied As they are passed by pointer in C/C++/CUDA, e.g. a `BigInt state[2]` decays to `BigInt *state` in a function argument behind the scenes. --- constantine/math_compiler/experimental/cuda_execute_dsl.nim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim index 8f1aa879..f75c899f 100644 --- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim +++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim @@ -41,13 +41,15 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool = case n.typeKind of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats result = false - of ntyObject, ntyArray: + of ntyObject: if passStructByPointer: result = false # regular objects can just be copied! else: result = true # struct passing by pointer forbidden ## NOTE: strictly speaking this is not the case of course! If the object ## contains refs, it won't hold! + of ntyArray: # statically sized arrays are passed by pointer in CUDA / C++ / C! + result = true of ntyGenericInst: if passStructByPointer: let impl = n.getTypeImpl() From cbbc41554a9ac668956d5d58fd6afd0661c336f9 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 20 Aug 2025 11:46:57 +0200 Subject: [PATCH 32/87] make sure to catch `CUdeviceptr` arguments After our recent Nvidia API wrapper changes, it is now an alias and not a distinct type anymore. --- constantine/math_compiler/experimental/cuda_execute_dsl.nim | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim index f75c899f..ae5732ad 100644 --- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim +++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim @@ -62,6 +62,12 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool = result = false else: result = true + of ntyAlias: + let impl = n.getTypeInst() + if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr": + result = false + else: + result = true else: result = true From d44da46940939a118aedfc102f1e876adc932e7a Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 21 Aug 2025 09:27:23 +0200 Subject: [PATCH 33/87] change CUDA codegen to similar style as WGSL This allows us to make use of the Nim generic instantiation and regular procs we 'pick up' as well as types. In principle it also allow us to do dead code elimination by only generating what's actually called in the global functions (or single kernel, if one uses `toGpuAst` first and then manually calls `codegen`). --- .../experimental/backends/backends.nim | 3 +- .../experimental/backends/common_utils.nim | 31 ++++++- .../experimental/backends/cuda.nim | 80 +++++++++++++++++-- .../experimental/backends/wgsl.nim | 28 ------- .../math_compiler/experimental/gpu_types.nim | 1 + 5 files changed, 107 insertions(+), 36 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim index f2b71152..c7b45740 100644 --- a/constantine/math_compiler/experimental/backends/backends.nim +++ b/constantine/math_compiler/experimental/backends/backends.nim @@ -34,7 +34,8 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string = case Backend of bkCuda: - result = ctx.genCuda(ast) + ctx.preprocess(ast, kernel) + result = cuda.codegen(ctx) of bkWGSL: ctx.storagePass(ast, kernel) result = wgsl.codegen(ctx) diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim index 1def1b1f..01ee5d33 100644 --- a/constantine/math_compiler/experimental/backends/common_utils.nim +++ b/constantine/math_compiler/experimental/backends/common_utils.nim @@ -6,9 +6,36 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. +import std / tables import ../gpu_types -# import ./backends proc address*(a: string): string = "&" & a - proc size*(a: string): string = "sizeof(" & a & ")" + +proc isGlobal*(fn: GpuAst): bool = + doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind + result = attGlobal in fn.pAttributes + +proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) = + ## Farms the top level of the code for functions, variable and type definition. + ## All functions are added to the `allFnTab`, while only global ones (or even only + ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining + ## logic. + ## Variables and types are collected in `varBlock` and `typBlock`. + case ast.kind + of gpuProc: + ctx.allFnTab[ast.pName] = ast + if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal(): + ctx.fnTab[ast.pName] = ast.clone() # store global function extra + elif kernel.len == 0 and ast.isGlobal(): + ctx.fnTab[ast.pName] = ast.clone() # store global function extra + of gpuBlock: + # could be a type definition or global variable + for ch in ast: + ctx.farmTopLevel(ch, kernel, varBlock, typBlock) + of gpuVar, gpuConstexpr: + varBlock.statements.add ast + of gpuTypeDef, gpuAlias: + typBlock.statements.add ast + else: + discard diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index ae8307f6..04d77449 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -6,7 +6,7 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -import std / [macros, strformat, strutils, sugar, sequtils] +import std / [macros, strformat, strutils, sugar, sequtils, tables, algorithm] import ../gpu_types import ./common_utils @@ -119,11 +119,62 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = proc genMemcpy(lhs, rhs, size: string): string = result = &"memcpy({lhs}, {rhs}, {size})" +proc scanFunctions(ctx: var GpuContext, n: GpuAst) = + ## Iterates over the given function and checks for all `gpuCall` nodes. Any function + ## called in the scope is added to `fnTab`. This is a form of dead code elimination. + case n.kind + of gpuCall: + let fn = n.cName + if fn in ctx.allFnTab: + # Check if any of the parameters are pointers (otherwise non generic) + if fn notin ctx.fnTab: # function not known, add to `fnTab` (i.e. avoid code elimination) + let fnCalled = ctx.allFnTab[fn] + ctx.fnTab[fn] = fnCalled + # still "scan for functions", i.e. fill `fnTab` from inner calls + for ch in fnCalled: + ctx.scanFunctions(ch) + # else we don't do anything for this function + # Harvest functions from arguments to this call! + for arg in n.cArgs: + ctx.scanFunctions(arg) + else: + for ch in n: + ctx.scanFunctions(ch) proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a)) +proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = + + # 1. Add all data from `genericInsts` and `types` tables + # In CUDA the types have to be before any possible global variables using + # them! + for k, v in pairs(ctx.genericInsts): + ctx.allFnTab[k] = v + # And all the known types + for k, typ in pairs(ctx.types): + ctx.globalBlocks.add typ + + # 2. Fill table with all *global* functions or *only* the specific `kernel` + # if any given + var varBlock = GpuAst(kind: gpuBlock) + var typBlock = GpuAst(kind: gpuBlock) + ctx.farmTopLevel(ast, kernel, varBlock, typBlock) + ctx.globalBlocks.add varBlock + ctx.globalBlocks.add typBlock + ## XXX: `typBlock` should now always be empty, as we pass all + ## found types into `ctx.types` + + # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect + # the functions called and record them in `fnTab`. + let fns = toSeq(ctx.fnTab.pairs) + for (fnIdent, fn) in fns: # everything in `fnTab` at this point is a global function + # Get the original arguments (before lifting them) of this function. Needed in scan + # to check if `gpuCall` argument is a parameter. + let fnOrig = ctx.allFnTab[fnIdent] + ctx.scanFunctions(fn) + proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = ## The actual CUDA code generator. let indentStr = " ".repeat(indent) @@ -143,10 +194,13 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = # extern "C" is needed to avoid name mangling result = indentStr & "extern \"C\" " & attrs.join(" ") & " " & - fnSig & "{\n" - - result &= ctx.genCuda(ast.pBody, indent + 1) - result &= "\n" & indentStr & "}" + fnSig + if ast.forwardDeclare: + result.add ";" + else: + result.add "{\n" + result &= ctx.genCuda(ast.pBody, indent + 1) + result &= "\n" & indentStr & "}" of gpuBlock: result = "" @@ -296,3 +350,19 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = echo "Unhandled node kind in genCuda: ", ast.kind raiseAssert "Unhandled node kind in genCuda: " & ast.repr result = "" + +proc codegen*(ctx: var GpuContext): string = + ## Generate the actual code for all pieces of the puzzle + # 1. generate code for the global blocks (types, global vars etc) + for blk in ctx.globalBlocks: + result.add ctx.genCuda(blk) & ";\n\n" + + # 2. generate all regular functions + let fns = toSeq(ctx.fnTab.pairs) + for (fnIdent, fn) in fns: + let fnC = fn.clone() + fnC.forwardDeclare = true + result.add ctx.genCuda(fnC) & "\n" + + for fnIdent, fn in ctx.fnTab: + result.add ctx.genCuda(fn) & "\n\n" diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 9180b055..20dc01e0 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -147,34 +147,6 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = if typ.len > 0: result.add &" -> {typ}" -proc isGlobal(fn: GpuAst): bool = - doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind - result = attGlobal in fn.pAttributes - -proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) = - ## Farms the top level of the code for functions, variable and type definition. - ## All functions are added to the `allFnTab`, while only global ones (or even only - ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining - ## logic. - ## Variables and types are collected in `varBlock` and `typBlock`. - case ast.kind - of gpuProc: - ctx.allFnTab[ast.pName] = ast - if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal(): - ctx.fnTab[ast.pName] = ast.clone() # store global function extra - elif kernel.len == 0 and ast.isGlobal(): - ctx.fnTab[ast.pName] = ast.clone() # store global function extra - of gpuBlock: - # could be a type definition or global variable - for ch in ast: - ctx.farmTopLevel(ch, kernel, varBlock, typBlock) - of gpuVar, gpuConstexpr: - varBlock.statements.add ast - of gpuTypeDef, gpuAlias: - typBlock.statements.add ast - else: - discard - proc patchType(t: GpuType): GpuType = ## Applies patches needed for WGSL support. E.g. `bool` cannot be a storage variable. result = t diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 3cd47a4c..9b4d9293 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -104,6 +104,7 @@ type pParams*: seq[GpuParam] pBody*: GpuAst pAttributes*: set[GpuAttribute] # order not important, hence set + forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration of gpuCall: cName*: GpuAst ## Will be a `GpuIdent` cArgs*: seq[GpuAst] From 5e383fd2858cb18352b4f238d56b08f30d8cda45 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 21 Aug 2025 09:30:15 +0200 Subject: [PATCH 34/87] improve error message for `Dot` node if not ident/deref in WGSL --- constantine/math_compiler/experimental/backends/wgsl.nim | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 20dc01e0..19582a00 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -513,7 +513,8 @@ proc rewriteCompoundAssignment(n: GpuAst): GpuAst = proc getStructType(n: GpuAst): GpuType = ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct. - doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & $n.kind + doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & + $n.kind & " for node: " & $n var p = n if p.kind == gpuDeref: p = n.dOf From 7eaa65f45c4e16b423cf0ac357adc983ba3aec7c Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 21 Aug 2025 09:30:52 +0200 Subject: [PATCH 35/87] always also inject `num_workgroups` argument into WGSL globals When working with workgroups dispatched in an e.g. 2D grid, `passEncoder.dispatchWorkgroups(N, M)` one needs the number of workgroups to compute the unique thread ID. --- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- constantine/math_compiler/experimental/gpu_compiler.nim | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 19582a00..2f548ad5 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -813,7 +813,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = if $attGlobal in attrs: doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!" ## XXX: make this more flexible. In theory can be any name - fnArgs = "@builtin(global_invocation_id) global_id: vec3" + fnArgs = "@builtin(global_invocation_id) global_id: vec3, @builtin(num_workgroups) num_workgroups: vec3" let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs) result = indentStr & "fn " & fnSig & " {\n" diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 08378417..51207212 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -70,6 +70,7 @@ let threadIdx* {.builtin.} = NvThreadIdx() ## WebGPU specific let global_id* {.builtin.} = WgslGridDim() +let num_workgroups* {.builtin.} = WgslGridDim() ## Similar for procs. They don't need any implementation, as they won't ever be actually called. proc printf*(fmt: string) {.varargs, builtin.} = discard From 151e0250369c6d9078fc51a03a538caaab76fc77 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 21 Aug 2025 10:38:59 +0200 Subject: [PATCH 36/87] rename WGSL `storagePass` to `preprocess` --- constantine/math_compiler/experimental/backends/backends.nim | 4 ++-- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim index c7b45740..76d63a72 100644 --- a/constantine/math_compiler/experimental/backends/backends.nim +++ b/constantine/math_compiler/experimental/backends/backends.nim @@ -34,8 +34,8 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string = case Backend of bkCuda: - ctx.preprocess(ast, kernel) + cuda.preprocess(ctx, ast, kernel) result = cuda.codegen(ctx) of bkWGSL: - ctx.storagePass(ast, kernel) + wgsl.preprocess(ctx, ast, kernel) result = wgsl.codegen(ctx) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 2f548ad5..9043e3a0 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -720,7 +720,7 @@ proc removeStructPointerFields(blk: var GpuAst) = else: inc i -proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = +proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ## If `kernel` is a global function, we *only* generate code for that kernel. ## This is useful if your GPU code contains multiple kernels with differing ## parameters to avoid having to fill dummy buffers for all the unused parameters From 987482956b09ebaa55d75e37b9770e45b96a8251 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 21 Aug 2025 10:39:16 +0200 Subject: [PATCH 37/87] remove `gpuTypeToString`, `genFunctionType` from `backends` In the end we never used those variants anyway, as the code is fully separate in their respective submodules now. --- .../experimental/backends/backends.nim | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim index 76d63a72..95fbda87 100644 --- a/constantine/math_compiler/experimental/backends/backends.nim +++ b/constantine/math_compiler/experimental/backends/backends.nim @@ -14,23 +14,6 @@ when defined(cuda): else: const Backend* = bkWGSL -proc gpuTypeToString*(t: GpuTypeKind): string = - case Backend - of bkCuda: cuda.gpuTypeToString(t) - of bkWGSL: wgsl.gpuTypeToString(t) - -proc gpuTypeToString*(t: GpuType, ident = newGpuIdent(), allowArrayToPtr = false, - allowEmptyIdent = false, - ): string = - case Backend - of bkCuda: cuda.gpuTypeToString(t, ident.ident(), allowArrayToPtr, allowEmptyIdent) - of bkWGSL: wgsl.gpuTypeToString(t, ident, allowArrayToPtr, allowEmptyIdent) - -proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = - case Backend - of bkCuda: cuda.genFunctionType(typ, fn, fnArgs) - of bkWGSL: wgsl.genFunctionType(typ, fn, fnArgs) - proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string = case Backend of bkCuda: From c617379d8cb69ccbddce21c4aff2bfa4af95c06c Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 10:57:54 +0200 Subject: [PATCH 38/87] [CUDA] fix indentation for variable declarations --- constantine/math_compiler/experimental/backends/cuda.nim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 04d77449..c117471f 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -216,7 +216,9 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" of gpuVar: - result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName.ident()) + let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & " " + else: "" + result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident()) # If there is an initialization, the type might require a memcpy if ast.vInit.kind != gpuVoid and not ast.vRequiresMemcpy: result &= " = " & ctx.genCuda(ast.vInit) From 25c5b13f962941eda9d9b8f7821a0abb795f7779 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 11:10:34 +0200 Subject: [PATCH 39/87] improve type deduction for types that look generic I.e. an array type with a constant in place for the array size (i.e. *not* a `static int` generic argument) will look like a generic in the sense that depending on how you look at the type with `getType*`, you will end up seeing `Ident "Foo"` for the constant. Therefore, we allow array length determination to fail and return `gtInvalid`. If we see that happens for a variable or parameter, we try to determine the type from the symbol of the parameter or variable. For some reason that tends to have the fully instantiated type. However, for *return types* this is not possible, as there is no easy symbol to look at (unless one were to go into the proc body and look at symbol that is being returned). Therefore, if we encounter this type of array as a return type (NOTE: most backends won't anyway allow to return arrays, but that's beside the point), we treat the function as a generic and look up its type when we encounter it in a `gpuCall`. Then, the type will be fully instantiated again. --- .../math_compiler/experimental/gpu_types.nim | 3 + .../math_compiler/experimental/nim_to_gpu.nim | 111 ++++++++++++------ 2 files changed, 81 insertions(+), 33 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 9b4d9293..bd12a1ee 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -53,6 +53,9 @@ type gtUA, # UncheckedArray (UA) mapped to runtime sized arrays gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!) gtVoidPtr # Opaque void pointer + gtInvalid # Can be returned to indicate a call to `nimToGpuType` failed to determine a type + ## XXX: make this the default value and replace all `gtVoid` placeholders by it + GpuTypeField* = object name*: string diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 812f7591..040f5907 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has import ./gpu_types import ./backends/backends -proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType +proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType proc initGpuType(kind: GpuTypeKind): GpuType = ## If `kind` is `gtPtr` `to` must be the type we point to @@ -20,11 +20,17 @@ proc initGpuType(kind: GpuTypeKind): GpuType = proc initGpuPtrType(to: GpuType, implicitPtr: bool): GpuType = ## If `kind` is `gtPtr` `to` must be the type we point to - result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr) + if to.kind == gtInvalid: # this is not a valid type + result = GpuType(kind: gtInvalid) + else: + result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr) proc initGpuUAType(to: GpuType): GpuType = ## Initializes a GPU type for an unchecked array (ptr wraps this) - result = GpuType(kind: gtUA, uaTo: to) + if to.kind == gtInvalid: # this is not a valid type + result = GpuType(kind: gtInvalid) + else: + result = GpuType(kind: gtUA, uaTo: to) proc initGpuVoidPtr(): GpuType = result = GpuType(kind: gtVoidPtr) @@ -94,45 +100,61 @@ proc unpackGenericInst(t: NimNode): NimNode = proc toGpuTypeKind(t: NimNode): GpuTypeKind = result = t.unpackGenericInst().typeKind.toGpuTypeKind() -proc getInnerPointerType(n: NimNode, allowToFail: bool = false): GpuType = +proc getInnerPointerType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind if n.typeKind in {ntyPointer, ntyUncheckedArray}: let typ = n.getTypeInst() doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr doAssert typ[0].kind in {nnkIdent, nnkSym} doAssert typ[0].strVal in ["ptr", "UncheckedArray"] - result = nimToGpuType(typ[1], allowToFail) + result = nimToGpuType(typ[1], allowToFail, allowArrayIdent) elif n.kind == nnkPtrTy: - result = nimToGpuType(n[0], allowToFail) + result = nimToGpuType(n[0], allowToFail, allowArrayIdent) elif n.kind == nnkAddr: let typ = n.getTypeInst() - result = getInnerPointerType(typ, allowToFail) + result = getInnerPointerType(typ, allowToFail, allowArrayIdent) elif n.kind == nnkVarTy: # VarTy # Sym "BigInt" - result = nimToGpuType(n[0], allowToFail) + result = nimToGpuType(n[0], allowToFail, allowArrayIdent) elif n.kind == nnkSym: # symbol of e.g. `ntyVar` - result = nimToGpuType(n.getTypeInst(), allowToFail) + result = nimToGpuType(n.getTypeInst(), allowToFail, allowArrayIdent) else: raiseAssert "Found what: " & $n.treerepr -proc determineArrayLength(n: NimNode): int = +proc determineArrayLength(n: NimNode, allowArrayIdent: bool): int = + ## If `allowArrayIdent` is true, we do not emit the error message when + ## encountering an ident. This is the case for procs taking arrays + ## with a static array where the constant comes from outside the + ## macro. In that case we return `-1` indicating + ## `proc mdsRowShfNaive(r: int, v: array[SPONGE_WIDTH, BigInt]): BigInt {.device.} =` case n[1].kind of nnkSym: # likely a constant, try to get its value result = n[1].getImpl.intVal of nnkIdent: - let msg = """Found array with length given by identifier: $#! + if not allowArrayIdent: + let msg = """Found array with length given by identifier: $#! You might want to create a typed template taking a typed parameter for this constant to force the Nim compiler to bind the symbol. In theory though this error should not appear anymore though, as we don't try to parse generic functions. """ % n[1].strVal - raiseAssert msg + raiseAssert msg + else: + result = -1 # return -1 to indicate caller should look at symbol else: case n[1].kind of nnkIntLit: result = n[1].intVal else: + # E.g. + # BracketExpr + # Sym "array" + # Infix + # Ident ".." + # IntLit 0 + # IntLit 11 + # Sym "BigInt" #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr @@ -150,7 +172,7 @@ proc getTypeName(n: NimNode): string = result = n[0].strVal # type is the first node else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr -proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = +proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = ## Maps a Nim type to a type on the GPU ## ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases @@ -159,30 +181,30 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = case n.kind of nnkIdentDefs: # extract type for let / var based on explicit or implicit type if n[n.len - 2].kind != nnkEmpty: # explicit type - result = nimToGpuType(n[n.len - 2], allowToFail) + result = nimToGpuType(n[n.len - 2], allowToFail, allowArrayIdent) else: # take from last element - result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail) + result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail, allowArrayIdent) of nnkConstDef: if n[1].kind != nnkEmpty: # has an explicit type - result = nimToGpuType(n[1], allowToFail) + result = nimToGpuType(n[1], allowToFail, allowArrayIdent) else: - result = nimToGpuType(n[2], allowToFail) # derive from the RHS literal + result = nimToGpuType(n[2], allowToFail, allowArrayIdent) # derive from the RHS literal else: if n.kind == nnkEmpty: return initGpuType(gtVoid) case n.typeKind of ntyBool, ntyInt .. ntyUint64: # includes all float types result = initGpuType(toGpuTypeKind n.typeKind) of ntyPtr: - result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = false) + result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false) of ntyVar: - result = initGpuPtrType(getInnerPointerType(n, allowToFail), implicitPtr = true) + result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = true) of ntyPointer: result = initGpuVoidPtr() of ntyUncheckedArray: ## Note: this is just the internal type of the array. It is only a pointer due to ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. - result = initGpuUAType(getInnerPointerType(n, allowToFail)) of ntyObject, ntyAlias: + result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent)) # for aliases, treat them identical to regular object types, but # `getTypeName` returns the alias! let impl = n.getTypeImpl @@ -192,7 +214,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = of ntyArray: # For a generic, static array type, e.g.: if n.kind == nnkSym: - return nimToGpuType(getTypeImpl(n), allowToFail) + return nimToGpuType(getTypeImpl(n), allowToFail, allowArrayIdent) if n.len == 3: # BracketExpr # Sym "array" @@ -200,8 +222,16 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = # Sym "uint32" doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr doAssert n[0].strVal == "array" - let len = determineArrayLength(n) - result = initGpuArrayType(n[2], len) + let len = determineArrayLength(n, allowArrayIdent) + if len < 0: + # indicates we found an array with an ident, e.g. + # BracketExpr + # Sym "array" + # Ident "SPONGE_WIDTH" + # Sym "BigInt" + return GpuType(kind: gtInvalid) + else: + result = initGpuArrayType(n[2], len) else: # just an array literal # Bracket @@ -212,7 +242,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = # echo n.getTypeImpl.treerepr # error("o") of ntyGenericInvocation: - result = initGpuType(gtVoid) + result = initGpuType(gtInvalid) error("Generics are not supported in the CUDA DSL so far.") of ntyGenericInst: result = initGpuGenericInst(n) @@ -222,7 +252,7 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false): GpuType = # BracketExpr # Sym "typeDesc" # Sym "float32" - result = n.getType[1].nimToGpuType(allowToFail) # for a type desc we need to recurse using the type of it + result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it else: if allowToFail: result = GpuType(kind: gtVoid) @@ -407,6 +437,15 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = name.iName = fn.pName.iSym ## update the name of the called function ctx.genericInsts[fn.pName] = fn +proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = + ## Returns the type from a given Nim node `t` representing a type. + ## If that fails due to an identifier in the type, we instead try + ## to look up the type from the associated symbol, `n`. + result = nimToGpuType(t, allowArrayIdent = true) + if result.kind == gtInvalid: + # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means + # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead + result = n.getTypeInst.nimToGpuType() proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## XXX: things still left to do: ## - support `result` variable? Currently not supported. Maybe we will won't @@ -456,7 +495,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.pName = name result.pName.symbolKind = gsProc ## This is a procedure identifier doAssert node[3].kind == nnkFormalParams - result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type + let retType = node[3][0] # arg 0 is return type + if retType.kind == nnkEmpty: + result.pRetType = GpuType(kind: gtVoid) # actual void return + else: + # attempt to get type. If fails, we need to wait for a caller to this function to get types + # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside + # the macro. We then rely on our generics logic to later look this up when called + result.pRetType = nimToGpuType(retType, allowArrayIdent = true) + if result.pRetType.kind == gtVoid: # stop parsing this function + ctx.generics.incl name.iName # need to use raw name, *not* symbol + return GpuAst(kind: gpuVoid) + # Process pragmas if node.pragma.kind != nnkEmpty: doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" @@ -464,10 +514,6 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if result.pAttributes.len == 0: # means `nimonly` was applied return GpuAst(kind: gpuVoid) # Process parameters - echo "Node: ", node.treerepr - if node[2].kind == nnkGenericParams: - echo node[2][0].getImpl().treerepr - echo node[2][0].treerepr for i in 1 ..< node[3].len: let param = node[3][i] let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: @@ -479,8 +525,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = # PtrTy # Ident "float32" # `param.len - 2` # Empty # `param.len - 1` - let paramType = nimToGpuType(param[typIdx]) - #echo "Argument: ", param.treerepr, " has tpye: ", paramType + let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1]) for i in 0 ..< numParams: var p = ctx.toGpuAst(param[i]) let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam @@ -522,7 +567,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert declaration[0][1].kind == nnkPragma varNode.vAttributes = collectAttributes(declaration[0][1]) else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr - varNode.vType = nimToGpuType(declaration) + varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0]) varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later # This is a *local* variable (i.e. `function` address space on WGSL) unless it is # annotated with `{.shared.}` (-> `workspace` in WGSL) From ed5efd217f40e1ff4750fcdc95fba5dd0f16aab5 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 11:16:47 +0200 Subject: [PATCH 40/87] clone `forwardDeclare` in GpuAst --- constantine/math_compiler/experimental/gpu_types.nim | 1 + 1 file changed, 1 insertion(+) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index bd12a1ee..ccc839b7 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -339,6 +339,7 @@ proc clone*(ast: GpuAst): GpuAst = result.pParams.add(clonedParam) result.pBody = ast.pBody.clone() result.pAttributes = ast.pAttributes + result.forwardDeclare = result.forwardDeclare of gpuCall: result = GpuAst(kind: gpuCall) result.cName = ast.cName.clone() From d1cefd12a9ee9b203f1a1974d322c87c973c9c45 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 11:18:32 +0200 Subject: [PATCH 41/87] support tuples by mapping to object types Tuples are essentially just objects anyway. So we make the transformation explicit and thus support tuples (including anonymous). Note: The Nim compiler already transforms tuple unpacking into temporary variable + statements to assign the fields. --- .../math_compiler/experimental/nim_to_gpu.nim | 163 ++++++++++++++++-- 1 file changed, 151 insertions(+), 12 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 040f5907..e1744756 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -161,6 +161,67 @@ functions. doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr result = n[1][2].intVal + 1 +proc constructTupleTypeName(n: NimNode): string = + ## XXX: overthink if this should really be here and not somewhere else + ## + ## Given a tuple, generate a name from the field names and types, e.g. + ## `Tuple_lo_BaseType_hi_BaseType` + ## + ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying + ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types. + result = "Tuple_" + doAssert n.kind in [nnkTupleTy, nnkTupleConstr] + for i, ch in n: + case ch.kind + of nnkIdentDefs: + let typName = ch[ch.len - 2].getTypeImpl.repr # second to last is type name of field(s) + for j in 0 ..< ch.len - 2: + # Example: + # IdentDefs + # Ident "hi" + # Ident "lo" `..< ch.len - 2 ` + # Sym "BaseType" `..< ch.len - 1` + # Empty `..< ch.len` + result.add ch[j].strVal & "_" & typName + if j < ch.len - 3: + result.add "_" + if i < n.len - 1: + result.add "_" + of nnkExprColonExpr: + # ExprColonExpr + # Sym "hi" + # Infix + # Sym "shr" + # Sym "n" + # IntLit 16 + # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)` + # give them a slightly different name + let typName = ch[0].getTypeImpl.repr ## XXX + doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr + result.add ch[0].strVal & "_" & typName + if i < n.len - 1: + result.add "_" + of nnkSym: + # TupleConstr + # Sym "BaseType" <-- e.g. here + # Sym "BaseType" + let typName = ch.getTypeImpl.repr + result.add "Field" & $i & "_" & typName + if i < n.len - 1: + result.add "_" + else: + # TupleConstr e.g. a tuple constr like this + # Infix + # Sym "shr" + # Sym "n" + # IntLit 16 + # Infix + # Sym "and" + # Sym "n" + # UInt32Lit 65535 + # -> Try again with type impl + return constructTupleTypeName(getTypeImpl(n)) + proc getTypeName(n: NimNode): string = ## Returns the name of the type case n.kind @@ -170,6 +231,8 @@ proc getTypeName(n: NimNode): string = result = n.getTypeInst.strVal else: result = n[0].strVal # type is the first node + of nnkTupleTy, nnkTupleConstr: + result = constructTupleTypeName(n) else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = @@ -203,11 +266,12 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = of ntyUncheckedArray: ## Note: this is just the internal type of the array. It is only a pointer due to ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. - of ntyObject, ntyAlias: result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent)) + of ntyObject, ntyAlias, ntyTuple: # for aliases, treat them identical to regular object types, but # `getTypeName` returns the alias! - let impl = n.getTypeImpl + let impl = if n.kind == nnkTupleConstr: n # might actually _lose_ information if used getTypeImpl + else: n.getTypeImpl let flds = impl.parseTypeFields() let typName = getTypeName(n) # might be an object construction result = initGpuObjectType(typName, flds) @@ -279,12 +343,34 @@ proc assignPrefixOp(op: string): string = else: result = op proc parseTypeFields(node: NimNode): seq[GpuTypeField] = - doAssert node.kind == nnkObjectTy - doAssert node[2].kind == nnkRecList - for ch in node[2]: - doAssert ch.kind == nnkIdentDefs and ch.len == 3 - result.add GpuTypeField(name: ch[0].strVal, - typ: nimToGpuType(ch[1])) + case node.kind + of nnkObjectTy: + doAssert node[2].kind == nnkRecList + for ch in node[2]: + doAssert ch.kind == nnkIdentDefs and ch.len == 3 + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + of nnkTupleTy: + for ch in node: + doAssert ch.kind == nnkIdentDefs and ch.len == 3 + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + of nnkTupleConstr: + # TupleConstr + # Sym "BaseType" + # Sym "BaseType" + for i, ch in node: + case ch.kind + of nnkSym: + result.add GpuTypeField(name: "Field" & $i, + typ: nimToGpuType(ch)) + of nnkExprColonExpr: + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + else: + return parseTypeFields(node.getTypeImpl) # will likely fall back to constr with `nnkSym` + else: + raiseAssert "Unsupported type to parse fields from: " & $node.kind template findIdx(col, el): untyped = var res = -1 @@ -696,9 +782,21 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.dField = ctx.toGpuAst(node[1]) of nnkBracketExpr: - result = GpuAst(kind: gpuIndex) - result.iArr = ctx.toGpuAst(node[0]) - result.iIndex = ctx.toGpuAst(node[1]) + case node[0].typeKind + of ntyTuple: + # need to replace `[idx]` by field access + let typ = nimToGpuType(node[0].getTypeImpl) + #doAssert typ in ctx.types + doAssert node[1].kind == nnkIntLit + let idx = node[1].intVal + let field = typ.oFields[idx].name + result = GpuAst(kind: gpuDot, + dParent: ctx.toGpuAst(node[0]), + dField: ctx.toGpuAst(ident(field))) + else: + result = GpuAst(kind: gpuIndex) + result.iArr = ctx.toGpuAst(node[0]) + result.iIndex = ctx.toGpuAst(node[1]) of nnkIdent, nnkOpenSymChoice: result = newGpuIdent() @@ -823,7 +921,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type - let flds = node[0].getTypeImpl.parseTypeFields() # sym + let flds = typ.oFields # find all fields that have been defined by the user var ocFields: seq[GpuFieldInit] for i in 1 ..< node.len: # all fields to be init'd @@ -844,6 +942,47 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.ocFields.add GpuFieldInit(name: flds[i].name, value: dfl, typ: flds[i].typ) + of nnkTupleConstr: + let typ = nimToGpuType(node) + if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type + # store the type instantiation + let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ) + case typ.kind + of gtObject: typDef.tFields = typ.oFields + of gtGenericInst: typDef.tFields = typ.gFields + else: + raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." + ctx.types[typ] = typDef + + result = GpuAst(kind: gpuObjConstr, ocType: typ) + # get all fields of the type + let flds = typ.oFields + # find all fields that have been defined by the user + var ocFields: seq[GpuFieldInit] + for i in 0 ..< node.len: # all fields to be init'd + case node[i].kind + of nnkExprColonExpr: + ocFields.add GpuFieldInit(name: node[i][0].strVal, + value: ctx.toGpuAst(node[i][1]), + typ: GpuType(kind: gtVoid)) + else: + ocFields.add GpuFieldInit(name: "Field" & $i, + value: ctx.toGpuAst(node[i]), + typ: GpuType(kind: gtVoid)) + + # now add fields in order of the type declaration + for i in 0 ..< flds.len: + let idx = findIdx(ocFields, flds[i].name) + if idx >= 0: + var f = ocFields[idx] + f.typ = flds[i].typ + result.ocFields.add f + else: + let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid)) + result.ocFields.add GpuFieldInit(name: flds[i].name, + value: dfl, + typ: flds[i].typ) + of nnkAsmStmt: doAssert node.len == 2 From 58d4608798355b81ae989b61e10fc77aaed743de Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 11:21:46 +0200 Subject: [PATCH 42/87] support Nim's implicit `result` variable Finally supports the implicit `result`. :) --- .../math_compiler/experimental/nim_to_gpu.nim | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index e1744756..f39e77dd 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -532,6 +532,51 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead result = n.getTypeInst.nimToGpuType() +proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = + ## Will insert a `gpuVar` for the implicit `result` variable, unless there + ## is a user defined `var result` that shadows it at the top level of the proc + ## body. + ## + ## Finally adds a `return result` statement if + ## - we add a `result` variable + ## - there is no `return` statement as the _last_ statement in the proc + if retType.kind == gtVoid: return # nothing to do if the proc returns nothing + + proc hasCustomResult(n: GpuAst): bool = + doAssert n.kind == gpuBlock + for ch in n: # iterate all top level statements in the proc body + case ch.kind + of gpuVar: + if ch.vName.ident() == "result": + ## XXX: could maybe consider to emit a CT warning that `result` shadows the implicit + ## result variable + echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`." + return true + else: + discard + + proc lastIsReturn(n: GpuAst): bool = + doAssert n.kind == gpuBlock + if n.statements[^1].kind == gpuReturn: return true + + if not hasCustomResult(ast): + # insert `gpuVar` as the *first* statement + let resId = GpuAst(kind: gpuIdent, iName: "result", + iSym: "result", + iTyp: retType, + symbolKind: gsLocal) + let res = GpuAst(kind: gpuVar, vName: resId, + vType: retType, + vInit: GpuAst(kind: gpuVoid), # no initialization + vRequiresMemcpy: false, + vMutable: true) + ast.statements.insert(res, 0) + # NOTE: The compiler rewrites expressions at the end of a `proc` into + # an assignment to `block: result = ` for us. + if not lastIsReturn(ast): + # insert `return result` + ast.statements.add GpuAst(kind: gpuReturn, rValue: resId) + proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## XXX: things still left to do: ## - support `result` variable? Currently not supported. Maybe we will won't @@ -623,6 +668,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.pBody = ctx.toGpuAst(node.body) .ensureBlock() # single line procs should be a block to generate `;` + result.pBody.maybeInsertResult(result.pRetType, result.pName.ident()) # Add to table of known functions if result.pName notin ctx.allFnTab: @@ -764,6 +810,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name result.bLeft = ctx.toGpuAst(node[1]) result.bRight = ctx.toGpuAst(node[2]) + # We patch the types of int / float literals. WGSL does not automatically convert literals # to the target type. Determining the type here _can_ fail. In that case the # `lType` field will just be `gtVoid`, like the default. From 9d5b02d03971b221174669c34fe7734785c09293 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 11:24:28 +0200 Subject: [PATCH 43/87] first steps towards supporting expressions This by itself does not do much. But in order to support (statement list / block) expressions, some more information about if something is an expression / if a call is an expression is going to be needed. --- .../math_compiler/experimental/gpu_types.nim | 8 ++++ .../math_compiler/experimental/nim_to_gpu.nim | 44 +++++++++++++++++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index ccc839b7..1e33f3ee 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -109,6 +109,7 @@ type pAttributes*: set[GpuAttribute] # order not important, hence set forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration of gpuCall: + cIsExpr*: bool ## If the call returns a value cName*: GpuAst ## Will be a `GpuIdent` cArgs*: seq[GpuAst] of gpuTemplateCall: @@ -155,6 +156,7 @@ type aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array aLitType*: GpuType # type of first element of gpuBlock: + isExpr*: bool ## Whether this block represents an expression, i.e. it returns something blockLabel*: string # optional name of the block. If any given, will open a `{ }` scope in CUDA statements*: seq[GpuAst] ## XXX: we could add a `locals` argument here, which would refer to all local variables @@ -274,6 +276,10 @@ type ## precise generic instantiations that are called. genericInsts*: OrderedTable[GpuAst, GpuAst] + ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can + ## check if they return a value when we encounter them in a `gpuCall` + builtins*: OrderedTable[GpuAst, GpuAst] + ## Table of all known types. Filled during Nim -> GpuAst. Includes generic ## instantiations, but also all other types. ## Key: the raw type. Value: a full `gpuTypeDef` @@ -409,6 +415,7 @@ proc clone*(ast: GpuAst): GpuAst = result.pVal = ast.pVal.clone() of gpuBlock: result = GpuAst(kind: gpuBlock) + result.isExpr = ast.isExpr result.blockLabel = ast.blockLabel for stmt in ast.statements: result.statements.add(stmt.clone()) @@ -458,6 +465,7 @@ proc clone*(ast: GpuAst): GpuAst = result.convExpr = ast.convExpr.clone() of gpuCast: result = GpuAst(kind: gpuCast) + result.cIsExpr = ast.cIsExpr result.cTo = ast.cTo.clone() result.cExpr = ast.cExpr.clone() of gpuComment: diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index f39e77dd..fba048ee 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -532,6 +532,18 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead result = n.getTypeInst.nimToGpuType() + +proc isExpression(n: GpuAst): bool = + ## Returns whether the given AST node is an expression + case n.kind + of gpuCall: # only if it returns something! + result = n.cIsExpr + of gpuBinOp, gpuIdent, gpuLit, gpuArrayLit, gpuPrefix, gpuDot, gpuIndex, gpuObjConstr, + gpuAddr, gpuDeref, gpuConv, gpuCast, gpuConstExpr: + result = true + else: + result = false + proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = ## Will insert a `gpuVar` for the implicit `result` variable, unless there ## is a user defined `var result` that shadows it at the top level of the proc @@ -577,6 +589,21 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = # insert `return result` ast.statements.add GpuAst(kind: gpuReturn, rValue: resId) +proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool = + ## Returns true if the given `fn` (gpuIdent) returns a value. + ## The function can either be: + ## - an inbuilt function + ## - a generic instantiation + ## - contained in `allFnTab` + if fn in ctx.allFnTab: + result = ctx.allFnTab[fn].pRetType.kind != gtVoid + elif fn in ctx.genericInsts: + result = ctx.genericInsts[fn].pRetType.kind != gtVoid + elif fn in ctx.builtins: + result = ctx.builtins[fn].pRetType.kind != gtVoid + else: + raiseAssert "The function: " & $fn & " is not known anywhere." + proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## XXX: things still left to do: ## - support `result` variable? Currently not supported. Maybe we will won't @@ -602,9 +629,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = blockLabel: blockLabel) for i in 1 ..< node.len: # index 0 is the block label result.statements.add ctx.toGpuAst(node[i]) + of nnkBlockExpr: + ## XXX: For CUDA just a block? + let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal + elif node[0].kind == nnkEmpty: "" + else: raiseAssert "Unexpected node in block label field: " & $node.treerepr + result = GpuAst(kind: gpuBlock, blockLabel: blockLabel, isExpr: true) + for el in node: + if el.kind != nnkEmpty: + result.statements.add ctx.toGpuAst(el) of nnkStmtListExpr: # for statements that return a value. ## XXX: For CUDA just a block? - result = GpuAst(kind: gpuBlock) + result = GpuAst(kind: gpuBlock, isExpr: true) for el in node: if el.kind != nnkEmpty: result.statements.add ctx.toGpuAst(el) @@ -642,7 +678,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if node.pragma.kind != nnkEmpty: doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" result.pAttributes = collectProcAttributes(node.pragma) - if result.pAttributes.len == 0: # means `nimonly` was applied + if result.pAttributes.len == 0: # means `nimonly` was applied / is a `builtin` + ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called return GpuAst(kind: gpuVoid) # Process parameters for i in 1 ..< node[3].len: @@ -798,7 +835,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.tcName = name result.tcArgs = args else: - result = GpuAst(kind: gpuCall) + let fnIsExpr = ctx.fnReturnsValue(name) + result = GpuAst(kind: gpuCall, cIsExpr: fnIsExpr) result.cName = name result.cArgs = args From cc1eaf71135f5f954d7b119b0f5f99edc725cd6a Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 12:08:05 +0200 Subject: [PATCH 44/87] move `cIsExpr` in clone to correct branch Whoops --- constantine/math_compiler/experimental/gpu_types.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 1e33f3ee..969859fd 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -348,6 +348,7 @@ proc clone*(ast: GpuAst): GpuAst = result.forwardDeclare = result.forwardDeclare of gpuCall: result = GpuAst(kind: gpuCall) + result.cIsExpr = ast.cIsExpr result.cName = ast.cName.clone() for arg in ast.cArgs: result.cArgs.add(arg.clone()) @@ -465,7 +466,6 @@ proc clone*(ast: GpuAst): GpuAst = result.convExpr = ast.convExpr.clone() of gpuCast: result = GpuAst(kind: gpuCast) - result.cIsExpr = ast.cIsExpr result.cTo = ast.cTo.clone() result.cExpr = ast.cExpr.clone() of gpuComment: From 7e43f71b6b20cd3fbcb207cec24cddc59b565082 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 12:08:26 +0200 Subject: [PATCH 45/87] also catch tuple types from `nnkBracketExpr` --- .../math_compiler/experimental/nim_to_gpu.nim | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index fba048ee..7df62c00 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -43,6 +43,17 @@ proc initGpuArrayType(aTyp: NimNode, len: int): GpuType = ## Construct an statically sized array type result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len) +proc toTypeDef(typ: GpuType): GpuAst = + ## Converts a given object or generic instantiation type into an AST of a + ## corresponding type def. + # store the type instantiation + result = GpuAst(kind: gpuTypeDef, tTyp: typ) + case typ.kind + of gtObject: result.tFields = typ.oFields + of gtGenericInst: result.tFields = typ.gFields + else: + raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." + proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = case t #of ntyBool, ntyChar: @@ -871,6 +882,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of ntyTuple: # need to replace `[idx]` by field access let typ = nimToGpuType(node[0].getTypeImpl) + if typ notin ctx.types: + ctx.types[typ] = toTypeDef(typ) #doAssert typ in ctx.types doAssert node[1].kind == nnkIntLit let idx = node[1].intVal @@ -994,15 +1007,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## this should never see `genericParam` I think let typ = nimToGpuType(node) if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type - # store the type instantiation - let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ) - case typ.kind - of gtObject: typDef.tFields = typ.oFields - of gtGenericInst: typDef.tFields = typ.gFields - else: - raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." - - ctx.types[typ] = typDef + ctx.types[typ] = toTypeDef(typ) result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type @@ -1030,14 +1035,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkTupleConstr: let typ = nimToGpuType(node) if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type - # store the type instantiation - let typDef = GpuAst(kind: gpuTypeDef, tTyp: typ) - case typ.kind - of gtObject: typDef.tFields = typ.oFields - of gtGenericInst: typDef.tFields = typ.gFields - else: - raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." - ctx.types[typ] = typDef + ctx.types[typ] = toTypeDef(typ) result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type From 39c415835d7490177522e31fb55a24698a670986 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 12:08:50 +0200 Subject: [PATCH 46/87] handle `inline` pragma same as `forceinline` --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 7df62c00..13769353 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -420,7 +420,7 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] = case pragma.strVal of "device": result.incl attDevice of "global": result.incl attGlobal - of "forceinline": result.incl attForceInline + of "inline", "forceinline": result.incl attForceInline of "nimonly", "builtin": # used to fully ignore functions! return From bce536f7f486a630bccf64aeadcbbdc02cc4b121 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 22 Aug 2025 12:14:18 +0200 Subject: [PATCH 47/87] fix detection of custom `result` by looking into `gpuBlock` --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 13769353..4f8648b8 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -575,6 +575,8 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = ## result variable echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`." return true + of gpuBlock: # need to look at `gpuBlock` from top level, because variables are defined in a block + result = result or hasCustomResult(ch) else: discard From dfd4f562db242eaf9fedc8c31d20811e6c7554c4 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 13:57:15 +0200 Subject: [PATCH 48/87] fix handling of `skipSemicolon` for nested usage If nested, we must not reset the `skipSemicolon` back to false once the inner one is done! --- constantine/math_compiler/experimental/gpu_types.nim | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 969859fd..2ca42e12 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -827,9 +827,12 @@ proc ident*(n: GpuAst): string = result = n.iName template withoutSemicolon*(ctx: var GpuContext, body: untyped): untyped = - ctx.skipSemicolon = true - body - ctx.skipSemicolon = false + if not ctx.skipSemicolon: # if we are already skipping, leave true + ctx.skipSemicolon = true + body + ctx.skipSemicolon = false + else: + body proc getInnerArrayLengths*(t: GpuType): string = ## Returns the lengths of the inner array types for a nested array. From 8cdf72c97d6ac60040ac9cdcf3181f792d89ce46 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 16:52:46 +0200 Subject: [PATCH 49/87] if last expression _is_ return, we don't need `result` variable At least not in theory. We _could_ construct a function in which there _is_ a branch that returns `result` and another that returns something else, but uhh, I guess we can leave that for later. --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 4f8648b8..0958378d 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -584,7 +584,7 @@ proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = doAssert n.kind == gpuBlock if n.statements[^1].kind == gpuReturn: return true - if not hasCustomResult(ast): + if not hasCustomResult(ast) and not lastIsReturn(ast): # insert `gpuVar` as the *first* statement let resId = GpuAst(kind: gpuIdent, iName: "result", iSym: "result", From a40a87f3b40f27c31dc7139eb7b989818752325a Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 16:54:15 +0200 Subject: [PATCH 50/87] remove debug output --- constantine/math_compiler/experimental/nim_to_gpu.nim | 1 - 1 file changed, 1 deletion(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 0958378d..10593884 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -94,7 +94,6 @@ proc initGpuGenericInst(t: NimNode): GpuType = result = initGpuGenericInst(t[0]) else: raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr - echo "Got generic inst: ", result proc unpackGenericInst(t: NimNode): NimNode = let tKind = t.typeKind From 396de7ba1d744ed31d20e4d052f4117706f2b54e Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 16:56:39 +0200 Subject: [PATCH 51/87] doc comment updates for procs in gpu_field_ops --- .../experimental/gpu_field_ops.nim | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim index c16bc900..6fa550f3 100644 --- a/constantine/math_compiler/experimental/gpu_field_ops.nim +++ b/constantine/math_compiler/experimental/gpu_field_ops.nim @@ -189,52 +189,58 @@ template defWGSLHelpers*(): untyped {.dirty.} = ## Global variable to simulate carry flag. Private == one for each thread var carry_flag {.private.}: uint32 = 0'u32 - # Add with carry out (sets carry flag) proc add_co(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry out (sets carry flag) let result = a + b # Check for overflow: carry occurs if result < a (or result < b) carry_flag = select(0'u32, 1'u32, result < a) return result - # Add with carry in and carry out proc add_cio(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry in and carry out let temp = a + b let result = temp + carry_flag # Carry out if: temp overflowed OR (temp + carry overflowed) carry_flag = select(0'u32, 1'u32, (temp < a) or (result < temp)) return result - # Add with carry in only proc add_ci(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry in only. + # NOTE: `carry_flag` is not reset, because the next call after + # an `add_ci` *must* be `add_co` or `sub_bo`, but never + # `add/sub_cio/ci`! let temp = a + b let result = temp + carry_flag # Don't update carry flag for this operation return result - # Subtract with borrow out (sets borrow flag) proc sub_bo(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow out (sets borrow flag) let result = a - b # Borrow occurs if a < b carry_flag = select(0'u32, 1'u32, a < b) return result - # Subtract with borrow in only - proc sub_bi(a: uint32, b: uint32): uint32 {.device.} = + proc sub_bio(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow in and borrow out + # NOTE: `carry_flag` is not reset, because the next call after + # an `add_ci` *must* be `add_co` or `sub_bo`, but never + # `add/sub_cio/ci`! let temp = a - b let result = temp - carry_flag - # Don't update carry flag for this operation + # Borrow out if: a < b OR (temp - borrow underflowed) + carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag)) return result - # Subtract with borrow in and borrow out - proc sub_bio(a: uint32, b: uint32): uint32 {.device.} = + proc sub_bi(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow in only let temp = a - b let result = temp - carry_flag - # Borrow out if: a < b OR (temp - borrow underflowed) - carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag)) + # Don't update carry flag for this operation return result - # Select based on condition (equivalent to PTX slct) proc slct(a: uint32, b: uint32, pred: int32): uint32 {.device.} = + # Select based on condition (equivalent to PTX slct) return select(b, a, pred >= 0) proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} = @@ -261,8 +267,8 @@ template defWGSLHelpers*(): untyped {.dirty.} = return p3 + (p1 shr 16) + (p2 shr 16) + carry - # r <- a * b + c (multiply-add low) proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} = + # r <- a * b + c (multiply-add low) return mul_lo(a, b) + c proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = @@ -280,8 +286,8 @@ template defWGSLHelpers*(): untyped {.dirty.} = let product = mul_lo(a, b) return add_cio(product, c) - # r <- (a * b) >> 32 + c (multiply-add high) proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} = + # r <- (a * b) >> 32 + c (multiply-add high) return mul_hi(a, b) + c proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = @@ -582,7 +588,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = mtymul_CIOS_sparebit(a, b, M, true) proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} = - ## Conditional copy in CUDA + ## Conditional copy. ## If condition is true: b is copied into a ## If condition is false: a is left unmodified ## @@ -599,7 +605,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = a[i] = slct(b[i], a[i], cond) proc csetZero(r: var BigInt, condition: bool) {.device.} = - ## Conditionally set `r` to zero in CUDA + ## Conditionally set `r` to zero. ## ## Note: This is constant-time var t = BigInt() @@ -607,14 +613,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc csetOne(r: var BigInt, condition: bool) {.device.} = - ## Conditionally set `r` to one in CUDA + ## Conditionally set `r` to one. ## ## Note: This is constant-time template mOne: untyped = MontyOne r.ccopy(mOne, condition) proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} = - ## Conditionally add `a` to `r` in place in CUDA. + ## Conditionally add `a` to `r` in place.. ## ## Note: This is constant-time var t = BigInt() @@ -622,7 +628,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} = - ## Conditionally subtract `a` from `r` in place in CUDA. + ## Conditionally subtract `a` from `r` in place. ## ## Note: This is constant-time var t = BigInt() @@ -630,14 +636,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc doubleElement(r: var BigInt, a: BigInt) {.device.} = - ## Double `a` and store it in `r` in CUDA. + ## Double `a` and store it in `r`. ## ## Note: This is constant-time r.add(a, a) proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} = - ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r` - ## in CUDA. + ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`. ## ## Note: This is constant-time ## @@ -649,7 +654,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true) proc isZero(r: var bool, a: BigInt) {.device.} = - ## Checks if `a` is zero in CUDA. Result is written to `r`. + ## Checks if `a` is zero. Result is written to `r`. ## ## Note: This is constant-time #r = true @@ -661,7 +666,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = isZero == 0'u32 proc isOdd(r: var bool, a: BigInt) {.device.} = - ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`. + ## Checks if the Montgomery value of `a` is odd. Result is written to `r`. ## ## IMPORTANT: The canonical value may or may not be odd if the Montgomery ## representation is odd (and vice versa!). @@ -671,7 +676,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = (a[0] and 1'u32).bool proc neg(r: var BigInt, a: BigInt) {.device.} = - ## Computes the negation of `a` and stores it in `r` in CUDA. + ## Computes the negation of `a` and stores it in `r`. ## ## Note: This is constant-time # Check if input is zero @@ -687,14 +692,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} = ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise - ## copy over `a` into `r` in CUDA. + ## copy over `a` into `r`. ## ## Note: This is constant-time r.neg(a) r.ccopy(a, not condition) proc shiftRight(r: var BigInt, k: uint32) {.device.} = - ## Shift `r` right by `k` bits in-nplace in CUDA. + ## Shift `r` right by `k` bits in-nplace. ## ## k MUST be less than the base word size (2^31) ## @@ -716,7 +721,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r[lastIdx] = r[lastIdx] shr k proc div2(r: var BigInt) {.device.} = - ## Divide `r` by 2 in-place in CUDA. + ## Divide `r` by 2 in-place. ## ## Note: This is constant-time # check if the input is odd From cd685528915eb23d36dcc5cd0ee4757131e03029 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 16:57:16 +0200 Subject: [PATCH 52/87] do not emit semicolon in binary operand child nodes --- constantine/math_compiler/experimental/backends/cuda.nim | 9 ++++++--- constantine/math_compiler/experimental/backends/wgsl.nim | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index c117471f..1f12be38 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -285,9 +285,12 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ctx.genCuda(expandedBody, indent) of gpuBinOp: - result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " & - ast.bOp & " " & - ctx.genCuda(ast.bRight) & ")" + ctx.withoutSemicolon: + let l = ctx.genCuda(ast.bLeft) + let r = ctx.genCuda(ast.bRight) + result = indentStr & "(" & l & " " & + ast.bOp & " " & + r & ")" of gpuIdent: result = ast.ident() diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 9043e3a0..288a4fdd 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -931,9 +931,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ctx.genWebGpu(expandedBody, indent) of gpuBinOp: - result = indentStr & "(" & ctx.genWebGpu(ast.bLeft) & " " & - ast.bOp & " " & - ctx.genWebGpu(ast.bRight) & ")" + ctx.withoutSemicolon: + let l = ctx.genWebGpu(ast.bLeft) + let r = ctx.genWebGpu(ast.bRight) + result = indentStr & "(" & l & " " & + ast.bOp & " " & + r & ")" of gpuIdent: result = ast.ident() From 30a4ac4f93c6b25483085c5007e56c1aecf0a454 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 17:17:01 +0200 Subject: [PATCH 53/87] handle function overloads by using `iSym` if encountered --- constantine/math_compiler/experimental/gpu_types.nim | 7 +++++++ .../math_compiler/experimental/nim_to_gpu.nim | 12 +++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 2ca42e12..2ce2be12 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -285,6 +285,13 @@ type ## Key: the raw type. Value: a full `gpuTypeDef` types*: OrderedTable[GpuType, GpuAst] + ## This is _effectively_ just a set of all already produced function symbols. + ## We use it to determine if when encountering another function with the same + ## name, but different arguments to instead of using `iName` to use `iSym` as + ## the function name. This is to avoid overload issues in backends that don't + ## allow overloading by function signatures. + symChoices*: HashSet[string] + ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext` ## causes trouble. diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 10593884..a9430f64 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -501,6 +501,16 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = result = ctx.toGpuAst(n) # if _no_ pragma else: result = ctx.toGpuAst(n) # if not proc or func + + # handle overloads with different signatures + if n.strVal in ctx.symChoices: + # this is an overload of another function with different signature (not a generic, but + # overloads are not allowed in CUDA/WGSL/...). Update `sigTab` entry by using `iSym` + # for `iName` field for unique name + let id = ctx.sigTab[sig] + id.iName = id.iSym + else: + ctx.symChoices.incl result.iName # store this name in `symChoices` else: # else we use the str representation (repr for open / closed sym choice nodes) result = toAst n.repr @@ -665,7 +675,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = # the `generics` set. When we encounter a `gpuCall` we will then check if the function # being called is part of the generic set and look up its _instantiated_ implementation # to parse it. The parsed generics are stored in the `genericInsts` table. - let name = ctx.toGpuAst(node.name) + let name = ctx.getFnName(node.name) if node[2].kind == nnkGenericParams: # is a generic ctx.generics.incl name.iName # need to use raw name, *not* symbol result = GpuAst(kind: gpuVoid) From a0daf1637e10b303cf7a83fa582d326de7ea1784 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 19:25:39 +0200 Subject: [PATCH 54/87] ignore `magic` pragma procs --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index a9430f64..d7d5d3d2 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -427,6 +427,8 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] = return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil` of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend continue + of "magic": + return else: raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr From a260bacfab1e17e1634633d9605356541eeae003 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 25 Aug 2025 19:25:47 +0200 Subject: [PATCH 55/87] ignore forward declarations in Nim -> GpuAst Any forward declaration will also have its regular implementation somewhere. Otherwise the code is invalid anyway. As we generate our own forward declarations, we can ignore them. --- constantine/math_compiler/experimental/nim_to_gpu.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index d7d5d3d2..f36c4f96 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -681,6 +681,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if node[2].kind == nnkGenericParams: # is a generic ctx.generics.incl name.iName # need to use raw name, *not* symbol result = GpuAst(kind: gpuVoid) + elif node.body.kind == nnkEmpty: # just a forward declaration + result = GpuAst(kind: gpuVoid) else: result = GpuAst(kind: gpuProc) result.pName = name From e0a00eec2b811d8edd5abcd7b5a667eedfc854c7 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 26 Aug 2025 08:11:23 +0200 Subject: [PATCH 56/87] add BigInt comparison and `toCanonical` conversion We could consider to rename it `fromMont` to match the regular Constantine naming, but given that here we (currently) only have a single type to represent both, I picked a different name. --- .../experimental/gpu_field_ops.nim | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim index 6fa550f3..76523366 100644 --- a/constantine/math_compiler/experimental/gpu_field_ops.nim +++ b/constantine/math_compiler/experimental/gpu_field_ops.nim @@ -306,6 +306,113 @@ template defWGSLHelpers*(): untyped {.dirty.} = return add_cio(hi_product, c) +template defBigIntCompare*(): untyped {.dirty.} = + ## This template adds a comparison operator for BigInts `<` (which is rewritten to + ## a function call `less`) as well as a `toCanonical` function to turn a Montgomery + ## representation into a canonical representation. + ## It is included in the `defCoreFieldOps` by default, so you need not manually use it. + + proc less(a, b: BigInt): bool {.device.} = + ## Returns true if a < b for two big ints in *canonical* + ## representation. + ## + ## NOTE: The inputs are compared *as is*. That means if they are + ## in Montgomery representation the result will not reflect the + ## ordering relation of their associated canonical values! + ## Call `toCanonical` on field elements in Montgomery order before + ## comparing them. + ## + ## Comparison is constant-time + var borrow: uint32 + # calculate sub with borrows for side effect. Use borrow flag + # at the end to determine if value was smaller + discard sub_bo(a[0], b[0]) + staticFor i, 1, a.len: + discard sub_bio(a[i], b[i]) + borrow = sub_bi(0'u32, 0'u32) + return borrow.bool + + # template to rewrite `<` into a function call. Most backends don't allow custom operators + template `<`(b1, b2: BigInt): untyped = less(b1, b2) + + proc muladd1_gpu(hi, lo: var uint32, a, b, c: uint32) {.device, forceinline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd1` + lo = mulloadd_co(a, b, c) # low part of a*b + c with carry out + hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in + + proc muladd2_gpu(hi, lo: var uint32, a, b, c1, c2: uint32) {.device, forceinline.} = + ## Extended precision multiplication + addition + addition + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + ## + ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd2` + lo = mulloadd_co(a, b, c1) # low part of a*b + c1 with carry out + hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in + # Add c2 with carry propagation + lo = add_co(lo, c2) + hi = add_ci(hi, 0'u32) + + proc sub_no_mod(a, b: BigInt): BigInt {.device.} = + ## Generate an optimized substraction kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + ## I.e. this does _not_ perform modular reduction. + var t = BigInt() + t[0] = sub_bo(a[0], b[0]) + staticFor i, 1, a.len: + t[i] = sub_bio(a[i], b[i]) + return t + + proc sub_no_mod(r: var BigInt, a, b: BigInt) {.device.} = + ## Subtraction of two finite field elements stored in `a` and `b` + ## *without* modular reduction. + ## The result is stored in `r`. + r = sub_no_mod(a, b) + + proc csub_no_mod(r: var BigInt, a: BigInt, condition: bool) {.device.} = + ## Conditionally subtract `a` from `r` in place *without* modular + ## reduction. + ## + ## Note: This is constant-time + var t = BigInt() + t.sub_no_mod(r, a) + r.ccopy(t, condition) + + proc fromMont_CIOS(r: var BigInt, a, M: BigInt, m0ninv: uint32) {.device.} = + ## Convert from Montgomery form to canonical BigInt form + # for i in 0 .. n-1: + # m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication) + # C, _ = t[0] + m * M[0] + # for j in 1 ..n-1: + # (C, t[j-1]) <- r[j] + m*M[j] + C + # t[n-1] = C + + var t = a # Ensure working in registers + + staticFor i, 0, N: + let m = t[0] * m0ninv + var C, lo: uint32 + muladd1_gpu(C, lo, m, M[0], t[0]) + staticFor j, 1, N: + muladd2_gpu(C, t[j-1], m, M[j], C, t[j]) + t[N-1] = C + + t.csub_no_mod(M, not (t < M)) + r = t + + proc toCanonical(b: BigInt): BigInt {.device.} = + var canon: BigInt + canon.fromMont_CIOS(b, M, M0NInv) + return canon + template defCoreFieldOps*(T: typed): untyped {.dirty.} = # Need to get the limbs & spare bits data in a static context template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32 From c9d99020681333f03cd7336453b09f35417fdfee Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 26 Aug 2025 08:14:19 +0200 Subject: [PATCH 57/87] add FIPS Montgomery multiplication for fields without spare bits The Goldilocks field does not have any spare bits. As a result using CIOS leads to the wrong result. --- .../experimental/gpu_field_ops.nim | 84 +++++++++++++++++-- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim index 76523366..46c458f4 100644 --- a/constantine/math_compiler/experimental/gpu_field_ops.nim +++ b/constantine/math_compiler/experimental/gpu_field_ops.nim @@ -427,6 +427,10 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2)) const M0NInv = getM0ninv().uint32 + # `ccopy` needed for BigInt comparison logic + proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} + defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts + proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} = ## If a >= Modulus: r <- a-M ## else: r <- a @@ -511,7 +515,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = t[i] = sub_bio(a[i], b[i]) let underflowMask = sub_bi(0'u32, 0'u32) - # If underflow # TODO: predicated mov instead? var maskedM: BigInt = BigInt() @@ -523,7 +526,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = t[i] = add_cio(t[i], maskedM[i]) when N > 1: t[N-1] = add_ci(t[N-1], maskedM[N-1]) - return t proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} = @@ -689,11 +691,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = ## The result is stored in `r`. r = modsub(a, b, M) - proc mul(r: var BigInt, a, b: BigInt) {.device.} = - ## Multiplication of two finite field elements stored in `a` and `b`. - ## The result is stored in `r`. - r = mtymul_CIOS_sparebit(a, b, M, true) - proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} = ## Conditional copy. ## If condition is true: b is copied into a @@ -772,6 +769,11 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = isZero = isZero or a[i] r = isZero == 0'u32 + proc isZero(a: BigInt): bool {.device, forceinline.} = + result.isZero(a) + proc isNonZero(a: BigInt): bool {.device, forceinline.} = + result = not isZero(a) + proc isOdd(r: var bool, a: BigInt) {.device.} = ## Checks if the Montgomery value of `a` is odd. Result is written to `r`. ## @@ -840,3 +842,71 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = # if it was odd, add `M+1/2` to go 'half-way around' r.cadd(PP1D2, isO) + + proc mul_lohi(hi, lo: var uint32, a, b: uint32) {.device, forceinline.} = + lo = mul_lo(a, b) + hi = mul_hi(a, b) + + proc mulAcc(t, u, v: var uint32, a, b: uint32) {.device, forceinline.} = + ## (t, u, v) <- (t, u, v) + a * b + v = mulloadd_co(a, b, v) # v = (a*b).low + v, with carry out + u = mulhiadd_cio(a, b, u) # u = (a*b).high + u + carry, with carry out + t = add_ci(t, 0'u32) # t = t + carry + + proc mtymul_FIPS(a, b, M: BigInt, lazyReduce: static bool = false): BigInt {.device.} = + ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS). + ## This implementation can be used for fields that do not have any spare bits. + ## + ## This maps + ## - [0, 2p) -> [0, 2p) with lazyReduce + ## - [0, 2p) -> [0, p) without + ## + ## lazyReduce skips the final substraction step. + # - Architectural Enhancements for Montgomery + # Multiplication on Embedded RISC Processors + # Johann Großschädl and Guy-Armand Kamendje, 2003 + # https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf + # + # - New Speed Records for Montgomery Modular + # Multiplication on 8-bit AVR Microcontrollers + # Zhe Liu and Johann Großschädl, 2013 + # https://eprint.iacr.org/2013/882.pdf + template m0ninv: untyped = M0NInv + var z = BigInt() # zero-init, ensure on stack and removes in-place problems in tower fields + const L = a.len + var t, u, v = 0'u32 + + staticFor i, 0, L: + staticFor j, 0, i: + mulAcc(t, u, v, a[j], b[i-j]) + mulAcc(t, u, v, z[j], M[i-j]) + mulAcc(t, u, v, a[i], b[0]) + z[i] = v * m0ninv + mulAcc(t, u, v, z[i], M[0]) + v = u + u = t + t = 0'u32 + + staticFor i, L, 2*L: + staticFor j, i-L+1, L: + mulAcc(t, u, v, a[j], b[i-j]) + mulAcc(t, u, v, z[j], M[i-j]) + z[i-L] = v + v = u + u = t + t = 0'u32 + + when not lazyReduce: + let cond = v != 0 or not(z < M) + # conditionally subtract using *non modular subtraction*. If `cond == true`, + # we are in `M <= z <= 2M` and can safely subtract `M`. + z.csub_no_mod(M, cond) + return z + + proc mul(r: var BigInt, a, b: BigInt) {.device.} = + ## Multiplication of two finite field elements stored in `a` and `b`. + ## The result is stored in `r`. + when spareBits() >= 1: + r = mtymul_CIOS_sparebit(a, b, M, true) + else: # e.g. Goldilocks + r = mtymul_FIPS(a, b, M, false) From b6a63ef2d3e6cc2330a856d106ff1899e5adcec2 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 26 Aug 2025 08:17:36 +0200 Subject: [PATCH 58/87] [cuda] for now emit `constexpr` for a `gpuConstexpr` (i.e. Nim `const`) See the added TODO. The main point is that if we write things like ```nim const M = toBigInt(bigintToUint32Limbs(T.getModulus)) ``` we want to have the Nim compiler evaluate the RHS at compile time. If we were to make the user write ``` var M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus)) ``` instead the Nim compiler would evaluate the RHS at runtime. We _could_ force the user to write ```nim const M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus)) ``` instead though. It is no problem however, to emit `__constant__` if it's a global and `constexpr` for a local (where `__constant__` is anyhow forbidden in CUDA). The only minor annoyance is that _maybe_ someone wants to emit `constexpr` also in global scope. _Maybe_ we'll go with a design that does the above by default but allows you to overwrite it via ```nim const M {.constant.} = toBigInt(bigintToUint32Limbs(T.getModulus)) ``` -> explicitly force `__constant__` ```nim const M {.constexpr.} = toBigInt(bigintToUint32Limbs(T.getModulus)) ``` -> explicity force `constexpr` instead. --- constantine/math_compiler/experimental/backends/cuda.nim | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 1f12be38..83220fdf 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -346,10 +346,15 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = "(*" & ctx.genCuda(ast.dOf) & ")" of gpuConstexpr: + ## TODO: We need to change the code such that we emit `constexpr` inside of procs and + ## `__constant__` outside of procs. The point is we want to support mapping to `__constant__` + ## for `const foo = bar` Nim declarations to evaluate values at Nim's compile time. + ## Alternatively, make user write `const foo {.constant.} = bar` to produce a global + ## `__constant__` value. if ast.cType.kind == gtArray: - result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) + result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) else: - result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) + result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) else: echo "Unhandled node kind in genCuda: ", ast.kind From cf6fcbff0a37a1c91cde1e11b9b49710d9365199 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:37:54 +0200 Subject: [PATCH 59/87] [cuda] handle generic instantiations and `UncheckedArray` --- .../math_compiler/experimental/backends/cuda.nim | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 83220fdf..92f63efa 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -95,7 +95,18 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, else: result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]" skipIdent = true + of gtGenericInst: + # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations + # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. + result = t.gName + if t.gArgs.len > 0: + result.add "_" + for i, g in t.gArgs: + result.add gpuTypeToString(g) + if i < t.gArgs.high: + result.add "_" of gtObject: result = t.name + of gtUA: result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T? else: result = gpuTypeToString(t.kind) if ident.len > 0 and not skipIdent: # still need to add ident @@ -373,6 +384,7 @@ proc codegen*(ctx: var GpuContext): string = let fnC = fn.clone() fnC.forwardDeclare = true result.add ctx.genCuda(fnC) & "\n" + result.add "\n\n" for fnIdent, fn in ctx.fnTab: result.add ctx.genCuda(fn) & "\n\n" From 1e8f84d288df0850ef9d2b617a268194ba0595f1 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:38:12 +0200 Subject: [PATCH 60/87] [wgsl] do not suffix generic inst types if they have no args --- constantine/math_compiler/experimental/backends/wgsl.nim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 288a4fdd..d908f22b 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -115,7 +115,9 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr = of gtGenericInst: # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. - result = t.gName & "_" + result = t.gName + if t.gArgs.len > 0: + result.add "_" for i, g in t.gArgs: result.add gpuTypeToString(g) if i < t.gArgs.high: From e6cab34f6596dae0c2e2bcd91e42e849d994f385 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:48:18 +0200 Subject: [PATCH 61/87] handle `raises`, `noinit` pragmas And use normalized strings because `noInit` and `noinit` both appears often enough. --- constantine/math_compiler/experimental/nim_to_gpu.nim | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index f36c4f96..a79cb1a4 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -429,6 +429,7 @@ proc collectProcAttributes(n: NimNode): set[GpuAttribute] = continue of "magic": return + of "raises": discard # result.incl attDevice #discard # XXX else: raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr @@ -453,12 +454,13 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] = # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes # do not match the CUDA string we need to emit, which is what the string value of # the `GpuVarAttribute` enum stores - case pragma.strVal - of "cuExtern", "extern": result.add atvExtern + case pragma.strVal.normalize + of "cuextern", "extern": result.add atvExtern of "shared": result.add atvShared of "private": result.add atvPrivate of "volatile": result.add atvVolatile of "constant": result.add atvConstant + of "noinit": discard # XXX: ignore for now else: raiseAssert "Unexpected pragma: " & $pragma.treerepr From 91a87e796a1a04e1788d9bbd89383be2e70e0cdb Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:52:54 +0200 Subject: [PATCH 62/87] use `getTypeName` for tuple type fields Otherwise we get a big mess for objects :) --- .../math_compiler/experimental/nim_to_gpu.nim | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index a79cb1a4..7dae2ad7 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -171,6 +171,7 @@ functions. doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr result = n[1][2].intVal + 1 +proc getTypeName(n: NimNode, recursedSym: bool = false): string proc constructTupleTypeName(n: NimNode): string = ## XXX: overthink if this should really be here and not somewhere else ## @@ -179,12 +180,14 @@ proc constructTupleTypeName(n: NimNode): string = ## ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types. + ## UPDATE: I changed the implementation to recurse into `getTypeName` + ## TODO: verify that this did not break the tuple test & specifically check for aliases result = "Tuple_" doAssert n.kind in [nnkTupleTy, nnkTupleConstr] for i, ch in n: case ch.kind of nnkIdentDefs: - let typName = ch[ch.len - 2].getTypeImpl.repr # second to last is type name of field(s) + let typName = ch[ch.len - 2].getTypeName() # second to last is type name of field(s) for j in 0 ..< ch.len - 2: # Example: # IdentDefs @@ -206,7 +209,7 @@ proc constructTupleTypeName(n: NimNode): string = # IntLit 16 # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)` # give them a slightly different name - let typName = ch[0].getTypeImpl.repr ## XXX + let typName = ch[0].getTypeName() ## XXX doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr result.add ch[0].strVal & "_" & typName if i < n.len - 1: @@ -215,7 +218,7 @@ proc constructTupleTypeName(n: NimNode): string = # TupleConstr # Sym "BaseType" <-- e.g. here # Sym "BaseType" - let typName = ch.getTypeImpl.repr + let typName = ch.getTypeName() result.add "Field" & $i & "_" & typName if i < n.len - 1: result.add "_" @@ -232,10 +235,15 @@ proc constructTupleTypeName(n: NimNode): string = # -> Try again with type impl return constructTupleTypeName(getTypeImpl(n)) -proc getTypeName(n: NimNode): string = +proc getTypeName(n: NimNode, recursedSym: bool = false): string = ## Returns the name of the type case n.kind - of nnkIdent, nnkSym: result = n.strVal + of nnkIdent: result = n.strVal + of nnkSym: + if recursedSym: + result = n.strVal + else: + result = n.getTypeInst.getTypeName(true) of nnkObjConstr: if n[0].kind == nnkEmpty: result = n.getTypeInst.strVal From bb4e8fb1881eba68b77be7a4c5b05a4c27fe821c Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:53:14 +0200 Subject: [PATCH 63/87] map `ntyString` to string explicitly in gpu type kinds Not supported on some backends --- constantine/math_compiler/experimental/nim_to_gpu.nim | 3 +++ 1 file changed, 3 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 7dae2ad7..c341de44 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -74,6 +74,7 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = of ntyUInt16: gtUint16 of ntyUInt32: gtUint32 of ntyUInt64: gtUint64 + of ntyString: gtString else: raiseAssert "Not supported yet: " & $t @@ -275,6 +276,8 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = case n.typeKind of ntyBool, ntyInt .. ntyUint64: # includes all float types result = initGpuType(toGpuTypeKind n.typeKind) + of ntyString: # only supported on some backends! + result = initGpuType(toGpuTypeKind n.typeKind) of ntyPtr: result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false) of ntyVar: From de2cf85e51c746337ea8327d48958a8672f46186 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:53:48 +0200 Subject: [PATCH 64/87] map ntyUnused2 (lent T) to ptr T --- constantine/math_compiler/experimental/nim_to_gpu.nim | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index c341de44..d21085e4 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -328,16 +328,21 @@ proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = # error("o") of ntyGenericInvocation: result = initGpuType(gtInvalid) - error("Generics are not supported in the CUDA DSL so far.") + error("Generics are not supported in the CUDA DSL so far.") # Note: this should not appear nowadays of ntyGenericInst: result = initGpuGenericInst(n) - #result = n.unpackGenericInst().nimToGpuType(allowToFail) of ntyTypeDesc: # `getType` returns a `BracketExpr` of eg: # BracketExpr # Sym "typeDesc" # Sym "float32" result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it + of ntyUnused2: + # BracketExpr + # Sym "lent" + # Sym "BigInt" + doAssert n.kind == nnkBracketExpr and n[0].strVal == "lent", "ntyUnused2: " & $n.treerepr + result = initGpuPtrType(nimToGpuType(n[1]), implicitPtr = false) else: if allowToFail: result = GpuType(kind: gtVoid) From 1b0aa55d3ecfb2d3b28654ef8145d3aee63b5561 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:54:10 +0200 Subject: [PATCH 65/87] generate type names for bracket expressions e.g. generics --- constantine/math_compiler/experimental/nim_to_gpu.nim | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index d21085e4..6122f601 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -252,6 +252,12 @@ proc getTypeName(n: NimNode, recursedSym: bool = false): string = result = n[0].strVal # type is the first node of nnkTupleTy, nnkTupleConstr: result = constructTupleTypeName(n) + of nnkBracketExpr: + # construct a type name `Foo_Bar_Baz` + for i, ch in n: + result.add ch.getTypeName() + if i < n.len - 1: + result.add "_" else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = From 1a4dc3140790130fdd0f017dab2a2ac83a3aea41 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:54:45 +0200 Subject: [PATCH 66/87] improve generic inst -> gpu type by handling nnkSym This is not perfect yet (I think). Needs some tests for different situations. Works in practice for what I've used it for at least. --- .../math_compiler/experimental/nim_to_gpu.nim | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 6122f601..ab5e45b5 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -93,6 +93,19 @@ proc initGpuGenericInst(t: NimNode): GpuType = of nnkObjConstr: doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr result = initGpuGenericInst(t[0]) + of nnkSym: + let impl = getTypeImpl(t) + case impl.kind + of nnkDistinctTy: + ## XXX: assumes distinct of inbuilt type, not object! + result = nimToGpuType(impl[0]) + of nnkObjectTy: + doAssert impl.kind == nnkObjectTy, "Unexpected node kind for generic inst: " & $impl.treerepr + ## XXX: use signature hash for type name? Otherwise will produce duplicates + result = GpuType(kind: gtGenericInst, gName: t.repr) + result.gFields = parseTypeFields(impl) + else: + raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr else: raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr From 8583d2306e0a0ed51bc4d63aa1e03eb63e88d34d Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:55:39 +0200 Subject: [PATCH 67/87] overwrite function names of problematic identifiers E.g. `[]` is not a sensible name on GPU backends. We rename it to `get` for example. Note that the binary operators there likely never appear there. We need to handle those via `gpuBinOp` (and call `maybePatchFnName` from there) --- .../math_compiler/experimental/nim_to_gpu.nim | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index ab5e45b5..346aaedf 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -501,6 +501,28 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] = proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst +proc maybePatchFnName(n: var GpuAst) = + ## Patches the function name for names that are not allowed on most backends, but appear + ## commonly in Nim (custom operators). + ## + ## NOTE: I think that the binary operators don't actually appear as a `gpuCall`, but still + ## as an infix node, even after sem checking by the Nim compiler. + doAssert n.kind == gpuIdent + template patch(arg, by: untyped): untyped = + arg.iSym = arg.iSym.replace(arg.iName, by) + arg.iName = by + let name = n.iName + case name + of "[]": patch(n, "get") + of "[]=": patch(n, "set") + of "+": patch(n, "add") + of "-": patch(n, "sub") + of "*": patch(n, "mul") + of "/": patch(n, "div") + else: + # leave as is + discard + proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = ## Returns the name for the function. Either the symbol name _or_ ## the `{.cudaName.}` pragma argument. @@ -541,6 +563,10 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = else: result = ctx.toGpuAst(n) # if not proc or func + # possibly patch function names, e.g. custom `[]`, `[]=`, `+` etc operators + # (inbuilt won't show up as a function name, but rather as a specific node kind, eg `nnkIndex` + result.maybePatchFnName() + # handle overloads with different signatures if n.strVal in ctx.symChoices: # this is an overload of another function with different signature (not a generic, but From bf27dcbf4376e16187f69c937b01109f516d9481 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:57:45 +0200 Subject: [PATCH 68/87] refactor proc signature parsing & unify adding types to `ctx.types` NOTE: We'll change the `maybeAddType` code in the future to be done in `nimToGpuType` directly. However, at the moment that produces a bit too much required change with having to add `GpuContext` to a whole bunch of functions that all call `nimToGpuType` internally. --- .../math_compiler/experimental/nim_to_gpu.nim | 121 ++++++++++-------- 1 file changed, 71 insertions(+), 50 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 346aaedf..b46a17c2 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -584,6 +584,66 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = # ctx.sigTab[sig] = result result.symbolKind = gsProc # make sure it's a proc +proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = + ## Returns the type from a given Nim node `t` representing a type. + ## If that fails due to an identifier in the type, we instead try + ## to look up the type from the associated symbol, `n`. + result = nimToGpuType(t, allowArrayIdent = true) + if result.kind == gtInvalid: + # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means + # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead + result = n.getTypeInst.nimToGpuType() + +proc maybeAddType*(ctx: var GpuContext, typ: GpuType) = + ## Adds the given type to the table of known types, if it is some kind of + ## object type. + ## + ## XXX: What about aliases and distincts? + if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types: + ctx.types[typ] = toTypeDef(typ) + +proc parseProcParameters(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): seq[GpuParam] = + ## Returns all parameters of the given procedure from the `params` node + ## of type `nnkFormalParams`. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + for i in 1 ..< params.len: + let param = params[i] + let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: + let typIdx = param.len - 2 # second to last is the type + # IdentDefs + # Ident "x" + # Ident "y" + # Ident "res" + # PtrTy + # Ident "float32" # `param.len - 2` + # Empty # `param.len - 1` + let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1]) + ctx.maybeAddType(paramType) + for i in 0 ..< numParams: + var p = ctx.toGpuAst(param[i]) + let symKind = if attGlobal in attrs: gsGlobalKernelParam + else: gsDeviceKernelParam + p.iTyp = paramType ## Update the type of the symbol + p.symbolKind = symKind ## and the symbol kind + let param = GpuParam(ident: p, typ: paramType) + result.add(param) + +proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType = + ## Returns the return type of the given procedure from the `params` node + ## of type `nnkFormalParams`. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + let retType = params[0] # arg 0 is return type + if retType.kind == nnkEmpty: + result = GpuType(kind: gtVoid) # actual void return + else: + # attempt to get type. If fails, we need to wait for a caller to this function to get types + # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside + # the macro. We then rely on our generics logic to later look this up when called + result = nimToGpuType(retType, allowArrayIdent = true) + if result.kind == gtVoid: # stop parsing this function + result = GpuType(kind: gtInvalid) + ctx.maybeAddType(result) + proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = ## Looks up the implementation of the given function and stores it in our table ## of generic instantiations. @@ -608,16 +668,6 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = name.iName = fn.pName.iSym ## update the name of the called function ctx.genericInsts[fn.pName] = fn -proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = - ## Returns the type from a given Nim node `t` representing a type. - ## If that fails due to an identifier in the type, we instead try - ## to look up the type from the associated symbol, `n`. - result = nimToGpuType(t, allowArrayIdent = true) - if result.kind == gtInvalid: - # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means - # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead - result = n.getTypeInst.nimToGpuType() - proc isExpression(n: GpuAst): bool = ## Returns whether the given AST node is an expression case n.kind @@ -750,18 +800,12 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = GpuAst(kind: gpuProc) result.pName = name result.pName.symbolKind = gsProc ## This is a procedure identifier - doAssert node[3].kind == nnkFormalParams - let retType = node[3][0] # arg 0 is return type - if retType.kind == nnkEmpty: - result.pRetType = GpuType(kind: gtVoid) # actual void return - else: - # attempt to get type. If fails, we need to wait for a caller to this function to get types - # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside - # the macro. We then rely on our generics logic to later look this up when called - result.pRetType = nimToGpuType(retType, allowArrayIdent = true) - if result.pRetType.kind == gtVoid: # stop parsing this function - ctx.generics.incl name.iName # need to use raw name, *not* symbol - return GpuAst(kind: gpuVoid) + let params = node[3] + doAssert params.kind == nnkFormalParams + result.pRetType = ctx.parseProcReturnType(params) + if result.pRetType.kind == gtInvalid: + ctx.generics.incl name.iName # need to use raw name, *not* symbol + return GpuAst(kind: gpuVoid) # Process pragmas if node.pragma.kind != nnkEmpty: @@ -771,27 +815,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called return GpuAst(kind: gpuVoid) # Process parameters - for i in 1 ..< node[3].len: - let param = node[3][i] - let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: - let typIdx = param.len - 2 # second to last is the type - # IdentDefs - # Ident "x" - # Ident "y" - # Ident "res" - # PtrTy - # Ident "float32" # `param.len - 2` - # Empty # `param.len - 1` - let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1]) - for i in 0 ..< numParams: - var p = ctx.toGpuAst(param[i]) - let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam - else: gsDeviceKernelParam - p.iTyp = paramType ## Update the type of the symbol - p.symbolKind = symKind ## and the symbol kind - let param = GpuParam(ident: p, typ: paramType) - result.pParams.add(param) - + result.pParams = ctx.parseProcParameters(params, result.pAttributes) result.pBody = ctx.toGpuAst(node.body) .ensureBlock() # single line procs should be a block to generate `;` result.pBody.maybeInsertResult(result.pRetType, result.pName.ident()) @@ -826,6 +850,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = varNode.vAttributes = collectAttributes(declaration[0][1]) else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0]) + ctx.maybeAddType(varNode.vType) varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later # This is a *local* variable (i.e. `function` address space on WGSL) unless it is # annotated with `{.shared.}` (-> `workspace` in WGSL) @@ -960,8 +985,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of ntyTuple: # need to replace `[idx]` by field access let typ = nimToGpuType(node[0].getTypeImpl) - if typ notin ctx.types: - ctx.types[typ] = toTypeDef(typ) + ctx.maybeAddType(typ) #doAssert typ in ctx.types doAssert node[1].kind == nnkIntLit let idx = node[1].intVal @@ -1084,9 +1108,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkObjConstr: ## this should never see `genericParam` I think let typ = nimToGpuType(node) - if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type - ctx.types[typ] = toTypeDef(typ) - + ctx.maybeAddType(typ) result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type let flds = typ.oFields @@ -1112,8 +1134,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = typ: flds[i].typ) of nnkTupleConstr: let typ = nimToGpuType(node) - if typ notin ctx.types: # this should handle not just local types, but also any "pulled in" type - ctx.types[typ] = toTypeDef(typ) + ctx.maybeAddType(typ) result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type From 81bd6f6c795287859fb1d0feb45d02f89ea61773 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 28 Aug 2025 18:59:59 +0200 Subject: [PATCH 69/87] handle recursive calls in GPU code Previously due to our parsing of procs whenever they are called, we would infinitely recurse in the parsing logic. We now record the function signature and function identifier so that we can avoid that. We need the function signature to get information about the return type before we actually start parsing a function. Otherwise _inside_ of the recursive function we wouldn't be able to determine the return type at the callsite of the recursive call (the initial parse hasn't been completed at that point yet, which would fill the proc into `allFnTab`) --- .../math_compiler/experimental/gpu_types.nim | 21 ++++++++++ .../math_compiler/experimental/nim_to_gpu.nim | 40 +++++++++++++++++-- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 2ce2be12..cac35019 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -236,6 +236,10 @@ type params*: seq[string] body*: GpuAst + GpuProcSignature* = object + params*: seq[GpuParam] + retType*: GpuType + GpuContext* = object ## XXX: need table for generic invocations. Then when we encounter a type, need to map to ## the specific version @@ -276,6 +280,12 @@ type ## precise generic instantiations that are called. genericInsts*: OrderedTable[GpuAst, GpuAst] + ## Table of procs and their signature to avoid looping infinitely for recursive procs + ## Arguments are: + ## - Key: ident of the proc + ## - Value: signature of the (possibly generic) instantiation + processedProcs*: OrderedTable[GpuAst, GpuProcSignature] + ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can ## check if they return a value when we encounter them in a `gpuCall` builtins*: OrderedTable[GpuAst, GpuAst] @@ -548,6 +558,17 @@ proc `==`*(a, b: GpuAst): bool = else: result = a.iSym == b.iSym and a.iTyp == b.iTyp and a.symbolKind == b.symbolKind +proc `==`*(a, b: GpuProcSignature): bool = + if a.retType != b.retType: result = false + elif a.params.len != b.params.len: + result = false + else: + result = true + for i in 0 ..< a.params.len: + let ap = a.params[i] + let bp = b.params[i] + result = result and (ap == bp) + proc len*(ast: GpuAst): int = case ast.kind of gpuProc: 1 diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index b46a17c2..7be15c56 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -447,7 +447,8 @@ proc isBuiltIn(n: NimNode): bool = return true proc collectProcAttributes(n: NimNode): set[GpuAttribute] = - doAssert n.kind == nnkPragma + doAssert n.kind in [nnkPragma, nnkEmpty] + if n.kind == nnkEmpty: return # no pragmas for pragma in n: doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma @@ -644,6 +645,16 @@ proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType = result = GpuType(kind: gtInvalid) ctx.maybeAddType(result) +proc toGpuProcSignature(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): GpuProcSignature = + ## Creates a `GpuProcSignature` from the given `params` node of type `nnkFormalParams` + + ## + ## NOTE: This procedure is only called from generically instantiated procs. Therefore, + ## we shouldn't need to worry about getting `gtInvalid` return types here. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + result = GpuProcSignature(params: ctx.parseProcParameters(params, attrs), + retType: ctx.parseProcReturnType(params)) + proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = ## Looks up the implementation of the given function and stores it in our table ## of generic instantiations. @@ -656,9 +667,30 @@ proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = let inst = node[0].getImpl() let sig = node[0].getTypeInst() inst.params = sig.params # copy over the parameters + + # turn the signature into a `GpuProcSignature` + let attrs = collectProcAttributes(inst.pragma) + let procSig = ctx.toGpuProcSignature(sig.params, attrs) + if name in ctx.processedProcs: + return + else: + # Need to add isym here so that if we have recursive calls, we don't end up + # calling `toGpuAst` recursively forever + ctx.processedProcs[name] = procSig + let fn = ctx.toGpuAst(inst) - if fn.kind == gpuVoid: # should be an inbuilt proc, i.e. annotated with `{.builtin.}` - doAssert inst.isBuiltIn() + if fn.kind == gpuVoid: + # Should be an inbuilt proc, i.e. annotated with `{.builtin.}`. However, + # functions that are available otherwise (e.g. in Nim's system like `abs`) + # in Nim _and_ backends will also show up here. Unless we wanted to manually + # wrap all of these, we can just skip the `isBuiltin` check here. + # If the user uses something not available in the backend, they'll get a + # compiler error from that compiler. + # It's mostly a matter of usability: For common procs like `abs` we cannot + # so easily define a custom overload `proc abs(...): ... {.builtin.}`, because + # that would overwrite the Nim version. + # doAssert inst.isBuiltIn() + return else: fn.pAttributes.incl attDevice # make sure this is interpreted as a device function doAssert fn.pName.iSym == name.iSym, "Not matching" @@ -738,6 +770,8 @@ proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool = result = ctx.genericInsts[fn].pRetType.kind != gtVoid elif fn in ctx.builtins: result = ctx.builtins[fn].pRetType.kind != gtVoid + elif fn in ctx.processedProcs: + result = ctx.processedProcs[fn].retType.kind != gtVoid else: raiseAssert "The function: " & $fn & " is not known anywhere." From 6cd1a5ed00201944137e4f64d68cbc3c6c42c5ce Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 29 Aug 2025 12:38:41 +0200 Subject: [PATCH 70/87] fix `modadd` in debug builds on CUDA In a debug build (`-d:debugCuda`) modular addition produced off by one errors. As it turned out the problem was our calculation of `overflowedLimbs` inside of `finalSubMayOverflow`. The carry flag set in the last `add_cio` call in `modadd` does not reliably survive into the function call in a debug build. We compute it directly after computing the last `add_cio` call in `modadd` and simply pass it as an argument. This way arithmetic also works reliably on a debug build. --- .../experimental/gpu_field_ops.nim | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim index 46c458f4..cc96abc3 100644 --- a/constantine/math_compiler/experimental/gpu_field_ops.nim +++ b/constantine/math_compiler/experimental/gpu_field_ops.nim @@ -431,7 +431,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts - proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} = + proc finalSubMayOverflow(a, M: BigInt, overflowedLimbs: uint32): BigInt {.device.} = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -442,9 +442,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) var scratch: BigInt = BigInt() - # Contains 0x0001 (if overflowed limbs) or 0x0000 - let overflowedLimbs = add_ci(0'u32, 0'u32) - # Now substract the modulus, and test a < M with the last borrow scratch[0] = sub_bo(a[0], M[0]) staticFor i, 1, N: @@ -454,9 +451,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = # 2. if a >= M, underflowedModulus >= 0 # if underflowedModulus >= 0: a-M else: a # TODO: predicated mov instead? - ## TODO: Fix this. `slct` needs a negative value for the else branch let underflowedModulus = sub_bi(overflowedLimbs, 0'u32) - var r: BigInt = BigInt() staticFor i, 0, N: r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus)) @@ -479,9 +474,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = scratch[i] = sub_bio(a[i], M[i]) # If it underflows here, `a` was smaller than the modulus, which is what we want - ## TODO: Fix this. `slct` needs a negative value for the else branch let underflowedModulus = sub_bi(0'u32, 0'u32) - var r: BigInt = BigInt() staticFor i, 0, N: r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus)) @@ -490,18 +483,21 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = proc modadd(a, b, M: BigInt): BigInt {.device.} = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - # try to add two bigints var t = BigInt() # temporary t[0] = add_co(a[0], b[0]) staticFor i, 1, N: t[i] = add_cio(a[i], b[i]) - # can use `when` of course! - when spareBits() >= 1: # if spareBits() >= 1: # would also work + when spareBits() >= 1: t = finalSubNoOverflow(t, M) else: - t = finalSubMayOverflow(t, M) + # Contains 0x0001 (if overflowed limbs) or 0x0000 + # This _must_ be computed here and not inside of `finalSubMayOverflow`. In a + # debug build on CUDA the carry flag would (potentially) be reset going into + # the function. + let overflowedLimbs = add_ci(0'u32, 0'u32) + t = finalSubMayOverflow(t, M, overflowedLimbs) return t From 8ae1427d9e45dc2827ba6267351cde99bfe81837 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 1 Sep 2025 09:44:00 +0200 Subject: [PATCH 71/87] fix reassignment of `types` in RT codegen for aliases --- constantine/math_compiler/experimental/gpu_compiler.nim | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 51207212..01548f7b 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -133,7 +133,12 @@ proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string = for fn in gen.procs: # assign generics info to correct table ctx.genericInsts[fn.pName] = fn for typ in gen.types: # assign generics info to correct table - ctx.types[typ.tTyp] = typ + case typ.kind + of gpuTypeDef: + ctx.types[typ.tTyp] = typ + of gpuAlias: + ctx.types[typ.aTyp] = typ + else: raiseAssert "Unexpected node kind assigning to `types`: " & $typ result = ctx.codegen(ast, kernel) From ac91c058e59fb3dfa629ac2c4d20f463ddcca483 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 1 Sep 2025 18:07:37 +0200 Subject: [PATCH 72/87] handle `nnkObjConstr` correctly when encountering gtGenericInst --- constantine/math_compiler/experimental/nim_to_gpu.nim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 7be15c56..725c17cb 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -1145,7 +1145,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ctx.maybeAddType(typ) result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type - let flds = typ.oFields + let flds = if typ.kind == gtObject: typ.oFields + elif typ.kind == gtGenericInst: typ.gFields + else: raiseAssert "ObjConstr must have an object type: " & $typ # find all fields that have been defined by the user var ocFields: seq[GpuFieldInit] for i in 1 ..< node.len: # all fields to be init'd From 04ddb398001a35f012a20b2cfcfc4fdb05a3bf5d Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 12:40:34 +0200 Subject: [PATCH 73/87] do not emit `f` suffix for float literals Nim float literals when converting to strings already come with a `f` suffix. --- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index d908f22b..b307cdb5 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -25,7 +25,7 @@ proc literalSuffix(t: GpuType): string = of gtUint32: "u" of gtInt32: "" # NOTE: We DON'T give as suffix to `i32` literals so that we can rely on more cases # where WebGPU allows literals to be converted automatically! - of gtFloat32: "f" + of gtFloat32: "" # NOTE: float suffixes _already_ come with an `f` suffix in Nim! else: "" proc toAddressSpace(symKind: GpuSymbolKind): AddressSpace = From 7b51b65b86f89d698282de46a630592684645977 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 12:40:59 +0200 Subject: [PATCH 74/87] [wgsl] append `;` for aliases --- constantine/math_compiler/experimental/backends/wgsl.nim | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index b307cdb5..0706dfc3 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -975,7 +975,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add "}" of gpuAlias: - result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) + # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped + # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons + # after alias definitions, but not after `struct`. Hence we add `;` manually here + result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ";" of gpuObjConstr: result = gpuTypeToString(ast.ocType) & "(" From 2d5b367df2e6ca0ebaba04f089770ade6d5a8760 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 13:00:14 +0200 Subject: [PATCH 75/87] rewrite infix as `gpuCall` if arguments not basic types The idea is that if the arguments are not basic types, we will need a custom function to perform the infix operation. Most backends however do not support custom operators and hence actual `gpuBinOp` are not valid for custom types in general. Hence, we rewrite them as `gpuCall` nodes with non-symbol based naming. NOTE: Currently this does not handle the case where we might use an inbuilt type like `vec3` and its implementation of infix operators that _may_ be defined after all. We need to find an elegant solution for that. Either by checking if argument are of basic types (like in this commit) or if they are a type annotated with `{.builtin.}`. Alternatively, we could force the user to define operators for such inbuilt types (i.e. wrap them) and then if there is a wrapper that is marked `{.builtin.}` we don't replace infix by `gpuCall` either. --- .../experimental/backends/cuda.nim | 2 +- .../experimental/backends/wgsl.nim | 10 +-- .../math_compiler/experimental/gpu_types.nim | 10 ++- .../math_compiler/experimental/nim_to_gpu.nim | 63 ++++++++++++++----- 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 92f63efa..6f3f33bb 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -300,7 +300,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = let l = ctx.genCuda(ast.bLeft) let r = ctx.genCuda(ast.bRight) result = indentStr & "(" & l & " " & - ast.bOp & " " & + ctx.genCuda(ast.bOp) & " " & r & ")" of gpuIdent: diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 0706dfc3..2e98b72c 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -499,15 +499,17 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) = proc rewriteCompoundAssignment(n: GpuAst): GpuAst = doAssert n.kind == gpuBinOp - if n.bOp in ["<=", "==", ">=", "!="]: return n + if n.bOp.ident() in ["<=", "==", ">=", "!="]: return n template genAssign(left, rnode, op: typed): untyped = let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode) GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false) - let op = n.bOp + let op = n.bOp.ident() if op.len >= 2 and op[^1] == '=': - result = genAssign(n.bLeft, n.bRight, op[0 .. ^2]) # all but last + var opAst = GpuAst(kind: gpuIdent, iName: op[0 .. ^2]) + opAst.iSym = opAst.iName + result = genAssign(n.bLeft, n.bRight, opAst) # all but last else: # leave untouched result = n @@ -937,7 +939,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = let l = ctx.genWebGpu(ast.bLeft) let r = ctx.genWebGpu(ast.bRight) result = indentStr & "(" & l & " " & - ast.bOp & " " & + ctx.genWebGpu(ast.bOp) & " " & r & ")" of gpuIdent: diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index cac35019..dd6e23df 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -127,8 +127,10 @@ type wCond*: GpuAst wBody*: GpuAst of gpuBinOp: - bOp*: string + bOp*: GpuAst # `gpuIdent` of the binary operation bLeft*, bRight*: GpuAst + # types of left and right nodes. Determined from Nim symbol associated with `bOp` + bLeftTyp*, bRightTyp*: GpuType of gpuVar: vName*: GpuAst ## Will be a `GpuIdent` vType*: GpuType @@ -391,9 +393,11 @@ proc clone*(ast: GpuAst): GpuAst = result.wBody = ast.wBody.clone() of gpuBinOp: result = GpuAst(kind: gpuBinOp) - result.bOp = ast.bOp + result.bOp = ast.bOp.clone() result.bLeft = ast.bLeft.clone() result.bRight = ast.bRight.clone() + result.bLeftTyp = ast.bLeftTyp.clone() + result.bRightTyp = ast.bRightTyp.clone() of gpuVar: result = GpuAst(kind: gpuVar) result.vName = ast.vName.clone() @@ -678,7 +682,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add pretty(n.wCond, indent + 2) result.add pretty(n.wBody, indent + 2) of gpuBinOp: - result.add idd("Ident", n.bOp) + result.add pretty(n.bOp, indent + 2) result.add pretty(n.bLeft, indent + 2) result.add pretty(n.bRight, indent + 2) of gpuVar: diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 725c17cb..06588ae0 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -990,22 +990,53 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkInfix: result = GpuAst(kind: gpuBinOp) - # if left/right is boolean we need logical AND/OR, otherwise - # bitwise - let isBoolean = node[1].typeKind == ntyBool - result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name - result.bLeft = ctx.toGpuAst(node[1]) - result.bRight = ctx.toGpuAst(node[2]) - - # We patch the types of int / float literals. WGSL does not automatically convert literals - # to the target type. Determining the type here _can_ fail. In that case the - # `lType` field will just be `gtVoid`, like the default. - if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit: - # determine literal type based on `bRight` - result.bLeft.lType = nimToGpuType(node[2], allowToFail = true) - elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit: - # determine literal type based on `bLeft` - result.bRight.lType = nimToGpuType(node[1], allowToFail = true) + # Using `getType` to get the types of the arguuments + let typ = node[0].getTypeImpl() # e.g. + doAssert typ.kind == nnkProcTy, "Infix node is not a proc but: " & $typ.treerepr + # BracketExpr + # Sym "proc" + # Sym "int" <- return type + # Sym "int" <- left op type + # Sym "int" <- right op type + result.bLeftTyp = nimToGpuType(typ[0][1]) + result.bRightTyp = nimToGpuType(typ[0][2]) + # if either is not a base type (`gtBool .. gtSize_t`) we actually deal with a _function call_ + # instead of an binary operation. Will thus rewrite. + proc ofBasicType(t: GpuType, allowPtrLhs: bool): bool = + ## Determines if the given type is a basic POD type *or* a simple pointer to it. + ## This is because some infix nodes, e.g. `x += y` will have LHS arguments that are + ## `var T`, which appear as an implicit pointer here. + ## + ## TODO: Handle the case of backend inbuilt special types (like `vec3`), which may indeed + ## have inbuilt infix operators. Either by checking if the type has a `{.builtin.}` pragma + ## _or_ if there is a wrapped proc for this operator and if so do not rewrite as `gpuCall` + ## if that exists. + result = (t.kind in gtBool .. gtSize_t) + if allowPtrLhs: + result = result or ((t.kind == gtPtr) and t.implicit and t.to.kind in gtBool .. gtSize_t) + + if not result.bLeftTyp.ofBasicType(true) or not result.bRightTyp.ofBasicType(false): + result = GpuAst(kind: gpuCall) + result.cName = ctx.getFnName(node[0]) + result.cArgs = @[ctx.toGpuAst(node[1]), ctx.toGpuAst(node[2])] + else: + # if left/right is boolean we need logical AND/OR, otherwise bitwise + let isBoolean = result.bLeftTyp.kind == gtBool + var op = GpuAst(kind: gpuIdent, iName: assignOp(node[0].repr, isBoolean)) # repr so that open sym choice gets correct name + op.iSym = op.iName + result.bOp = op + result.bLeft = ctx.toGpuAst(node[1]) + result.bRight = ctx.toGpuAst(node[2]) + + # We patch the types of int / float literals. WGSL does not automatically convert literals + # to the target type. Determining the type here _can_ fail. In that case the + # `lType` field will just be `gtVoid`, like the default. + if result.bLeft.kind == gpuLit: # and result.bRight.kind != gpuLit: + # determine literal type based on `bRight` + result.bLeft.lType = result.bLeftTyp # nimToGpuType(node[2], allowToFail = true) + elif result.bRight.kind == gpuLit: # and result.bLeft.kind != gpuLit: + # determine literal type based on `bLeft` + result.bRight.lType = result.bRightTyp #nimToGpuType(node[1], allowToFail = true) of nnkDotExpr: ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT From c779258172c6711903c836bbeaf5b71b595fb000 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 17:00:47 +0200 Subject: [PATCH 76/87] support arbitrary values in array literals --- constantine/math_compiler/experimental/backends/cuda.nim | 2 +- constantine/math_compiler/experimental/backends/wgsl.nim | 2 +- constantine/math_compiler/experimental/gpu_types.nim | 7 ++++--- constantine/math_compiler/experimental/nim_to_gpu.nim | 5 ++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 6f3f33bb..34a96445 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -314,7 +314,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuArrayLit: result = "{" for i, el in ast.aValues: - result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el + result.add "(" & gpuTypeToString(ast.aLitType) & ")" & ctx.genCuda(el) if i < ast.aValues.high: result.add ", " result.add "}" diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 2e98b72c..e9501440 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -959,7 +959,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuArrayLit: result = "array(" for i, el in ast.aValues: - result.add gpuTypeToString(ast.aLitType) & "(" & el & ")" + result.add gpuTypeToString(ast.aLitType) & "(" & ctx.genWebGpu(el) & ")" if i < ast.aValues.high: result.add ", " result.add ")" diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index dd6e23df..834d8fa5 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -155,7 +155,7 @@ type cValue*: GpuAst # not just a string to support different types easily cType*: GpuType of gpuArrayLit: - aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array + aValues*: seq[GpuAst] aLitType*: GpuType # type of first element of gpuBlock: isExpr*: bool ## Whether this block represents an expression, i.e. it returns something @@ -429,7 +429,8 @@ proc clone*(ast: GpuAst): GpuAst = result.cType = ast.cType.clone() of gpuArrayLit: result = GpuAst(kind: gpuArrayLit) - result.aValues = ast.aValues + for a in ast.aValues: + result.aValues.add a.clone() result.aLitType = ast.aLitType.clone() of gpuPrefix: result = GpuAst(kind: gpuPrefix) @@ -705,7 +706,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add pretty(n.cValue, indent + 2) of gpuArrayLit: for el in n.aValues: - result.add id(el) + result.add pretty(el, indent + 2) of gpuBlock: if n.blockLabel.len > 0: result.add id("Label", n.blockLabel) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 06588ae0..7176eb0f 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -1241,10 +1241,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkBracket: let aLitTyp = nimToGpuType(node[0]) - var aValues = newSeq[string]() + var aValues = newSeq[GpuAst]() for el in node: - ## XXX: Support not just int literals - aValues.add $el.intVal + aValues.add ctx.toGpuAst(el) result = GpuAst(kind: gpuArrayLit, aValues: aValues, aLitType: aLitTyp) From e395ecd2e1a094feabd3980407bdab0c791a2531 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 17:04:04 +0200 Subject: [PATCH 77/87] handle generic instantiation with actual initializations I.e. instead of just: ```nim Vec[float32]() ``` being: ``` ObjConstr BracketExpr Sym "Vec3" Sym "float32" ``` Also handle the case of: ``` Vec3[float32](limbs: [1'f32, 1'f32, 1'f32]) ``` being: ``` ObjConstr BracketExpr Sym "Vec3" Sym "float32" ExprColonExpr Sym "limbs" Bracket Float32Lit 1.0 Float32Lit 1.0 Float32Lit 1.0 ``` --- .../math_compiler/experimental/nim_to_gpu.nim | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 7176eb0f..59d473a4 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -79,20 +79,41 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = raiseAssert "Not supported yet: " & $t proc parseTypeFields(node: NimNode): seq[GpuTypeField] + +proc getGenericTypeName(t: NimNode): string = + ## Returns the base name of the generic type, i.e. for + ## `Foo[Bar, Baz]` returns `Foo`. + case t.kind + of nnkSym: result = t.strVal + of nnkBracketExpr: result = t[0].getGenericTypeName() + else: raiseAssert "Unexpected node kind for generic instantiation type: " & $t.treerepr + +proc parseGenericArgs(t: NimNode): seq[GpuType] = + case t.kind + of nnkSym: return # no generic arguments + of nnkBracketExpr: + for i in 1 ..< t.len: + result.add nimToGpuType(t[i]) + else: + raiseAssert "Unexpected node kind in parseGenericArgs: " & $t.treerepr + proc initGpuGenericInst(t: NimNode): GpuType = doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind case t.kind of nnkBracketExpr: # regular generic instantiation - result = GpuType(kind: gtGenericInst, gName: t[0].repr) - for i in 1 ..< t.len: # grab all generic arguments - let typ = nimToGpuType(t[i]) - result.gArgs.add typ + result = GpuType(kind: gtGenericInst, gName: getGenericTypeName(t)) + result.gArgs = parseGenericArgs(t) # now parse the object fields let impl = t.getTypeImpl() # impl for the `gFields` result.gFields = parseTypeFields(impl) of nnkObjConstr: - doAssert t.len == 1, "Unexpected length of ObjConstr node: " & $t.len & " of node: " & $t.treerepr - result = initGpuGenericInst(t[0]) + if t.len == 1: # Generic instantiation without arguments + result = initGpuGenericInst(t[0]) + elif t.len == 2: # ...and with arguments + doAssert t[1].kind == nnkExprColonExpr, "ObjConstr does not contain initialization as [1], but: " & $t.treerepr + result = initGpuGenericInst(t[0]) + else: + raiseAssert "Unexpected number of elements in `nnkObjConstr` node for generic instantiation: " & $t.treerepr of nnkSym: let impl = getTypeImpl(t) case impl.kind From ea2b36e29ca96b4dc612cf8b06bfd001b7540e2a Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 2 Sep 2025 18:33:08 +0200 Subject: [PATCH 78/87] fix top level type definitions Because we still had the `farmTopLevel` adding a (now empty) element to the `globalBlocks` our array access to `ctx.globalBlocks[0]` to generate the types didn't actually emit anything. --- .../math_compiler/experimental/backends/common_utils.nim | 8 ++++---- constantine/math_compiler/experimental/backends/cuda.nim | 4 +--- constantine/math_compiler/experimental/backends/wgsl.nim | 8 ++++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim index 01ee5d33..076625cf 100644 --- a/constantine/math_compiler/experimental/backends/common_utils.nim +++ b/constantine/math_compiler/experimental/backends/common_utils.nim @@ -16,12 +16,12 @@ proc isGlobal*(fn: GpuAst): bool = doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind result = attGlobal in fn.pAttributes -proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) = +proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock: var GpuAst) = ## Farms the top level of the code for functions, variable and type definition. ## All functions are added to the `allFnTab`, while only global ones (or even only ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining ## logic. - ## Variables and types are collected in `varBlock` and `typBlock`. + ## Variables are collected in `varBlock`. case ast.kind of gpuProc: ctx.allFnTab[ast.pName] = ast @@ -32,10 +32,10 @@ proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, t of gpuBlock: # could be a type definition or global variable for ch in ast: - ctx.farmTopLevel(ch, kernel, varBlock, typBlock) + ctx.farmTopLevel(ch, kernel, varBlock) of gpuVar, gpuConstexpr: varBlock.statements.add ast of gpuTypeDef, gpuAlias: - typBlock.statements.add ast + raiseAssert "Unexpected type def / alias def found. These should be in `ctx.types` now: " & $ast else: discard diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 34a96445..ec80aabf 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -170,10 +170,8 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = # 2. Fill table with all *global* functions or *only* the specific `kernel` # if any given var varBlock = GpuAst(kind: gpuBlock) - var typBlock = GpuAst(kind: gpuBlock) - ctx.farmTopLevel(ast, kernel, varBlock, typBlock) + ctx.farmTopLevel(ast, kernel, varBlock) ctx.globalBlocks.add varBlock - ctx.globalBlocks.add typBlock ## XXX: `typBlock` should now always be empty, as we pass all ## found types into `ctx.types` diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index e9501440..716f4fb8 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -732,10 +732,8 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = # 1. Fill table with all *global* functions or *only* the specific `kernel` # if any given var varBlock = GpuAst(kind: gpuBlock) - var typBlock = GpuAst(kind: gpuBlock) - ctx.farmTopLevel(ast, kernel, varBlock, typBlock) + ctx.farmTopLevel(ast, kernel, varBlock) ctx.globalBlocks.add varBlock - ctx.globalBlocks.add typBlock ## XXX: `typBlock` should now always be empty, as we pass all ## found types into `ctx.types` @@ -743,8 +741,10 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = for k, v in pairs(ctx.genericInsts): ctx.allFnTab[k] = v # And all the known types + var typBlock = GpuAst(kind: gpuBlock) for k, typ in pairs(ctx.types): - ctx.globalBlocks.add typ + typBlock.statements.add typ + ctx.globalBlocks.add typBlock # 2. Remove all arguments from global functions, as none are allowed in WGSL for (fnIdent, fn) in mpairs(ctx.fnTab): # mutating the function in the table From 96508f6bd0e30e10c2ae3c8c6eb3e503e0e90f58 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 3 Sep 2025 11:49:20 +0200 Subject: [PATCH 79/87] [cuda] add pass to strip `deref` if found inside `index` for pointers But only strip if not pointer to an array type! --- .../experimental/backends/cuda.nim | 28 +++++++++++++++++++ .../math_compiler/experimental/nim_to_gpu.nim | 24 ++-------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index ec80aabf..3e0e523a 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -152,6 +152,30 @@ proc scanFunctions(ctx: var GpuContext, n: GpuAst) = for ch in n: ctx.scanFunctions(ch) +proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = + ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects + ## that are rewritten include: + ## + ## - `Index` of `Deref` of `Ident` needs to be rewritten to `Index` of `Ident` if the + ## ident is a pointer type, because `[]` is syntactic sugar for pointer arithmetic + ## (unless the argument is a pointer to a static array) + case n.kind + of gpuIndex: + ## TODO: Assuming we have a more complicated expression instead of a `gpuIdent` in the deref + ## we won't perform replacement, but likely we should. Might use something like `determineIdent` + ## as used on WGSL in the future. Anyway, worts case this will lead to a NVRTC compile time error. + if n.iArr.kind == gpuDeref and + n.iArr.dOf.kind == gpuIdent and + n.iArr.dOf.iTyp.kind == gtPtr and # identifier is a pointer? + n.iArr.dOf.iTyp.to.kind != gtArray: # but not to an array + n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch) + proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a)) @@ -184,6 +208,10 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = let fnOrig = ctx.allFnTab[fnIdent] ctx.scanFunctions(fn) + # 4. Finalize the code by performing some required AST transformations to make the code valid. + for (fnIdent, fn) in mpairs(ctx.fnTab): + ctx.makeCodeValid(fn) + proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = ## The actual CUDA code generator. let indentStr = " ".repeat(indent) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 59d473a4..1f63c749 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -1286,27 +1286,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = # `HiddenAddr` appears for accesses to `var` passed arguments result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0])) - of nnkHiddenDeref: - case node.typeKind - of ntyUncheckedArray: - # `getTypeInst(node)` would yield: - # BracketExpr - # Sym "UncheckedArray" - # Sym "uint32" - # i.e. it is a `ptr UncheckedArray[T]` - # In this case we just ignore the deref, because on the CUDA - # side it is just a plain pointer array we index into using - # `foo[i]`. - result = ctx.toGpuAst(node[0]) - else: - # Otherwise we treat it like a regular deref - # HiddenDeref - # Sym "x" - # With e.g. `getTypeInst(node) = Sym "BigInt"` - # and `node.typeKind = ntyObject` - # due to a `var` parameter - result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) - of nnkDerefExpr: #, nnkHiddenDeref: + of nnkDerefExpr, nnkHiddenDeref: + # treat hidden and regular deref the same nowadays. On some backends may strip derefs, if + # they appear e.g. in an `gpuIndex` (CUDA) result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) of nnkConstDef: From c823f9feb159534f774b82b369bc9d16983c5d16 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 3 Sep 2025 11:53:42 +0200 Subject: [PATCH 80/87] remove old comment --- constantine/math_compiler/experimental/backends/cuda.nim | 2 -- 1 file changed, 2 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 3e0e523a..8544a666 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -196,8 +196,6 @@ proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = var varBlock = GpuAst(kind: gpuBlock) ctx.farmTopLevel(ast, kernel, varBlock) ctx.globalBlocks.add varBlock - ## XXX: `typBlock` should now always be empty, as we pass all - ## found types into `ctx.types` # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect # the functions called and record them in `fnTab`. From 6527759bb71f0c951790b84ba424e5bf97cc7fa2 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 3 Sep 2025 15:23:50 +0200 Subject: [PATCH 81/87] better handle replacement of derefs This allows us to make a better choice about when to replace and when not to replace. --- .../experimental/backends/cuda.nim | 63 ++++++++++++++++--- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 8544a666..37dbfb8f 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -152,6 +152,56 @@ proc scanFunctions(ctx: var GpuContext, n: GpuAst) = for ch in n: ctx.scanFunctions(ch) +proc getFieldType(t: GpuType, field: GpuAst): GpuType = + ## Returns the type of the field. `t` must be an object or generic instantiation. + ## `field` must be an ident. + doAssert field.kind == gpuIdent, "Field is not an ident: " & $field + doAssert t.kind in [gtObject, gtGenericInst] + let flds = if t.kind == gtObject: t.oFields + else: t.gFields + result = GpuType(kind: gtInvalid) + for f in flds: + if f.name == field.ident(): + return f.typ + +proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType = + ## Tries to determine the underlying type of the AST. + ## + ## If `typeOfIndex` is `true`, we return the type of the index we access. Otherwise + ## we return the type of the array / pointer. + ## + ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types! + template dfl(): untyped = GpuType(kind: gtInvalid) + case arg.kind + of gpuIdent: arg.iTyp + of gpuAddr: GpuType(kind: gtPtr, to: ctx.getType(arg.aOf)) + of gpuDeref: + let argTyp = ctx.getType(arg.dOf) + doAssert argTyp.kind == gtPtr + argTyp.to + of gpuCall: dfl() + of gpuIndex: + let arrType = ctx.getType(arg.iArr) + if typeOfIndex: + case arrType.kind + of gtPtr: arrType.to + of gtUA: arrType.uaTo + of gtArray: arrType.aTyp + else: raiseAssert "`gpuIndex` cannot be of a non pointer / array type: " & $arrType + else: + arrType + of gpuDot: + let parentTyp = ctx.getType(arg.dParent) + parentTyp.getFieldType(arg.dField) + of gpuLit: arg.lType + of gpuBinOp: dfl() ## XXX: store resulting type of `gpuBinOp`! + #of gpuBlock: arg.statements[^1].getType() + of gpuPrefix: ctx.getType(arg.pVal) + of gpuConv: arg.convTo + of gpuCast: arg.cTo # ident of the thing we cast + else: + raiseAssert "Not implemented to determine type from node: " & $arg + proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects ## that are rewritten include: @@ -161,14 +211,11 @@ proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = ## (unless the argument is a pointer to a static array) case n.kind of gpuIndex: - ## TODO: Assuming we have a more complicated expression instead of a `gpuIdent` in the deref - ## we won't perform replacement, but likely we should. Might use something like `determineIdent` - ## as used on WGSL in the future. Anyway, worts case this will lead to a NVRTC compile time error. - if n.iArr.kind == gpuDeref and - n.iArr.dOf.kind == gpuIdent and - n.iArr.dOf.iTyp.kind == gtPtr and # identifier is a pointer? - n.iArr.dOf.iTyp.to.kind != gtArray: # but not to an array - n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex) + if n.iArr.kind == gpuDeref: + # get type of deref'd node, but do not fold `gpuIndex` (i.e. get type of collection) + let typ = ctx.getType(n, typeOfIndex = false) + if typ.kind != gtArray: + n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex) else: for ch in mitems(n): ctx.makeCodeValid(ch) From 3095d4fd649241104d9405d6ce5277c05563e128 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 3 Sep 2025 18:08:49 +0200 Subject: [PATCH 82/87] remove Nim gensym'd suffix from `tmpTuple` variables I.e. variables that correspond to tuple unpacking in Nim --- constantine/math_compiler/experimental/nim_to_gpu.nim | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 1f63c749..ec23ac1a 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -1104,6 +1104,10 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if result.iName == "_": result.iName = "tmp_" & $ctx.genSymCount inc ctx.genSymCount + elif result.iName.startsWith("tmpTuple_"): # will have a Nim gensym'd suffix, replace by custom counter + result.iName = "tmpTuple_" & $ctx.genSymCount + result.iSym = result.iName & "_" & node.signatureHash() # and update the iSym to not be based on Nim's value either + inc ctx.genSymCount ctx.sigTab[s] = result else: result = ctx.sigTab[s] From 59c16b676d7de84a0d76381306da3326e5492207 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Sep 2025 15:40:23 +0200 Subject: [PATCH 83/87] replace single letter strings by characters --- .../experimental/backends/cuda.nim | 82 +++++++++---------- .../experimental/backends/wgsl.nim | 76 ++++++++--------- 2 files changed, 79 insertions(+), 79 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 37dbfb8f..4899c3f5 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -70,7 +70,7 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, # so as our ident we pass `theIdent = (*)` and generate the type for the internal # array type, which yields e.g. `BigInt [4]`. let ptrStar = gpuTypeToString(t.kind) - result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")") + result = gpuTypeToString(t.to, '(' & ptrStar & ident & ')') skipIdent = true else: let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent) @@ -87,30 +87,30 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, of gtArray: # nested array let typ = getInnerArrayType(t) # get inner most type let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...` - result = typ & " " & ident & lengths + result = typ & ' ' & ident & lengths else: # NOTE: Nested arrays don't have an inner identifier! if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead - result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]" + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & "[]" else: - result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]" + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & '[' & $t.aLen & ']' skipIdent = true of gtGenericInst: # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. result = t.gName if t.gArgs.len > 0: - result.add "_" + result.add '_' for i, g in t.gArgs: result.add gpuTypeToString(g) if i < t.gArgs.high: - result.add "_" + result.add '_' of gtObject: result = t.name of gtUA: result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T? else: result = gpuTypeToString(t.kind) if ident.len > 0 and not skipIdent: # still need to add ident - result.add " " & ident + result.add ' ' & ident proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = ## Returns the correct function with its return type @@ -275,30 +275,30 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs) # extern "C" is needed to avoid name mangling - result = indentStr & "extern \"C\" " & attrs.join(" ") & " " & + result = indentStr & "extern \"C\" " & attrs.join(" ") & ' ' & fnSig if ast.forwardDeclare: - result.add ";" + result.add ';' else: result.add "{\n" result &= ctx.genCuda(ast.pBody, indent + 1) - result &= "\n" & indentStr & "}" + result &= '\n' & indentStr & '}' of gpuBlock: result = "" if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n' for i, el in ast.statements: result.add ctx.genCuda(el, indent) if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added - result.add ";" + result.add ';' if i < ast.statements.high: - result.add "\n" + result.add '\n' if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n' of gpuVar: - let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & " " + let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & ' ' else: "" result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident()) # If there is an initialization, the type might require a memcpy @@ -320,35 +320,35 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = # skip semicolon in the condition. Otherwise can lead to problematic code ctx.withoutSemicolon: # skip semicolon for if bodies result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n" - result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.ifThen, indent + 1) & '\n' + result &= indentStr & '}' if ast.ifElse.kind != gpuVoid: result &= " else {\n" - result &= ctx.genCuda(ast.ifElse, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.ifElse, indent + 1) & '\n' + result &= indentStr & '}' of gpuFor: result = indentStr & "for(int " & ast.fVar.ident() & " = " & ctx.genCuda(ast.fStart) & "; " & ast.fVar.ident() & " < " & ctx.genCuda(ast.fEnd) & "; " & ast.fVar.ident() & "++) {\n" - result &= ctx.genCuda(ast.fBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.fBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuWhile: ctx.withoutSemicolon: result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n" - result &= ctx.genCuda(ast.wBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.wBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuDot: - result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField) + result = ctx.genCuda(ast.dParent) & '.' & ctx.genCuda(ast.dField) of gpuIndex: - result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]" + result = ctx.genCuda(ast.iArr) & '[' & ctx.genCuda(ast.iIndex) & ']' of gpuCall: - result = indentStr & ast.cName.ident() & "(" & - ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")" + result = indentStr & ast.cName.ident() & '(' & + ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ')' of gpuTemplateCall: when nimvm: @@ -370,25 +370,25 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = ctx.withoutSemicolon: let l = ctx.genCuda(ast.bLeft) let r = ctx.genCuda(ast.bRight) - result = indentStr & "(" & l & " " & - ctx.genCuda(ast.bOp) & " " & - r & ")" + result = indentStr & '(' & l & ' ' & + ctx.genCuda(ast.bOp) & ' ' & + r & ')' of gpuIdent: result = ast.ident() of gpuLit: - if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\"" + if ast.lType.kind == gtString: result = '"' & ast.lValue & '"' elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder else: result = ast.lValue of gpuArrayLit: result = "{" for i, el in ast.aValues: - result.add "(" & gpuTypeToString(ast.aLitType) & ")" & ctx.genCuda(el) + result.add '(' & gpuTypeToString(ast.aLitType) & ')' & ctx.genCuda(el) if i < ast.aValues.high: result.add ", " - result.add "}" + result.add '}' of gpuReturn: result = indentStr & "return " & ctx.genCuda(ast.rValue) @@ -400,7 +400,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = "struct " & gpuTypeToString(ast.tTyp) & "{\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, el.name) & ";\n" - result.add "}" + result.add '}' of gpuObjConstr: result = "{" @@ -408,7 +408,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add ctx.genCuda(el.value) if i < ast.ocFields.len - 1: result.add ", " - result.add "}" + result.add '}' of gpuInlineAsm: result = indentStr & "asm(" & ast.stmt.strip & ");" @@ -417,15 +417,15 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "/* " & ast.comment & " */" of gpuConv: - result = "(" & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.convExpr) + result = '(' & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.convExpr) of gpuCast: - result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr) + result = '(' & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.cExpr) of gpuAddr: - result = "(&" & ctx.genCuda(ast.aOf) & ")" + result = "(&" & ctx.genCuda(ast.aOf) & ')' of gpuDeref: - result = "(*" & ctx.genCuda(ast.dOf) & ")" + result = "(*" & ctx.genCuda(ast.dOf) & ')' of gpuConstexpr: ## TODO: We need to change the code such that we emit `constexpr` inside of procs and @@ -436,7 +436,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = if ast.cType.kind == gtArray: result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) else: - result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) + result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & ' ' & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) else: echo "Unhandled node kind in genCuda: ", ast.kind @@ -454,7 +454,7 @@ proc codegen*(ctx: var GpuContext): string = for (fnIdent, fn) in fns: let fnC = fn.clone() fnC.forwardDeclare = true - result.add ctx.genCuda(fnC) & "\n" + result.add ctx.genCuda(fnC) & '\n' result.add "\n\n" for fnIdent, fn in ctx.fnTab: diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 716f4fb8..35a80f32 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -117,13 +117,13 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr = # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. result = t.gName if t.gArgs.len > 0: - result.add "_" + result.add '_' for i, g in t.gArgs: result.add gpuTypeToString(g) if i < t.gArgs.high: - result.add "_" + result.add '_' of gtObject: result = t.name - of gtUA: result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">" + of gtUA: result = gpuTypeToString(t.kind) & '<' & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & '>' else: result = gpuTypeToString(t.kind) if ident.len > 0 and not skipIdent: # still need to add ident @@ -295,7 +295,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string ## the information we precisely want to extract (different symbol kind etc) would make ## it so that we cannot look up elements. doAssert n.kind == gpuCall, "Not a call, but: " & $n.kind - result = n.cName.ident() & "_" + result = n.cName.ident() & '_' for i, arg in n.cArgs: let p = params[i] var s: string @@ -312,7 +312,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string s = shortAddrSpace(addrSpace) & m result.add s if i < n.cArgs.high: - result.add "_" + result.add '_' proc makeFnGeneric(fn: GpuAst, gi: GenericInst): GpuAst = ## Returns a (shallow) copy of the input function (which is a clone of the @@ -823,20 +823,20 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "fn " & fnSig & " {\n" result &= ctx.genWebGpu(ast.pBody, indent + 1) - result &= "\n" & indentStr & "}" + result &= '\n' & indentStr & '}' of gpuBlock: result = "" if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n' for i, el in ast.statements: result.add ctx.genWebGpu(el, indent) if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added - result.add ";" + result.add ';' if i < ast.statements.high: - result.add "\n" + result.add '\n' if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n' of gpuVar: let letOrVar = if ast.vMutable: "var" else: "let" @@ -872,7 +872,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and # we patched the `bool -> i32` and thus need to convert it. - result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ")" + result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ')' else: result = indentStr & ctx.genWebGpu(ast.aLeft) & " = " & ctx.genWebGpu(ast.aRight) @@ -886,36 +886,36 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "if (false) {\n" else: result = indentStr & "if (" & ctx.genWebGpu(ast.ifCond) & ") {\n" - result &= ctx.genWebGpu(ast.ifThen, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.ifThen, indent + 1) & '\n' + result &= indentStr & '}' if ast.ifElse.kind != gpuVoid: result &= " else {\n" - result &= ctx.genWebGpu(ast.ifElse, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.ifElse, indent + 1) & '\n' + result &= indentStr & '}' of gpuFor: result = indentStr & "for(var " & ast.fVar.ident() & ": i32 = " & ctx.genWebGpu(ast.fStart) & "; " & ast.fVar.ident() & " < " & ctx.genWebGpu(ast.fEnd) & "; " & ast.fVar.ident() & "++) {\n" - result &= ctx.genWebGpu(ast.fBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.fBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuWhile: ctx.withoutSemicolon: result = indentStr & "while (" & ctx.genWebGpu(ast.wCond) & "){\n" - result &= ctx.genWebGpu(ast.wBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.wBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuDot: - result = ctx.genWebGpu(ast.dParent) & "." & ctx.genWebGpu(ast.dField) + result = ctx.genWebGpu(ast.dParent) & '.' & ctx.genWebGpu(ast.dField) of gpuIndex: - result = ctx.genWebGpu(ast.iArr) & "[" & ctx.genWebGpu(ast.iIndex) & "]" + result = ctx.genWebGpu(ast.iArr) & '[' & ctx.genWebGpu(ast.iIndex) & ']' of gpuCall: ctx.withoutSemicolon: - result = indentStr & ast.cName.ident() & "(" & - ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ")" + result = indentStr & ast.cName.ident() & '(' & + ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ')' of gpuTemplateCall: when nimvm: @@ -938,15 +938,15 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = ctx.withoutSemicolon: let l = ctx.genWebGpu(ast.bLeft) let r = ctx.genWebGpu(ast.bRight) - result = indentStr & "(" & l & " " & - ctx.genWebGpu(ast.bOp) & " " & - r & ")" + result = indentStr & '(' & l & ' ' & + ctx.genWebGpu(ast.bOp) & ' ' & + r & ')' of gpuIdent: result = ast.ident() of gpuLit: - if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\"" + if ast.lType.kind == gtString: result = '"' & ast.lValue & '"' elif ast.lValue == "DEFAULT": ## TODO: We could "manually" construct a zero version! ## NOTE: There *are* default initializations to zero. Just not for fields that @@ -959,10 +959,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuArrayLit: result = "array(" for i, el in ast.aValues: - result.add gpuTypeToString(ast.aLitType) & "(" & ctx.genWebGpu(el) & ")" + result.add gpuTypeToString(ast.aLitType) & '(' & ctx.genWebGpu(el) & ')' if i < ast.aValues.high: result.add ", " - result.add ")" + result.add ')' of gpuReturn: result = indentStr & "return " & ctx.genWebGpu(ast.rValue) @@ -974,16 +974,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = "struct " & gpuTypeToString(ast.tTyp) & " {\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n" - result.add "}" + result.add '}' of gpuAlias: # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons # after alias definitions, but not after `struct`. Hence we add `;` manually here - result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ";" + result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ';' of gpuObjConstr: - result = gpuTypeToString(ast.ocType) & "(" + result = gpuTypeToString(ast.ocType) & '(' for i, el in ast.ocFields: if el.value.kind == gpuLit and el.value.lValue == "DEFAULT": # use type to construct a default value @@ -993,7 +993,7 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add ctx.genWebGpu(el.value) if i < ast.ocFields.len - 1: result.add ", " - result.add ")" + result.add ')' of gpuInlineAsm: raiseAssert "Inline assembly not supported on the WebGPU target." @@ -1002,16 +1002,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "/* " & ast.comment & " */" of gpuConv: - result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & "(" & ctx.genWebGpu(ast.convExpr) & ")" + result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & '(' & ctx.genWebGpu(ast.convExpr) & ')' of gpuCast: - result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ")" + result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ')' of gpuAddr: - result = "(&" & ctx.genWebGpu(ast.aOf) & ")" + result = "(&" & ctx.genWebGpu(ast.aOf) & ')' of gpuDeref: - result = "(*" & ctx.genWebGpu(ast.dOf) & ")" + result = "(*" & ctx.genWebGpu(ast.dOf) & ')' of gpuConstexpr: result = indentStr & "const " & ctx.genWebGpu(ast.cIdent) & ": " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " = " & ctx.genWebGpu(ast.cValue) @@ -1050,7 +1050,7 @@ proc codegen*(ctx: var GpuContext): string = # 1. Generate the header for all global variables for id, g in ctx.globals: result.add genGlobal(g) - result.add "\n" + result.add '\n' # 2. generate code for the global blocks (types, global vars etc) for blk in ctx.globalBlocks: From 096b5bc7ae3315e028d6c760da7afee56c20820f Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Sep 2025 15:40:44 +0200 Subject: [PATCH 84/87] extend `maybeAddType` to find types behind Ptr/UA/Array --- constantine/math_compiler/experimental/nim_to_gpu.nim | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index ec23ac1a..5373581b 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -616,11 +616,21 @@ proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead result = n.getTypeInst.nimToGpuType() +proc stripPtrOrArrayType(t: GpuType): GpuType = + ## Strips any pointer or array type to return any struct / generic instantiation + ## it might contain + case t.kind + of gtPtr: result = stripPtrOrArrayType t.to + of gtUA: result = stripPtrOrArrayType t.uaTo + of gtArray: result = stripPtrOrArrayType t.aTyp + else: result = t + proc maybeAddType*(ctx: var GpuContext, typ: GpuType) = ## Adds the given type to the table of known types, if it is some kind of ## object type. ## ## XXX: What about aliases and distincts? + let typ = typ.stripPtrOrArrayType() # get any underlying type if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types: ctx.types[typ] = toTypeDef(typ) From 1aaf1b8b0f8fd6b0400f03f9b4b7bab290bc858c Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Sep 2025 15:53:50 +0200 Subject: [PATCH 85/87] move all `builtins` into separate files, one for each backend --- .../experimental/builtins/builtins.nim | 19 ++++ .../experimental/builtins/common_builtins.nim | 46 +++++++++ .../experimental/builtins/cuda_builtins.nim | 48 ++++++++++ .../experimental/builtins/wgsl_builtins.nim | 26 ++++++ .../experimental/gpu_compiler.nim | 93 +------------------ 5 files changed, 143 insertions(+), 89 deletions(-) create mode 100644 constantine/math_compiler/experimental/builtins/builtins.nim create mode 100644 constantine/math_compiler/experimental/builtins/common_builtins.nim create mode 100644 constantine/math_compiler/experimental/builtins/cuda_builtins.nim create mode 100644 constantine/math_compiler/experimental/builtins/wgsl_builtins.nim diff --git a/constantine/math_compiler/experimental/builtins/builtins.nim b/constantine/math_compiler/experimental/builtins/builtins.nim new file mode 100644 index 00000000..da079e83 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/builtins.nim @@ -0,0 +1,19 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# NOTE: For the moment we import and export builtins here for all backends. +# Once we change the code to make single backends importable on their own, +# this will be changed and these builtins will be imported/exported in the +# corresponding CUDA/WGSL etc module the user needs to import. +import ./common_builtins +import ./cuda_builtins +import ./wgsl_builtins + +export common_builtins +export cuda_builtins +export wgsl_builtins diff --git a/constantine/math_compiler/experimental/builtins/common_builtins.nim b/constantine/math_compiler/experimental/builtins/common_builtins.nim new file mode 100644 index 00000000..b9a144a5 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/common_builtins.nim @@ -0,0 +1,46 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +#import std / [macros, strutils, sequtils, options, sugar, tables, strformat, hashes, sets] +# +#import ./gpu_types +#import ./backends/backends +#import ./nim_to_gpu +# +#export gpu_types + +template nimonly*(): untyped {.pragma.} +template cudaName*(s: string): untyped {.pragma.} + +## Dummy data for the typed nature of the `cuda` macro. These define commonly used +## CUDA specific names so that they produce valid Nim code in the context of a typed macro. +template global*() {.pragma.} +template device*() {.pragma.} +template forceinline*() {.pragma.} + +## If attached to a function, type or variable it will refer to a built in +## in the target backend. This is used for all the functions, types and variables +## defined below to indicate that we do not intend to generate code for them. +template builtin*() {.pragma.} +# If attached to a `var` it will be treated as a +# `__constant__`! Only useful if you want to define a +# constant without initializing it (and then use +# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it +# before executing the kernel) +template constant*() {.pragma.} + +## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its +## own `extern` pragma (due to requiring an argument it cannot be reused): +## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma +template cuExtern*(): untyped {.pragma.} +template shared*(): untyped {.pragma.} +template private*(): untyped {.pragma.} +## You would typically use `cuExtern` and `shared` together: +## `var x {.cuExtern, shared.}: array[N, Foo]` +## for example to declare a constant array that is filled by the +## host before kernel execution. diff --git a/constantine/math_compiler/experimental/builtins/cuda_builtins.nim b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim new file mode 100644 index 00000000..5d002f93 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim @@ -0,0 +1,48 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./common_builtins + +type + Dim* = cint ## dummy to have access to math + NvBlockIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvBlockDim = object + x*: Dim + y*: Dim + z*: Dim + NvThreadIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvGridDim = object + x*: Dim + y*: Dim + z*: Dim + + +## These are dummy elements to make CUDA block / thread index / dim +## access possible in the *typed* `cuda` macro. It cannot be `const`, +## because then the typed code would evaluate the values before we +## can work with it from the typed macro. +let blockIdx* {.builtin.} = NvBlockIdx() +let blockDim* {.builtin.} = NvBlockDim() +let gridDim* {.builtin.} = NvGridDim() +let threadIdx* {.builtin.} = NvThreadIdx() + +## Similar for procs. They don't need any implementation, as they won't ever be actually called. +proc printf*(fmt: string) {.varargs, builtin.} = discard +proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard + +## While you can use `malloc` on device with small sizes, it is usually not +## recommended to do so. +proc malloc*(size: csize_t): pointer {.builtin.} = discard +proc free*(p: pointer) {.builtin.} = discard +proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard diff --git a/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim new file mode 100644 index 00000000..fe7e11f0 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim @@ -0,0 +1,26 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./common_builtins + +type + DimWgsl = uint32 + WgslGridDim = object + x*: DimWgsl + y*: DimWgsl + z*: DimWgsl + +## WebGPU specific +let global_id* {.builtin.} = WgslGridDim() +let num_workgroups* {.builtin.} = WgslGridDim() + +## WebGPU select +proc select*[T](f, t: T, cond: bool): T {.builtin.} = + # Implementation to run WebGPU code on CPU + if cond: t + else: f diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 01548f7b..58ad4207 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -14,97 +14,12 @@ import ./nim_to_gpu export gpu_types -template nimonly*(): untyped {.pragma.} -template cudaName*(s: string): untyped {.pragma.} - -## Dummy data for the typed nature of the `cuda` macro. These define commonly used -## CUDA specific names so that they produce valid Nim code in the context of a typed macro. -template global*() {.pragma.} -template device*() {.pragma.} -template forceinline*() {.pragma.} - -## If attached to a function, type or variable it will refer to a built in -## in the target backend. This is used for all the functions, types and variables -## defined below to indicate that we do not intend to generate code for them. -template builtin*() {.pragma.} - -# If attached to a `var` it will be treated as a -# `__constant__`! Only useful if you want to define a -# constant without initializing it (and then use -# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it -# before executing the kernel) -template constant*() {.pragma.} -type - Dim* = cint ## dummy to have access to math - NvBlockIdx* = object - x*: Dim - y*: Dim - z*: Dim - NvBlockDim = object - x*: Dim - y*: Dim - z*: Dim - NvThreadIdx* = object - x*: Dim - y*: Dim - z*: Dim - NvGridDim = object - x*: Dim - y*: Dim - z*: Dim - - DimWgsl = uint32 - WgslGridDim = object - x*: DimWgsl - y*: DimWgsl - z*: DimWgsl - -## These are dummy elements to make CUDA block / thread index / dim -## access possible in the *typed* `cuda` macro. It cannot be `const`, -## because then the typed code would evaluate the values before we -## can work with it from the typed macro. -let blockIdx* {.builtin.} = NvBlockIdx() -let blockDim* {.builtin.} = NvBlockDim() -let gridDim* {.builtin.} = NvGridDim() -let threadIdx* {.builtin.} = NvThreadIdx() - -## WebGPU specific -let global_id* {.builtin.} = WgslGridDim() -let num_workgroups* {.builtin.} = WgslGridDim() - -## Similar for procs. They don't need any implementation, as they won't ever be actually called. -proc printf*(fmt: string) {.varargs, builtin.} = discard -proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard - -## WebGPU select -proc select*[T](f, t: T, cond: bool): T {.builtin.} = - # Implementation to run WebGPU code on CPU - if cond: t - else: f - -## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its -## own `extern` pragma (due to requiring an argument it cannot be reused): -## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma -template cuExtern*(): untyped {.pragma.} -template shared*(): untyped {.pragma.} -template private*(): untyped {.pragma.} -## You would typically use `cuExtern` and `shared` together: -## `var x {.cuExtern, shared.}: array[N, Foo]` -## for example to declare a constant array that is filled by the -## host before kernel execution. - -## While you can use `malloc` on device with small sizes, it is usually not -## recommended to do so. -proc malloc*(size: csize_t): pointer {.builtin.} = discard -proc free*(p: pointer) {.builtin.} = discard -proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard +import builtins/builtins # all the builtins for the backend to make the Nim compiler happy +export builtins macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = - ## WARNING: The following are *not* supported: - ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between - ## what is a field access and a function call. Hence we assume any `nnkDotExpr` - ## is actually a field access! - ## - most regular Nim features :) + ## Converts the body of this macro into a `GpuAst` from where it can be converted + ## into CUDA or WGSL code at runtime. var ctx = GpuContext() let ast = ctx.toGpuAst(body) let genProcs = toSeq(ctx.genericInsts.values) From 71c379f7c84a07973f456538c18c0ba500cb46d3 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Sep 2025 15:54:28 +0200 Subject: [PATCH 86/87] update doc comment of `cuda` macro --- .../math_compiler/experimental/gpu_compiler.nim | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 58ad4207..0a8b7362 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -28,11 +28,11 @@ macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = newLit((g, ast)) macro cuda*(body: typed): string = - ## WARNING: The following are *not* supported: - ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between - ## what is a field access and a function call. Hence we assume any `nnkDotExpr` - ## is actually a field access! - ## - most regular Nim features :) + ## Converts the body of this macro into a `GpuAst` and from there into a string of + ## CUDA or WGSL code. + ## + ## TODO: make `cuda` choose CUDA backend, `wgsl` WGSL etc. Need to change code + ## that chooses backend etc. #echo body.treerepr var ctx = GpuContext() let gpuAst = ctx.toGpuAst(body) From 05321435f3df621f48067a1bb27d8592533751bc Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Sep 2025 15:56:57 +0200 Subject: [PATCH 87/87] improve explanation of `typeOfIndex` in CUDA's `getType` helper --- .../math_compiler/experimental/backends/cuda.nim | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 4899c3f5..7e8e4a3b 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -167,8 +167,14 @@ proc getFieldType(t: GpuType, field: GpuAst): GpuType = proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType = ## Tries to determine the underlying type of the AST. ## - ## If `typeOfIndex` is `true`, we return the type of the index we access. Otherwise - ## we return the type of the array / pointer. + ## If `typeOfIndex` is `true`, in case of a `gpuIndex` node, we return the type of the + ## element behind the index access `[]`. Otherwise we return the type of the array / pointer. + ## + ## Let `foo` be an array `let foo = [1'f32, 2, 3]`. + ## Let `n` be a `GpuAst` node of kind `gpuIndex` corresponding to `foo[1]`. + ## Then `ctx.getType(n, typeOfIndex = true)` returns `float32` while + ## `ctx.getType(n, typeOfIndex = false)` returns `array[3, float32]` (as + ## a `GpuType`). ## ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types! template dfl(): untyped = GpuType(kind: gtInvalid)