diff --git a/constantine/math_compiler/experimental/backends/backends.nim b/constantine/math_compiler/experimental/backends/backends.nim index f2b71152..95fbda87 100644 --- a/constantine/math_compiler/experimental/backends/backends.nim +++ b/constantine/math_compiler/experimental/backends/backends.nim @@ -14,27 +14,11 @@ when defined(cuda): else: const Backend* = bkWGSL -proc gpuTypeToString*(t: GpuTypeKind): string = - case Backend - of bkCuda: cuda.gpuTypeToString(t) - of bkWGSL: wgsl.gpuTypeToString(t) - -proc gpuTypeToString*(t: GpuType, ident = newGpuIdent(), allowArrayToPtr = false, - allowEmptyIdent = false, - ): string = - case Backend - of bkCuda: cuda.gpuTypeToString(t, ident.ident(), allowArrayToPtr, allowEmptyIdent) - of bkWGSL: wgsl.gpuTypeToString(t, ident, allowArrayToPtr, allowEmptyIdent) - -proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = - case Backend - of bkCuda: cuda.genFunctionType(typ, fn, fnArgs) - of bkWGSL: wgsl.genFunctionType(typ, fn, fnArgs) - proc codegen*(ctx: var GpuContext, ast: GpuAst, kernel: string = ""): string = case Backend of bkCuda: - result = ctx.genCuda(ast) + cuda.preprocess(ctx, ast, kernel) + result = cuda.codegen(ctx) of bkWGSL: - ctx.storagePass(ast, kernel) + wgsl.preprocess(ctx, ast, kernel) result = wgsl.codegen(ctx) diff --git a/constantine/math_compiler/experimental/backends/common_utils.nim b/constantine/math_compiler/experimental/backends/common_utils.nim index 1def1b1f..076625cf 100644 --- a/constantine/math_compiler/experimental/backends/common_utils.nim +++ b/constantine/math_compiler/experimental/backends/common_utils.nim @@ -6,9 +6,36 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. +import std / tables import ../gpu_types -# import ./backends proc address*(a: string): string = "&" & a - proc size*(a: string): string = "sizeof(" & a & ")" + +proc isGlobal*(fn: GpuAst): bool = + doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind + result = attGlobal in fn.pAttributes + +proc farmTopLevel*(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock: var GpuAst) = + ## Farms the top level of the code for functions, variable and type definition. + ## All functions are added to the `allFnTab`, while only global ones (or even only + ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining + ## logic. + ## Variables are collected in `varBlock`. + case ast.kind + of gpuProc: + ctx.allFnTab[ast.pName] = ast + if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal(): + ctx.fnTab[ast.pName] = ast.clone() # store global function extra + elif kernel.len == 0 and ast.isGlobal(): + ctx.fnTab[ast.pName] = ast.clone() # store global function extra + of gpuBlock: + # could be a type definition or global variable + for ch in ast: + ctx.farmTopLevel(ch, kernel, varBlock) + of gpuVar, gpuConstexpr: + varBlock.statements.add ast + of gpuTypeDef, gpuAlias: + raiseAssert "Unexpected type def / alias def found. These should be in `ctx.types` now: " & $ast + else: + discard diff --git a/constantine/math_compiler/experimental/backends/cuda.nim b/constantine/math_compiler/experimental/backends/cuda.nim index 227bde79..7e8e4a3b 100644 --- a/constantine/math_compiler/experimental/backends/cuda.nim +++ b/constantine/math_compiler/experimental/backends/cuda.nim @@ -6,7 +6,7 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -import std / [macros, strformat, strutils, sugar, sequtils] +import std / [macros, strformat, strutils, sugar, sequtils, tables, algorithm] import ../gpu_types import ./common_utils @@ -43,6 +43,7 @@ proc gpuTypeToString*(t: GpuTypeKind): string = of gtVoidPtr: "void*" of gtObject: "struct" of gtString: "const char*" + of gtUA: "" # `UncheckedArray` by itself is nothing in CUDA else: raiseAssert "Invalid type : " & $t @@ -57,6 +58,10 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, var skipIdent = false case t.kind of gtPtr: + var t = t # if `ptr UncheckedArray`, remove the `gtUA` layer. No meaning on CUDA + if t.to.kind == gtUA: + t.to = t.to.uaTo + if t.to.kind == gtArray: # ptr to array type # need to pass `*` for the pointer into the identifier, i.e. # `state: var array[4, BigInt]` @@ -65,7 +70,7 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, # so as our ident we pass `theIdent = (*)` and generate the type for the internal # array type, which yields e.g. `BigInt [4]`. let ptrStar = gpuTypeToString(t.kind) - result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")") + result = gpuTypeToString(t.to, '(' & ptrStar & ident & ')') skipIdent = true else: let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent) @@ -82,19 +87,30 @@ proc gpuTypeToString*(t: GpuType, ident: string = "", allowArrayToPtr = false, of gtArray: # nested array let typ = getInnerArrayType(t) # get inner most type let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...` - result = typ & " " & ident & lengths + result = typ & ' ' & ident & lengths else: # NOTE: Nested arrays don't have an inner identifier! if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead - result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]" + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & "[]" else: - result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]" + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & ' ' & ident & '[' & $t.aLen & ']' skipIdent = true + of gtGenericInst: + # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations + # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. + result = t.gName + if t.gArgs.len > 0: + result.add '_' + for i, g in t.gArgs: + result.add gpuTypeToString(g) + if i < t.gArgs.high: + result.add '_' of gtObject: result = t.name + of gtUA: result = gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) ## XXX: unchecked array just T? else: result = gpuTypeToString(t.kind) if ident.len > 0 and not skipIdent: # still need to add ident - result.add " " & ident + result.add ' ' & ident proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = ## Returns the correct function with its return type @@ -114,11 +130,139 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = proc genMemcpy(lhs, rhs, size: string): string = result = &"memcpy({lhs}, {rhs}, {size})" +proc scanFunctions(ctx: var GpuContext, n: GpuAst) = + ## Iterates over the given function and checks for all `gpuCall` nodes. Any function + ## called in the scope is added to `fnTab`. This is a form of dead code elimination. + case n.kind + of gpuCall: + let fn = n.cName + if fn in ctx.allFnTab: + # Check if any of the parameters are pointers (otherwise non generic) + if fn notin ctx.fnTab: # function not known, add to `fnTab` (i.e. avoid code elimination) + let fnCalled = ctx.allFnTab[fn] + ctx.fnTab[fn] = fnCalled + # still "scan for functions", i.e. fill `fnTab` from inner calls + for ch in fnCalled: + ctx.scanFunctions(ch) + # else we don't do anything for this function + # Harvest functions from arguments to this call! + for arg in n.cArgs: + ctx.scanFunctions(arg) + else: + for ch in n: + ctx.scanFunctions(ch) + +proc getFieldType(t: GpuType, field: GpuAst): GpuType = + ## Returns the type of the field. `t` must be an object or generic instantiation. + ## `field` must be an ident. + doAssert field.kind == gpuIdent, "Field is not an ident: " & $field + doAssert t.kind in [gtObject, gtGenericInst] + let flds = if t.kind == gtObject: t.oFields + else: t.gFields + result = GpuType(kind: gtInvalid) + for f in flds: + if f.name == field.ident(): + return f.typ + +proc getType(ctx: var GpuContext, arg: GpuAst, typeOfIndex = true): GpuType = + ## Tries to determine the underlying type of the AST. + ## + ## If `typeOfIndex` is `true`, in case of a `gpuIndex` node, we return the type of the + ## element behind the index access `[]`. Otherwise we return the type of the array / pointer. + ## + ## Let `foo` be an array `let foo = [1'f32, 2, 3]`. + ## Let `n` be a `GpuAst` node of kind `gpuIndex` corresponding to `foo[1]`. + ## Then `ctx.getType(n, typeOfIndex = true)` returns `float32` while + ## `ctx.getType(n, typeOfIndex = false)` returns `array[3, float32]` (as + ## a `GpuType`). + ## + ## NOTE: Do *not* rely on this for `mutable` or `implicit` fields of pointer types! + template dfl(): untyped = GpuType(kind: gtInvalid) + case arg.kind + of gpuIdent: arg.iTyp + of gpuAddr: GpuType(kind: gtPtr, to: ctx.getType(arg.aOf)) + of gpuDeref: + let argTyp = ctx.getType(arg.dOf) + doAssert argTyp.kind == gtPtr + argTyp.to + of gpuCall: dfl() + of gpuIndex: + let arrType = ctx.getType(arg.iArr) + if typeOfIndex: + case arrType.kind + of gtPtr: arrType.to + of gtUA: arrType.uaTo + of gtArray: arrType.aTyp + else: raiseAssert "`gpuIndex` cannot be of a non pointer / array type: " & $arrType + else: + arrType + of gpuDot: + let parentTyp = ctx.getType(arg.dParent) + parentTyp.getFieldType(arg.dField) + of gpuLit: arg.lType + of gpuBinOp: dfl() ## XXX: store resulting type of `gpuBinOp`! + #of gpuBlock: arg.statements[^1].getType() + of gpuPrefix: ctx.getType(arg.pVal) + of gpuConv: arg.convTo + of gpuCast: arg.cTo # ident of the thing we cast + else: + raiseAssert "Not implemented to determine type from node: " & $arg + +proc makeCodeValid(ctx: var GpuContext, n: var GpuAst) = + ## Addresses other AST patterns that need to be rewritten on CUDA. Aspects + ## that are rewritten include: + ## + ## - `Index` of `Deref` of `Ident` needs to be rewritten to `Index` of `Ident` if the + ## ident is a pointer type, because `[]` is syntactic sugar for pointer arithmetic + ## (unless the argument is a pointer to a static array) + case n.kind + of gpuIndex: + if n.iArr.kind == gpuDeref: + # get type of deref'd node, but do not fold `gpuIndex` (i.e. get type of collection) + let typ = ctx.getType(n, typeOfIndex = false) + if typ.kind != gtArray: + n = GpuAst(kind: gpuIndex, iArr: n.iArr.dOf, iIndex: n.iIndex) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch) + else: + for ch in mitems(n): + ctx.makeCodeValid(ch) proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a)) +proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = + + # 1. Add all data from `genericInsts` and `types` tables + # In CUDA the types have to be before any possible global variables using + # them! + for k, v in pairs(ctx.genericInsts): + ctx.allFnTab[k] = v + # And all the known types + for k, typ in pairs(ctx.types): + ctx.globalBlocks.add typ + + # 2. Fill table with all *global* functions or *only* the specific `kernel` + # if any given + var varBlock = GpuAst(kind: gpuBlock) + ctx.farmTopLevel(ast, kernel, varBlock) + ctx.globalBlocks.add varBlock + + # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect + # the functions called and record them in `fnTab`. + let fns = toSeq(ctx.fnTab.pairs) + for (fnIdent, fn) in fns: # everything in `fnTab` at this point is a global function + # Get the original arguments (before lifting them) of this function. Needed in scan + # to check if `gpuCall` argument is a parameter. + let fnOrig = ctx.allFnTab[fnIdent] + ctx.scanFunctions(fn) + + # 4. Finalize the code by performing some required AST transformations to make the code valid. + for (fnIdent, fn) in mpairs(ctx.fnTab): + ctx.makeCodeValid(fn) + proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = ## The actual CUDA code generator. let indentStr = " ".repeat(indent) @@ -137,27 +281,32 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs) # extern "C" is needed to avoid name mangling - result = indentStr & "extern \"C\" " & attrs.join(" ") & " " & - fnSig & "{\n" - - result &= ctx.genCuda(ast.pBody, indent + 1) - result &= "\n" & indentStr & "}" + result = indentStr & "extern \"C\" " & attrs.join(" ") & ' ' & + fnSig + if ast.forwardDeclare: + result.add ';' + else: + result.add "{\n" + result &= ctx.genCuda(ast.pBody, indent + 1) + result &= '\n' & indentStr & '}' of gpuBlock: result = "" if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n' for i, el in ast.statements: result.add ctx.genCuda(el, indent) if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added - result.add ";" + result.add ';' if i < ast.statements.high: - result.add "\n" + result.add '\n' if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n' of gpuVar: - result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName.ident()) + let attrs = if ast.vAttributes.len > 0: ast.vAttributes.join(" ") & ' ' + else: "" + result = indentStr & attrs & gpuTypeToString(ast.vType, ast.vName.ident()) # If there is an initialization, the type might require a memcpy if ast.vInit.kind != gpuVoid and not ast.vRequiresMemcpy: result &= " = " & ctx.genCuda(ast.vInit) @@ -177,35 +326,35 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = # skip semicolon in the condition. Otherwise can lead to problematic code ctx.withoutSemicolon: # skip semicolon for if bodies result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n" - result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.ifThen, indent + 1) & '\n' + result &= indentStr & '}' if ast.ifElse.kind != gpuVoid: result &= " else {\n" - result &= ctx.genCuda(ast.ifElse, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.ifElse, indent + 1) & '\n' + result &= indentStr & '}' of gpuFor: result = indentStr & "for(int " & ast.fVar.ident() & " = " & ctx.genCuda(ast.fStart) & "; " & ast.fVar.ident() & " < " & ctx.genCuda(ast.fEnd) & "; " & ast.fVar.ident() & "++) {\n" - result &= ctx.genCuda(ast.fBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.fBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuWhile: ctx.withoutSemicolon: result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n" - result &= ctx.genCuda(ast.wBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genCuda(ast.wBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuDot: - result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField) + result = ctx.genCuda(ast.dParent) & '.' & ctx.genCuda(ast.dField) of gpuIndex: - result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]" + result = ctx.genCuda(ast.iArr) & '[' & ctx.genCuda(ast.iIndex) & ']' of gpuCall: - result = indentStr & ast.cName.ident() & "(" & - ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")" + result = indentStr & ast.cName.ident() & '(' & + ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ')' of gpuTemplateCall: when nimvm: @@ -224,25 +373,28 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ctx.genCuda(expandedBody, indent) of gpuBinOp: - result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " & - ast.bOp & " " & - ctx.genCuda(ast.bRight) & ")" + ctx.withoutSemicolon: + let l = ctx.genCuda(ast.bLeft) + let r = ctx.genCuda(ast.bRight) + result = indentStr & '(' & l & ' ' & + ctx.genCuda(ast.bOp) & ' ' & + r & ')' of gpuIdent: result = ast.ident() of gpuLit: - if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\"" + if ast.lType.kind == gtString: result = '"' & ast.lValue & '"' elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder else: result = ast.lValue of gpuArrayLit: result = "{" for i, el in ast.aValues: - result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el + result.add '(' & gpuTypeToString(ast.aLitType) & ')' & ctx.genCuda(el) if i < ast.aValues.high: result.add ", " - result.add "}" + result.add '}' of gpuReturn: result = indentStr & "return " & ctx.genCuda(ast.rValue) @@ -251,10 +403,10 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ast.pOp & ctx.genCuda(ast.pVal) of gpuTypeDef: - result = "struct " & ast.tName & "{\n" + result = "struct " & gpuTypeToString(ast.tTyp) & "{\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, el.name) & ";\n" - result.add "}" + result.add '}' of gpuObjConstr: result = "{" @@ -262,7 +414,7 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result.add ctx.genCuda(el.value) if i < ast.ocFields.len - 1: result.add ", " - result.add "}" + result.add '}' of gpuInlineAsm: result = indentStr & "asm(" & ast.stmt.strip & ");" @@ -271,23 +423,45 @@ proc genCuda*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "/* " & ast.comment & " */" of gpuConv: - result = "(" & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.convExpr) + result = '(' & gpuTypeToString(ast.convTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.convExpr) of gpuCast: - result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr) + result = '(' & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ')' & ctx.genCuda(ast.cExpr) of gpuAddr: - result = "(&" & ctx.genCuda(ast.aOf) & ")" + result = "(&" & ctx.genCuda(ast.aOf) & ')' of gpuDeref: - result = "(*" & ctx.genCuda(ast.dOf) & ")" + result = "(*" & ctx.genCuda(ast.dOf) & ')' of gpuConstexpr: + ## TODO: We need to change the code such that we emit `constexpr` inside of procs and + ## `__constant__` outside of procs. The point is we want to support mapping to `__constant__` + ## for `const foo = bar` Nim declarations to evaluate values at Nim's compile time. + ## Alternatively, make user write `const foo {.constant.} = bar` to produce a global + ## `__constant__` value. if ast.cType.kind == gtArray: - result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) + result = indentStr & "constexpr " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) else: - result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) + result = indentStr & "constexpr " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & ' ' & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) else: echo "Unhandled node kind in genCuda: ", ast.kind raiseAssert "Unhandled node kind in genCuda: " & ast.repr result = "" + +proc codegen*(ctx: var GpuContext): string = + ## Generate the actual code for all pieces of the puzzle + # 1. generate code for the global blocks (types, global vars etc) + for blk in ctx.globalBlocks: + result.add ctx.genCuda(blk) & ";\n\n" + + # 2. generate all regular functions + let fns = toSeq(ctx.fnTab.pairs) + for (fnIdent, fn) in fns: + let fnC = fn.clone() + fnC.forwardDeclare = true + result.add ctx.genCuda(fnC) & '\n' + result.add "\n\n" + + for fnIdent, fn in ctx.fnTab: + result.add ctx.genCuda(fn) & "\n\n" diff --git a/constantine/math_compiler/experimental/backends/wgsl.nim b/constantine/math_compiler/experimental/backends/wgsl.nim index 0b6552bd..35a80f32 100644 --- a/constantine/math_compiler/experimental/backends/wgsl.nim +++ b/constantine/math_compiler/experimental/backends/wgsl.nim @@ -25,7 +25,7 @@ proc literalSuffix(t: GpuType): string = of gtUint32: "u" of gtInt32: "" # NOTE: We DON'T give as suffix to `i32` literals so that we can rely on more cases # where WebGPU allows literals to be converted automatically! - of gtFloat32: "f" + of gtFloat32: "" # NOTE: float suffixes _already_ come with an `f` suffix in Nim! else: "" proc toAddressSpace(symKind: GpuSymbolKind): AddressSpace = @@ -56,7 +56,7 @@ proc fromAddressSpace(addrSpace: AddressSpace): GpuSymbolKind = proc constructPtrSignature(addrSpace: AddressSpace, idTyp: GpuType, ptrStr, typStr: string): string = ## Constructs the `ptr` string, which only includes ## the RW string if the address space is `storage` - let rw = if not idTyp.isNil: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`) + let rw = if idTyp.kind != gtVoid: idTyp.mutable else: false # symbol is a pointer -> mutable (can be implicit via `var T`) let rwStr = if rw: "read_write" else: "read" case addrSpace of asStorage: result = &"{ptrStr}<{addrSpace}, {typStr}, {rwStr}>" @@ -112,8 +112,18 @@ proc gpuTypeToString*(t: GpuType, id: GpuAst = newGpuIdent(), allowArrayToPtr = else: result = &"{identPrefix}array<{typ}, {t.aLen}>" skipIdent = true + of gtGenericInst: + # NOTE: WGSL does not support actual custom generic types. And as we only anyway deal with generic instantiations + # we simply turn e.g. `foo[float32, uint32]` into `foo_f32_u32`. + result = t.gName + if t.gArgs.len > 0: + result.add '_' + for i, g in t.gArgs: + result.add gpuTypeToString(g) + if i < t.gArgs.high: + result.add '_' of gtObject: result = t.name - of gtUA: result = gpuTypeToString(t.kind) & "<" & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & ">" + of gtUA: result = gpuTypeToString(t.kind) & '<' & gpuTypeToString(t.uaTo, allowEmptyIdent = allowEmptyIdent) & '>' else: result = gpuTypeToString(t.kind) if ident.len > 0 and not skipIdent: # still need to add ident @@ -139,34 +149,6 @@ proc genFunctionType*(typ: GpuType, fn: string, fnArgs: string): string = if typ.len > 0: result.add &" -> {typ}" -proc isGlobal(fn: GpuAst): bool = - doAssert fn.kind == gpuProc, "Not a function, but: " & $fn.kind - result = attGlobal in fn.pAttributes - -proc farmTopLevel(ctx: var GpuContext, ast: GpuAst, kernel: string, varBlock, typBlock: var GpuAst) = - ## Farms the top level of the code for functions, variable and type definition. - ## All functions are added to the `allFnTab`, while only global ones (or even only - ## `kernel` if any) is added to the `fnTab` as the starting point for the remaining - ## logic. - ## Variables and types are collected in `varBlock` and `typBlock`. - case ast.kind - of gpuProc: - ctx.allFnTab[ast.pName] = ast - if kernel.len > 0 and ast.pName.ident() == kernel and ast.isGlobal(): - ctx.fnTab[ast.pName] = ast.clone() # store global function extra - elif kernel.len == 0 and ast.isGlobal(): - ctx.fnTab[ast.pName] = ast.clone() # store global function extra - of gpuBlock: - # could be a type definition or global variable - for ch in ast: - ctx.farmTopLevel(ch, kernel, varBlock, typBlock) - of gpuVar, gpuConstexpr: - varBlock.statements.add ast - of gpuTypeDef: - typBlock.statements.add ast - else: - discard - proc patchType(t: GpuType): GpuType = ## Applies patches needed for WGSL support. E.g. `bool` cannot be a storage variable. result = t @@ -205,6 +187,7 @@ proc determineSymKind(arg: GpuAst): GpuSymbolKind = of gpuBlock: arg.statements[^1].determineSymKind() # look at last element of gpuPrefix: gsLocal # equivalent to constructing a local var of gpuConv: gsLocal # a converted value will be a local var + of gpuCast: arg.cExpr.determineSymKind() # symbol kind of the thing we cast else: raiseAssert "Not implemented to determine symbol kind from node: " & $arg @@ -225,6 +208,7 @@ proc determineMutability(arg: GpuAst): bool = of gpuBlock: arg.statements[^1].determineMutability() # look at last element of gpuPrefix: false # equivalent to constructing a local var of gpuConv: false # a converted value will be immutable + of gpuCast: arg.cExpr.determineMutability() # mutability of the thing we cast else: raiseAssert "Not implemented to determine mutability from node: " & $arg @@ -250,8 +234,9 @@ proc determineIdent(arg: GpuAst): GpuAst = of gpuBlock: arg.statements[^1].determineIdent() of gpuPrefix: dfl() of gpuConv: dfl() + of gpuCast: arg.cExpr.determineIdent() # ident of the thing we cast else: - raiseAssert "Not implemented to determine mutability from node: " & $arg + raiseAssert "Not implemented to determine ident from node: " & $arg proc getGenericArguments(args: seq[GpuAst], params: seq[GpuParam], callerParams: Table[string, GpuParam]): seq[GenericArg] = ## If an argument is not a ptr argument in the original function (`params`) then @@ -310,7 +295,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string ## the information we precisely want to extract (different symbol kind etc) would make ## it so that we cannot look up elements. doAssert n.kind == gpuCall, "Not a call, but: " & $n.kind - result = n.cName.ident() & "_" + result = n.cName.ident() & '_' for i, arg in n.cArgs: let p = params[i] var s: string @@ -327,7 +312,7 @@ proc genGenericName(n: GpuAst, params: seq[GpuParam], callerParams: Table[string s = shortAddrSpace(addrSpace) & m result.add s if i < n.cArgs.high: - result.add "_" + result.add '_' proc makeFnGeneric(fn: GpuAst, gi: GenericInst): GpuAst = ## Returns a (shallow) copy of the input function (which is a clone of the @@ -385,6 +370,11 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp ## of that function (hence the name `scanGenerics`). The generic instance will be added ## to `fnTab` instead. The name of the generic will be derived based on the types ## of arguments with respect to mutability and address space. + ## + ## In addition this function records any time a `struct` is constructed in a `gpuObjConstr` + ## node and a pointer field assigned to it. As pointer fields are not valid in WGSL, we + ## record them here to replace them by their arguments passed to the constructor later. + ## The pointers _must_ be pointers passed into a global kernel (i.e. `storage` address space). case n.kind of gpuCall: let fn = n.cName @@ -435,6 +425,18 @@ proc scanGenerics(ctx: var GpuContext, n: GpuAst, callerParams: Table[string, Gp # Harvest generics from arguments to this call! for arg in n.cArgs: ctx.scanGenerics(arg, callerParams) + of gpuObjConstr: + # If pointer argument of `storage`, strip out, if pointer type field + # otherwise, raise CT error + for f in n.ocFields: + if f.typ.kind == gtPtr: + doAssert f.value.kind in [gpuAddr, gpuIdent], "Constructing a pointer field " & + "from a more complex expression than an ident or an address-of operation " & + "is currently not supported." + let id = f.value.determineIdent() + doAssert id.symbolKind == gsGlobalKernelParam, "Assigning a pointer to a non storage address space " & + "variable (i.e. an argument to a global kernel) is not supported: " & $f + ctx.structsWithPtrs[(n.ocType, f.name)] = id else: for ch in n: ctx.scanGenerics(ch, callerParams) @@ -495,7 +497,234 @@ proc injectAddressOf(ctx: var GpuContext, n: var GpuAst) = for ch in mitems(n): ctx.injectAddressOf(ch) -proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = +proc rewriteCompoundAssignment(n: GpuAst): GpuAst = + doAssert n.kind == gpuBinOp + if n.bOp.ident() in ["<=", "==", ">=", "!="]: return n + + template genAssign(left, rnode, op: typed): untyped = + let right = GpuAst(kind: gpuBinOp, bOp: op, bLeft: left, bRight: rnode) + GpuAst(kind: gpuAssign, aLeft: left, aRight: right, aRequiresMemcpy: false) + + let op = n.bOp.ident() + if op.len >= 2 and op[^1] == '=': + var opAst = GpuAst(kind: gpuIdent, iName: op[0 .. ^2]) + opAst.iSym = opAst.iName + result = genAssign(n.bLeft, n.bRight, opAst) # all but last + else: + # leave untouched + result = n + +proc getStructType(n: GpuAst): GpuType = + ## Given an identifier `gpuIdent` (or `Deref` of one), return the struct type + ## the ident is of or a GpuType of `void` if it is not (pointing to) a struct. + doAssert n.kind in [gpuIdent, gpuDeref], "Dot expression of anything not an address currently not supported: " & + $n.kind & " for node: " & $n + var p = n + if p.kind == gpuDeref: + p = n.dOf + result = if p.iTyp.kind == gtPtr and p.iTyp.to.kind == gtObject: + p.iTyp.to + elif p.iTyp.kind == gtObject: + p.iTyp + else: GpuType(kind: gtVoid) + +proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string +proc makeCodeValid(ctx: var GpuContext, n: var GpuAst, inGlobal: bool) = + ## Addresses other AST patterns that need to be rewritten on WGSL. Aspects + ## that are rewritten include: + ## + ## - (`gpuBinOp`) rewriting compound assignment operators as regular assignments, `x += y` ↦ `x = x + y` + ## + ## - (`gpuDot`) replace field access of struct pointer fields by the pointers passed into the object + ## constructor (ref `scanGenerics`). `inGlobal` is used to decide what exactly we replace + ## it by. Inside of a global function the variables won't be pointers, hence we insert `&foo`. + ## In device functions, the globals will have been passed into the function as a parameter, + ## `ptr`. Thus, we replace by `foo`. + ## NOTE: We could consider to move this into `scanGenerics`, but for the moment I prefer to + ## do code transformations here and `scanGenerics` being only about data collection. + ## + ## - (`gpuAssign`) compile time errors, if a user tries to assign a pointer to a struct pointer field + ## outside the constructor. + ## + ## - (`gpuCall`) potentially update signatures of our custom generic functions. In `scanGenerics` if we + ## have a call like `foo(bar.ptrField)` we will determine the signature of `foo` to have + ## a `function` pointer, because `bar` will be a local struct instance. However, due to + ## our replacement rules and fact that *only* storage pointers may be assigned to constructors + ## the correct signature would be `storage` for the first argument after replacing `bar.ptrField` + ## by its value in the constructor. + ## + ## - (`gpuObjConstr`) delete arguments to object constructors, which assign pointer fields. + ## + ## - (`gpuVar`) update types of new variables based on the RHS. May have changed since Nim -> GpuAst, + ## due to `gpuDot` replacement further up. + ## + ## NOTE: A few cases already raise compile time errors _here_ and not in `checkCodeValid`, + ## as some transformations otherwise break the detection. + case n.kind + of gpuBinOp: + n = rewriteCompoundAssignment(n) + for ch in mitems(n): # now go over children + ctx.makeCodeValid(ch, inGlobal) + of gpuObjConstr: # strip out arguments that are pointer types + let t = n.ocType + var i = 0 + while i < n.ocFields.len: + let f = n.ocFields[i] + if (t, f.name) in ctx.structsWithPtrs: + if f.typ.kind == gtPtr: + n.ocFields.delete(i) + else: + inc i + else: + inc i + of gpuDot: # replace `foo.bar` by storage pointer recorded in `scanGenerics`, i.e. `foo.bar` -> `&res` + var p = n.dParent + let id = getStructType(p) + doAssert n.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $n.dField.kind + let field = n.dField.ident() + if id.kind != gtVoid and (id, field) in ctx.structsWithPtrs: # this is in the struct with pointer + let v = ctx.structsWithPtrs[(id, field)] + ## XXX: only need `addr` if we are in a global function, not otherwise, because in device functions, + ## we will have passed the parameter + if inGlobal: + n = GpuAst(kind: gpuAddr, aOf: v) # overwrite with the address of value passed in to the object constructor + else: + n = v + of gpuAssign: # checks we don't have `foo.x = res` for `x` a pointer field + if n.aLeft.kind == gpuDot and n.aLeft.dParent.kind in [gpuIdent, gpuDeref]: + let dot = n.aLeft + let id = getStructType(dot.dParent) + if id.kind != gtVoid: + doAssert dot.dField.kind == gpuIdent, "Dot expression must contain an ident as field: " & $dot.dField.kind + let field = dot.dField.ident() + if (id, field) in ctx.structsWithPtrs: + raiseAssert "Assignment of a struct (`" & pretty(id) & "`) field of a pointer type is not supported. " & + "Assign pointer fields in the constructor only. In code: " & $ctx.genWebGpu(n) + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + of gpuCall: + # we might need to update the type of generics, if we did the replacement in `gpuDot`, because + # a struct ptr field will have had the wrong storage type + for ch in mitems(n): # first process children + ctx.makeCodeValid(ch, inGlobal) + # now check if any argument's type mismatches against the generic we recorded + let fnName = n.cName + if fnName in ctx.fnTab: # otherwise will not be generated by us, so irrelevan + # NOTE: theoretically, if we had struct pointer field replacements with symbols that had + # *different* address spaces, we'd need to split one generic into multiple again here. + # But that shouldn't be possible, because our entire replacement is currently only + # sane if we store a *storage pointer* in a struct. We would have raised in `scanGenerics` + # because of invalid pointer assignment in an object constructor. + let fn = ctx.fnTab[fnName] + let params = fn.pParams + for i, arg in n: # walk the parameters again and compare + let argId = arg.determineIdent() + if argId.kind != gpuVoid and argId.ident().len > 0: + var p = params[i] + ## XXX: update anything else? We mostly care about the address space here, because + ## the rest _should_ be the same anyway. + if p.addressSpace != argId.symbolKind.toAddressSpace(): + p.addressSpace = argId.symbolKind.toAddressSpace() + p.ident.symbolKind = argId.symbolKind + fn.pParams[i] = p # write back, not a ref type! + of gpuVar: + # first recurse on the `gpuVar` to get possible replacements + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + # update LHS with info from RHS by copying over its symbol kind. Different types are + # possible after replacements of `gpuDot` nodes above. + if n.vType.kind == gtPtr: + let rightId = n.vInit.determineIdent() + n.vName.symbolKind = rightId.symbolKind + n.vType.mutable = rightId.iTyp.mutable + n.vName.iTyp.mutable = rightId.iTyp.mutable + else: + for ch in mitems(n): + ctx.makeCodeValid(ch, inGlobal) + +proc updateSymsInGlobals(ctx: var GpuContext, n: GpuAst) = + ## Update symbols in global functions to have same mutability and symbolkind as + ## parameters + case n.kind + of gpuIdent: + if n.iSym in ctx.globals: + n.symbolKind = gsGlobalKernelParam + if n.iTyp.kind == gtPtr: + let g = ctx.globals[n.iSym] + n.iTyp.mutable = g.typ.kind == gtPtr # arguments as pointers == mutable + else: + for ch in n: + ctx.updateSymsInGlobals(ch) + +proc checkCodeValid(ctx: var GpuContext, n: GpuAst) = + ## Checks if the code is valid according to WGSL spec. + ## So far handles: + ## - variables (`var`) to pointer types are not allowed + ## + ## Some code is already rejected in earlier passes, if a compiler pass would transform + ## the code in such a way as making a detection of illegal code invalid. + case n.kind + of gpuVar: + if n.vType.kind == gtPtr and n.vMutable: # `vMutable == var` -> not allowed to store pointers + let code = ctx.genWebGpu(n) + raiseAssert "The node: `" & $code & "` constructs a variable (`var`) to a pointer type. This " & + "is invalid in WGSL. Use `let`." + else: + for ch in n: + ctx.checkCodeValid(ch) + +proc pullConstantPragmaVars(ctx: var GpuContext, blk: var GpuAst) = + ## Filters out all `var foo {.constant.}: dtype` from the `globalBlocks` and adds them to + ## the `globals` of the context. Such variables are *not* regular global constants, but rather + ## `storage` buffers, which are filled before the kernel is executed. + ## + ## XXX: Document current not ideal behavior that one needs to be careful to pass data into + ## `wgsl.fakeExecute` precisely in the order in which the `var foo {.constant.}` are defined + ## *AND* after all kernel parameters! + doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind + var i = 0 + while i < blk.len: + doAssert blk.kind == gpuBlock + let g = blk.statements[i] + if g.kind == gpuVar and atvConstant in g.vAttributes: + # remove this from `globalBlocks` and add to `globals` + doAssert g.vInit.kind == gpuVoid, "A variable annotated with `{.constant.}` must not have an initialization!" + # we construct a fake parameter from it + ## XXX: `storage` address space is probably what we want, but think more about it + let param = GpuParam(ident: g.vName, typ: g.vType, addressSpace: asStorage) + ctx.globals[param.ident.iSym] = param + blk.statements.delete(i) + # no need to increase `i` + else: + inc i + +proc removeStructPointerFields(blk: var GpuAst) = + ## Filters out `ptr` fields from all structs. + ## + ## If a type is used with `storage` pointer arguments, we will later perform replacement of field + ## access to the pointer field by the value we assign. + ## + ## If the user assigns a local (`function`) pointer, we raise a CT error. We _could_ in theory support + ## replacement for local pointer types too, but it requires a more careful analysis of which + ## local to replace by and the name in other scopes. I.e. passing a local pointer to a constructor + ## which has a different name than in the calling scope would require us to traverse the AST up to + ## the calling scope. + ## + ## Given the extreme limitations on `let` variables with pointers anyway, I don't think there is muc + ## purpose on supporting such features. + doAssert blk.kind == gpuBlock, "Argument must be a block, but is: " & $blk.kind + for typ in mitems(blk): + if typ.kind == gpuAlias: continue # don't need to mutate aliases! + doAssert typ.kind == gpuTypeDef + var i = 0 + while i < typ.tFields.len: + let f = typ.tFields[i] + if f.typ.kind == gtPtr: # delete + typ.tFields.delete(i) + else: + inc i + +proc preprocess*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = ## If `kernel` is a global function, we *only* generate code for that kernel. ## This is useful if your GPU code contains multiple kernels with differing ## parameters to avoid having to fill dummy buffers for all the unused parameters @@ -503,9 +732,18 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = # 1. Fill table with all *global* functions or *only* the specific `kernel` # if any given var varBlock = GpuAst(kind: gpuBlock) - var typBlock = GpuAst(kind: gpuBlock) - ctx.farmTopLevel(ast, kernel, varBlock, typBlock) + ctx.farmTopLevel(ast, kernel, varBlock) ctx.globalBlocks.add varBlock + ## XXX: `typBlock` should now always be empty, as we pass all + ## found types into `ctx.types` + + # Now add the generics to the `allFnTab` + for k, v in pairs(ctx.genericInsts): + ctx.allFnTab[k] = v + # And all the known types + var typBlock = GpuAst(kind: gpuBlock) + for k, typ in pairs(ctx.types): + typBlock.statements.add typ ctx.globalBlocks.add typBlock # 2. Remove all arguments from global functions, as none are allowed in WGSL @@ -515,9 +753,21 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = for p in fn.pParams: ctx.globals[p.ident.iSym] = p # copy all parameters over to globals fn.pParams.setLen(0) # delete function's parameters + # now update all appearances of the parameters, now globals, such that they reflect + # the correct symbol kind and mutability + ctx.updateSymsInGlobals(fn) else: discard + # 2.b filter out all `var foo {.constant.}: dtype` from the `globalBlocks` and add them to + # the `globals` + # `globalBlocks` has two entries: + # 0: variables + # 1: types + ctx.pullConstantPragmaVars(ctx.globalBlocks[0]) + # 2.c remove all fields of structs, which have pointer type + removeStructPointerFields(ctx.globalBlocks[1]) + # 3. Using all global functions, we traverse their AST for any `gpuCall` node. We inspect # the functions called and record them in `fnTab`. If they have pointer arguments we # generate a generic instantiation for the exact pointer types used. @@ -538,8 +788,15 @@ proc storagePass*(ctx: var GpuContext, ast: GpuAst, kernel: string = "") = if fn.isGlobal(): # non global functions don't need to be mutated ctx.injectAddressOf(fn) + # 5. (Actually finally) patch all additional things invalid in WGSL, e.g. `x += 5` -> `x = x + 5` + for (fnIdent, fn) in mpairs(ctx.fnTab): + ctx.makeCodeValid(fn, inGlobal = fn.isGlobal()) + + # 6. finally raise error if we find anything that is not allowed in WGSL after our transformations + for (fnIdent, fn) in pairs(ctx.fnTab): + ctx.checkCodeValid(fn) + -proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genWebGpu(a)) proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genWebGpu(a)) @@ -549,46 +806,37 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = case ast.kind of gpuVoid: return # nothing to emit of gpuProc: - - ## XXX: if a {.global.} / attGlobal proc, lift arguments - ## Store all arguments in the `GpuContext` - ## *AFTER* processing all of the code, generate header and place at beginning - ## Most difficult: - ## - track identifiers from {.global.} functions into arbitrary layers and remove - ## BUT, we can also have a full preprocessing pass. - let attrs = collect: for att in ast.pAttributes: $att - # Parameters var params: seq[string] for p in ast.pParams: params.add gpuTypeToString(p.typ, p.ident, allowEmptyIdent = false) var fnArgs = params.join(", ") if $attGlobal in attrs: doAssert fnArgs.len == 0, "Global function `" & $ast.pName.ident() & "` still has arguments!" - ## XXX: clean this up. Add the global id builtin - fnArgs = "@builtin(global_invocation_id) global_id: vec3" + ## XXX: make this more flexible. In theory can be any name + fnArgs = "@builtin(global_invocation_id) global_id: vec3, @builtin(num_workgroups) num_workgroups: vec3" let fnSig = genFunctionType(ast.pRetType, ast.pName.ident(), fnArgs) result = indentStr & "fn " & fnSig & " {\n" result &= ctx.genWebGpu(ast.pBody, indent + 1) - result &= "\n" & indentStr & "}" + result &= '\n' & indentStr & '}' of gpuBlock: result = "" if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "{ // " & ast.blockLabel & '\n' for i, el in ast.statements: result.add ctx.genWebGpu(el, indent) if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added - result.add ";" + result.add ';' if i < ast.statements.high: - result.add "\n" + result.add '\n' if ast.blockLabel.len > 0: - result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" + result.add '\n' & indentStr & "} // " & ast.blockLabel & '\n' of gpuVar: let letOrVar = if ast.vMutable: "var" else: "let" @@ -619,36 +867,12 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight), ctx.size(ast.aLeft)) else: - proc determineIdent(arg: GpuAst): GpuAst = - ## Tries to determine the underlying ident that is contained in this node. - ## The issue is the argument to a `gpuCall` can be a complicated expression. - ## Depending on the node it may be possible to extract a simple identifier, - ## e.g. for `addr(foo)` (`gpuAddr` of `gpuIdent` node) we can get the ident. - ## If this fails, we return a `gpuVoid` node. - ## - ## TODO: Think about if it ever makes sense to extract the ident underlying - ## e.g. `deref` and use _that_ to determine mutability & address space. - template dfl(): untyped = GpuAst(kind: gpuVoid) - case arg.kind - of gpuIdent: arg - of gpuAddr: arg.aOf.determineIdent() - of gpuDeref: arg.dOf.determineIdent() - of gpuCall: dfl() - of gpuIndex: arg.iArr.determineIdent() - of gpuDot: arg.dParent.determineIdent() - of gpuLit: dfl() - of gpuBinOp: dfl() - of gpuBlock: arg.statements[^1].determineIdent() - of gpuPrefix: dfl() - of gpuConv: dfl() - else: - raiseAssert "Not implemented to determine mutability from node: " & $arg - let leftTyp = ast.aLeft.determineIdent().iTyp - if leftTyp.kind == gtPtr and leftTyp.to.kind == gtInt32: + let leftId = ast.aLeft.determineIdent() + if leftId.kind != gpuVoid and leftId.iTyp.kind == gtPtr and leftId.iTyp.to.kind == gtInt32: # If the LHS is `i32` then a conversion to `i32` is either a no-op, if the left always was # `i32` (and the Nim compiler type checked it for us) *OR* the RHS is a boolean expression and # we patched the `bool -> i32` and thus need to convert it. - result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ")" + result = indentStr & ctx.genWebGpu(ast.aLeft) & " = i32(" & ctx.genWebGpu(ast.aRight) & ')' else: result = indentStr & ctx.genWebGpu(ast.aLeft) & " = " & ctx.genWebGpu(ast.aRight) @@ -662,36 +886,36 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "if (false) {\n" else: result = indentStr & "if (" & ctx.genWebGpu(ast.ifCond) & ") {\n" - result &= ctx.genWebGpu(ast.ifThen, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.ifThen, indent + 1) & '\n' + result &= indentStr & '}' if ast.ifElse.kind != gpuVoid: result &= " else {\n" - result &= ctx.genWebGpu(ast.ifElse, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.ifElse, indent + 1) & '\n' + result &= indentStr & '}' of gpuFor: result = indentStr & "for(var " & ast.fVar.ident() & ": i32 = " & ctx.genWebGpu(ast.fStart) & "; " & ast.fVar.ident() & " < " & ctx.genWebGpu(ast.fEnd) & "; " & ast.fVar.ident() & "++) {\n" - result &= ctx.genWebGpu(ast.fBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.fBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuWhile: ctx.withoutSemicolon: result = indentStr & "while (" & ctx.genWebGpu(ast.wCond) & "){\n" - result &= ctx.genWebGpu(ast.wBody, indent + 1) & "\n" - result &= indentStr & "}" + result &= ctx.genWebGpu(ast.wBody, indent + 1) & '\n' + result &= indentStr & '}' of gpuDot: - result = ctx.genWebGpu(ast.dParent) & "." & ctx.genWebGpu(ast.dField) + result = ctx.genWebGpu(ast.dParent) & '.' & ctx.genWebGpu(ast.dField) of gpuIndex: - result = ctx.genWebGpu(ast.iArr) & "[" & ctx.genWebGpu(ast.iIndex) & "]" + result = ctx.genWebGpu(ast.iArr) & '[' & ctx.genWebGpu(ast.iIndex) & ']' of gpuCall: ctx.withoutSemicolon: - result = indentStr & ast.cName.ident() & "(" & - ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ")" + result = indentStr & ast.cName.ident() & '(' & + ast.cArgs.mapIt(ctx.genWebGpu(it)).join(", ") & ')' of gpuTemplateCall: when nimvm: @@ -711,15 +935,18 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ctx.genWebGpu(expandedBody, indent) of gpuBinOp: - result = indentStr & "(" & ctx.genWebGpu(ast.bLeft) & " " & - ast.bOp & " " & - ctx.genWebGpu(ast.bRight) & ")" + ctx.withoutSemicolon: + let l = ctx.genWebGpu(ast.bLeft) + let r = ctx.genWebGpu(ast.bRight) + result = indentStr & '(' & l & ' ' & + ctx.genWebGpu(ast.bOp) & ' ' & + r & ')' of gpuIdent: result = ast.ident() of gpuLit: - if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\"" + if ast.lType.kind == gtString: result = '"' & ast.lValue & '"' elif ast.lValue == "DEFAULT": ## TODO: We could "manually" construct a zero version! ## NOTE: There *are* default initializations to zero. Just not for fields that @@ -732,10 +959,10 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = of gpuArrayLit: result = "array(" for i, el in ast.aValues: - result.add gpuTypeToString(ast.aLitType) & "(" & el & ")" + result.add gpuTypeToString(ast.aLitType) & '(' & ctx.genWebGpu(el) & ')' if i < ast.aValues.high: result.add ", " - result.add ")" + result.add ')' of gpuReturn: result = indentStr & "return " & ctx.genWebGpu(ast.rValue) @@ -744,18 +971,29 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = ast.pOp & ctx.genWebGpu(ast.pVal) of gpuTypeDef: - result = "struct " & ast.tName & "{\n" + result = "struct " & gpuTypeToString(ast.tTyp) & " {\n" for el in ast.tFields: result.add " " & gpuTypeToString(el.typ, newGpuIdent(el.name)) & ",\n" - result.add "}" + result.add '}' + + of gpuAlias: + # Aliases come from `ctx.types` and due to implementation details currently are _not_ wrapped + # in a `block` (as they are handled like regular `structs`). However, WebGPU requires semicolons + # after alias definitions, but not after `struct`. Hence we add `;` manually here + result = "alias " & gpuTypeToString(ast.aTyp) & " = " & ctx.genWebGpu(ast.aTo) & ';' of gpuObjConstr: - result = ast.ocName & "(" + result = gpuTypeToString(ast.ocType) & '(' for i, el in ast.ocFields: - result.add ctx.genWebGpu(el.value) + if el.value.kind == gpuLit and el.value.lValue == "DEFAULT": + # use type to construct a default value + let typStr = gpuTypeToString(el.typ, allowEmptyIdent = true) + result.add typStr & "()" + else: + result.add ctx.genWebGpu(el.value) if i < ast.ocFields.len - 1: result.add ", " - result.add ")" + result.add ')' of gpuInlineAsm: raiseAssert "Inline assembly not supported on the WebGPU target." @@ -764,16 +1002,16 @@ proc genWebGpu*(ctx: var GpuContext, ast: GpuAst, indent = 0): string = result = indentStr & "/* " & ast.comment & " */" of gpuConv: - result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & "(" & ctx.genWebGpu(ast.convExpr) & ")" + result = gpuTypeToString(ast.convTo, allowEmptyIdent = true) & '(' & ctx.genWebGpu(ast.convExpr) & ')' of gpuCast: - result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ")" + result = "bitcast<" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ">(" & ctx.genWebGpu(ast.cExpr) & ')' of gpuAddr: - result = "(&" & ctx.genWebGpu(ast.aOf) & ")" + result = "(&" & ctx.genWebGpu(ast.aOf) & ')' of gpuDeref: - result = "(*" & ctx.genWebGpu(ast.dOf) & ")" + result = "(*" & ctx.genWebGpu(ast.dOf) & ')' of gpuConstexpr: result = indentStr & "const " & ctx.genWebGpu(ast.cIdent) & ": " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " = " & ctx.genWebGpu(ast.cValue) @@ -812,7 +1050,7 @@ proc codegen*(ctx: var GpuContext): string = # 1. Generate the header for all global variables for id, g in ctx.globals: result.add genGlobal(g) - result.add "\n" + result.add '\n' # 2. generate code for the global blocks (types, global vars etc) for blk in ctx.globalBlocks: @@ -822,5 +1060,5 @@ proc codegen*(ctx: var GpuContext): string = for fnIdent, fn in ctx.fnTab: if fn.isGlobal(): ## XXX: make adjustable! - result.add "@compute @workgroup_size(NUM_WORKGROUPS)\n" + result.add "@compute @workgroup_size(WORKGROUP_SIZE)\n" result.add ctx.genWebGpu(fn) & "\n\n" diff --git a/constantine/math_compiler/experimental/builtins/builtins.nim b/constantine/math_compiler/experimental/builtins/builtins.nim new file mode 100644 index 00000000..da079e83 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/builtins.nim @@ -0,0 +1,19 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# NOTE: For the moment we import and export builtins here for all backends. +# Once we change the code to make single backends importable on their own, +# this will be changed and these builtins will be imported/exported in the +# corresponding CUDA/WGSL etc module the user needs to import. +import ./common_builtins +import ./cuda_builtins +import ./wgsl_builtins + +export common_builtins +export cuda_builtins +export wgsl_builtins diff --git a/constantine/math_compiler/experimental/builtins/common_builtins.nim b/constantine/math_compiler/experimental/builtins/common_builtins.nim new file mode 100644 index 00000000..b9a144a5 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/common_builtins.nim @@ -0,0 +1,46 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +#import std / [macros, strutils, sequtils, options, sugar, tables, strformat, hashes, sets] +# +#import ./gpu_types +#import ./backends/backends +#import ./nim_to_gpu +# +#export gpu_types + +template nimonly*(): untyped {.pragma.} +template cudaName*(s: string): untyped {.pragma.} + +## Dummy data for the typed nature of the `cuda` macro. These define commonly used +## CUDA specific names so that they produce valid Nim code in the context of a typed macro. +template global*() {.pragma.} +template device*() {.pragma.} +template forceinline*() {.pragma.} + +## If attached to a function, type or variable it will refer to a built in +## in the target backend. This is used for all the functions, types and variables +## defined below to indicate that we do not intend to generate code for them. +template builtin*() {.pragma.} +# If attached to a `var` it will be treated as a +# `__constant__`! Only useful if you want to define a +# constant without initializing it (and then use +# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it +# before executing the kernel) +template constant*() {.pragma.} + +## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its +## own `extern` pragma (due to requiring an argument it cannot be reused): +## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma +template cuExtern*(): untyped {.pragma.} +template shared*(): untyped {.pragma.} +template private*(): untyped {.pragma.} +## You would typically use `cuExtern` and `shared` together: +## `var x {.cuExtern, shared.}: array[N, Foo]` +## for example to declare a constant array that is filled by the +## host before kernel execution. diff --git a/constantine/math_compiler/experimental/builtins/cuda_builtins.nim b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim new file mode 100644 index 00000000..5d002f93 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/cuda_builtins.nim @@ -0,0 +1,48 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./common_builtins + +type + Dim* = cint ## dummy to have access to math + NvBlockIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvBlockDim = object + x*: Dim + y*: Dim + z*: Dim + NvThreadIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvGridDim = object + x*: Dim + y*: Dim + z*: Dim + + +## These are dummy elements to make CUDA block / thread index / dim +## access possible in the *typed* `cuda` macro. It cannot be `const`, +## because then the typed code would evaluate the values before we +## can work with it from the typed macro. +let blockIdx* {.builtin.} = NvBlockIdx() +let blockDim* {.builtin.} = NvBlockDim() +let gridDim* {.builtin.} = NvGridDim() +let threadIdx* {.builtin.} = NvThreadIdx() + +## Similar for procs. They don't need any implementation, as they won't ever be actually called. +proc printf*(fmt: string) {.varargs, builtin.} = discard +proc memcpy*(dst, src: pointer, size: int) {.builtin.} = discard + +## While you can use `malloc` on device with small sizes, it is usually not +## recommended to do so. +proc malloc*(size: csize_t): pointer {.builtin.} = discard +proc free*(p: pointer) {.builtin.} = discard +proc syncthreads*() {.cudaName: "__syncthreads", builtin.} = discard diff --git a/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim new file mode 100644 index 00000000..fe7e11f0 --- /dev/null +++ b/constantine/math_compiler/experimental/builtins/wgsl_builtins.nim @@ -0,0 +1,26 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import ./common_builtins + +type + DimWgsl = uint32 + WgslGridDim = object + x*: DimWgsl + y*: DimWgsl + z*: DimWgsl + +## WebGPU specific +let global_id* {.builtin.} = WgslGridDim() +let num_workgroups* {.builtin.} = WgslGridDim() + +## WebGPU select +proc select*[T](f, t: T, cond: bool): T {.builtin.} = + # Implementation to run WebGPU code on CPU + if cond: t + else: f diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim index 8f1aa879..ae5732ad 100644 --- a/constantine/math_compiler/experimental/cuda_execute_dsl.nim +++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim @@ -41,13 +41,15 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool = case n.typeKind of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats result = false - of ntyObject, ntyArray: + of ntyObject: if passStructByPointer: result = false # regular objects can just be copied! else: result = true # struct passing by pointer forbidden ## NOTE: strictly speaking this is not the case of course! If the object ## contains refs, it won't hold! + of ntyArray: # statically sized arrays are passed by pointer in CUDA / C++ / C! + result = true of ntyGenericInst: if passStructByPointer: let impl = n.getTypeImpl() @@ -60,6 +62,12 @@ proc requiresCopy(n: NimNode, passStructByPointer: bool): bool = result = false else: result = true + of ntyAlias: + let impl = n.getTypeInst() + if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr": + result = false + else: + result = true else: result = true diff --git a/constantine/math_compiler/experimental/gpu_compiler.nim b/constantine/math_compiler/experimental/gpu_compiler.nim index 0c68e88e..0a8b7362 100644 --- a/constantine/math_compiler/experimental/gpu_compiler.nim +++ b/constantine/math_compiler/experimental/gpu_compiler.nim @@ -14,103 +14,25 @@ import ./nim_to_gpu export gpu_types -template nimonly*(): untyped {.pragma.} -template cudaName*(s: string): untyped {.pragma.} +import builtins/builtins # all the builtins for the backend to make the Nim compiler happy +export builtins -## Dummy data for the typed nature of the `cuda` macro. These define commonly used -## CUDA specific names so that they produce valid Nim code in the context of a typed macro. -template global*() {.pragma.} -template device*() {.pragma.} -template forceinline*() {.pragma.} - -# If attached to a `var` it will be treated as a -# `__constant__`! Only useful if you want to define a -# constant without initializing it (and then use -# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it -# before executing the kernel) -template constant*() {.pragma.} -type - Dim* = cint ## dummy to have access to math - NvBlockIdx* = object - x*: Dim - y*: Dim - z*: Dim - NvBlockDim = object - x*: Dim - y*: Dim - z*: Dim - NvThreadIdx* = object - x*: Dim - y*: Dim - z*: Dim - NvGridDim = object - x*: Dim - y*: Dim - z*: Dim - - DimWgsl = uint32 - WgslGridDim = object - x*: DimWgsl - y*: DimWgsl - z*: DimWgsl - - -## These are dummy elements to make CUDA block / thread index / dim -## access possible in the *typed* `cuda` macro. It cannot be `const`, -## because then the typed code would evaluate the values before we -## can work with it from the typed macro. -let blockIdx* = NvBlockIdx() -let blockDim* = NvBlockDim() -let gridDim* = NvGridDim() -let threadIdx* = NvThreadIdx() - -## WebGPU specific -let global_id* = WgslGridDim() - -## Similar for procs. They don't need any implementation, as they won't ever be actually called. -proc printf*(fmt: string) {.varargs.} = discard -proc memcpy*(dst, src: pointer, size: int) = discard - -## WebGPU select -proc select*[T](f, t: T, cond: bool): T = - # Implementation to run WebGPU code on CPU - if cond: t - else: f - -## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its -## own `extern` pragma (due to requiring an argument it cannot be reused): -## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma -template cuExtern*(): untyped {.pragma.} -template shared*(): untyped {.pragma.} -template private*(): untyped {.pragma.} -## You would typically use `cuExtern` and `shared` together: -## `var x {.cuExtern, shared.}: array[N, Foo]` -## for example to declare a constant array that is filled by the -## host before kernel execution. - -## While you can use `malloc` on device with small sizes, it is usually not -## recommended to do so. -proc malloc*(size: csize_t): pointer = discard -proc free*(p: pointer) = discard -proc syncthreads*() {.cudaName: "__syncthreads".} = discard - -macro toGpuAst*(body: typed): GpuAst = - ## WARNING: The following are *not* supported: - ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between - ## what is a field access and a function call. Hence we assume any `nnkDotExpr` - ## is actually a field access! - ## - most regular Nim features :) - echo body.treerepr - echo body.repr +macro toGpuAst*(body: typed): (GpuGenericsInfo, GpuAst) = + ## Converts the body of this macro into a `GpuAst` from where it can be converted + ## into CUDA or WGSL code at runtime. var ctx = GpuContext() - newLit(ctx.toGpuAst(body)) + let ast = ctx.toGpuAst(body) + let genProcs = toSeq(ctx.genericInsts.values) + let genTypes = toSeq(ctx.types.values) + let g = GpuGenericsInfo(procs: genProcs, types: genTypes) + newLit((g, ast)) macro cuda*(body: typed): string = - ## WARNING: The following are *not* supported: - ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between - ## what is a field access and a function call. Hence we assume any `nnkDotExpr` - ## is actually a field access! - ## - most regular Nim features :) + ## Converts the body of this macro into a `GpuAst` and from there into a string of + ## CUDA or WGSL code. + ## + ## TODO: make `cuda` choose CUDA backend, `wgsl` WGSL etc. Need to change code + ## that chooses backend etc. #echo body.treerepr var ctx = GpuContext() let gpuAst = ctx.toGpuAst(body) @@ -119,13 +41,22 @@ macro cuda*(body: typed): string = let body = ctx.codegen(gpuAst) result = newLit(body) -proc codegen*(ast: GpuAst, kernel: string = ""): string = +proc codegen*(gen: GpuGenericsInfo, ast: GpuAst, kernel: string = ""): string = ## Generates the code based on the given AST (optionally at runtime) and restricts ## it to a single global kernel (WebGPU) if any given. - let ast = ast.clone() ## XXX: remove clone var ctx = GpuContext() + for fn in gen.procs: # assign generics info to correct table + ctx.genericInsts[fn.pName] = fn + for typ in gen.types: # assign generics info to correct table + case typ.kind + of gpuTypeDef: + ctx.types[typ.tTyp] = typ + of gpuAlias: + ctx.types[typ.aTyp] = typ + else: raiseAssert "Unexpected node kind assigning to `types`: " & $typ result = ctx.codegen(ast, kernel) + when isMainModule: # Mini example let kernel = cuda: diff --git a/constantine/math_compiler/experimental/gpu_field_ops.nim b/constantine/math_compiler/experimental/gpu_field_ops.nim index c16bc900..cc96abc3 100644 --- a/constantine/math_compiler/experimental/gpu_field_ops.nim +++ b/constantine/math_compiler/experimental/gpu_field_ops.nim @@ -189,52 +189,58 @@ template defWGSLHelpers*(): untyped {.dirty.} = ## Global variable to simulate carry flag. Private == one for each thread var carry_flag {.private.}: uint32 = 0'u32 - # Add with carry out (sets carry flag) proc add_co(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry out (sets carry flag) let result = a + b # Check for overflow: carry occurs if result < a (or result < b) carry_flag = select(0'u32, 1'u32, result < a) return result - # Add with carry in and carry out proc add_cio(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry in and carry out let temp = a + b let result = temp + carry_flag # Carry out if: temp overflowed OR (temp + carry overflowed) carry_flag = select(0'u32, 1'u32, (temp < a) or (result < temp)) return result - # Add with carry in only proc add_ci(a: uint32, b: uint32): uint32 {.device.} = + # Add with carry in only. + # NOTE: `carry_flag` is not reset, because the next call after + # an `add_ci` *must* be `add_co` or `sub_bo`, but never + # `add/sub_cio/ci`! let temp = a + b let result = temp + carry_flag # Don't update carry flag for this operation return result - # Subtract with borrow out (sets borrow flag) proc sub_bo(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow out (sets borrow flag) let result = a - b # Borrow occurs if a < b carry_flag = select(0'u32, 1'u32, a < b) return result - # Subtract with borrow in only - proc sub_bi(a: uint32, b: uint32): uint32 {.device.} = + proc sub_bio(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow in and borrow out + # NOTE: `carry_flag` is not reset, because the next call after + # an `add_ci` *must* be `add_co` or `sub_bo`, but never + # `add/sub_cio/ci`! let temp = a - b let result = temp - carry_flag - # Don't update carry flag for this operation + # Borrow out if: a < b OR (temp - borrow underflowed) + carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag)) return result - # Subtract with borrow in and borrow out - proc sub_bio(a: uint32, b: uint32): uint32 {.device.} = + proc sub_bi(a: uint32, b: uint32): uint32 {.device.} = + # Subtract with borrow in only let temp = a - b let result = temp - carry_flag - # Borrow out if: a < b OR (temp - borrow underflowed) - carry_flag = select(0'u32, 1'u32, (a < b) or (temp < carry_flag)) + # Don't update carry flag for this operation return result - # Select based on condition (equivalent to PTX slct) proc slct(a: uint32, b: uint32, pred: int32): uint32 {.device.} = + # Select based on condition (equivalent to PTX slct) return select(b, a, pred >= 0) proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} = @@ -261,8 +267,8 @@ template defWGSLHelpers*(): untyped {.dirty.} = return p3 + (p1 shr 16) + (p2 shr 16) + carry - # r <- a * b + c (multiply-add low) proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} = + # r <- a * b + c (multiply-add low) return mul_lo(a, b) + c proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = @@ -280,8 +286,8 @@ template defWGSLHelpers*(): untyped {.dirty.} = let product = mul_lo(a, b) return add_cio(product, c) - # r <- (a * b) >> 32 + c (multiply-add high) proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} = + # r <- (a * b) >> 32 + c (multiply-add high) return mul_hi(a, b) + c proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = @@ -300,6 +306,113 @@ template defWGSLHelpers*(): untyped {.dirty.} = return add_cio(hi_product, c) +template defBigIntCompare*(): untyped {.dirty.} = + ## This template adds a comparison operator for BigInts `<` (which is rewritten to + ## a function call `less`) as well as a `toCanonical` function to turn a Montgomery + ## representation into a canonical representation. + ## It is included in the `defCoreFieldOps` by default, so you need not manually use it. + + proc less(a, b: BigInt): bool {.device.} = + ## Returns true if a < b for two big ints in *canonical* + ## representation. + ## + ## NOTE: The inputs are compared *as is*. That means if they are + ## in Montgomery representation the result will not reflect the + ## ordering relation of their associated canonical values! + ## Call `toCanonical` on field elements in Montgomery order before + ## comparing them. + ## + ## Comparison is constant-time + var borrow: uint32 + # calculate sub with borrows for side effect. Use borrow flag + # at the end to determine if value was smaller + discard sub_bo(a[0], b[0]) + staticFor i, 1, a.len: + discard sub_bio(a[i], b[i]) + borrow = sub_bi(0'u32, 0'u32) + return borrow.bool + + # template to rewrite `<` into a function call. Most backends don't allow custom operators + template `<`(b1, b2: BigInt): untyped = less(b1, b2) + + proc muladd1_gpu(hi, lo: var uint32, a, b, c: uint32) {.device, forceinline.} = + ## Extended precision multiplication + addition + ## (hi, lo) <- a*b + c + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding any c cannot overflow + ## + ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd1` + lo = mulloadd_co(a, b, c) # low part of a*b + c with carry out + hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in + + proc muladd2_gpu(hi, lo: var uint32, a, b, c1, c2: uint32) {.device, forceinline.} = + ## Extended precision multiplication + addition + addition + ## (hi, lo) <- a*b + c1 + c2 + ## + ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001) + ## so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000) + ## and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing + ## + ## Note: `_gpu` prefix to not confuse Nim compiler with `precompute/muladd2` + lo = mulloadd_co(a, b, c1) # low part of a*b + c1 with carry out + hi = mulhiadd_ci(a, b, 0'u32) # high part of a*b with carry in + # Add c2 with carry propagation + lo = add_co(lo, c2) + hi = add_ci(hi, 0'u32) + + proc sub_no_mod(a, b: BigInt): BigInt {.device.} = + ## Generate an optimized substraction kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + ## I.e. this does _not_ perform modular reduction. + var t = BigInt() + t[0] = sub_bo(a[0], b[0]) + staticFor i, 1, a.len: + t[i] = sub_bio(a[i], b[i]) + return t + + proc sub_no_mod(r: var BigInt, a, b: BigInt) {.device.} = + ## Subtraction of two finite field elements stored in `a` and `b` + ## *without* modular reduction. + ## The result is stored in `r`. + r = sub_no_mod(a, b) + + proc csub_no_mod(r: var BigInt, a: BigInt, condition: bool) {.device.} = + ## Conditionally subtract `a` from `r` in place *without* modular + ## reduction. + ## + ## Note: This is constant-time + var t = BigInt() + t.sub_no_mod(r, a) + r.ccopy(t, condition) + + proc fromMont_CIOS(r: var BigInt, a, M: BigInt, m0ninv: uint32) {.device.} = + ## Convert from Montgomery form to canonical BigInt form + # for i in 0 .. n-1: + # m <- t[0] * m0ninv mod 2ʷ (i.e. simple multiplication) + # C, _ = t[0] + m * M[0] + # for j in 1 ..n-1: + # (C, t[j-1]) <- r[j] + m*M[j] + C + # t[n-1] = C + + var t = a # Ensure working in registers + + staticFor i, 0, N: + let m = t[0] * m0ninv + var C, lo: uint32 + muladd1_gpu(C, lo, m, M[0], t[0]) + staticFor j, 1, N: + muladd2_gpu(C, t[j-1], m, M[j], C, t[j]) + t[N-1] = C + + t.csub_no_mod(M, not (t < M)) + r = t + + proc toCanonical(b: BigInt): BigInt {.device.} = + var canon: BigInt + canon.fromMont_CIOS(b, M, M0NInv) + return canon + template defCoreFieldOps*(T: typed): untyped {.dirty.} = # Need to get the limbs & spare bits data in a static context template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32 @@ -314,7 +427,11 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2)) const M0NInv = getM0ninv().uint32 - proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} = + # `ccopy` needed for BigInt comparison logic + proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} + defBigIntCompare() # contains `toCanonical` and `<` comparison for canonical BigInts + + proc finalSubMayOverflow(a, M: BigInt, overflowedLimbs: uint32): BigInt {.device.} = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -325,9 +442,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) var scratch: BigInt = BigInt() - # Contains 0x0001 (if overflowed limbs) or 0x0000 - let overflowedLimbs = add_ci(0'u32, 0'u32) - # Now substract the modulus, and test a < M with the last borrow scratch[0] = sub_bo(a[0], M[0]) staticFor i, 1, N: @@ -337,9 +451,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = # 2. if a >= M, underflowedModulus >= 0 # if underflowedModulus >= 0: a-M else: a # TODO: predicated mov instead? - ## TODO: Fix this. `slct` needs a negative value for the else branch let underflowedModulus = sub_bi(overflowedLimbs, 0'u32) - var r: BigInt = BigInt() staticFor i, 0, N: r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus)) @@ -362,9 +474,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = scratch[i] = sub_bio(a[i], M[i]) # If it underflows here, `a` was smaller than the modulus, which is what we want - ## TODO: Fix this. `slct` needs a negative value for the else branch let underflowedModulus = sub_bi(0'u32, 0'u32) - var r: BigInt = BigInt() staticFor i, 0, N: r[i] = slct(scratch[i], a[i], cast[int32](underflowedModulus)) @@ -373,18 +483,21 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = proc modadd(a, b, M: BigInt): BigInt {.device.} = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - # try to add two bigints var t = BigInt() # temporary t[0] = add_co(a[0], b[0]) staticFor i, 1, N: t[i] = add_cio(a[i], b[i]) - # can use `when` of course! - when spareBits() >= 1: # if spareBits() >= 1: # would also work + when spareBits() >= 1: t = finalSubNoOverflow(t, M) else: - t = finalSubMayOverflow(t, M) + # Contains 0x0001 (if overflowed limbs) or 0x0000 + # This _must_ be computed here and not inside of `finalSubMayOverflow`. In a + # debug build on CUDA the carry flag would (potentially) be reset going into + # the function. + let overflowedLimbs = add_ci(0'u32, 0'u32) + t = finalSubMayOverflow(t, M, overflowedLimbs) return t @@ -398,7 +511,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = t[i] = sub_bio(a[i], b[i]) let underflowMask = sub_bi(0'u32, 0'u32) - # If underflow # TODO: predicated mov instead? var maskedM: BigInt = BigInt() @@ -410,7 +522,6 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = t[i] = add_cio(t[i], maskedM[i]) when N > 1: t[N-1] = add_ci(t[N-1], maskedM[N-1]) - return t proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} = @@ -576,13 +687,8 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = ## The result is stored in `r`. r = modsub(a, b, M) - proc mul(r: var BigInt, a, b: BigInt) {.device.} = - ## Multiplication of two finite field elements stored in `a` and `b`. - ## The result is stored in `r`. - r = mtymul_CIOS_sparebit(a, b, M, true) - proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} = - ## Conditional copy in CUDA + ## Conditional copy. ## If condition is true: b is copied into a ## If condition is false: a is left unmodified ## @@ -599,7 +705,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = a[i] = slct(b[i], a[i], cond) proc csetZero(r: var BigInt, condition: bool) {.device.} = - ## Conditionally set `r` to zero in CUDA + ## Conditionally set `r` to zero. ## ## Note: This is constant-time var t = BigInt() @@ -607,14 +713,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc csetOne(r: var BigInt, condition: bool) {.device.} = - ## Conditionally set `r` to one in CUDA + ## Conditionally set `r` to one. ## ## Note: This is constant-time template mOne: untyped = MontyOne r.ccopy(mOne, condition) proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} = - ## Conditionally add `a` to `r` in place in CUDA. + ## Conditionally add `a` to `r` in place.. ## ## Note: This is constant-time var t = BigInt() @@ -622,7 +728,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} = - ## Conditionally subtract `a` from `r` in place in CUDA. + ## Conditionally subtract `a` from `r` in place. ## ## Note: This is constant-time var t = BigInt() @@ -630,14 +736,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r.ccopy(t, condition) proc doubleElement(r: var BigInt, a: BigInt) {.device.} = - ## Double `a` and store it in `r` in CUDA. + ## Double `a` and store it in `r`. ## ## Note: This is constant-time r.add(a, a) proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} = - ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r` - ## in CUDA. + ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`. ## ## Note: This is constant-time ## @@ -649,7 +754,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true) proc isZero(r: var bool, a: BigInt) {.device.} = - ## Checks if `a` is zero in CUDA. Result is written to `r`. + ## Checks if `a` is zero. Result is written to `r`. ## ## Note: This is constant-time #r = true @@ -660,8 +765,13 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = isZero = isZero or a[i] r = isZero == 0'u32 + proc isZero(a: BigInt): bool {.device, forceinline.} = + result.isZero(a) + proc isNonZero(a: BigInt): bool {.device, forceinline.} = + result = not isZero(a) + proc isOdd(r: var bool, a: BigInt) {.device.} = - ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`. + ## Checks if the Montgomery value of `a` is odd. Result is written to `r`. ## ## IMPORTANT: The canonical value may or may not be odd if the Montgomery ## representation is odd (and vice versa!). @@ -671,7 +781,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r = (a[0] and 1'u32).bool proc neg(r: var BigInt, a: BigInt) {.device.} = - ## Computes the negation of `a` and stores it in `r` in CUDA. + ## Computes the negation of `a` and stores it in `r`. ## ## Note: This is constant-time # Check if input is zero @@ -687,14 +797,14 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} = ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise - ## copy over `a` into `r` in CUDA. + ## copy over `a` into `r`. ## ## Note: This is constant-time r.neg(a) r.ccopy(a, not condition) proc shiftRight(r: var BigInt, k: uint32) {.device.} = - ## Shift `r` right by `k` bits in-nplace in CUDA. + ## Shift `r` right by `k` bits in-nplace. ## ## k MUST be less than the base word size (2^31) ## @@ -716,7 +826,7 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = r[lastIdx] = r[lastIdx] shr k proc div2(r: var BigInt) {.device.} = - ## Divide `r` by 2 in-place in CUDA. + ## Divide `r` by 2 in-place. ## ## Note: This is constant-time # check if the input is odd @@ -728,3 +838,71 @@ template defCoreFieldOps*(T: typed): untyped {.dirty.} = # if it was odd, add `M+1/2` to go 'half-way around' r.cadd(PP1D2, isO) + + proc mul_lohi(hi, lo: var uint32, a, b: uint32) {.device, forceinline.} = + lo = mul_lo(a, b) + hi = mul_hi(a, b) + + proc mulAcc(t, u, v: var uint32, a, b: uint32) {.device, forceinline.} = + ## (t, u, v) <- (t, u, v) + a * b + v = mulloadd_co(a, b, v) # v = (a*b).low + v, with carry out + u = mulhiadd_cio(a, b, u) # u = (a*b).high + u + carry, with carry out + t = add_ci(t, 0'u32) # t = t + carry + + proc mtymul_FIPS(a, b, M: BigInt, lazyReduce: static bool = false): BigInt {.device.} = + ## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS). + ## This implementation can be used for fields that do not have any spare bits. + ## + ## This maps + ## - [0, 2p) -> [0, 2p) with lazyReduce + ## - [0, 2p) -> [0, p) without + ## + ## lazyReduce skips the final substraction step. + # - Architectural Enhancements for Montgomery + # Multiplication on Embedded RISC Processors + # Johann Großschädl and Guy-Armand Kamendje, 2003 + # https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf + # + # - New Speed Records for Montgomery Modular + # Multiplication on 8-bit AVR Microcontrollers + # Zhe Liu and Johann Großschädl, 2013 + # https://eprint.iacr.org/2013/882.pdf + template m0ninv: untyped = M0NInv + var z = BigInt() # zero-init, ensure on stack and removes in-place problems in tower fields + const L = a.len + var t, u, v = 0'u32 + + staticFor i, 0, L: + staticFor j, 0, i: + mulAcc(t, u, v, a[j], b[i-j]) + mulAcc(t, u, v, z[j], M[i-j]) + mulAcc(t, u, v, a[i], b[0]) + z[i] = v * m0ninv + mulAcc(t, u, v, z[i], M[0]) + v = u + u = t + t = 0'u32 + + staticFor i, L, 2*L: + staticFor j, i-L+1, L: + mulAcc(t, u, v, a[j], b[i-j]) + mulAcc(t, u, v, z[j], M[i-j]) + z[i-L] = v + v = u + u = t + t = 0'u32 + + when not lazyReduce: + let cond = v != 0 or not(z < M) + # conditionally subtract using *non modular subtraction*. If `cond == true`, + # we are in `M <= z <= 2M` and can safely subtract `M`. + z.csub_no_mod(M, cond) + return z + + proc mul(r: var BigInt, a, b: BigInt) {.device.} = + ## Multiplication of two finite field elements stored in `a` and `b`. + ## The result is stored in `r`. + when spareBits() >= 1: + r = mtymul_CIOS_sparebit(a, b, M, true) + else: # e.g. Goldilocks + r = mtymul_FIPS(a, b, M, false) diff --git a/constantine/math_compiler/experimental/gpu_types.nim b/constantine/math_compiler/experimental/gpu_types.nim index 448a783e..834d8fa5 100644 --- a/constantine/math_compiler/experimental/gpu_types.nim +++ b/constantine/math_compiler/experimental/gpu_types.nim @@ -33,6 +33,7 @@ type gpuDot # Member access (a.b) gpuIndex # Array indexing (a[b]) gpuTypeDef # Type definition + gpuAlias # A type alias gpuObjConstr # Object (struct) constructor gpuInlineAsm # Inline assembly (PTX) gpuAddr # Address of an expression @@ -45,18 +46,23 @@ type GpuTypeKind* = enum gtVoid, gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics - gtArray, # Static array `array[N, dtype]` -> `dtype[N]` + gtArray, # Static array `array[N, dtype]` -> `dtype[N]` gtString, - gtObject, # Struct types - gtPtr, # Pointer type, carries inner type - gtUA, # UncheckedArray (UA) mapped to runtime sized arrays - gtVoidPtr # Opaque void pointer + gtObject, # Struct types + gtPtr, # Pointer type, carries inner type + gtUA, # UncheckedArray (UA) mapped to runtime sized arrays + gtGenericInst, # Instantiated generic type with one or more generic arguments (instantiated!) + gtVoidPtr # Opaque void pointer + gtInvalid # Can be returned to indicate a call to `nimToGpuType` failed to determine a type + ## XXX: make this the default value and replace all `gtVoid` placeholders by it + GpuTypeField* = object name*: string typ*: GpuType GpuType* = ref object + builtin*: bool ## Whether the type refers to a builtin type or not case kind*: GpuTypeKind of gtPtr: to*: GpuType # `ptr T` points to `to` @@ -74,6 +80,10 @@ type aLen*: int # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time # On both CUDA and WebGPU a length of `0` is also used to generate `int foo[]` (CUDA) # `array` (WebGPU) (runtime sized arrays), which are generated from `ptr UncheckedArray[float32]` for example. + of gtGenericInst: + gName*: string # name of the generic type + gArgs*: seq[GpuType] # list of the instantiated generic arguments e.g. `vec3` on WGSL backend + gFields*: seq[GpuTypeField] # same as `oFields` for `gtObject` else: discard GpuAttribute* = enum @@ -97,7 +107,9 @@ type pParams*: seq[GpuParam] pBody*: GpuAst pAttributes*: set[GpuAttribute] # order not important, hence set + forwardDeclare*: bool ## can be set to true to _only_ generate a forward declaration of gpuCall: + cIsExpr*: bool ## If the call returns a value cName*: GpuAst ## Will be a `GpuIdent` cArgs*: seq[GpuAst] of gpuTemplateCall: @@ -115,8 +127,10 @@ type wCond*: GpuAst wBody*: GpuAst of gpuBinOp: - bOp*: string + bOp*: GpuAst # `gpuIdent` of the binary operation bLeft*, bRight*: GpuAst + # types of left and right nodes. Determined from Nim symbol associated with `bOp` + bLeftTyp*, bRightTyp*: GpuType of gpuVar: vName*: GpuAst ## Will be a `GpuIdent` vType*: GpuType @@ -141,9 +155,10 @@ type cValue*: GpuAst # not just a string to support different types easily cType*: GpuType of gpuArrayLit: - aValues*: seq[string] ## XXX: make `GpuAst` for case where we store a symbol in an array + aValues*: seq[GpuAst] aLitType*: GpuType # type of first element of gpuBlock: + isExpr*: bool ## Whether this block represents an expression, i.e. it returns something blockLabel*: string # optional name of the block. If any given, will open a `{ }` scope in CUDA statements*: seq[GpuAst] ## XXX: we could add a `locals` argument here, which would refer to all local variables @@ -159,10 +174,14 @@ type pOp*: string pVal*: GpuAst of gpuTypeDef: - tName*: string ## XXX: could make GpuAst, but don't really need the types as symbols + tTyp*: GpuType ## the actual type. Used to generate the name tFields*: seq[GpuTypeField] + of gpuAlias: + aTyp*: GpuType ## Name of the type alias + aTo*: GpuAst ## Type the alias maps to + aDistinct*: bool ## If the alias is a distinct type in Nim. of gpuObjConstr: - ocName*: string # type we construct + ocType*: GpuType # type we construct ## XXX: it would be better if we already fill the fields with default values here ocFields*: seq[GpuFieldInit] # the fields we initialize of gpuInlineAsm: @@ -212,12 +231,17 @@ type GpuFieldInit* = object name*: string value*: GpuAst + typ*: GpuType ## XXX: UNUSED TemplateInfo* = object params*: seq[string] body*: GpuAst + GpuProcSignature* = object + params*: seq[GpuParam] + retType*: GpuType + GpuContext* = object ## XXX: need table for generic invocations. Then when we encounter a type, need to map to ## the specific version @@ -245,6 +269,47 @@ type # ## when we finish, we pop. Before we pop, we assign the variable definitions to the `gpuBlock` # ## `locals` genSymCount*: int ## increases for every generated identifier (currently only underscore `_`), hence the basic solution + ## Maps a struct type and field name, which is of pointer type to the value the user assigns + ## in the constructor. Allows us to later replace `foo.ptrField` by the assignment in the `Foo()` + ## constructor (WebGPU only). + structsWithPtrs*: Table[(GpuType, string), GpuAst] + ## Set of all generic proc names we have encountered in Nim -> GpuAst. When + ## we see an `nnkCall` we check if we call a generic function. If so, look up + ## the instantiated generic, parse it and store in `genericInsts` below. + generics*: HashSet[string] + + ## Stores the unique identifiers (keys) and the implementations of the + ## precise generic instantiations that are called. + genericInsts*: OrderedTable[GpuAst, GpuAst] + + ## Table of procs and their signature to avoid looping infinitely for recursive procs + ## Arguments are: + ## - Key: ident of the proc + ## - Value: signature of the (possibly generic) instantiation + processedProcs*: OrderedTable[GpuAst, GpuProcSignature] + + ## Storse all builtin / nimonly / importc / ... functions we encounter so that we can + ## check if they return a value when we encounter them in a `gpuCall` + builtins*: OrderedTable[GpuAst, GpuAst] + + ## Table of all known types. Filled during Nim -> GpuAst. Includes generic + ## instantiations, but also all other types. + ## Key: the raw type. Value: a full `gpuTypeDef` + types*: OrderedTable[GpuType, GpuAst] + + ## This is _effectively_ just a set of all already produced function symbols. + ## We use it to determine if when encountering another function with the same + ## name, but different arguments to instead of using `iName` to use `iSym` as + ## the function name. This is to avoid overload issues in backends that don't + ## allow overloading by function signatures. + symChoices*: HashSet[string] + + ## We rely on being able to compute a `newLit` from the result of `toGpuAst`. Currently we + ## only need the `genericInsts` field data (the values). Trying to `newLit` the full `GpuContext` + ## causes trouble. + GpuGenericsInfo* = object + procs*: seq[GpuAst] + types*: seq[GpuAst] GenericArg* = object addrSpace*: AddressSpace ## We store the address space, because that's what matters @@ -274,6 +339,12 @@ proc clone*(typ: GpuType): GpuType = of gtArray: result.aTyp = typ.aTyp.clone() result.aLen = typ.aLen + of gtGenericInst: + result.gName = typ.gName + for g in typ.gArgs: + result.gArgs.add g.clone() + for f in typ.gFields: + result.gFields.add GpuTypeField(name: f.name, typ: f.typ.clone()) else: discard proc clone*(ast: GpuAst): GpuAst = @@ -293,8 +364,10 @@ proc clone*(ast: GpuAst): GpuAst = result.pParams.add(clonedParam) result.pBody = ast.pBody.clone() result.pAttributes = ast.pAttributes + result.forwardDeclare = result.forwardDeclare of gpuCall: result = GpuAst(kind: gpuCall) + result.cIsExpr = ast.cIsExpr result.cName = ast.cName.clone() for arg in ast.cArgs: result.cArgs.add(arg.clone()) @@ -320,9 +393,11 @@ proc clone*(ast: GpuAst): GpuAst = result.wBody = ast.wBody.clone() of gpuBinOp: result = GpuAst(kind: gpuBinOp) - result.bOp = ast.bOp + result.bOp = ast.bOp.clone() result.bLeft = ast.bLeft.clone() result.bRight = ast.bRight.clone() + result.bLeftTyp = ast.bLeftTyp.clone() + result.bRightTyp = ast.bRightTyp.clone() of gpuVar: result = GpuAst(kind: gpuVar) result.vName = ast.vName.clone() @@ -354,7 +429,8 @@ proc clone*(ast: GpuAst): GpuAst = result.cType = ast.cType.clone() of gpuArrayLit: result = GpuAst(kind: gpuArrayLit) - result.aValues = ast.aValues + for a in ast.aValues: + result.aValues.add a.clone() result.aLitType = ast.aLitType.clone() of gpuPrefix: result = GpuAst(kind: gpuPrefix) @@ -362,6 +438,7 @@ proc clone*(ast: GpuAst): GpuAst = result.pVal = ast.pVal.clone() of gpuBlock: result = GpuAst(kind: gpuBlock) + result.isExpr = ast.isExpr result.blockLabel = ast.blockLabel for stmt in ast.statements: result.statements.add(stmt.clone()) @@ -378,14 +455,24 @@ proc clone*(ast: GpuAst): GpuAst = result.iIndex = ast.iIndex.clone() of gpuTypeDef: result = GpuAst(kind: gpuTypeDef) - result.tName = ast.tName + result.tTyp = ast.tTyp.clone() for f in ast.tFields: result.tFields.add(GpuTypeField(name: f.name, typ: f.typ.clone())) + of gpuAlias: + result = GpuAst(kind: gpuAlias) + result.aTyp = ast.aTyp.clone() + result.aTo = ast.aTo.clone() of gpuObjConstr: result = GpuAst(kind: gpuObjConstr) - result.ocName = ast.ocName + result.ocType = ast.ocType.clone() for f in ast.ocFields: - result.ocFields.add(GpuFieldInit(name: f.name, value: f.value.clone())) + result.ocFields.add( + GpuFieldInit( + name: f.name, + value: f.value.clone(), + typ: f.typ.clone() + ) + ) of gpuInlineAsm: result = GpuAst(kind: gpuInlineAsm) result.stmt = ast.stmt @@ -423,6 +510,12 @@ proc hash*(t: GpuType): Hash = of gtArray: h = h !& hash(t.aTyp) h = h !& hash(t.aLen) + of gtGenericInst: + h = h !& hash(t.gName) + for g in t.gArgs: + h = h !& hash(g) + for f in t.gFields: + h = h !& hash(f) else: discard result = !$ h @@ -451,6 +544,15 @@ proc `==`*(a, b: GpuType): bool = else: for i in 0 ..< a.oFields.len: result = result and (a.oFields[i] == b.oFields[i]) + of gtGenericInst: + result = a.gName == b.gName + if a.gArgs.len != b.gArgs.len: result = false + elif a.gFields.len != b.gFields.len: result = false + else: + for i in 0 ..< a.gArgs.len: + result = result and (a.gArgs[i] == b.gArgs[i]) + for i in 0 ..< a.gFields.len: + result = result and (a.gFields[i] == b.gFields[i]) of gtArray: result = a.aTyp == b.aTyp and a.aLen == b.aLen else: discard @@ -461,6 +563,17 @@ proc `==`*(a, b: GpuAst): bool = else: result = a.iSym == b.iSym and a.iTyp == b.iTyp and a.symbolKind == b.symbolKind +proc `==`*(a, b: GpuProcSignature): bool = + if a.retType != b.retType: result = false + elif a.params.len != b.params.len: + result = false + else: + result = true + for i in 0 ..< a.params.len: + let ap = a.params[i] + let bp = b.params[i] + result = result and (ap == bp) + proc len*(ast: GpuAst): int = case ast.kind of gpuProc: 1 @@ -496,6 +609,31 @@ proc removePrefix(s, p: string): string = result = s result.removePrefix(p) +proc pretty*(t: GpuType): string = + ## returns a flat (but lossy) string representation of the type + if t == nil: + result = "GpuType(nil)" + else: + case t.kind + of gtPtr: + result = if t.implicit: "var " else: "ptr " + result.add pretty(t.to) + of gtUA: + result = "UncheckedArray[" & t.uaTo.pretty() & "]" + of gtObject: + result = t.name # just the name + of gtArray: + result = "array[" & $t.aLen & ", " & t.aTyp.pretty() & "]" + of gtGenericInst: + result = t.gName & "[" + for i, g in t.gArgs: + result.add pretty(g) + if i < t.gArgs.high: + result.add ", " + result.add "]" + else: + result = ($t.kind).removePrefix("gt") + proc pretty*(n: GpuAst, indent: int = 0): string = template id(): untyped = repeat(" ", indent) template idn(x): untyped = repeat(" ", indent) & $x @@ -545,17 +683,17 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add pretty(n.wCond, indent + 2) result.add pretty(n.wBody, indent + 2) of gpuBinOp: - result.add idd("Ident", n.bOp) + result.add pretty(n.bOp, indent + 2) result.add pretty(n.bLeft, indent + 2) result.add pretty(n.bRight, indent + 2) of gpuVar: result.add pretty(n.vName, indent + 2) result.add pretty(n.vInit, indent + 2) if n.vAttributes.len > 0: - result.add id("Attributes") + result.add idd("Attributes") for attr in n.vAttributes: let indent = indent + 2 - result.add id(attr) + result.add idd(attr) of gpuAssign: result.add pretty(n.aLeft, indent + 2) result.add pretty(n.aRight, indent + 2) @@ -568,7 +706,7 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add pretty(n.cValue, indent + 2) of gpuArrayLit: for el in n.aValues: - result.add id(el) + result.add pretty(el, indent + 2) of gpuBlock: if n.blockLabel.len > 0: result.add id("Label", n.blockLabel) @@ -586,17 +724,22 @@ proc pretty*(n: GpuAst, indent: int = 0): string = result.add id("Op", n.pOp) result.add pretty(n.pVal, indent + 2) of gpuTypeDef: - result.add id("Type", n.tName) + result.add id("Type", pretty(n.tTyp)) result.add id("Fields") for t in n.tFields: let indent = indent + 2 result.add id(t.name) + of gpuAlias: + result.add id("Alias", pretty(n.aTyp)) + result.add pretty(n.aTo, indent + 2) of gpuObjConstr: - result.add id("Ident", n.ocName) - result.add id("Fields") + result.add idd("Ident", pretty(n.ocType)) + result.add idd("Fields") for f in n.ocFields: - let indent = indent + 2 - result.add id("Name", f.name) + var indent = indent + 2 + result.add idd("Field") + indent = indent + 2 + result.add idd("Name", f.name) result.add pretty(f.value, indent + 2) of gpuInlineAsm: result.add id(n.stmt) @@ -694,6 +837,19 @@ iterator mitems*(ast: var GpuAst): var GpuAst = iterator items*(ast: GpuAst): GpuAst = iterImpl(ast, mutable = false) +iterator mpairs*(ast: var GpuAst): (int, var GpuAst) = + ## Iterate over all child nodes of the given AST and the index + var i = 0 + for el in mitems(ast): + yield (i, el) + inc i + +iterator pairs*(ast: GpuAst): (int, GpuAst) = + var i = 0 + for el in items(ast): + yield (i, el) + inc i + ## General utility helpers @@ -704,9 +860,12 @@ proc ident*(n: GpuAst): string = result = n.iName template withoutSemicolon*(ctx: var GpuContext, body: untyped): untyped = - ctx.skipSemicolon = true - body - ctx.skipSemicolon = false + if not ctx.skipSemicolon: # if we are already skipping, leave true + ctx.skipSemicolon = true + body + ctx.skipSemicolon = false + else: + body proc getInnerArrayLengths*(t: GpuType): string = ## Returns the lengths of the inner array types for a nested array. diff --git a/constantine/math_compiler/experimental/nim_to_gpu.nim b/constantine/math_compiler/experimental/nim_to_gpu.nim index 45af3782..5373581b 100644 --- a/constantine/math_compiler/experimental/nim_to_gpu.nim +++ b/constantine/math_compiler/experimental/nim_to_gpu.nim @@ -11,7 +11,7 @@ import std / [macros, strutils, sequtils, options, sugar, tables, strformat, has import ./gpu_types import ./backends/backends -proc nimToGpuType(n: NimNode): GpuType +proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType proc initGpuType(kind: GpuTypeKind): GpuType = ## If `kind` is `gtPtr` `to` must be the type we point to @@ -20,11 +20,17 @@ proc initGpuType(kind: GpuTypeKind): GpuType = proc initGpuPtrType(to: GpuType, implicitPtr: bool): GpuType = ## If `kind` is `gtPtr` `to` must be the type we point to - result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr) + if to.kind == gtInvalid: # this is not a valid type + result = GpuType(kind: gtInvalid) + else: + result = GpuType(kind: gtPtr, to: to, implicit: implicitPtr) proc initGpuUAType(to: GpuType): GpuType = ## Initializes a GPU type for an unchecked array (ptr wraps this) - result = GpuType(kind: gtUA, uaTo: to) + if to.kind == gtInvalid: # this is not a valid type + result = GpuType(kind: gtInvalid) + else: + result = GpuType(kind: gtUA, uaTo: to) proc initGpuVoidPtr(): GpuType = result = GpuType(kind: gtVoidPtr) @@ -37,6 +43,17 @@ proc initGpuArrayType(aTyp: NimNode, len: int): GpuType = ## Construct an statically sized array type result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len) +proc toTypeDef(typ: GpuType): GpuAst = + ## Converts a given object or generic instantiation type into an AST of a + ## corresponding type def. + # store the type instantiation + result = GpuAst(kind: gpuTypeDef, tTyp: typ) + case typ.kind + of gtObject: result.tFields = typ.oFields + of gtGenericInst: result.tFields = typ.gFields + else: + raiseAssert "Type: " & $pretty(typ) & " is neither object type nor generic instantiation." + proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = case t #of ntyBool, ntyChar: @@ -57,9 +74,62 @@ proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = of ntyUInt16: gtUint16 of ntyUInt32: gtUint32 of ntyUInt64: gtUint64 + of ntyString: gtString else: raiseAssert "Not supported yet: " & $t +proc parseTypeFields(node: NimNode): seq[GpuTypeField] + +proc getGenericTypeName(t: NimNode): string = + ## Returns the base name of the generic type, i.e. for + ## `Foo[Bar, Baz]` returns `Foo`. + case t.kind + of nnkSym: result = t.strVal + of nnkBracketExpr: result = t[0].getGenericTypeName() + else: raiseAssert "Unexpected node kind for generic instantiation type: " & $t.treerepr + +proc parseGenericArgs(t: NimNode): seq[GpuType] = + case t.kind + of nnkSym: return # no generic arguments + of nnkBracketExpr: + for i in 1 ..< t.len: + result.add nimToGpuType(t[i]) + else: + raiseAssert "Unexpected node kind in parseGenericArgs: " & $t.treerepr + +proc initGpuGenericInst(t: NimNode): GpuType = + doAssert t.typeKind == ntyGenericInst, "Input is not a generic instantiation: " & $t.treerepr & " of typeKind: " & $t.typeKind + case t.kind + of nnkBracketExpr: # regular generic instantiation + result = GpuType(kind: gtGenericInst, gName: getGenericTypeName(t)) + result.gArgs = parseGenericArgs(t) + # now parse the object fields + let impl = t.getTypeImpl() # impl for the `gFields` + result.gFields = parseTypeFields(impl) + of nnkObjConstr: + if t.len == 1: # Generic instantiation without arguments + result = initGpuGenericInst(t[0]) + elif t.len == 2: # ...and with arguments + doAssert t[1].kind == nnkExprColonExpr, "ObjConstr does not contain initialization as [1], but: " & $t.treerepr + result = initGpuGenericInst(t[0]) + else: + raiseAssert "Unexpected number of elements in `nnkObjConstr` node for generic instantiation: " & $t.treerepr + of nnkSym: + let impl = getTypeImpl(t) + case impl.kind + of nnkDistinctTy: + ## XXX: assumes distinct of inbuilt type, not object! + result = nimToGpuType(impl[0]) + of nnkObjectTy: + doAssert impl.kind == nnkObjectTy, "Unexpected node kind for generic inst: " & $impl.treerepr + ## XXX: use signature hash for type name? Otherwise will produce duplicates + result = GpuType(kind: gtGenericInst, gName: t.repr) + result.gFields = parseTypeFields(impl) + else: + raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr + else: + raiseAssert "Unexpected node kind in for genericInst: " & $t.treerepr + proc unpackGenericInst(t: NimNode): NimNode = let tKind = t.typeKind if tKind == ntyGenericInst: @@ -75,96 +145,201 @@ proc unpackGenericInst(t: NimNode): NimNode = proc toGpuTypeKind(t: NimNode): GpuTypeKind = result = t.unpackGenericInst().typeKind.toGpuTypeKind() -proc getInnerPointerType(n: NimNode): GpuType = +proc getInnerPointerType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind if n.typeKind in {ntyPointer, ntyUncheckedArray}: let typ = n.getTypeInst() doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr doAssert typ[0].kind in {nnkIdent, nnkSym} doAssert typ[0].strVal in ["ptr", "UncheckedArray"] - result = nimToGpuType(typ[1]) + result = nimToGpuType(typ[1], allowToFail, allowArrayIdent) elif n.kind == nnkPtrTy: - result = nimToGpuType(n[0]) + result = nimToGpuType(n[0], allowToFail, allowArrayIdent) elif n.kind == nnkAddr: let typ = n.getTypeInst() - result = getInnerPointerType(typ) + result = getInnerPointerType(typ, allowToFail, allowArrayIdent) elif n.kind == nnkVarTy: # VarTy # Sym "BigInt" - result = nimToGpuType(n[0]) + result = nimToGpuType(n[0], allowToFail, allowArrayIdent) + elif n.kind == nnkSym: # symbol of e.g. `ntyVar` + result = nimToGpuType(n.getTypeInst(), allowToFail, allowArrayIdent) else: raiseAssert "Found what: " & $n.treerepr -proc determineArrayLength(n: NimNode): int = +proc determineArrayLength(n: NimNode, allowArrayIdent: bool): int = + ## If `allowArrayIdent` is true, we do not emit the error message when + ## encountering an ident. This is the case for procs taking arrays + ## with a static array where the constant comes from outside the + ## macro. In that case we return `-1` indicating + ## `proc mdsRowShfNaive(r: int, v: array[SPONGE_WIDTH, BigInt]): BigInt {.device.} =` case n[1].kind of nnkSym: # likely a constant, try to get its value result = n[1].getImpl.intVal of nnkIdent: - let msg = """Found array with length given by identifier: $#! + if not allowArrayIdent: + let msg = """Found array with length given by identifier: $#! You might want to create a typed template taking a typed parameter for this -constant to force the Nim compiler to bind the symbol. +constant to force the Nim compiler to bind the symbol. In theory though this +error should not appear anymore though, as we don't try to parse generic +functions. """ % n[1].strVal - raiseAssert msg + raiseAssert msg + else: + result = -1 # return -1 to indicate caller should look at symbol else: case n[1].kind of nnkIntLit: result = n[1].intVal else: + # E.g. + # BracketExpr + # Sym "array" + # Infix + # Ident ".." + # IntLit 0 + # IntLit 11 + # Sym "BigInt" #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr result = n[1][2].intVal + 1 -proc getTypeName(n: NimNode): string = +proc getTypeName(n: NimNode, recursedSym: bool = false): string +proc constructTupleTypeName(n: NimNode): string = + ## XXX: overthink if this should really be here and not somewhere else + ## + ## Given a tuple, generate a name from the field names and types, e.g. + ## `Tuple_lo_BaseType_hi_BaseType` + ## + ## XXX: `getTypeImpl.repr` is a hacky way to get a string name of the underlying + ## type, e.g. for `BaseType`. Aliases would lead to duplicate tuple types. + ## UPDATE: I changed the implementation to recurse into `getTypeName` + ## TODO: verify that this did not break the tuple test & specifically check for aliases + result = "Tuple_" + doAssert n.kind in [nnkTupleTy, nnkTupleConstr] + for i, ch in n: + case ch.kind + of nnkIdentDefs: + let typName = ch[ch.len - 2].getTypeName() # second to last is type name of field(s) + for j in 0 ..< ch.len - 2: + # Example: + # IdentDefs + # Ident "hi" + # Ident "lo" `..< ch.len - 2 ` + # Sym "BaseType" `..< ch.len - 1` + # Empty `..< ch.len` + result.add ch[j].strVal & "_" & typName + if j < ch.len - 3: + result.add "_" + if i < n.len - 1: + result.add "_" + of nnkExprColonExpr: + # ExprColonExpr + # Sym "hi" + # Infix + # Sym "shr" + # Sym "n" + # IntLit 16 + # -> these are tuple types that are constructed in place using `(foo: bar, ar: br)` + # give them a slightly different name + let typName = ch[0].getTypeName() ## XXX + doAssert ch[0].kind == nnkSym, "Not a symbol, but: " & $ch.treerepr + result.add ch[0].strVal & "_" & typName + if i < n.len - 1: + result.add "_" + of nnkSym: + # TupleConstr + # Sym "BaseType" <-- e.g. here + # Sym "BaseType" + let typName = ch.getTypeName() + result.add "Field" & $i & "_" & typName + if i < n.len - 1: + result.add "_" + else: + # TupleConstr e.g. a tuple constr like this + # Infix + # Sym "shr" + # Sym "n" + # IntLit 16 + # Infix + # Sym "and" + # Sym "n" + # UInt32Lit 65535 + # -> Try again with type impl + return constructTupleTypeName(getTypeImpl(n)) + +proc getTypeName(n: NimNode, recursedSym: bool = false): string = ## Returns the name of the type case n.kind - of nnkIdent, nnkSym: result = n.strVal + of nnkIdent: result = n.strVal + of nnkSym: + if recursedSym: + result = n.strVal + else: + result = n.getTypeInst.getTypeName(true) of nnkObjConstr: if n[0].kind == nnkEmpty: result = n.getTypeInst.strVal else: result = n[0].strVal # type is the first node + of nnkTupleTy, nnkTupleConstr: + result = constructTupleTypeName(n) + of nnkBracketExpr: + # construct a type name `Foo_Bar_Baz` + for i, ch in n: + result.add ch.getTypeName() + if i < n.len - 1: + result.add "_" else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr -proc parseTypeFields(node: NimNode): seq[GpuTypeField] -proc nimToGpuType(n: NimNode): GpuType = +proc nimToGpuType(n: NimNode, allowToFail: bool = false, allowArrayIdent: bool = false): GpuType = ## Maps a Nim type to a type on the GPU + ## + ## If `allowToFail` is `true`, we return `GpuType(kind: gtVoid)` in cases + ## where we would otherwise raise. This is so that in some cases where + ## we only _attempt_ to determine a type, we can do so safely. case n.kind of nnkIdentDefs: # extract type for let / var based on explicit or implicit type if n[n.len - 2].kind != nnkEmpty: # explicit type - result = nimToGpuType(n[n.len - 2]) + result = nimToGpuType(n[n.len - 2], allowToFail, allowArrayIdent) else: # take from last element - result = nimToGpuType(n[n.len - 1].getTypeInst()) + result = nimToGpuType(n[n.len - 1].getTypeInst(), allowToFail, allowArrayIdent) of nnkConstDef: if n[1].kind != nnkEmpty: # has an explicit type - result = nimToGpuType(n[1]) + result = nimToGpuType(n[1], allowToFail, allowArrayIdent) else: - result = nimToGpuType(n[2]) # derive from the RHS literal + result = nimToGpuType(n[2], allowToFail, allowArrayIdent) # derive from the RHS literal else: if n.kind == nnkEmpty: return initGpuType(gtVoid) case n.typeKind of ntyBool, ntyInt .. ntyUint64: # includes all float types result = initGpuType(toGpuTypeKind n.typeKind) + of ntyString: # only supported on some backends! + result = initGpuType(toGpuTypeKind n.typeKind) of ntyPtr: - result = initGpuPtrType(getInnerPointerType(n), implicitPtr = false) + result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = false) of ntyVar: - result = initGpuPtrType(getInnerPointerType(n), implicitPtr = true) + result = initGpuPtrType(getInnerPointerType(n, allowToFail, allowArrayIdent), implicitPtr = true) of ntyPointer: result = initGpuVoidPtr() of ntyUncheckedArray: ## Note: this is just the internal type of the array. It is only a pointer due to ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. - result = initGpuUAType(getInnerPointerType(n)) - of ntyObject: - let impl = n.getTypeImpl + result = initGpuUAType(getInnerPointerType(n, allowToFail, allowArrayIdent)) + of ntyObject, ntyAlias, ntyTuple: + # for aliases, treat them identical to regular object types, but + # `getTypeName` returns the alias! + let impl = if n.kind == nnkTupleConstr: n # might actually _lose_ information if used getTypeImpl + else: n.getTypeImpl let flds = impl.parseTypeFields() let typName = getTypeName(n) # might be an object construction result = initGpuObjectType(typName, flds) of ntyArray: # For a generic, static array type, e.g.: if n.kind == nnkSym: - return nimToGpuType(getTypeImpl(n)) + return nimToGpuType(getTypeImpl(n), allowToFail, allowArrayIdent) if n.len == 3: # BracketExpr # Sym "array" @@ -172,8 +347,16 @@ proc nimToGpuType(n: NimNode): GpuType = # Sym "uint32" doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr doAssert n[0].strVal == "array" - let len = determineArrayLength(n) - result = initGpuArrayType(n[2], len) + let len = determineArrayLength(n, allowArrayIdent) + if len < 0: + # indicates we found an array with an ident, e.g. + # BracketExpr + # Sym "array" + # Ident "SPONGE_WIDTH" + # Sym "BigInt" + return GpuType(kind: gtInvalid) + else: + result = initGpuArrayType(n[2], len) else: # just an array literal # Bracket @@ -184,11 +367,27 @@ proc nimToGpuType(n: NimNode): GpuType = # echo n.getTypeImpl.treerepr # error("o") of ntyGenericInvocation: - result = initGpuType(gtVoid) - error("Generics are not supported in the CUDA DSL so far.") + result = initGpuType(gtInvalid) + error("Generics are not supported in the CUDA DSL so far.") # Note: this should not appear nowadays of ntyGenericInst: - result = n.unpackGenericInst().nimToGpuType() - else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr + result = initGpuGenericInst(n) + of ntyTypeDesc: + # `getType` returns a `BracketExpr` of eg: + # BracketExpr + # Sym "typeDesc" + # Sym "float32" + result = n.getType[1].nimToGpuType(allowToFail, allowArrayIdent) # for a type desc we need to recurse using the type of it + of ntyUnused2: + # BracketExpr + # Sym "lent" + # Sym "BigInt" + doAssert n.kind == nnkBracketExpr and n[0].strVal == "lent", "ntyUnused2: " & $n.treerepr + result = initGpuPtrType(nimToGpuType(n[1]), implicitPtr = false) + else: + if allowToFail: + result = GpuType(kind: gtVoid) + else: + raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr proc assignOp(op: string, isBoolean: bool): string = ## Returns the correct CUDA operation given the Nim operator. @@ -210,12 +409,34 @@ proc assignPrefixOp(op: string): string = else: result = op proc parseTypeFields(node: NimNode): seq[GpuTypeField] = - doAssert node.kind == nnkObjectTy - doAssert node[2].kind == nnkRecList - for ch in node[2]: - doAssert ch.kind == nnkIdentDefs and ch.len == 3 - result.add GpuTypeField(name: ch[0].strVal, - typ: nimToGpuType(ch[1])) + case node.kind + of nnkObjectTy: + doAssert node[2].kind == nnkRecList + for ch in node[2]: + doAssert ch.kind == nnkIdentDefs and ch.len == 3 + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + of nnkTupleTy: + for ch in node: + doAssert ch.kind == nnkIdentDefs and ch.len == 3 + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + of nnkTupleConstr: + # TupleConstr + # Sym "BaseType" + # Sym "BaseType" + for i, ch in node: + case ch.kind + of nnkSym: + result.add GpuTypeField(name: "Field" & $i, + typ: nimToGpuType(ch)) + of nnkExprColonExpr: + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + else: + return parseTypeFields(node.getTypeImpl) # will likely fall back to constr with `nnkSym` + else: + raiseAssert "Unsupported type to parse fields from: " & $node.kind template findIdx(col, el): untyped = var res = -1 @@ -235,17 +456,37 @@ proc requiresMemcpy(n: NimNode): bool = ## At the moment we only emit a `memcpy` statement for array types result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy +proc isBuiltIn(n: NimNode): bool = + ## Checks if the given proc is a `{.builtin.}` (or if it is a Nim "built in" + ## proc that uses `importc`, as we cannot emit those; they _need_ to have a + ## WGSL / CUDA equivalent built in) + doAssert n.kind in [nnkProcDef, nnkFuncDef], "Argument is not a proc: " & $n.treerepr + for pragma in n.pragma: + doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr + let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma + if pragma.strVal in ["builtin", "importc"]: + return true + proc collectProcAttributes(n: NimNode): set[GpuAttribute] = - doAssert n.kind == nnkPragma + doAssert n.kind in [nnkPragma, nnkEmpty] + if n.kind == nnkEmpty: return # no pragmas for pragma in n: - doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr + doAssert pragma.kind in [nnkIdent, nnkSym, nnkCall, nnkExprColonExpr], "Unexpected node kind: " & $pragma.treerepr + let pragma = if pragma.kind in [nnkCall, nnkExprColonExpr]: pragma[0] else: pragma case pragma.strVal of "device": result.incl attDevice of "global": result.incl attGlobal - of "forceinline": result.incl attForceInline - of "nimonly": + of "inline", "forceinline": result.incl attForceInline + of "nimonly", "builtin": # used to fully ignore functions! return + of "importc": # encountered if we analyze a proc from outside `cuda` scope + return # this _should_ be a builtin function that has a counterpart in Nim, e.g. `math.ceil` + of "varargs": # attached to some builtins, e.g. `printf` on CUDA backend + continue + of "magic": + return + of "raises": discard # result.incl attDevice #discard # XXX else: raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr @@ -270,17 +511,40 @@ proc collectAttributes(n: NimNode): seq[GpuVarAttribute] = # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes # do not match the CUDA string we need to emit, which is what the string value of # the `GpuVarAttribute` enum stores - case pragma.strVal - of "cuExtern", "extern": result.add atvExtern + case pragma.strVal.normalize + of "cuextern", "extern": result.add atvExtern of "shared": result.add atvShared of "private": result.add atvPrivate of "volatile": result.add atvVolatile of "constant": result.add atvConstant + of "noinit": discard # XXX: ignore for now else: raiseAssert "Unexpected pragma: " & $pragma.treerepr proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst +proc maybePatchFnName(n: var GpuAst) = + ## Patches the function name for names that are not allowed on most backends, but appear + ## commonly in Nim (custom operators). + ## + ## NOTE: I think that the binary operators don't actually appear as a `gpuCall`, but still + ## as an infix node, even after sem checking by the Nim compiler. + doAssert n.kind == gpuIdent + template patch(arg, by: untyped): untyped = + arg.iSym = arg.iSym.replace(arg.iName, by) + arg.iName = by + let name = n.iName + case name + of "[]": patch(n, "get") + of "[]=": patch(n, "set") + of "+": patch(n, "add") + of "-": patch(n, "sub") + of "*": patch(n, "mul") + of "/": patch(n, "div") + else: + # leave as is + discard + proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = ## Returns the name for the function. Either the symbol name _or_ ## the `{.cudaName.}` pragma argument. @@ -320,6 +584,20 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = result = ctx.toGpuAst(n) # if _no_ pragma else: result = ctx.toGpuAst(n) # if not proc or func + + # possibly patch function names, e.g. custom `[]`, `[]=`, `+` etc operators + # (inbuilt won't show up as a function name, but rather as a specific node kind, eg `nnkIndex` + result.maybePatchFnName() + + # handle overloads with different signatures + if n.strVal in ctx.symChoices: + # this is an overload of another function with different signature (not a generic, but + # overloads are not allowed in CUDA/WGSL/...). Update `sigTab` entry by using `iSym` + # for `iName` field for unique name + let id = ctx.sigTab[sig] + id.iName = id.iSym + else: + ctx.symChoices.incl result.iName # store this name in `symChoices` else: # else we use the str representation (repr for open / closed sym choice nodes) result = toAst n.repr @@ -328,6 +606,206 @@ proc getFnName(ctx: var GpuContext, n: NimNode): GpuAst = # ctx.sigTab[sig] = result result.symbolKind = gsProc # make sure it's a proc +proc gpuTypeMaybeFromSymbol(t: NimNode, n: NimNode): GpuType = + ## Returns the type from a given Nim node `t` representing a type. + ## If that fails due to an identifier in the type, we instead try + ## to look up the type from the associated symbol, `n`. + result = nimToGpuType(t, allowArrayIdent = true) + if result.kind == gtInvalid: + # an existing symbol cannot be `void` by definition, then it wouldn't be a symbol. Means + # `allowArrayIdent` triggered due to an ident in the type. Use symbol for type instead + result = n.getTypeInst.nimToGpuType() + +proc stripPtrOrArrayType(t: GpuType): GpuType = + ## Strips any pointer or array type to return any struct / generic instantiation + ## it might contain + case t.kind + of gtPtr: result = stripPtrOrArrayType t.to + of gtUA: result = stripPtrOrArrayType t.uaTo + of gtArray: result = stripPtrOrArrayType t.aTyp + else: result = t + +proc maybeAddType*(ctx: var GpuContext, typ: GpuType) = + ## Adds the given type to the table of known types, if it is some kind of + ## object type. + ## + ## XXX: What about aliases and distincts? + let typ = typ.stripPtrOrArrayType() # get any underlying type + if typ.kind in [gtObject, gtGenericInst] and typ notin ctx.types: + ctx.types[typ] = toTypeDef(typ) + +proc parseProcParameters(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): seq[GpuParam] = + ## Returns all parameters of the given procedure from the `params` node + ## of type `nnkFormalParams`. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + for i in 1 ..< params.len: + let param = params[i] + let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: + let typIdx = param.len - 2 # second to last is the type + # IdentDefs + # Ident "x" + # Ident "y" + # Ident "res" + # PtrTy + # Ident "float32" # `param.len - 2` + # Empty # `param.len - 1` + let paramType = gpuTypeMaybeFromSymbol(param[typIdx], param[typIdx-1]) + ctx.maybeAddType(paramType) + for i in 0 ..< numParams: + var p = ctx.toGpuAst(param[i]) + let symKind = if attGlobal in attrs: gsGlobalKernelParam + else: gsDeviceKernelParam + p.iTyp = paramType ## Update the type of the symbol + p.symbolKind = symKind ## and the symbol kind + let param = GpuParam(ident: p, typ: paramType) + result.add(param) + +proc parseProcReturnType(ctx: var GpuContext, params: NimNode): GpuType = + ## Returns the return type of the given procedure from the `params` node + ## of type `nnkFormalParams`. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + let retType = params[0] # arg 0 is return type + if retType.kind == nnkEmpty: + result = GpuType(kind: gtVoid) # actual void return + else: + # attempt to get type. If fails, we need to wait for a caller to this function to get types + # (e.g. returns something like `array[FOO, BigInt]` where `FOO` is a constant defined outside + # the macro. We then rely on our generics logic to later look this up when called + result = nimToGpuType(retType, allowArrayIdent = true) + if result.kind == gtVoid: # stop parsing this function + result = GpuType(kind: gtInvalid) + ctx.maybeAddType(result) + +proc toGpuProcSignature(ctx: var GpuContext, params: NimNode, attrs: set[GpuAttribute]): GpuProcSignature = + ## Creates a `GpuProcSignature` from the given `params` node of type `nnkFormalParams` + + ## + ## NOTE: This procedure is only called from generically instantiated procs. Therefore, + ## we shouldn't need to worry about getting `gtInvalid` return types here. + doAssert params.kind == nnkFormalParams, "Argument is not FormalParams, but: " & $params.treerepr + result = GpuProcSignature(params: ctx.parseProcParameters(params, attrs), + retType: ctx.parseProcReturnType(params)) + +proc addProcToGenericInsts(ctx: var GpuContext, node: NimNode, name: GpuAst) = + ## Looks up the implementation of the given function and stores it in our table + ## of generic instantiations. + ## + ## For any looked up procedure, we attach the `{.device.}` pragma. + ## + ## Mutates the `name` of the given function to match its generic name. + # We need both `getImpl` for the *body* and `getTypeInst` for the actual signature + # Only the latter contains e.g. correct instantiation of static array sizes + let inst = node[0].getImpl() + let sig = node[0].getTypeInst() + inst.params = sig.params # copy over the parameters + + # turn the signature into a `GpuProcSignature` + let attrs = collectProcAttributes(inst.pragma) + let procSig = ctx.toGpuProcSignature(sig.params, attrs) + if name in ctx.processedProcs: + return + else: + # Need to add isym here so that if we have recursive calls, we don't end up + # calling `toGpuAst` recursively forever + ctx.processedProcs[name] = procSig + + let fn = ctx.toGpuAst(inst) + if fn.kind == gpuVoid: + # Should be an inbuilt proc, i.e. annotated with `{.builtin.}`. However, + # functions that are available otherwise (e.g. in Nim's system like `abs`) + # in Nim _and_ backends will also show up here. Unless we wanted to manually + # wrap all of these, we can just skip the `isBuiltin` check here. + # If the user uses something not available in the backend, they'll get a + # compiler error from that compiler. + # It's mostly a matter of usability: For common procs like `abs` we cannot + # so easily define a custom overload `proc abs(...): ... {.builtin.}`, because + # that would overwrite the Nim version. + # doAssert inst.isBuiltIn() + return + else: + fn.pAttributes.incl attDevice # make sure this is interpreted as a device function + doAssert fn.pName.iSym == name.iSym, "Not matching" + # now overwrite the identifier's `iName` field by its `iSym` so that different + # generic insts have different + fn.pName.iName = fn.pName.iSym + name.iName = fn.pName.iSym ## update the name of the called function + ctx.genericInsts[fn.pName] = fn + +proc isExpression(n: GpuAst): bool = + ## Returns whether the given AST node is an expression + case n.kind + of gpuCall: # only if it returns something! + result = n.cIsExpr + of gpuBinOp, gpuIdent, gpuLit, gpuArrayLit, gpuPrefix, gpuDot, gpuIndex, gpuObjConstr, + gpuAddr, gpuDeref, gpuConv, gpuCast, gpuConstExpr: + result = true + else: + result = false + +proc maybeInsertResult(ast: var GpuAst, retType: GpuType, fnName: string) = + ## Will insert a `gpuVar` for the implicit `result` variable, unless there + ## is a user defined `var result` that shadows it at the top level of the proc + ## body. + ## + ## Finally adds a `return result` statement if + ## - we add a `result` variable + ## - there is no `return` statement as the _last_ statement in the proc + if retType.kind == gtVoid: return # nothing to do if the proc returns nothing + + proc hasCustomResult(n: GpuAst): bool = + doAssert n.kind == gpuBlock + for ch in n: # iterate all top level statements in the proc body + case ch.kind + of gpuVar: + if ch.vName.ident() == "result": + ## XXX: could maybe consider to emit a CT warning that `result` shadows the implicit + ## result variable + echo "[WARNING] ", fnName, " has a custom `result` variable, which shadows the implicit `result`." + return true + of gpuBlock: # need to look at `gpuBlock` from top level, because variables are defined in a block + result = result or hasCustomResult(ch) + else: + discard + + proc lastIsReturn(n: GpuAst): bool = + doAssert n.kind == gpuBlock + if n.statements[^1].kind == gpuReturn: return true + + if not hasCustomResult(ast) and not lastIsReturn(ast): + # insert `gpuVar` as the *first* statement + let resId = GpuAst(kind: gpuIdent, iName: "result", + iSym: "result", + iTyp: retType, + symbolKind: gsLocal) + let res = GpuAst(kind: gpuVar, vName: resId, + vType: retType, + vInit: GpuAst(kind: gpuVoid), # no initialization + vRequiresMemcpy: false, + vMutable: true) + ast.statements.insert(res, 0) + # NOTE: The compiler rewrites expressions at the end of a `proc` into + # an assignment to `block: result = ` for us. + if not lastIsReturn(ast): + # insert `return result` + ast.statements.add GpuAst(kind: gpuReturn, rValue: resId) + +proc fnReturnsValue(ctx: GpuContext, fn: GpuAst): bool = + ## Returns true if the given `fn` (gpuIdent) returns a value. + ## The function can either be: + ## - an inbuilt function + ## - a generic instantiation + ## - contained in `allFnTab` + if fn in ctx.allFnTab: + result = ctx.allFnTab[fn].pRetType.kind != gtVoid + elif fn in ctx.genericInsts: + result = ctx.genericInsts[fn].pRetType.kind != gtVoid + elif fn in ctx.builtins: + result = ctx.builtins[fn].pRetType.kind != gtVoid + elif fn in ctx.processedProcs: + result = ctx.processedProcs[fn].retType.kind != gtVoid + else: + raiseAssert "The function: " & $fn & " is not known anywhere." + proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## XXX: things still left to do: ## - support `result` variable? Currently not supported. Maybe we will won't @@ -353,9 +831,18 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = blockLabel: blockLabel) for i in 1 ..< node.len: # index 0 is the block label result.statements.add ctx.toGpuAst(node[i]) + of nnkBlockExpr: + ## XXX: For CUDA just a block? + let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal + elif node[0].kind == nnkEmpty: "" + else: raiseAssert "Unexpected node in block label field: " & $node.treerepr + result = GpuAst(kind: gpuBlock, blockLabel: blockLabel, isExpr: true) + for el in node: + if el.kind != nnkEmpty: + result.statements.add ctx.toGpuAst(el) of nnkStmtListExpr: # for statements that return a value. ## XXX: For CUDA just a block? - result = GpuAst(kind: gpuBlock) + result = GpuAst(kind: gpuBlock, isExpr: true) for el in node: if el.kind != nnkEmpty: result.statements.add ctx.toGpuAst(el) @@ -364,42 +851,43 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = ctx.toGpuAst(node[0]) of nnkProcDef, nnkFuncDef: - result = GpuAst(kind: gpuProc) - result.pName = ctx.toGpuAst(node.name) - result.pName.symbolKind = gsProc ## This is a procedure identifier - doAssert node[3].kind == nnkFormalParams - result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type - # Process pragmas - if node.pragma.kind != nnkEmpty: - doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" - result.pAttributes = collectProcAttributes(node.pragma) - if result.pAttributes.len == 0: # means `nimonly` was applied + # if it is a _generic_ function, we don't actually process it here. instead we add it to + # the `generics` set. When we encounter a `gpuCall` we will then check if the function + # being called is part of the generic set and look up its _instantiated_ implementation + # to parse it. The parsed generics are stored in the `genericInsts` table. + let name = ctx.getFnName(node.name) + if node[2].kind == nnkGenericParams: # is a generic + ctx.generics.incl name.iName # need to use raw name, *not* symbol + result = GpuAst(kind: gpuVoid) + elif node.body.kind == nnkEmpty: # just a forward declaration + result = GpuAst(kind: gpuVoid) + else: + result = GpuAst(kind: gpuProc) + result.pName = name + result.pName.symbolKind = gsProc ## This is a procedure identifier + let params = node[3] + doAssert params.kind == nnkFormalParams + result.pRetType = ctx.parseProcReturnType(params) + if result.pRetType.kind == gtInvalid: + ctx.generics.incl name.iName # need to use raw name, *not* symbol return GpuAst(kind: gpuVoid) - # Process parameters - for i in 1 ..< node[3].len: - let param = node[3][i] - let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: - let typIdx = param.len - 2 # second to last is the type - # IdentDefs - # Ident "x" - # Ident "y" - # Ident "res" - # PtrTy - # Ident "float32" # `param.len - 2` - # Empty # `param.len - 1` - let paramType = nimToGpuType(param[typIdx]) - #echo "Argument: ", param.treerepr, " has tpye: ", paramType - for i in 0 ..< numParams: - var p = ctx.toGpuAst(param[i]) - let symKind = if attGlobal in result.pAttributes: gsGlobalKernelParam - else: gsDeviceKernelParam - p.iTyp = paramType ## Update the type of the symbol - p.symbolKind = symKind ## and the symbol kind - let param = GpuParam(ident: p, typ: paramType) - result.pParams.add(param) - - result.pBody = ctx.toGpuAst(node.body) - .ensureBlock() # single line procs should be a block to generate `;` + + # Process pragmas + if node.pragma.kind != nnkEmpty: + doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" + result.pAttributes = collectProcAttributes(node.pragma) + if result.pAttributes.len == 0: # means `nimonly` was applied / is a `builtin` + ctx.builtins[name] = result # store in builtins, so that we know if it returns a value when called + return GpuAst(kind: gpuVoid) + # Process parameters + result.pParams = ctx.parseProcParameters(params, result.pAttributes) + result.pBody = ctx.toGpuAst(node.body) + .ensureBlock() # single line procs should be a block to generate `;` + result.pBody.maybeInsertResult(result.pRetType, result.pName.ident()) + + # Add to table of known functions + if result.pName notin ctx.allFnTab: + ctx.allFnTab[result.pName] = result of nnkLetSection, nnkVarSection: # For a section with multiple declarations, create a block @@ -426,7 +914,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert declaration[0][1].kind == nnkPragma varNode.vAttributes = collectAttributes(declaration[0][1]) else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr - varNode.vType = nimToGpuType(declaration) + varNode.vType = gpuTypeMaybeFromSymbol(declaration, declaration[0]) + ctx.maybeAddType(varNode.vType) varNode.vName.iTyp = varNode.vType # also store the type in the symbol, for easier lookup later # This is a *local* variable (i.e. `function` address space on WGSL) unless it is # annotated with `{.shared.}` (-> `workspace` in WGSL) @@ -489,6 +978,7 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = ## NOTE: Currently we process templates, but we expect them to be already ## expanded by the Nim compiler. Thus we could in theory expand them manually ## but fortunately we don't need to. + return GpuAst(kind: gpuVoid) let tName = node[0].strVal # Extract parameters @@ -508,8 +998,14 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result = GpuAst(kind: gpuVoid) of nnkCall, nnkCommand: - # Check if this is a template call + # `name` below is name + signature hash. Check if this is a generic based on node repr let name = ctx.getFnName(node[0]) # cannot use `strVal`, might be a symchoice + if node[0].repr in ctx.generics or name notin ctx.allFnTab: + # process the generic instantiaton and store *or* pull in a proc defined outside + # the `cuda` macro by its implementation. + ## XXX: for CUDA backend need to annotate all pulled in procs with `{.device.}`! + ctx.addProcToGenericInsts(node, name) + let args = node[1..^1].mapIt(ctx.toGpuAst(it)) # Producing a template call something like this (but problematic due to overloads etc) # we could then perform manual replacement of the template in the CUDA generation pass. @@ -518,26 +1014,60 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.tcName = name result.tcArgs = args else: - result = GpuAst(kind: gpuCall) + let fnIsExpr = ctx.fnReturnsValue(name) + result = GpuAst(kind: gpuCall, cIsExpr: fnIsExpr) result.cName = name result.cArgs = args of nnkInfix: result = GpuAst(kind: gpuBinOp) - # if left/right is boolean we need logical AND/OR, otherwise - # bitwise - let isBoolean = node[1].typeKind == ntyBool - result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name - result.bLeft = ctx.toGpuAst(node[1]) - result.bRight = ctx.toGpuAst(node[2]) - # We patch the types of int / float literals. WGSL does not automatically convert literals - # to the target type. - if result.bLeft.kind == gpuLit and result.bRight.kind != gpuLit: - # determine literal type based on `bRight` - result.bLeft.lType = nimToGpuType(node[2]) - elif result.bRight.kind == gpuLit and result.bLeft.kind != gpuLit: - # determine literal type based on `bLeft` - result.bRight.lType = nimToGpuType(node[1]) + # Using `getType` to get the types of the arguuments + let typ = node[0].getTypeImpl() # e.g. + doAssert typ.kind == nnkProcTy, "Infix node is not a proc but: " & $typ.treerepr + # BracketExpr + # Sym "proc" + # Sym "int" <- return type + # Sym "int" <- left op type + # Sym "int" <- right op type + result.bLeftTyp = nimToGpuType(typ[0][1]) + result.bRightTyp = nimToGpuType(typ[0][2]) + # if either is not a base type (`gtBool .. gtSize_t`) we actually deal with a _function call_ + # instead of an binary operation. Will thus rewrite. + proc ofBasicType(t: GpuType, allowPtrLhs: bool): bool = + ## Determines if the given type is a basic POD type *or* a simple pointer to it. + ## This is because some infix nodes, e.g. `x += y` will have LHS arguments that are + ## `var T`, which appear as an implicit pointer here. + ## + ## TODO: Handle the case of backend inbuilt special types (like `vec3`), which may indeed + ## have inbuilt infix operators. Either by checking if the type has a `{.builtin.}` pragma + ## _or_ if there is a wrapped proc for this operator and if so do not rewrite as `gpuCall` + ## if that exists. + result = (t.kind in gtBool .. gtSize_t) + if allowPtrLhs: + result = result or ((t.kind == gtPtr) and t.implicit and t.to.kind in gtBool .. gtSize_t) + + if not result.bLeftTyp.ofBasicType(true) or not result.bRightTyp.ofBasicType(false): + result = GpuAst(kind: gpuCall) + result.cName = ctx.getFnName(node[0]) + result.cArgs = @[ctx.toGpuAst(node[1]), ctx.toGpuAst(node[2])] + else: + # if left/right is boolean we need logical AND/OR, otherwise bitwise + let isBoolean = result.bLeftTyp.kind == gtBool + var op = GpuAst(kind: gpuIdent, iName: assignOp(node[0].repr, isBoolean)) # repr so that open sym choice gets correct name + op.iSym = op.iName + result.bOp = op + result.bLeft = ctx.toGpuAst(node[1]) + result.bRight = ctx.toGpuAst(node[2]) + + # We patch the types of int / float literals. WGSL does not automatically convert literals + # to the target type. Determining the type here _can_ fail. In that case the + # `lType` field will just be `gtVoid`, like the default. + if result.bLeft.kind == gpuLit: # and result.bRight.kind != gpuLit: + # determine literal type based on `bRight` + result.bLeft.lType = result.bLeftTyp # nimToGpuType(node[2], allowToFail = true) + elif result.bRight.kind == gpuLit: # and result.bLeft.kind != gpuLit: + # determine literal type based on `bLeft` + result.bRight.lType = result.bRightTyp #nimToGpuType(node[1], allowToFail = true) of nnkDotExpr: ## NOTE: As we use a typed macro, we only encounter `DotExpr` for *actual* field accesses and NOT @@ -547,9 +1077,22 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = result.dField = ctx.toGpuAst(node[1]) of nnkBracketExpr: - result = GpuAst(kind: gpuIndex) - result.iArr = ctx.toGpuAst(node[0]) - result.iIndex = ctx.toGpuAst(node[1]) + case node[0].typeKind + of ntyTuple: + # need to replace `[idx]` by field access + let typ = nimToGpuType(node[0].getTypeImpl) + ctx.maybeAddType(typ) + #doAssert typ in ctx.types + doAssert node[1].kind == nnkIntLit + let idx = node[1].intVal + let field = typ.oFields[idx].name + result = GpuAst(kind: gpuDot, + dParent: ctx.toGpuAst(node[0]), + dField: ctx.toGpuAst(ident(field))) + else: + result = GpuAst(kind: gpuIndex) + result.iArr = ctx.toGpuAst(node[0]) + result.iIndex = ctx.toGpuAst(node[1]) of nnkIdent, nnkOpenSymChoice: result = newGpuIdent() @@ -560,7 +1103,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkSym: let s = node.repr & "_" & node.signatureHash() # NOTE: The reason we have a tab of known symbols is not to keep the same _reference_ to each - # symbol, but rather to allow having the same symbol kind (set in the caller of this call). + # symbol, but rather to allow having the same symbol kind and appropriate type for each + # symbol `GpuAst` (of kind `gpuIdent`), which is set in the caller of this call. # For example in `nnkCall` nodes returning the value from the table automatically means the # `symbolKind` is local / function argument etc. if s notin ctx.sigTab: @@ -570,7 +1114,11 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = if result.iName == "_": result.iName = "tmp_" & $ctx.genSymCount inc ctx.genSymCount - #ctx.sigTab[s] = result + elif result.iName.startsWith("tmpTuple_"): # will have a Nim gensym'd suffix, replace by custom counter + result.iName = "tmpTuple_" & $ctx.genSymCount + result.iSym = result.iName & "_" & node.signatureHash() # and update the iSym to not be based on Nim's value either + inc ctx.genSymCount + ctx.sigTab[s] = result else: result = ctx.sigTab[s] @@ -635,28 +1183,90 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert el.kind == nnkTypeDef result.statements.add ctx.toGpuAst(el) of nnkTypeDef: - result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal) - result.tFields = parseTypeFields(node[2]) + doAssert node.len == 3, "TypeDef node does not have 3 children: " & $node.len + let name = ctx.toGpuAst(node[0]) + if node[1].kind == nnkGenericParams: # if this is a generic, only store existence of it + # will store the instantiatons in `nnkObjConstr` + result = GpuAst(kind: gpuVoid) + else: + let typ = nimToGpuType(node[0]) + case node[2].kind + of nnkObjectTy: # regular `type foo = object` + result = GpuAst(kind: gpuTypeDef, tTyp: typ) + result.tFields = parseTypeFields(node[2]) + of nnkSym: # a type alias `type foo = bar` + result = GpuAst(kind: gpuAlias, aTyp: typ, + aTo: ctx.toGpuAst(node[2])) + else: + raiseAssert "Unexpected node kind in TypeDef: " & $node[2].kind + + # include this the set of known types to not generate duplicates + ctx.types[typ] = result + # Reset the type we return to void. We now generate _all_ types from the + # `types`. + result = GpuAst(kind: gpuVoid) of nnkObjConstr: - let typName = getTypeName(node) - result = GpuAst(kind: gpuObjConstr, ocName: typName) + ## this should never see `genericParam` I think + let typ = nimToGpuType(node) + ctx.maybeAddType(typ) + result = GpuAst(kind: gpuObjConstr, ocType: typ) # get all fields of the type - let flds = node[0].getTypeImpl.parseTypeFields() # sym + let flds = if typ.kind == gtObject: typ.oFields + elif typ.kind == gtGenericInst: typ.gFields + else: raiseAssert "ObjConstr must have an object type: " & $typ # find all fields that have been defined by the user var ocFields: seq[GpuFieldInit] for i in 1 ..< node.len: # all fields to be init'd doAssert node[i].kind == nnkExprColonExpr ocFields.add GpuFieldInit(name: node[i][0].strVal, - value: ctx.toGpuAst(node[i][1])) + value: ctx.toGpuAst(node[i][1]), + typ: GpuType(kind: gtVoid)) + # now add fields in order of the type declaration for i in 0 ..< flds.len: let idx = findIdx(ocFields, flds[i].name) if idx >= 0: - result.ocFields.add ocFields[idx] + var f = ocFields[idx] + f.typ = flds[i].typ + result.ocFields.add f else: let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid)) result.ocFields.add GpuFieldInit(name: flds[i].name, - value: dfl) + value: dfl, + typ: flds[i].typ) + of nnkTupleConstr: + let typ = nimToGpuType(node) + ctx.maybeAddType(typ) + + result = GpuAst(kind: gpuObjConstr, ocType: typ) + # get all fields of the type + let flds = typ.oFields + # find all fields that have been defined by the user + var ocFields: seq[GpuFieldInit] + for i in 0 ..< node.len: # all fields to be init'd + case node[i].kind + of nnkExprColonExpr: + ocFields.add GpuFieldInit(name: node[i][0].strVal, + value: ctx.toGpuAst(node[i][1]), + typ: GpuType(kind: gtVoid)) + else: + ocFields.add GpuFieldInit(name: "Field" & $i, + value: ctx.toGpuAst(node[i]), + typ: GpuType(kind: gtVoid)) + + # now add fields in order of the type declaration + for i in 0 ..< flds.len: + let idx = findIdx(ocFields, flds[i].name) + if idx >= 0: + var f = ocFields[idx] + f.typ = flds[i].typ + result.ocFields.add f + else: + let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid)) + result.ocFields.add GpuFieldInit(name: flds[i].name, + value: dfl, + typ: flds[i].typ) + of nnkAsmStmt: doAssert node.len == 2 @@ -666,10 +1276,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = of nnkBracket: let aLitTyp = nimToGpuType(node[0]) - var aValues = newSeq[string]() + var aValues = newSeq[GpuAst]() for el in node: - ## XXX: Support not just int literals - aValues.add $el.intVal + aValues.add ctx.toGpuAst(el) result = GpuAst(kind: gpuArrayLit, aValues: aValues, aLitType: aLitTyp) @@ -691,27 +1300,9 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = # `HiddenAddr` appears for accesses to `var` passed arguments result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0])) - of nnkHiddenDeref: - case node.typeKind - of ntyUncheckedArray: - # `getTypeInst(node)` would yield: - # BracketExpr - # Sym "UncheckedArray" - # Sym "uint32" - # i.e. it is a `ptr UncheckedArray[T]` - # In this case we just ignore the deref, because on the CUDA - # side it is just a plain pointer array we index into using - # `foo[i]`. - result = ctx.toGpuAst(node[0]) - else: - # Otherwise we treat it like a regular deref - # HiddenDeref - # Sym "x" - # With e.g. `getTypeInst(node) = Sym "BigInt"` - # and `node.typeKind = ntyObject` - # due to a `var` parameter - result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) - of nnkDerefExpr: #, nnkHiddenDeref: + of nnkDerefExpr, nnkHiddenDeref: + # treat hidden and regular deref the same nowadays. On some backends may strip derefs, if + # they appear e.g. in an `gpuIndex` (CUDA) result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) of nnkConstDef: @@ -730,6 +1321,8 @@ proc toGpuAst*(ctx: var GpuContext, node: NimNode): GpuAst = doAssert el.kind == nnkConstDef result.statements.add ctx.toGpuAst(el) + of nnkWhenStmt: + raiseAssert "We shouldn't be seeing a `when` statement after sem check of the Nim code." else: echo "Unhandled node kind in toGpuAst: ", node.kind raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim index dcfde9c1..95928e2b 100644 --- a/constantine/math_compiler/experimental/runtime_compile.nim +++ b/constantine/math_compiler/experimental/runtime_compile.nim @@ -86,15 +86,17 @@ proc log*(nvrtc: var NVRTC) = proc compile*(nvrtc: var NVRTC) = # Compile the program with fmad disabled. # Note: Can specify GPU target architecture explicitly with '-arch' flag. - const - Options = [ - cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is - # "--fmad=false", # and whatever other options for example - ] - - NumberOfOptions = cint Options.len - let compileResult = nvrtcCompileProgram(nvrtc.prog, NumberOfOptions, - cast[cstringArray](addr Options[0])) + var options = @[ + cstring "--gpu-architecture=compute_75", # or whatever your GPU arch is + # "--fmad=false", # and whatever other options for example + ] + when defined(debugCuda): + options.add cstring "--device-debug" # Equivalent to -g + options.add cstring "--generate-line-info" # Equivalent to -lineinfo + + let numberOfOptions = cint options.len + let compileResult = nvrtcCompileProgram(nvrtc.prog, numberOfOptions, + cast[cstringArray](addr options[0])) nvrtc.log() ## XXX: only in `DebugCuda`? diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim index 8e22f6ef..cfc8d2c9 100644 --- a/constantine/platforms/abis/nvidia_abi.nim +++ b/constantine/platforms/abis/nvidia_abi.nim @@ -855,11 +855,11 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi {.pop.} proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult {.v2.} +proc cuCtxDestroy*(ctx: CUcontext): CUresult {.v2.} proc cuCtxSynchronize*(ctx: CUcontext): CUresult {.v2.} {.push noconv, importc, dynlib: libCuda.} -proc cuCtxDestroy*(ctx: CUcontext): CUresult proc cuCtxSynchronize*(): CUresult proc cuCtxGetCurrent*(ctx: var CUcontext): CUresult proc cuCtxSetCurrent*(ctx: CUcontext): CUresult @@ -875,7 +875,6 @@ proc cuModuleUnload*(module: CUmodule): CUresult proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.} proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult -proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult proc cuLaunchKernel*( kernel: CUfunction, @@ -889,6 +888,8 @@ proc cuLaunchKernel*( {.pop.} # {.push noconv, importc, dynlib: "libcuda.so"..} +proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult {.v2.} + proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult {.v2.} proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult {.v1.} proc cuMemFree*(devptr: CUdeviceptr): CUresult {.v2.}