diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 797bd44c47..4c647a3ab4 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -336,14 +336,14 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool: True if this VariableBuffer has any live aliases, False otherwise """ # Do a breadth-first search across the aliasing double-linked list - live = self._live + live = self._live or self.is_input or self.is_output queue = set(self.aliases) visited = set(self.name) while len(queue) > 0: next = queue.pop() buffNext = ctxt.lookup(next) assert isinstance(buffNext, VariableBuffer) - live |= buffNext._live + live |= buffNext._live or buffNext.is_input or buffNext.is_output visited.add(next) queue |= buffNext.aliases - visited return live diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..7ead6556b7 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -492,6 +492,22 @@ def __init__(self, maps: List[NodeMapper]): super().__init__(maps) +class InPlaceAccumulatorV2Layer(ONNXLayer): + """Layer for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Gradient accumulation with optional reset: + if lazy_reset_grad: out = gradient + else: out = buffer + gradient + """ + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # One conditional check + one element-wise op (copy or add) per element + return self.mapper.parser.operatorRepresentation['size'] + + class LinearAttentionLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..385eb03dff 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -2611,15 +2611,18 @@ def parseNodeCtxt(self, class SoftmaxCrossEntropyLossParser(NodeParser): + """SoftmaxCrossEntropyLoss parser. + + The canonical form has two outputs: a scalar mean cross-entropy loss and + a per-sample log_prob tensor, matching the signature emitted by ONNX + Runtime when exporting training graphs. + """ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) - - return ret + return all([len(node.inputs) == 2, len(node.outputs) == 2]) def parseNodeCtxt(self, ctxt: NetworkContext, @@ -2628,9 +2631,13 @@ def parseNodeCtxt(self, logits = ctxt.lookup(node.inputs[0].name) labels = ctxt.lookup(node.inputs[1].name) - log_prob = ctxt.lookup(node.outputs[0].name) + # outputs[0] = loss (0-d scalar, shape [1] after Deeploy normalisation) + # outputs[1] = log_prob tensor + loss = ctxt.lookup(node.outputs[0].name) + log_prob = ctxt.lookup(node.outputs[1].name) self.operatorRepresentation['logits'] = logits.name self.operatorRepresentation['labels'] = labels.name + self.operatorRepresentation['loss'] = loss.name self.operatorRepresentation['log_prob'] = log_prob.name self.operatorRepresentation['batch'] = logits.shape[0] self.operatorRepresentation['num_classes'] = logits.shape[1] @@ -2697,6 +2704,48 @@ def parseNodeCtxt(self, return ctxt, True +class InPlaceAccumulatorV2Parser(NodeParser): + """Parser for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Semantics: + if lazy_reset_grad: out = gradient (reset) + else: out = buffer + gradient (accumulate) + + Inputs: + 0: buffer - current accumulation buffer (float tensor) + 1: gradient - new gradient to accumulate (float tensor, same shape) + 2: lazy_reset_grad - reset flag; if true, overwrite; else add (bool[1]) + + Output: + 0: output_buffer - updated accumulation buffer (float tensor) + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + # Require exactly 3 inputs (buffer, gradient, lazy_reset_grad) and 1 output + return len(node.inputs) == 3 and len(node.outputs) == 1 + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + buffer = ctxt.lookup(node.inputs[0].name) + gradient = ctxt.lookup(node.inputs[1].name) + lazy_reset_grad = ctxt.lookup(node.inputs[2].name) + data_out = ctxt.lookup(node.outputs[0].name) + + self.operatorRepresentation['accum_buffer'] = buffer.name + self.operatorRepresentation['gradient'] = gradient.name + self.operatorRepresentation['lazy_reset_grad'] = lazy_reset_grad.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['size'] = int(np.prod(buffer.shape)) + + return ctxt, True + + class BatchNormParser(NodeParser): def __init__(self): diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..85453563c3 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -577,11 +577,11 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[ def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return [2**(self.input_types[0].referencedType.typeWidth)] + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: - return [False] + return [False] * len(self.output_types) class SGDChecker(SignPropTypeChecker): @@ -598,6 +598,32 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [True] +class InPlaceAccumulatorV2Checker(SignPropTypeChecker): + """Type checker for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Inputs: + 0: buffer (float32*) + 1: gradient (float32*) + 2: lazy_reset_grad (uint8_t* or bool* - 1 element) + + Output: + 0: output_buffer (float32*) + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + # Output has same precision as the buffer input (float32) + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + # Float32 output is signed + return [True] + + class BatchNormChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 5d7b02ae62..06674a7498 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -18,9 +18,9 @@ from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ - GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ - QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker + GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, InPlaceAccumulatorV2Checker, LayerNormChecker, \ + MatMulChecker, MulChecker, QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, \ + RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -29,11 +29,12 @@ from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ - FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ - FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ - MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ - RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate + FloatGELUTemplate, FloatGemmTemplate, FloatInPlaceAccumulatorV2Template, FloatLayernormTemplate, \ + FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \ + FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, \ + RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ + SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ + iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -353,7 +354,8 @@ PULPSoftmaxCrossEntropyLossBindings = [ NodeBinding( - SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), + SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], + [PointerClass(float32_t), PointerClass(float32_t)]), SoftmaxCrossEntropyLossTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes ] @@ -368,6 +370,14 @@ SGDTemplate.referenceTemplate, ForkTransformer) ] +PULPInPlaceAccumulatorV2Bindings = [ + NodeBinding( + InPlaceAccumulatorV2Checker( + [PointerClass(float32_t), PointerClass(float32_t), + PointerClass(uint8_t)], [PointerClass(float32_t)]), FloatInPlaceAccumulatorV2Template.referenceTemplate, + ForkTransformer) +] + PULPTransposeBindings = [ NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 7456dd9e1b..2413942869 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -14,17 +14,17 @@ from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \ - GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ - ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ - RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer + GEMMLayer, InPlaceAccumulatorV2Layer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, \ + PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, \ + RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, \ + SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \ - MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \ - SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \ - SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \ - iSoftmaxParser + GELUGradParser, GELUParser, GEMMParser, InPlaceAccumulatorV2Parser, LayerNormGradParser, LayerNormParser, \ + MatMulParser, MaxPool1DParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, \ + ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \ + RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \ + SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \ + iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -39,17 +39,17 @@ from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \ - PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \ - PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \ - PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \ - PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \ - PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \ - PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \ - PULPUniformRQSTilingReadyBindings + PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPInPlaceAccumulatorV2TilingReadyBindings, \ + PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, \ + PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, \ + PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, \ + PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \ + PULPRQSConv1DTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, \ + PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \ + PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \ + PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ + PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -108,6 +108,7 @@ SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(), PULPSoftmaxCrossEntropyGradTilingReadyBindings) SGDMapper = NodeMapper(SGDParser(), PULPSGDTilingReadyBindings) +InPlaceAccumulatorV2Mapper = NodeMapper(InPlaceAccumulatorV2Parser(), PULPInPlaceAccumulatorV2TilingReadyBindings) QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) @@ -151,7 +152,8 @@ 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), - 'SGD': SGDLayer([SGDMapper]) + 'SGD': SGDLayer([SGDMapper]), + 'InPlaceAccumulatorV2': InPlaceAccumulatorV2Layer([InPlaceAccumulatorV2Mapper]), } diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index 59499706e5..ef046f191d 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -4,7 +4,8 @@ from typing import Dict, List, Tuple -from Deeploy.AbstractDataTypes import float32_tPtr +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation @@ -19,7 +20,7 @@ def alignToContext(self, ctxt: NetworkContext, if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None: # No bias case - set C to NULL and provide a default type operatorRepresentation['C'] = None - operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type + operatorRepresentation['C_type'] = PointerClass(float32_t) # Default to fp32 type operatorRepresentation['C_batched'] = False return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py new file mode 100644 index 0000000000..f7864c7261 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _PULPInPlaceAccumulatorV2Template(NodeTemplate): + """True in-place InPlaceAccumulatorV2 template for PULP. + + Writes the accumulation result into ``accum_buffer`` (the graph input). + ``data_out`` is registered as an alias of ``accum_buffer`` so the memory + allocator knows they share memory and will not free ``accum_buffer`` + prematurely. + + ``data_out`` is intentionally *not* written by the emitted C code: + + - InPlaceAccumulatorV2 is terminal in the training graph — no downstream + kernel consumes ``data_out``; it only exists as a symbolic output so + the graph stays well-formed. + - In the tiled path, emitting a write to ``data_out`` would also make + Deeploy generate an L2 egress DMA for it, and ``data_out``'s L2 slot + may overlap with other live buffers, corrupting L2. + + Semantics: + if lazy_reset_grad: accum_buffer = gradient (reset) + else: accum_buffer += gradient (accumulate) + """ + + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: + accum_buffer = ctxt.lookup(operatorRepresentation['accum_buffer']) + data_out = ctxt.lookup(operatorRepresentation['data_out']) + + accum_buffer.aliases.add(data_out.name) + data_out.aliases.add(accum_buffer.name) + data_out._alias = accum_buffer.name + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _PULPInPlaceAccumulatorV2Template(""" +// InPlaceAccumulatorV2 (Name: ${nodeName}, Op: ${nodeOp}) +// Writes result into accum_buffer (in-place). data_out is an alias of +// accum_buffer and is deliberately not written — it has no downstream +// consumer, and emitting a write would trigger an L2 egress DMA whose +// destination may overlap with live buffers in the tiled path. +// Reset (lazy_reset_grad=1): accum_buffer = gradient +// Accum (lazy_reset_grad=0): accum_buffer += gradient +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (int32_t)${size}); +int32_t ${nodeName}_stop = MIN(${nodeName}_start + ${nodeName}_chunk, (int32_t)${size}); + +if (${lazy_reset_grad}[0]) { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] = ${gradient}[i]; + } +} else { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] += ${gradient}[i]; + } +} +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py index 1592fe30c4..d31d2c2797 100644 --- a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py @@ -2,9 +2,43 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from typing import List, Tuple -referenceTemplate = NodeTemplate(""" +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _PULPSGDTemplate(NodeTemplate): + """In-place SGD template for PULP. + + weight_updated is aliased to weight so the memory allocator places them + at the same address in whichever memory level weight lives in (L2 or L3). + This ensures the tiled egress DMA writes the updated weight back to + weight's buffer — the same buffer the training network reads from on the + next forward pass. + """ + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: + weight = ctxt.lookup(operatorRepresentation['weight']) + weight_updated = ctxt.lookup(operatorRepresentation['weight_updated']) + + weight.aliases.add(weight_updated.name) + weight_updated.aliases.add(weight.name) + weight_updated._alias = weight.name + + # Make weight_updated share weight's allocation (no separate malloc), + # regardless of which memory level (L2 or L3) weight is placed in. + # The egress DMA then writes updated weights back to weight's address. + weight_updated.allocTemplate = NodeTemplate(" ${name} = (${type.typeName}) " + str(weight._instance) + ";") + weight_updated.deallocTemplate = NodeTemplate("") + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _PULPSGDTemplate(""" // SGD Weight Update with Separated Multiplication and Subtraction Unrolling // (Name: ${nodeName}, Op: ${nodeOp}) int8_t ${nodeName}_core_id = pi_core_id(); @@ -46,4 +80,4 @@ float32_t temp_grad = learning_rate * ref_${grad}[i]; ref_${weight_updated}[i] = ref_${weight}[i] - temp_grad; } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py index c1aefe01a3..914a18c3ed 100644 --- a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py @@ -4,27 +4,29 @@ from Deeploy.DeeployTypes import NodeTemplate +# Canonical SoftmaxCrossEntropyLoss: emits both a scalar mean loss and the +# per-sample log_prob tensor. referenceTemplate = NodeTemplate(""" BEGIN_SINGLE_CORE // SoftmaxCrossEntropyLoss (Name: ${nodeName}, Op: ${nodeOp}) + float32_t sce_total_loss = 0.0f; for (uint32_t i = 0; i < ${batch}; i++) { - float max_logit = ${logits}[i * ${num_classes} + 0]; + float32_t sce_max_logit = ${logits}[i * ${num_classes}]; for (uint32_t j = 1; j < ${num_classes}; j++) { - if (${logits}[i * ${num_classes} + j] > max_logit) { - max_logit = ${logits}[i * ${num_classes} + j]; - } - } - - float32_t sum_exp = 0.0f; - for (uint32_t j = 0; j < ${num_classes}; j++) { - sum_exp += expf(${logits}[i * ${num_classes} + j] - max_logit); - } - - for (uint32_t j = 0; j < ${num_classes}; j++) { - // log_prob = logit - max_logit - log(sum_exp) - ${log_prob}[i * ${num_classes} + j] = ${logits}[i * ${num_classes} + j] - max_logit - logf(sum_exp); + if (${logits}[i * ${num_classes} + j] > sce_max_logit) + sce_max_logit = ${logits}[i * ${num_classes} + j]; } + float32_t sce_sum_exp = 0.0f; + for (uint32_t j = 0; j < ${num_classes}; j++) + sce_sum_exp += expf(${logits}[i * ${num_classes} + j] - sce_max_logit); + float32_t sce_log_sum_exp = logf(sce_sum_exp); + for (uint32_t j = 0; j < ${num_classes}; j++) + ${log_prob}[i * ${num_classes} + j] = + ${logits}[i * ${num_classes} + j] - sce_max_logit - sce_log_sum_exp; + sce_total_loss += -(${logits}[i * ${num_classes} + (uint32_t)(${labels}[i])] + - sce_max_logit - sce_log_sum_exp); } + ${loss}[0] = sce_total_loss / (float32_t)${batch}; END_SINGLE_CORE """) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py new file mode 100644 index 0000000000..fb2b4bde78 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class InPlaceAccumulatorV2TileConstraint(BOPTileConstraint): + """Tile constraint for InPlaceAccumulatorV2. + + Tiles accum_buffer and gradient together (same shape); lazy_reset_grad + is a scalar (1 element) and is not tiled. + """ + + dataIn1Name = 'accum_buffer' + dataIn2Name = 'gradient' + dataOutName = 'data_out' + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + tilerModel = super().addGeometricalConstraint(tilerModel, parseDict, ctxt) + + # lazy_reset_grad is a scalar flag — pin full size so it is not tiled. + lazyResetName = parseDict['lazy_reset_grad'] + tilerModel.addTensorDimToModel(ctxt, lazyResetName) + shape = ctxt.lookup(lazyResetName).shape + dims = [shape] if isinstance(shape, int) else shape + for idx, dim in enumerate(dims): + dimVar = tilerModel.getTensorDimVar(lazyResetName, idx) + tilerModel.addConstraint(dimVar == dim) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName, 'lazy_reset_grad'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + outputBaseOffsets[cls.dataOutName] = inputBaseOffsets[cls.dataIn1Name] + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + lazyResetShape = ctxt.lookup(operatorRepresentation['lazy_reset_grad']).shape + lazyResetDims = (lazyResetShape,) if isinstance(lazyResetShape, int) else tuple(lazyResetShape) + lazyResetCube = HyperRectangle((0,) * len(lazyResetDims), lazyResetDims) + + inputLoadSchedule = [{ + cls.dataIn1Name: cube, + cls.dataIn2Name: cube, + 'lazy_reset_grad': lazyResetCube, + } for cube in outputCubes] + outputLoadSchedule = [{cls.dataOutName: out} for out in outputCubes] + + for cube in outputCubes: + replacements["size"].append(int(np.prod(cube.dims))) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py index 38c984de63..78957136e5 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import copy from typing import Dict, List, Tuple, Union from ortools.constraint_solver.pywrapcp import IntVar @@ -17,10 +18,18 @@ class SoftmaxCrossEntropyTileConstraint(TileConstraint): + """TileConstraint for SoftmaxCrossEntropyLoss (2 outputs: loss + log_prob). + + Both batch and num_classes are pinned to their full size by + addPolicyConstraint, so SCE itself is never tiled — the sole purpose of + the wrapTilingSolution override is to bypass the base-class single-output + assertion and carry the scalar loss buffer through the DMA schedule. + """ dataIn1Name = 'logits' dataIn2Name = 'labels' dataOutName = 'log_prob' + dataLossName = 'loss' @classmethod def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -108,8 +117,53 @@ def serializeTilingSolution( return variableReplacementSchedule, tilingSchedule + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + """Override the base-class single-output wrapper. + + SoftmaxCrossEntropyLoss emits two outputs (loss + log_prob) but the + base-class wrapTilingSolution asserts exactly one. We run the base + wrapper on a log_prob-only slice of the tiling solution and then patch + the scalar loss address / rectangle back into each resulting schedule. + + Grad subclasses that do not have a scalar loss output fall straight + through to the base-class behaviour. + """ + lossVar = operatorRepresentation.get(cls.dataLossName, '') + + # No scalar loss output (e.g. Grad subclass) — plain base-class path. + if not lossVar or lossVar not in tilingSolution.outputTensorMemoryConstraints: + return super().wrapTilingSolution(tilingSolution, targetMemLevel, ctxt, operatorRepresentation) + + # Log_prob-only slice of the tiling solution so the single-output + # assertion in the base class passes. + logProbVar = operatorRepresentation[cls.dataOutName] + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + logProbVar: tilingSolution.outputTensorMemoryConstraints[logProbVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + # Patch the scalar loss into each schedule's output list. + lossAddr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, lossVar) + if lossAddr == [None]: + return varReplacement, tilingSchedules + + lossRect = HyperRectangle((0,), (1,)) + for schedule in tilingSchedules: + schedule.outputBaseOffsets[cls.dataLossName] = lossAddr + for step in schedule.outputLoadSchedule: + step[cls.dataLossName] = lossRect + + return varReplacement, tilingSchedules + class SoftmaxCrossEntropyGradTileConstraint(SoftmaxCrossEntropyTileConstraint): dataIn1Name = 'log_prob' dataIn2Name = 'labels' dataOutName = 'grad' + dataLossName = '' # no scalar loss output — fall through to base wrapper diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 901106459e..cc9b4e0ca4 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -16,13 +16,14 @@ from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ - PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \ - PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \ - PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \ - PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \ - PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \ - PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, \ - PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings + PULPGatherBindings, PULPiHardswishBindings, PULPInPlaceAccumulatorV2Bindings, PULPiRMSNormBindings, \ + PULPiRQSGELUBindings, PULPLayernormBinding, PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, \ + PULPMaxPool2DBindings, PULPMulBindings, PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, \ + PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, \ + PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, \ + PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \ + PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \ + PULPTransposeBindings, PULPUniformRQSBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \ RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ @@ -30,6 +31,8 @@ from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GeluTileConstraint import GeluGradTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.InPlaceAccumulatorV2TileConstraint import \ + InPlaceAccumulatorV2TileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import SoftmaxGradTileConstraint, \ iSoftmaxTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormGradTileConstraint, \ @@ -155,6 +158,9 @@ PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings, tileConstraint = SGDTileConstraint()) +PULPInPlaceAccumulatorV2TilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPInPlaceAccumulatorV2Bindings, tileConstraint = InPlaceAccumulatorV2TileConstraint()) + PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings, tileConstraint = SliceTileConstraint()) diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index 2186d4d4c4..3a583fd452 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -333,7 +333,8 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext, if _buffer._memoryLevel != memoryLevel: continue - if hasattr(_buffer, "_alias") and ctxt.is_global(_buffer._alias): + if hasattr(_buffer, "_alias") and ctxt.is_global( + _buffer._alias) and _buffer._alias not in blockNames: continue if hasattr(_buffer, "_alias") and _buffer._alias in blockNames: @@ -398,11 +399,24 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor environment variable to be set to the installation directory. """ + blockNames = {block.name for block in memoryMap} + + # In-place alias outputs whose target is in the same memoryMap share + # storage with the target — skip them from the MiniMalloc CSV (it + # rejects size-0 entries) and copy their addrSpace from the target + # after the solver runs. + aliasBlocks = { + block.name for block in memoryMap if getattr(ctxt.lookup(block.name), "_alias", None) in blockNames + } + with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file: writer = csv.writer(file, lineterminator = "\n") writer.writerow(["id", "lower", "upper", "size"]) for memoryBlock in memoryMap: + if memoryBlock.name in aliasBlocks: + continue + _buffer = ctxt.lookup(memoryBlock.name) if nodeMemoryConstraint is None: _bufferSize = _buffer.size if isinstance( @@ -452,6 +466,14 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor if memoryBlock.name == row[0]: memoryBlock._addrSpace = (int(row[-1]), int(row[-1]) + int(row[-2])) + # Resolve skipped alias blocks: copy addrSpace from the alias target. + targetBlocks = {block.name: block for block in memoryMap} + for memoryBlock in memoryMap: + if memoryBlock.name not in aliasBlocks: + continue + target = targetBlocks.get(ctxt.dealiasBuffer(memoryBlock.name)) + memoryBlock._addrSpace = target._addrSpace if target is not None else (0, 0) + return memoryMap def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution: diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index b7f3535790..3d6480d5f9 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -6,8 +6,16 @@ include_directories(${GENERATED_SOURCE}) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -add_library(network OBJECT ${GENERATED_SOURCE}/Network.c) -target_link_libraries(network PUBLIC deeploylib) +if(TRAINING) + add_library(training_network OBJECT ${GENERATED_SOURCE}/TrainingNetwork.c) + target_link_libraries(training_network PUBLIC deeploylib) + # Optimizer network (SGD kernel, compiled separately to allow different prefix) + add_library(optimizer_network OBJECT ${GENERATED_SOURCE}/OptimizerNetwork.c) + target_link_libraries(optimizer_network PUBLIC deeploylib) +else() + add_library(network OBJECT ${GENERATED_SOURCE}/Network.c) + target_link_libraries(network PUBLIC deeploylib) +endif() if(platform STREQUAL MemPool) add_subdirectory(Platforms/MemPool) @@ -29,7 +37,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) ) if (NOT HEXLIST) - target_compile_options(network PUBLIC -DNOFLASH) + if(TRAINING) + target_compile_options(training_network PUBLIC -DNOFLASH) + target_compile_options(optimizer_network PUBLIC -DNOFLASH) + else() + target_compile_options(network PUBLIC -DNOFLASH) + endif() else() gvsoc_flags_add_files_to_hyperflash(GVSOC_HEX_HYPERFLASH_FLAGS HEXLIST) list(APPEND GVSOC_EXTRA_FLAGS ${GVSOC_HEX_HYPERFLASH_FLAGS}) @@ -37,9 +50,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) # SCHEREMO: Waive warnings # Pointer sign warnings are caused by the data width abstraction used in Deeploy. Signedness is not explicitly modelled, as this is handled by kernels - target_compile_options(network PRIVATE - -Wno-pointer-sign - ) + if(TRAINING) + target_compile_options(training_network PRIVATE -Wno-pointer-sign) + target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign) + else() + target_compile_options(network PRIVATE -Wno-pointer-sign) + endif() if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) add_subdirectory(Platforms/Siracusa) @@ -61,7 +77,12 @@ elseif(platform STREQUAL GAP9) if (NOT HEXLIST) # L2 mode: No flash/readfs files # Data lives in L2 memory only - target_compile_options(network PUBLIC -DNOFLASH) + if(TRAINING) + target_compile_options(training_network PUBLIC -DNOFLASH) + target_compile_options(optimizer_network PUBLIC -DNOFLASH) + else() + target_compile_options(network PUBLIC -DNOFLASH) + endif() message(STATUS "[Deeploy GAP9] L2 mode: No hex files found, -DNOFLASH set") message(STATUS "[Deeploy GAP9] If you expect L3 mode, ensure Python codegen created hex files in ${GENERATED_SOURCE}/hex/") else() @@ -77,5 +98,13 @@ elseif(platform STREQUAL GAP9) message(STATUS "GAPY_RUNNER_ARGS: ${GAPY_RUNNER_ARGS}") endif() + # Waive warnings in generated code + if(TRAINING) + target_compile_options(training_network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + else() + target_compile_options(network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + endif() + add_subdirectory(Platforms/GAP9) endif() diff --git a/DeeployTest/Platforms/Siracusa/CMakeLists.txt b/DeeployTest/Platforms/Siracusa/CMakeLists.txt index 45e6191490..28ac5131f2 100644 --- a/DeeployTest/Platforms/Siracusa/CMakeLists.txt +++ b/DeeployTest/Platforms/Siracusa/CMakeLists.txt @@ -1,19 +1,46 @@ # SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna -# # SPDX-License-Identifier: Apache-2.0 set(ProjectId ${TESTNAME}) -file(GLOB_RECURSE SOURCES - src/CycleCounter.c - src/deeploytest.c -) +option(TRAINING "Use training harness instead of inference harness" OFF) + +# Compile-time training parameters (override via -D on cmake command line) +set(N_TRAIN_STEPS "1" CACHE STRING "Number of optimizer steps") +set(N_ACCUM_STEPS "1" CACHE STRING "Number of mini-batches per optimizer step") +set(TRAINING_NUM_DATA_INPUTS "2" CACHE STRING "Number of data inputs per mini-batch") + +if(TRAINING) + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytraintest.c + ) + set(NETWORK_LIB training_network) +else() + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytest.c + ) + set(NETWORK_LIB network) +endif() add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc) -target_link_libraries(${ProjectId} PRIVATE network deeploylib) -target_compile_options(${ProjectId} INTERFACE network) -add_gvsoc_emulation(${ProjectId} "siracusa") +if(TRAINING) + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} optimizer_network deeploylib) +else() + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} deeploylib) +endif() +target_compile_options(${ProjectId} INTERFACE ${NETWORK_LIB}) +if(TRAINING) + target_compile_definitions(${ProjectId} PRIVATE + N_TRAIN_STEPS=${N_TRAIN_STEPS} + N_ACCUM_STEPS=${N_ACCUM_STEPS} + TRAINING_NUM_DATA_INPUTS=${TRAINING_NUM_DATA_INPUTS} + ) +endif() + +add_gvsoc_emulation(${ProjectId} "siracusa") link_compile_dump(${TESTNAME}) diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c new file mode 100644 index 0000000000..50eb34d748 --- /dev/null +++ b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c @@ -0,0 +1,385 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Training harness for Siracusa — Phase 2 (with Deeploy-compiled + * OptimizerNetwork) + * + * Loop structure: + * + * InitTrainingNetwork() + * InitOptimizerNetwork() + * Connect optimizer buffers → training network's weight/grad buffers + * + * for update_step in [0, N_TRAIN_STEPS): // optimizer steps + * for accum_step in [0, N_ACCUM_STEPS): // mini-batches per update + * lazy_reset_grad = (accum_step == 0) // reset on first, + * accumulate on rest load data for this mini-batch RunTrainingNetwork() // fwd + * + bwd + InPlaceAccumulatorV2 store loss value + * // SGD weight update via Deeploy-compiled optimizer kernel: + * copy weights + grad_acc → optimizer input buffers + * RunOptimizerNetwork() + * copy weight_updated ← optimizer output buffers → training weight + * buffers + * + * Numerical verification: + * - Compare stored loss values against testLossRef[] (from testoutputs.h) + * + * Buffer layout in DeeployNetwork_inputs[] (must match ONNX input order): + * [0 .. TRAINING_NUM_DATA_INPUTS-1] data + labels (per + * mini-batch) [TRAINING_NUM_DATA_INPUTS .. + * .. TRAINING_GRAD_BUF_START_IDX-1] weights (persistent) + * [TRAINING_GRAD_BUF_START_IDX .. + * .. +TRAINING_NUM_GRAD_INPUTS-1] grad accumulation bufs + * (persistent) [DeeployNetwork_num_inputs-1] lazy_reset_grad + * uint8 + * + * Optimizer buffer layout in DeeployOptNetwork_inputs[] (interleaved pairs): + * [2*i] weight_i (copied from + * DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i]) [2*i+1] grad_acc_i (copied + * from DeeployNetwork_inputs[TRAINING_GRAD_BUF_START_IDX+i]) + * DeeployOptNetwork_outputs[i] = weight_i_updated + * → copied back to DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i] + * + * Compile-time constants (emitted by code generator into testinputs.h): + * N_TRAIN_STEPS number of optimizer (weight-update) steps + * N_ACCUM_STEPS number of mini-batches accumulated per update + * TRAINING_NUM_DATA_INPUTS inputs that change each mini-batch (data + + * labels) TRAINING_GRAD_BUF_START_IDX first grad acc buffer index in + * DeeployNetwork_inputs[] TRAINING_NUM_GRAD_INPUTS number of grad + * accumulation buffers (== number of weights) TRAINING_NUM_WEIGHT_INPUTS number + * of trainable weight buffers TRAINING_LEARNING_RATE SGD learning rate (for + * reference — embedded in optimizer ONNX) + * + * Reference comparison constants (emitted into testoutputs.h): + * N_LOSS_REFS number of reference loss values + * NUM_WEIGHT_REFS number of reference weight tensors + * TRAINING_TOLERANCE_ABS absolute comparison tolerance + */ + +#include +#include +#include + +#include "CycleCounter.h" +#include "OptimizerNetwork.h" +#include "TrainingNetwork.h" +#include "dory_mem.h" +#include "pmsis.h" +#include "testinputs.h" +#include "testoutputs.h" + +/* Helper: true when ptr is in L2 (CPU-accessible); false when in L3 (external + * RAM) */ +#define IS_L2(ptr) ((uint32_t)(ptr) >= 0x10000000u) + +/* ------------------------------------------------------------------------- + * Compile-time defaults — override via CMake target_compile_definitions + * ---------------------------------------------------------------------- */ + +#ifndef N_TRAIN_STEPS +#define N_TRAIN_STEPS 1 +#endif + +#ifndef N_ACCUM_STEPS +#define N_ACCUM_STEPS 1 +#endif + +#ifndef TRAINING_NUM_DATA_INPUTS +#define TRAINING_NUM_DATA_INPUTS 2 +#endif + +#define MAINSTACKSIZE 12000 +#define SLAVESTACKSIZE 3800 + +/* ------------------------------------------------------------------------- + * Cluster device + * ---------------------------------------------------------------------- */ + +struct pi_device cluster_dev; + +/* ------------------------------------------------------------------------- + * Loss storage (one value per forward pass) + * ---------------------------------------------------------------------- */ + +#define TOTAL_FWD_PASSES (N_TRAIN_STEPS * N_ACCUM_STEPS) +static float stored_losses[TOTAL_FWD_PASSES]; + +/* ------------------------------------------------------------------------- + * L3-aware memory transfer: handles all combinations of L2/L3 src and dst + * ---------------------------------------------------------------------- */ + +static void l3_aware_copy(void *dst, const void *src, uint32_t bytes) { + if (IS_L2(dst) && IS_L2(src)) { + memcpy(dst, src, bytes); + } else if (IS_L2(dst)) { + /* L3 → L2 */ + ram_read(dst, (void *)src, bytes); + } else if (IS_L2(src)) { + /* L2 → L3 */ + ram_write(dst, (void *)src, bytes); + } else { + /* L3 → L3: stage through a temporary L2 buffer */ + void *tmp = pi_l2_malloc(bytes); + ram_read(tmp, (void *)src, bytes); + ram_write(dst, tmp, bytes); + pi_l2_free(tmp, bytes); + } +} + +static void run_optimizer_step(void) { +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + /* --- Step A: copy current weights + grad acc → optimizer input buffers --- + * Skipped when codegen has shared the buffers (pointer equality test). */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t train_g_idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + wi; + uint32_t opt_w_in = 2u * wi; + uint32_t opt_g_in = 2u * wi + 1u; + + if (DeeployOptNetwork_inputs[opt_w_in] != + DeeployNetwork_inputs[train_w_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_w_in], + DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_inputs_bytes[opt_w_in]); + } + if (DeeployOptNetwork_inputs[opt_g_in] != + DeeployNetwork_inputs[train_g_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_g_in], + DeeployNetwork_inputs[train_g_idx], + DeeployOptNetwork_inputs_bytes[opt_g_in]); + } + } + + /* --- Step B: run optimizer kernel on cluster --- */ + struct pi_cluster_task opt_task; + pi_cluster_task(&opt_task, RunOptimizerNetwork, NULL); + opt_task.stack_size = MAINSTACKSIZE; + opt_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &opt_task); + + /* --- Step C: copy weight_updated back to training network's weight buffers + * --- Skipped when codegen has shared the output buffer with the training + * input. */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t opt_w_out = wi; + + if (DeeployOptNetwork_outputs[opt_w_out] == + DeeployNetwork_inputs[train_w_idx]) { + continue; /* in-place: training buffer already updated */ + } + + uint32_t opt_bytes = DeeployOptNetwork_outputs_bytes[opt_w_out]; + uint32_t train_bytes = DeeployNetwork_inputs_bytes[train_w_idx]; + if (opt_bytes == train_bytes) { + l3_aware_copy(DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_outputs[opt_w_out], opt_bytes); + } else { + /* Broadcasted bias: fill every tile with updated value. */ + for (uint32_t off = 0; off < train_bytes; off += opt_bytes) { + uint32_t chunk = + (off + opt_bytes <= train_bytes) ? opt_bytes : (train_bytes - off); + l3_aware_copy((char *)DeeployNetwork_inputs[train_w_idx] + off, + DeeployOptNetwork_outputs[opt_w_out], chunk); + } + } + } +#endif /* TRAINING_NUM_WEIGHT_INPUTS */ +} + +/* ------------------------------------------------------------------------- + * Numerical comparison helpers — run on cluster (FC has no FPU) + * ---------------------------------------------------------------------- */ + +typedef struct { + float *computed; + float *reference; + uint32_t n; + uint32_t *err_count; +} LossCompareArgs; + +static void CompareLossesOnCluster(void *args) { + if (pi_core_id() != 0) + return; + LossCompareArgs *a = (LossCompareArgs *)args; + float tol = TRAINING_TOLERANCE_ABS; /* read on cluster — has FPU */ + uint32_t errors = 0; + for (uint32_t i = 0; i < a->n; i++) { + float diff = a->computed[i] - a->reference[i]; + if (diff < 0.0f) + diff = -diff; + printf(" [loss %u] computed=%.6f ref=%.6f diff=%.6f TOL=%.6f\r\n", i, + (double)a->computed[i], (double)a->reference[i], (double)diff, + (double)tol); + if (diff > tol) { + errors++; + } + } + *a->err_count = errors; +} + +/* ------------------------------------------------------------------------- + * main + * ---------------------------------------------------------------------- */ + +int main(void) { + + printf("=== Siracusa Training Harness (Phase 2 — with OptimizerNetwork) " + "===\r\n"); + printf("N_TRAIN_STEPS=%u N_ACCUM_STEPS=%u DATA_INPUTS=%u\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS, + (unsigned)TRAINING_NUM_DATA_INPUTS); + + struct pi_cluster_conf conf; + pi_cluster_conf_init(&conf); + conf.id = 0; + pi_open_from_conf(&cluster_dev, &conf); + if (pi_cluster_open(&cluster_dev)) + return -1; + +#ifndef NOFLASH + mem_init(); + open_fs(); +#endif + + struct pi_cluster_task cluster_task; + + /* ------------------------------------------------------------------ + * Init training network + * ------------------------------------------------------------------ */ + + printf("Initializing TrainingNetwork...\r\n"); + pi_cluster_task(&cluster_task, InitTrainingNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ------------------------------------------------------------------ + * Zero-initialise gradient accumulation buffers. + * ------------------------------------------------------------------ */ + + for (uint32_t _gi = 0; _gi < (uint32_t)TRAINING_NUM_GRAD_INPUTS; _gi++) { + uint32_t _idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + _gi; + uint32_t bytes = DeeployNetwork_inputs_bytes[_idx]; + void *buf = DeeployNetwork_inputs[_idx]; + if (IS_L2(buf)) { + memset(buf, 0, bytes); + } else { + /* Write zeros into L3 via DMA using a temporary L2 zero page */ + uint8_t *zero_page = pi_l2_malloc(512); + memset(zero_page, 0, 512); + for (uint32_t off = 0; off < bytes; off += 512) { + uint32_t chunk = (off + 512 <= bytes) ? 512 : (bytes - off); + ram_write((char *)buf + off, zero_page, chunk); + } + pi_l2_free(zero_page, 512); + } + } + + /* ------------------------------------------------------------------ + * Init optimizer network + * ------------------------------------------------------------------ */ + + printf("Initializing OptimizerNetwork...\r\n"); + pi_cluster_task(&cluster_task, InitOptimizerNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + uint32_t reset_idx = DeeployNetwork_num_inputs - 1; + + /* ------------------------------------------------------------------ + * Copy initial weights into network input buffers. + * (InitTrainingNetwork only malloc's them; testInitWeights[] holds + * the actual starting values from inputs.npz.) + * ------------------------------------------------------------------ */ + +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + l3_aware_copy(DeeployNetwork_inputs[idx], testInitWeights[wi], + DeeployNetwork_inputs_bytes[idx]); + } +#endif + + printf("Starting training (%u optimizer steps x %u accum steps)...\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS); + + for (uint32_t update_step = 0; update_step < N_TRAIN_STEPS; update_step++) { + + for (uint32_t accum_step = 0; accum_step < N_ACCUM_STEPS; accum_step++) { + + uint32_t mb = update_step * N_ACCUM_STEPS + accum_step; + + printf(" update %u/%u accum %u/%u (mini-batch %u)\r\n", + update_step + 1, (unsigned)N_TRAIN_STEPS, accum_step + 1, + (unsigned)N_ACCUM_STEPS, mb); + + /* ① Set lazy_reset_grad. */ + { + void *reset_ptr = DeeployNetwork_inputs[reset_idx]; + uint8_t reset_val = (accum_step == 0) ? 1u : 0u; + if (IS_L2(reset_ptr)) { + *((uint8_t *)reset_ptr) = reset_val; + } else { + ram_write(reset_ptr, &reset_val, sizeof(uint8_t)); + } + } + + /* ② Load this mini-batch's data + labels (cycle through unique samples). + */ + for (uint32_t buf = 0; buf < TRAINING_NUM_DATA_INPUTS; buf++) { + l3_aware_copy(DeeployNetwork_inputs[buf], + testDataVector[mb % TRAINING_DATA_SIZE][buf], + DeeployNetwork_inputs_bytes[buf]); + } + + /* ③ Forward + backward + InPlaceAccumulatorV2. */ + pi_cluster_task(&cluster_task, RunTrainingNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ④ Store loss — use memcpy to avoid float registers on FC (no FPU). */ + { + void *loss_ptr = DeeployNetwork_outputs[0]; + if (IS_L2(loss_ptr)) { + memcpy(&stored_losses[mb], loss_ptr, sizeof(float)); + } else { + ram_read(&stored_losses[mb], loss_ptr, sizeof(float)); + } + } + + } /* end accum_step loop */ + + /* ⑤ SGD weight update via Deeploy-compiled OptimizerNetwork. */ + run_optimizer_step(); + + } /* end update_step loop */ + + /* ------------------------------------------------------------------ + * Numerical verification — run on cluster (FC has no FPU) + * ------------------------------------------------------------------ */ + + uint32_t loss_err_count = 0; + uint32_t total_loss_checks = + (TOTAL_FWD_PASSES < N_LOSS_REFS) ? TOTAL_FWD_PASSES : N_LOSS_REFS; + LossCompareArgs loss_cmp_args = { + .computed = stored_losses, + .reference = (float *)testLossRef, + .n = total_loss_checks, + .err_count = &loss_err_count, + }; + pi_cluster_task(&cluster_task, CompareLossesOnCluster, &loss_cmp_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + printf("Errors: %u out of %u\r\n", (unsigned)loss_err_count, + (unsigned)total_loss_checks); + + return loss_err_count == 0 ? 0 : 1; +} diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz deleted file mode 100644 index b51a843019..0000000000 Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx deleted file mode 100644 index 4e132a326b..0000000000 Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz deleted file mode 100644 index fede142f83..0000000000 Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz and /dev/null differ diff --git a/DeeployTest/deeployTrainingRunner_siracusa.py b/DeeployTest/deeployTrainingRunner_siracusa.py new file mode 100644 index 0000000000..c13cc31411 --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_siracusa.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = False)) diff --git a/DeeployTest/deeployTrainingRunner_tiled_siracusa.py b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py new file mode 100644 index 0000000000..3509fc04fe --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = True)) diff --git a/DeeployTest/generateOptimizerNetwork.py b/DeeployTest/generateOptimizerNetwork.py new file mode 100644 index 0000000000..d13b29505e --- /dev/null +++ b/DeeployTest/generateOptimizerNetwork.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Optimizer network code-generation entry point. + +Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits +OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory. + +The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the +default ``DeeployNetwork_``) so that it can be linked together with the +training network without symbol conflicts. + +Usage +----- + /usr/bin/python generateOptimizerNetwork.py \\ + -t \\ # directory containing network.onnx + -d \\ # where to write OptimizerNetwork.c/h + -p Siracusa \\ + --cores 8 \\ + --lr 0.001 +""" + +import os +import sys +from pathlib import Path + +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.trainingUtils import add_optimizer_training_dir_arg + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine + + +def generateOptimizerNetwork(args): + log.debug("Arguments: %s", args) + + # 1. Load optimizer network.onnx + onnx_path = f'{args.dir}/network.onnx' + onnx_model = onnx.load_model(onnx_path) + graph = gs.import_onnx(onnx_model) + + log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}") + log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}") + + # 2. Platform setup + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 3. All optimizer inputs are float32 (weights + grad acc buffers). + graph_input_names = [inp.name for inp in onnx_model.graph.input] + inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))} + inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))} + + # 4. Create and prepare deployer + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name = "DeeployOptimizerNetwork", + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets) + + # Set up memory hierarchy so AnnotateDefaultMemoryLevel assigns the correct + # memory level to ConstantBuffers (weights). The optimizer graph is NOT + # tiled, but it must share the same memory-level view as the training graph + # so that weights end up in the same physical location (L2 when L3 is the + # training default, see AnnotateDefaultMemoryLevel). + L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) + L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + defaultTargetMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel) + deployer = MemoryDeployerWrapper(deployer, [AnnotateDefaultMemoryLevel(memoryHierarchy)]) + + verbosityCfg = _NoVerbosity + _ = deployer.prepare(verbosityCfg) + + # 5. Build shared-buffer maps when the training ONNX is available + shared_input_map: dict = {} + shared_output_map: dict = {} + training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None + if training_onnx and training_onnx.exists(): + shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model) + log.debug(f"[SharedBuffers] input map: {shared_input_map}") + log.debug(f"[SharedBuffers] output map: {shared_output_map}") + log.info(f"[OptimizerNetwork] Sharing {len(shared_input_map)} inputs and " + f"{len(shared_output_map)} outputs with TrainingNetwork") + else: + if args.training_dir: + log.warning(f"[OptimizerNetwork] training_dir set but {training_onnx} not found — " + "generating standalone OptimizerNetwork (no buffer sharing)") + + # 6. Generate OptimizerNetwork.c / OptimizerNetwork.h + os.makedirs(args.dumpdir, exist_ok = True) + generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map) + + log.info(f"Optimizer network code generated in: {args.dumpdir}") + print(f"[OptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}") + + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(description = "Deeploy Optimizer Network Code Generation.") + parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.") + parser.add_argument( + "--lr", + type = float, + default = 0.001, + help = "Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.", + ) + parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.") + parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.") + parser.add_argument("--defaultMemLevel", + type = str, + default = "L2", + help = "Default memory level for IO buffers. Default: L2.") + add_optimizer_training_dir_arg(parser) + parser.add_argument("--shouldFail", action = "store_true") + parser.set_defaults(shouldFail = False) + args = parser.parse_args() + + try: + generateOptimizerNetwork(args) + except Exception: + if args.shouldFail: + print("\033[92mOptimizer network generation ended, failed as expected!\033[0m") + sys.exit(0) + raise + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/generateTrainingNetwork.py b/DeeployTest/generateTrainingNetwork.py new file mode 100644 index 0000000000..febd95afdb --- /dev/null +++ b/DeeployTest/generateTrainingNetwork.py @@ -0,0 +1,238 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import sys + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerateTraining import generateTrainingTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \ + _infer_total_mb, _load_reference_losses, add_training_inference_args +from testUtils.typeMapping import inferTypeAndOffset + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t +from Deeploy.DeeployTypes import _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine + + +def generateTrainingNetwork(args): + log.debug("Arguments: %s", args) + + # 1. Load network.onnx (training graph) + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + # 1a. Handle UNDEFINED-typed outputs in training ONNX graphs. + # Backward pass ONNX often doesn't propagate types for gradient outputs. + # (i) Strip UNDEFINED-typed outputs that have no consumers. + # (ii) Patch UNDEFINED-typed outputs WITH consumers to float32 (training default). + _stripped = False + _patched = False + for node in graph.nodes: + filtered = [out for out in node.outputs if not (out.dtype == 0 and len(out.outputs) == 0)] + if len(filtered) < len(node.outputs): + node.outputs = filtered + _stripped = True + for out in node.outputs: + if out.dtype == 0 and len(out.outputs) > 0: + out.dtype = np.dtype(np.float32) + _patched = True + if _stripped: + graph.cleanup() + log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes") + if _patched: + log.debug("Patched UNDEFINED-typed outputs with consumers to float32") + + # 2. Load inputs.npz (new format: no grad acc buf entries) + inputs_path = f'{args.dir}/inputs.npz' + inputs = np.load(inputs_path) + + # 3. Platform setup + platform, signProp = mapPlatform(args.platform) + + log.debug(f"Platform: {platform} (sign: {signProp})") + + # Set cores on cluster engines (same pattern as generateNetwork.py) + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 4. Identify grad acc buf positions in the ONNX graph. + graph_input_names = [inp.name for inp in onnx_graph.graph.input] + grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n} + non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set] + + # Base npz arrays: keys that are neither per-mb entries (mb*) nor metadata (meta_*) + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + npz_base = [inputs[k] for k in base_keys] + + if len(npz_base) != len(non_grad_indices): + raise ValueError(f"inputs.npz has {len(npz_base)} base entries but network.onnx has " + f"{len(non_grad_indices)} non-grad-buf inputs. " + f"Re-generate inputs.npz with the updated exporter.") + + # Build inputTypes / inputOffsets for ALL graph input positions. + inputTypes = {} + inputOffsets = {} + + npz_idx = 0 + for graph_idx in range(len(graph_input_names)): + if graph_idx in grad_acc_set: + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + arr = npz_base[npz_idx] + npz_idx += 1 + + if arr.dtype == bool or arr.dtype == np.bool_: + inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif arr.dtype in (np.float32, np.float64): + # Float32 training parameters always stay float32. + # inferTypeAndOffset would misclassify integer-valued floats + # (e.g. LayerNorm gamma=1.0 / beta=0.0) as int8_t. + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif np.prod(arr.shape) == 0: + # Zero-sized input (ONNX allows shape (0, ...) for optional + # placeholders). No data to infer from, but downstream still + # looks up input_{idx} by key, so populate with a trivial default. + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + values = arr.reshape(-1).astype(np.float32) + _type, offset = inferTypeAndOffset(values, signProp = False) + inputTypes[f"input_{graph_idx}"] = _type + inputOffsets[f"input_{graph_idx}"] = offset + + # 5. Create deployer + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name = "DeeployTrainingNetwork", + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets) + + log.debug(f"Deployer: {deployer}") + + # 6. Prepare deployer + verbosityCfg = _NoVerbosity + + _ = deployer.prepare(verbosityCfg) + + # 7. Resolve num_data_inputs, n_steps, n_accum (auto-detect when not given). + + # num_data_inputs: detect from npz mb1 variants if not specified + num_data = args.num_data_inputs + if num_data is None: + num_data = _infer_num_data_inputs(inputs_path) + log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz") + + # n_steps / n_accum: derive from inputs.npz mini-batch count if not specified + n_steps = args.n_steps + n_accum = args.n_accum + if n_steps is None or n_accum is None: + total_mb = _infer_total_mb(inputs_path) + log.info(f"Auto-detected total_mb={total_mb} from inputs.npz") + if n_steps is None and n_accum is None: + n_accum = _infer_n_accum(inputs_path) + n_steps = max(1, total_mb // n_accum) + elif n_steps is None: + n_steps = max(1, total_mb // n_accum) + else: + n_accum = max(1, total_mb // n_steps) + + log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}") + + # 8. Build unique_mb_data from npz (only data_size unique samples). + # The C harness cycles through them via mb % TRAINING_DATA_SIZE. + total_mb = n_steps * n_accum + data_size = _infer_data_size(inputs_path) + log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}") + mb0_data = list(npz_base[:num_data]) + + unique_mb_data = [] + for mb in range(data_size): + if mb == 0: + unique_mb_data.append(mb0_data) + else: + mb_row = [] + for buf_idx in range(num_data): + key = f"mb{mb}_arr_{buf_idx:04d}" + mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx]) + unique_mb_data.append(mb_row) + + # Grad acc buf info for testinputs.h. + if grad_acc_set: + sorted_grad = sorted(grad_acc_set) + grad_buf_start_idx = sorted_grad[0] + else: + grad_buf_start_idx = -1 + num_grad_inputs = len(grad_acc_set) + + # Initial weight arrays: npz_base[num_data .. grad_buf_start_idx-1] + if grad_buf_start_idx > num_data: + init_weights = list(npz_base[num_data:grad_buf_start_idx]) + else: + init_weights = [] + + # 9. Load reference loss from outputs.npz. + reference_losses = _load_reference_losses(args.dir) + + # 10. Generate all output files + os.makedirs(args.dumpdir, exist_ok = True) + + generateTrainingTestNetwork(deployer, + unique_mb_data, + args.dumpdir, + verbosityCfg, + n_steps = n_steps, + n_accum = n_accum, + num_data_inputs = num_data, + grad_buf_start_idx = grad_buf_start_idx, + num_grad_inputs = num_grad_inputs, + learning_rate = args.learning_rate, + reference_losses = reference_losses, + init_weights = init_weights, + data_size = data_size, + tolerance_abs = args.tolerance_abs) + + # 11. Write resolved config for execution.py to pick up after subprocess call. + meta = { + "n_train_steps": n_steps, + "n_accum_steps": n_accum, + "training_num_data_inputs": num_data, + } + meta_path = os.path.join(args.dumpdir, "training_meta.json") + with open(meta_path, 'w') as f: + json.dump(meta, f, indent = 2) + log.info(f"Training meta written to {meta_path}: {meta}") + + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(description = "Deeploy Training Code Generation Utility.") + parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.") + add_training_inference_args(parser) + parser.add_argument("--shouldFail", action = "store_true") + parser.set_defaults(shouldFail = False) + args = parser.parse_args() + + try: + generateTrainingNetwork(args) + except Exception: + if args.shouldFail: + print("\033[92mTraining network generation ended, failed as expected!\033[0m") + sys.exit(0) + raise + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py new file mode 100644 index 0000000000..3a94bf8e48 --- /dev/null +++ b/DeeployTest/testMVPOptimizer.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Tiled optimizer network code-generation entry point. + +Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits +OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory, +using the SB-Tiler to tile SGD kernels through L1. + +The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the +default ``DeeployNetwork_``) so that it can be linked together with the +training network without symbol conflicts. + +Usage +----- + /usr/bin/python testMVPOptimizer.py \\ + -t \\ # directory containing network.onnx + -d \\ # where to write OptimizerNetwork.c/h + -p Siracusa \\ + --cores 8 \\ + --l1 64000 \\ + --l2 1024000 \\ + --defaultMemLevel L2 +""" + +import hashlib +import os +import sys +from pathlib import Path + +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.tilingUtils import TrainingSBTiler +from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import CodeGenVerbosity, _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ + AnnotateIOMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + + +def generateTiledOptimizerNetwork(args) -> None: + log.debug("Arguments: %s", args) + + # 1. Load optimizer network.onnx + onnx_path = f'{args.dir}/network.onnx' + onnx_model = onnx.load_model(onnx_path) + graph = gs.import_onnx(onnx_model) + + log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}") + log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}") + + # 2. Platform setup + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 3. All optimizer inputs are float32 (weights + grad acc buffers). + graph_input_names = [inp.name for inp in onnx_model.graph.input] + inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))} + inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))} + + # 4. Create deployer with _mockScheduler (required for TilerDeployerWrapper). + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name = "DeeployOptimizerNetwork", + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets, + scheduler = _mockScheduler) + + # 5. Set up memory hierarchy. + # Tiles execute in L1; optimizer I/O (weights, grads) live in L2 (or L3). + L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64_000_000) + L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + + defaultTargetMemLevel = L1 + defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + # 6. Wrap with memory-level annotation. + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel) + deployer = MemoryDeployerWrapper(deployer, [ + AnnotateIOMemoryLevel(defaultIoMemLevel.name), + AnnotateDefaultMemoryLevel(memoryHierarchy), + ]) + + # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer" + testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] + + # TrainingSBTiler extends all input buffer lifetimes to the end of the + # schedule (via TrainingMemoryScheduler). This prevents the allocator from + # reusing the space of a consumed input (e.g. fc1 weight) for a later + # output (e.g. fc2 updated weight), which would corrupt the weight buffer. + deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc + deployer.tiler.memoryAllocStrategy = args.memAllocStrategy + deployer.tiler.searchStrategy = args.searchStrategy + + # 8. Prepare deployer. + verbosityCfg = _NoVerbosity + if args.profileTiling: + verbosityCfg = CodeGenVerbosity(tilingProfiling = True) + _ = deployer.prepare(verbosityCfg) + + # 9. Build shared-buffer maps when the training ONNX is available + shared_input_map: dict = {} + shared_output_map: dict = {} + training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None + if training_onnx and training_onnx.exists(): + shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model) + log.debug(f"[SharedBuffers] input map: {shared_input_map}") + log.debug(f"[SharedBuffers] output map: {shared_output_map}") + log.info(f"[TiledOptimizerNetwork] Sharing {len(shared_input_map)} inputs and " + f"{len(shared_output_map)} outputs with TrainingNetwork") + else: + if args.training_dir: + log.warning(f"[TiledOptimizerNetwork] training_dir set but {training_onnx} not found — " + "generating standalone OptimizerNetwork (no buffer sharing)") + + # 10. Generate OptimizerNetwork.c / OptimizerNetwork.h + os.makedirs(args.dumpdir, exist_ok = True) + generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map) + + log.info(f"Tiled optimizer network code generated in: {args.dumpdir}") + print(f"[TiledOptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}") + + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(description = "Deeploy Tiled Optimizer Network Code Generation.") + parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.") + parser.add_argument( + "--lr", + type = float, + default = 0.001, + help = "Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.", + ) + parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.") + parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.") + parser.add_argument("--defaultMemLevel", + type = str, + default = "L2", + help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--memAllocStrategy", + type = str, + default = "MiniMalloc", + help = "Memory allocation strategy. Default: MiniMalloc.") + parser.add_argument("--searchStrategy", + type = str, + default = "random-max", + help = "CP solver search strategy. Default: random-max.") + parser.add_argument("--plotMemAlloc", + action = "store_true", + help = "Save memory allocation plots in the deeployStates folder.") + parser.add_argument("--profileTiling", + action = "store_true", + help = "Enable tiling profiling (inserts cycle counters around each tiled kernel).") + add_optimizer_training_dir_arg(parser) + parser.add_argument("--shouldFail", action = "store_true") + parser.set_defaults(shouldFail = False) + args = parser.parse_args() + + try: + generateTiledOptimizerNetwork(args) + except Exception: + if args.shouldFail: + print("\033[92mTiled optimizer network generation ended, failed as expected!\033[0m") + sys.exit(0) + raise + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py new file mode 100644 index 0000000000..c0e4e7c2d8 --- /dev/null +++ b/DeeployTest/testMVPTraining.py @@ -0,0 +1,274 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import hashlib +import json +import os +import sys + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerateTraining import generateTrainingTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.tilingUtils import TrainingSBTiler +from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \ + _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args +from testUtils.typeMapping import inferTypeAndOffset + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t +from Deeploy.DeeployTypes import CodeGenVerbosity, _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ + AnnotateIOMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + + +def generateTiledTrainingNetwork(args) -> None: + log.debug("Arguments: %s", args) + + # 1. Load network.onnx (training graph with forward + backward ops). + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + # 1a. Strip UNDEFINED-typed unused optional outputs (e.g. MaxPool mask indices). + _stripped = False + for node in graph.nodes: + filtered = [out for out in node.outputs if not (out.dtype == 0 and len(out.outputs) == 0)] + if len(filtered) < len(node.outputs): + node.outputs = filtered + _stripped = True + if _stripped: + graph.cleanup() + log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes") + + # 2. Load inputs.npz. + inputs_path = f'{args.dir}/inputs.npz' + inputs = np.load(inputs_path) + + # 3. Platform setup. + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 4. Identify grad acc buf positions in the ONNX graph. + graph_input_names = [inp.name for inp in onnx_graph.graph.input] + grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n} + non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set] + + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + npz_base = [inputs[k] for k in base_keys] + + if len(npz_base) != len(non_grad_indices): + raise ValueError(f"inputs.npz has {len(npz_base)} base entries but network.onnx has " + f"{len(non_grad_indices)} non-grad-buf inputs. " + f"Re-generate inputs.npz with the updated exporter.") + + # 5. Build inputTypes / inputOffsets for ALL graph input positions. + inputTypes = {} + inputOffsets = {} + + npz_idx = 0 + for graph_idx in range(len(graph_input_names)): + if graph_idx in grad_acc_set: + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + arr = npz_base[npz_idx] + npz_idx += 1 + if arr.dtype == bool or arr.dtype == np.bool_: + inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif arr.dtype in (np.float32, np.float64): + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif np.prod(arr.shape) == 0: + # Zero-sized input (ONNX allows shape (0, ...) for optional + # placeholders). No data to infer from, but downstream still + # looks up input_{idx} by key, so populate with a trivial default. + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + values = arr.reshape(-1).astype(np.float32) + _type, offset = inferTypeAndOffset(values, signProp = False) + inputTypes[f"input_{graph_idx}"] = _type + inputOffsets[f"input_{graph_idx}"] = offset + + # 6. Create deployer with _mockScheduler (required for TilerDeployerWrapper). + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name = "DeeployTrainingNetwork", + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets, + scheduler = _mockScheduler) + + # 7. Set up memory hierarchy. + L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64_000_000) + L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + + defaultTargetMemLevel = L1 + defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + # 8. Wrap with memory-level annotation. + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel) + + deployer = MemoryDeployerWrapper(deployer, [ + AnnotateIOMemoryLevel(defaultIoMemLevel.name), + AnnotateDefaultMemoryLevel(memoryHierarchy), + ]) + + # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}" + testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] + + deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc + deployer.tiler.memoryAllocStrategy = args.memAllocStrategy + deployer.tiler.searchStrategy = args.searchStrategy + + # 10. Prepare deployer. + verbosityCfg = _NoVerbosity + if args.profileTiling: + verbosityCfg = CodeGenVerbosity(tilingProfiling = True) + _ = deployer.prepare(verbosityCfg) + + # 11. Resolve num_data_inputs, n_steps, n_accum. + num_data = args.num_data_inputs + if num_data is None: + num_data = _infer_num_data_inputs(inputs_path) + log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz") + + n_steps = args.n_steps + n_accum = args.n_accum + if n_steps is None or n_accum is None: + total_mb = _infer_total_mb(inputs_path) + log.info(f"Auto-detected total_mb={total_mb} from inputs.npz") + if n_steps is None and n_accum is None: + n_accum = _infer_n_accum(inputs_path) + n_steps = max(1, total_mb // n_accum) + elif n_steps is None: + n_steps = max(1, total_mb // n_accum) + else: + n_accum = max(1, total_mb // n_steps) + + log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}") + + # 12. Build unique_mb_data from npz. + total_mb = n_steps * n_accum + data_size = _infer_data_size(inputs_path) + log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}") + mb0_data = list(npz_base[:num_data]) + + unique_mb_data = [] + for mb in range(data_size): + if mb == 0: + unique_mb_data.append(mb0_data) + else: + mb_row = [] + for buf_idx in range(num_data): + key = f"mb{mb}_arr_{buf_idx:04d}" + mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx]) + unique_mb_data.append(mb_row) + + # Grad acc buf info for testinputs.h. + if grad_acc_set: + sorted_grad = sorted(grad_acc_set) + grad_buf_start_idx = sorted_grad[0] + else: + grad_buf_start_idx = -1 + num_grad_inputs = len(grad_acc_set) + + if grad_buf_start_idx > num_data: + init_weights = list(npz_base[num_data:grad_buf_start_idx]) + else: + init_weights = [] + + # 13. Load reference losses. + reference_losses = _load_reference_losses(args.dir) + + # 14. Generate output files. + os.makedirs(args.dumpdir, exist_ok = True) + + generateTrainingTestNetwork(deployer, + unique_mb_data, + args.dumpdir, + verbosityCfg, + n_steps = n_steps, + n_accum = n_accum, + num_data_inputs = num_data, + grad_buf_start_idx = grad_buf_start_idx, + num_grad_inputs = num_grad_inputs, + learning_rate = args.learning_rate, + reference_losses = reference_losses, + init_weights = init_weights, + data_size = data_size, + tolerance_abs = args.tolerance_abs) + + # 15. Write resolved config for execution.py to pick up. + meta = { + "n_train_steps": n_steps, + "n_accum_steps": n_accum, + "training_num_data_inputs": num_data, + } + meta_path = os.path.join(args.dumpdir, "training_meta.json") + with open(meta_path, 'w') as f: + json.dump(meta, f, indent = 2) + log.info(f"Training meta written to {meta_path}: {meta}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(description = "Deeploy Tiled Training Code Generation Utility.") + parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.") + add_training_inference_args(parser) + parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.") + parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.") + parser.add_argument("--defaultMemLevel", + type = str, + default = "L2", + help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--memAllocStrategy", + type = str, + default = "MiniMalloc", + help = "Memory allocation strategy. Default: MiniMalloc.") + parser.add_argument("--searchStrategy", + type = str, + default = "random-max", + help = "CP solver search strategy. Default: random-max.") + parser.add_argument("--plotMemAlloc", + action = "store_true", + help = "Save memory allocation plots in the deeployStates folder.") + parser.add_argument("--profileTiling", + action = "store_true", + help = "Enable tiling profiling (inserts cycle counters around each tiled kernel).") + parser.add_argument("--shouldFail", action = "store_true") + parser.set_defaults(shouldFail = False) + args = parser.parse_args() + + try: + generateTiledTrainingNetwork(args) + except Exception: + if args.shouldFail: + print("\033[92mTiled training network generation ended, failed as expected!\033[0m") + sys.exit(0) + raise + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testUtils/codeGenerateTraining.py b/DeeployTest/testUtils/codeGenerateTraining.py new file mode 100644 index 0000000000..4ef9a9fd8a --- /dev/null +++ b/DeeployTest/testUtils/codeGenerateTraining.py @@ -0,0 +1,892 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Code-generation helpers for the training / optimizer test harness. + +These functions emit the C source, header and data files for training tests +that drive both a TrainingNetwork (forward + backward + gradient accumulation) +and an OptimizerNetwork (SGD weight update) on the target platform. + +Kept as a separate module from testUtils.codeGenerate (which handles plain +inference codegen) so this PR's training-side additions touch the inference +helpers only through imports, not by interleaving with inference definitions. +""" + +import os +import re +from typing import Dict, List, Optional, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkDeployer +from Deeploy.Targets.MemPool.Platform import MemPoolPlatform +from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform + +from .codeGenerate import generateL3HexDump + + +def generateTrainingTestInputsHeader(deployer: NetworkDeployer, + all_mb_data: List[List[np.ndarray]], + n_steps: int, + n_accum: int, + grad_buf_start_idx: int = 0, + num_grad_inputs: int = 0, + learning_rate: float = 0.001, + init_weights: List[np.ndarray] = None, + data_size: int = None) -> str: + """Generate testinputs.h for training tests. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer (used to look up buffer types). + all_mb_data : list of list of np.ndarray + Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for + mini-batch *mb* and DATA buffer *buf*. All mini-batches must have the + same number of buffers. + n_steps : int + N_TRAIN_STEPS macro value. + n_accum : int + N_ACCUM_STEPS macro value. + grad_buf_start_idx : int + Index of the first grad accumulation buffer in DeeployNetwork_inputs[]. + Used to emit TRAINING_GRAD_BUF_START_IDX. Pass 0 (and num_grad_inputs=0) + to suppress the define (e.g. when no grad bufs exist). + num_grad_inputs : int + Number of grad accumulation buffers. Used to emit TRAINING_NUM_GRAD_INPUTS. + + Returns + ------- + str + C header string. + """ + total_mb = n_steps * n_accum + num_data = len(all_mb_data[0]) if all_mb_data else 0 + # data_size: number of unique samples stored in C arrays. + # C harness cycles: testDataVector[mb % TRAINING_DATA_SIZE]. + # Defaults to total_mb (no cycling) for backward compatibility. + effective_data_size = data_size if (data_size is not None and data_size < total_mb) else total_mb + + retStr = "" + retStr += f"#define N_TRAIN_STEPS {n_steps}\n" + retStr += f"#define N_ACCUM_STEPS {n_accum}\n" + retStr += f"#define TRAINING_DATA_SIZE {effective_data_size}\n" + retStr += f"#define TRAINING_NUM_DATA_INPUTS {num_data}\n" + if num_grad_inputs > 0: + retStr += f"#define TRAINING_GRAD_BUF_START_IDX {grad_buf_start_idx}\n" + retStr += f"#define TRAINING_NUM_GRAD_INPUTS {num_grad_inputs}\n" + num_weight_inputs = grad_buf_start_idx - num_data + retStr += f"#define TRAINING_NUM_WEIGHT_INPUTS {num_weight_inputs}\n" + retStr += f"#define TRAINING_LEARNING_RATE {learning_rate:.10g}f\n" + retStr += "\n" + + # Emit per-mini-batch buffer arrays — only effective_data_size unique rows. + # all_mb_data must contain exactly effective_data_size rows. + for mb in range(effective_data_size): + mb_data = all_mb_data[mb] if mb < len(all_mb_data) else all_mb_data[-1] + row_entries = [] + for buf_idx, arr in enumerate(mb_data): + values = arr.reshape(-1) + + # Determine C type from deployer context (buffer "input_N"). + input_key = f"input_{buf_idx}" + if deployer.ctxt.is_buffer(input_key): + buffer = deployer.ctxt.lookup(input_key) + typeName = buffer._type.referencedType.typeName + typeWidth = buffer._type.referencedType.typeWidth + else: + # Fallback: infer from numpy dtype + if arr.dtype == np.float32 or arr.dtype == np.float64: + typeName = "float32_t" + typeWidth = 32 + elif arr.dtype == np.int64: + typeName = "int64_t" + typeWidth = 64 + elif arr.dtype == np.bool_ or arr.dtype == bool: + typeName = "uint8_t" + typeWidth = 8 + else: + typeName = "int32_t" + typeWidth = 32 + + buf_name = f"testData_mb{mb}_buf{buf_idx}" + row_entries.append(buf_name) + + # Format values + if typeName == 'float32_t': + list_str = ", ".join( + [f'{float(x)}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values.astype(np.float32)]) + else: + list_str = ", ".join([str(x) for x in values]) + + # 4-byte alignment padding + total_bytes = (values.size * typeWidth) // 8 + pad_bytes = (-total_bytes) % 4 + if pad_bytes: + paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth + list_str += ", " + ", ".join("0" for _ in range(paddingElements)) + + retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n" + + # Emit the row pointer array for this mini-batch + row_name = f"testDataRow{mb}" + retStr += f"void* {row_name}[] = {{{', '.join(f'(void*){e}' for e in row_entries)}}};\n" + retStr += "\n" + + # Emit the top-level vector of row pointers (only unique samples; C harness cycles via modulo). + retStr += f"void** testDataVector[{effective_data_size}] = {{{', '.join(f'testDataRow{mb}' for mb in range(effective_data_size))}}};\n" + + # Emit initial weight arrays (one per weight input, indices num_data..grad_buf_start_idx-1). + if init_weights: + retStr += "\n" + weight_entries = [] + num_data = len(all_mb_data[0]) if all_mb_data else 0 + for wi, arr in enumerate(init_weights): + buf_global_idx = num_data + wi + input_key = f"input_{buf_global_idx}" + if deployer.ctxt.is_buffer(input_key): + buffer = deployer.ctxt.lookup(input_key) + typeName = buffer._type.referencedType.typeName + typeWidth = buffer._type.referencedType.typeWidth + else: + typeName = "float32_t" + typeWidth = 32 + values = arr.reshape(-1).astype(np.float32) + # Tile values to match Deeploy's internal (possibly sequence-length-tiled) shape. + if deployer.ctxt.is_buffer(input_key): + expected_nelems = int(np.prod(deployer.ctxt.lookup(input_key).shape)) + if expected_nelems > len(values) and expected_nelems % len(values) == 0: + values = np.tile(values, expected_nelems // len(values)) + list_str = ", ".join([f'{float(x)}f' for x in values]) + buf_name = f"testInitWeight_{wi}" + weight_entries.append(buf_name) + retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n" + retStr += f"void* testInitWeights[{len(weight_entries)}] = {{{', '.join(f'(void*){e}' for e in weight_entries)}}};\n" + + return retStr + + +def generateTrainingTestOutputsHeader( + reference_losses: List = None, + tolerance_abs: float = 1e-3, +) -> str: + """Generate testoutputs.h for training tests — loss comparison only. + + Parameters + ---------- + reference_losses : list of float, optional + Reference loss value for each forward pass (one per mini-batch step). + If None, loss comparison is skipped. + tolerance_abs : float + Absolute comparison tolerance emitted as TRAINING_TOLERANCE_ABS. + + Returns + ------- + str + C header string. + """ + has_loss = reference_losses is not None and len(reference_losses) > 0 + + retStr = "// testoutputs.h — Phase 2: loss verification\n" + retStr += f"#define TRAINING_TOLERANCE_ABS {tolerance_abs:.10g}f\n\n" + + if has_loss: + n = len(reference_losses) + retStr += "// Expected loss for each forward pass (one per mini-batch)\n" + retStr += f"#define N_LOSS_REFS {n}\n" + vals = ", ".join(f"{float(v):.10g}f" for v in reference_losses) + retStr += f"float32_t testLossRef[{n}] = {{{vals}}};\n\n" + else: + retStr += "// No loss reference available — loss comparison skipped.\n" + retStr += "#define N_LOSS_REFS 0\n\n" + + return retStr + + +def generateTrainingNetworkHeader(deployer: NetworkDeployer) -> str: + """Generate TrainingNetwork.h — same as generateTestNetworkHeader but with + RunTrainingNetwork / InitTrainingNetwork function names and a distinct header guard. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer. + + Returns + ------- + str + C header string. + """ + retStr = "" + + retStr += """ +#ifndef __DEEPLOY_TRAINING_HEADER__ +#define __DEEPLOY_TRAINING_HEADER__ +#include +#include +#include +""" + retStr += deployer.generateIncludeString() + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunTrainingNetwork(); +void InitTrainingNetwork(); + +""" + else: + retStr += """ +void RunTrainingNetwork(uint32_t core_id, uint32_t numThreads); +void InitTrainingNetwork(uint32_t core_id, uint32_t numThread); + +""" + + retStr += deployer.generateIOBufferInitializationCode() + retStr += """ +#endif +""" + + return retStr + + +def generateTrainingNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str: + """Generate TrainingNetwork.c — same as generateTestNetworkImplementation but with + RunTrainingNetwork / InitTrainingNetwork function names and including TrainingNetwork.h. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + + Returns + ------- + str + C implementation string. + """ + retStr = "" + + retStr += """#include +#include +#include +""" + retStr += deployer.generateIncludeString() + retStr += """ + +#include "TrainingNetwork.h" + +""" + + retStr += deployer.generateBufferInitializationCode() + retStr += deployer.generateGlobalDefinitionCode() + + if isinstance(deployer.Platform, MemPoolPlatform): + retStr += deployer.generateInferenceInitializationCode() + retStr += """ +void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunTrainingNetwork(){ +""" + retStr += deployer.generateInferenceInitializationCode() + else: + retStr += """ +void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateInferenceInitializationCode() + + retStr += deployer.generateFunction(verbosityCfg) + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +} + +void InitTrainingNetwork(){ +""" + else: + retStr += """ +} + +void InitTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateEngineInitializationCode() + retStr += deployer.generateBufferAllocationCode() + retStr += """ +} +""" + + return retStr + + +def generateTrainingTestNetwork(deployer: NetworkDeployer, + all_mb_data: List[List[np.ndarray]], + dumpdir: str, + verbosityCfg: CodeGenVerbosity, + n_steps: int = 1, + n_accum: int = 1, + num_data_inputs: int = 2, + grad_buf_start_idx: int = 0, + num_grad_inputs: int = 0, + learning_rate: float = 0.001, + reference_losses: List = None, + init_weights: List = None, + data_size: int = None, + tolerance_abs: float = 1e-3) -> None: + """Generate all training test files: testinputs.h, testoutputs.h, TrainingNetwork.h, TrainingNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer (ctxt.name must already be set to "DeeployTrainingNetwork"). + all_mb_data : list of list of np.ndarray + Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for + mini-batch *mb* and DATA buffer *buf*. + dumpdir : str + Output directory for generated files. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + n_steps : int + N_TRAIN_STEPS value. + n_accum : int + N_ACCUM_STEPS value. + num_data_inputs : int + Number of data inputs (TRAINING_NUM_DATA_INPUTS). + grad_buf_start_idx : int + Index of the first grad accumulation buffer in DeeployNetwork_inputs[]. + num_grad_inputs : int + Number of grad accumulation buffers (TRAINING_NUM_GRAD_INPUTS). + """ + assert deployer.prepared, "An unprepared deployer was given" + + os.makedirs(dumpdir, exist_ok = True) + + # testinputs.h + testInputStr = generateTrainingTestInputsHeader(deployer, + all_mb_data, + n_steps, + n_accum, + grad_buf_start_idx, + num_grad_inputs, + learning_rate, + init_weights = init_weights, + data_size = data_size) + with open(f'{dumpdir}/testinputs.h', 'w') as f: + f.write(testInputStr) + + # testoutputs.h + testOutputStr = generateTrainingTestOutputsHeader( + reference_losses = reference_losses, + tolerance_abs = tolerance_abs, + ) + with open(f'{dumpdir}/testoutputs.h', 'w') as f: + f.write(testOutputStr) + + # TrainingNetwork.h + headerStr = generateTrainingNetworkHeader(deployer) + with open(f'{dumpdir}/TrainingNetwork.h', 'w') as f: + f.write(headerStr) + + # TrainingNetwork.c + implStr = generateTrainingNetworkImplementation(deployer, verbosityCfg) + with open(f'{dumpdir}/TrainingNetwork.c', 'w') as f: + f.write(implStr) + + clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}" + for fname in ['TrainingNetwork.c', 'TrainingNetwork.h', 'testinputs.h', 'testoutputs.h']: + os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}') + + # Build initial-value list for every input_N buffer so that L3 hex files + # can be written. The list must cover all N where "input_N" exists in the + # deployer context. Layout (must match DeeployNetwork_inputs[] order): + # [0 .. num_data_inputs-1] → first mini-batch data + # [num_data_inputs .. grad_start-1] → initial weights + # [grad_start .. grad_start+num_grad-1] → zeros (grad acc bufs) + # [last] → lazy_reset_grad = 1 (uint8) + l3_initial_inputs: List[np.ndarray] = [] + # Count how many input_N buffers exist in the deployer context + n_total_inputs = sum( + 1 for name in deployer.ctxt.globalObjects if name.startswith("input_") and name[len("input_"):].isdigit()) + for i in range(n_total_inputs): + if all_mb_data and i < len(all_mb_data[0]): + # Data / label input + l3_initial_inputs.append(all_mb_data[0][i]) + elif (init_weights is not None and grad_buf_start_idx > 0 and num_data_inputs <= i < grad_buf_start_idx): + # Weight input + wi = i - num_data_inputs + l3_initial_inputs.append(init_weights[wi] if wi < + len(init_weights) else np.array([0.0], dtype = np.float32)) + elif (grad_buf_start_idx > 0 and num_grad_inputs > 0 + and grad_buf_start_idx <= i < grad_buf_start_idx + num_grad_inputs): + # Gradient accumulation buffer — zero-initialised + buf = deployer.ctxt.globalObjects.get(f"input_{i}") + shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,) + l3_initial_inputs.append(np.zeros(shape, dtype = np.float32)) + else: + # lazy_reset_grad (last input) or any unknown slot — default 1 / uint8 + buf = deployer.ctxt.globalObjects.get(f"input_{i}") + shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,) + l3_initial_inputs.append(np.ones(shape, dtype = np.uint8)) + + generateL3HexDump(deployer, os.path.join(dumpdir, 'hex'), l3_initial_inputs, []) + + +# --------------------------------------------------------------------------- +# Optimizer network code-generation helpers +# --------------------------------------------------------------------------- + +_OPT_PREFIX = "DeeployOptNetwork_" +_TRAIN_PREFIX = "DeeployNetwork_" + + +def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict[int, int], Dict[int, int]]: + """Build optimizer→training index maps for tensors shared between the two graphs. + + The optimizer ONNX inputs are interleaved weight/grad pairs that have the + same tensor names as inputs in the training ONNX graph. We match by name + so that ``InitOptimizerNetwork`` can reference the already-allocated + ``DeeployNetwork_input_N`` pointers instead of allocating fresh buffers. + + Parameters + ---------- + train_onnx_path : str + Path to the training ``network.onnx``. + opt_onnx_model : + Already-loaded optimizer ONNX model (``onnx.ModelProto``). + + Returns + ------- + shared_input_map : Dict[int, int] + opt_input_idx → train_input_idx + shared_output_map : Dict[int, int] + opt_output_idx → train_input_idx (SGD outputs == updated weights, + same physical buffer as the weight input) + """ + import onnx as _onnx + train_model = _onnx.load_model(train_onnx_path) + train_names = [inp.name for inp in train_model.graph.input] + train_name_to_idx = {name: i for i, name in enumerate(train_names)} + + opt_input_names = [inp.name for inp in opt_onnx_model.graph.input] + opt_output_names = [out.name for out in opt_onnx_model.graph.output] + + shared_input_map: Dict[int, int] = {} + for opt_idx, name in enumerate(opt_input_names): + if name in train_name_to_idx: + shared_input_map[opt_idx] = train_name_to_idx[name] + + shared_output_map: Dict[int, int] = {} + for opt_idx, name in enumerate(opt_output_names): + # Try exact match first; then strip the '_updated' suffix that the SGD + # node appends to output tensor names (e.g. 'conv1_weight_updated' → 'conv1_weight'). + lookup_name = name + if lookup_name not in train_name_to_idx and lookup_name.endswith('_updated'): + lookup_name = lookup_name[:-len('_updated')] + if lookup_name in train_name_to_idx: + shared_output_map[opt_idx] = train_name_to_idx[lookup_name] + + return shared_input_map, shared_output_map + + +def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str: + """Redirect optimizer I/O buffers to Training's already-allocated buffers. + + Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that + the generated symbols already carry the ``DeeployOptNetwork_`` prefix. + + Handles two allocation styles produced by Deeploy: + + *Non-tiled* (per-buffer malloc):: + + DeeployOptNetwork_input_N = (SomeType *)pi_l2_malloc(sizeof(...)); + + *Tiled* (single arena with offsets):: + + DeeployOptNetwork_input_N = (float32_t *)((char *)DeeployOptNetwork_MEMORYARENA_L2 + OFFSET); + + Both are replaced with direct pointers into the TrainingNetwork arenas:: + + DeeployOptNetwork_input_N = (float32_t *)DeeployNetwork_input_M; + + After all I/O pointers are redirected, if a ``MEMORYARENA_L2`` or + ``MEMORYARENA_L3`` allocation is no longer referenced anywhere in the Init + body (i.e., the shared buffers consumed the entire arena), the now-unused + malloc is also removed to reclaim the L2/L3 memory. + + Parameters + ---------- + retStr : str + The already-prefix-substituted C source string. + shared_input_map : Dict[int, int] + Optimizer input index → training input index. + shared_output_map : Dict[int, int] + Optimizer output index → training input index (in-place update). + + Returns + ------- + str + Patched C source string. + """ + if not shared_input_map and not shared_output_map: + return retStr + + # ------------------------------------------------------------------ + # Pattern 1 (non-tiled): individual pi_*_malloc per buffer + # ------------------------------------------------------------------ + _malloc_pat = re.compile( + r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)\s*pi_\w+_malloc\([^;]+\);') + + # ------------------------------------------------------------------ + # Pattern 2 (tiled): arena-offset assignment + # DeeployOptNetwork_input_N = (Type *)((char *)DeeployOptNetwork_MEMORYARENA_Lx + OFFSET); + # ------------------------------------------------------------------ + _arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)' + r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;') + + def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]: + if kind == "input" and idx in shared_input_map: + train_idx = shared_input_map[idx] + return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */' + if kind == "output" and idx in shared_output_map: + train_idx = shared_output_map[idx] + return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */' + return None + + def _replace(m: re.Match) -> str: + replacement = _make_replacement(m.group(1), m.group(2), int(m.group(3))) + return replacement if replacement is not None else m.group(0) + + retStr = _malloc_pat.sub(_replace, retStr) + retStr = _arena_pat.sub(_replace, retStr) + + # ------------------------------------------------------------------ + # Arena elimination: if a MEMORYARENA_Lx is no longer used for any + # pointer arithmetic after the redirects, its malloc is dead and can + # be removed to reclaim L2/L3. The global declaration is left in + # place (harmless; the variable will be NULL at runtime). + # ------------------------------------------------------------------ + for level in ('L2', 'L3'): + arena_sym = f'DeeployOptNetwork_MEMORYARENA_{level}' + # Pattern for the malloc assignment line itself + malloc_line_pat = re.compile(rf'[^\n]*{re.escape(arena_sym)}\s*=\s*\([^)]+\)\s*pi_\w+_malloc\([^;]+\);\s*\n') + # Pattern for any use of the arena in pointer arithmetic: + # (char *)ARENA + OFFSET or (void *)ARENA etc. + arena_use_pat = re.compile(rf'\(\s*(?:char|void|int8_t)\s*\*\s*\)\s*{re.escape(arena_sym)}') + if not arena_use_pat.search(retStr): + # No remaining pointer arithmetic — the malloc is dead + retStr = malloc_line_pat.sub('', retStr) + + # ------------------------------------------------------------------ + # Inject TrainingNetwork header so DeeployNetwork_input_N symbols resolve + # ------------------------------------------------------------------ + retStr = retStr.replace( + '#include "OptimizerNetwork.h"', + '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"', + ) + return retStr + + +def _patch_shared_arenas(retStr: str, train_c_source: str) -> str: + """Redirect optimizer L1/L2 arena allocations to reuse training network's arenas. + + TrainingNetwork and OptimizerNetwork run strictly sequentially: RunTrainingNetwork() + completes before RunOptimizerNetwork() starts. Their L1/L2 tile-working arenas + therefore never overlap in time and can share the same physical memory. + + Only the L1 arena is shared: it is pure tile-compute scratch whose content is + dead after each kernel returns. The L2 arena is NOT shared because it may hold + persistent tensor data (weights, activations) at fixed offsets in non-tiled mode; + sharing it would let the optimizer's L2 staging buffers overwrite that data. + + Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution. + + Parameters + ---------- + retStr : str + The already-prefix-substituted C source string for the optimizer. + train_c_source : str + The full text of TrainingNetwork.c (used to confirm the arena symbols exist). + + Returns + ------- + str + Patched C source string. + """ + for level in ('L1',): + train_sym = f'DeeployNetwork_MEMORYARENA_{level}' + # Only alias if the training network actually has this arena + if train_sym not in train_c_source: + continue + + opt_sym = f'DeeployOptNetwork_MEMORYARENA_{level}' + opt_malloc_pat = re.compile(rf'({re.escape(opt_sym)})\s*=\s*\([^)]+\)\s*\w+\(sizeof\([^)]+\)\s*\*\s*\d+\)\s*;') + if not opt_malloc_pat.search(retStr): + continue + + replacement = f'{opt_sym} = (int8_t *){train_sym}; /* shared with TrainingNetwork */' + retStr = opt_malloc_pat.sub(replacement, retStr) + + # Inject TrainingNetwork header if not already present + # (_patch_shared_buffers may have already added it; guard against duplicates) + if '#include "TrainingNetwork.h"' not in retStr: + retStr = retStr.replace( + '#include "OptimizerNetwork.h"', + '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"', + ) + + return retStr + + +def _ensure_training_l1_capacity(dumpdir: str, train_c_source: str, opt_alloc_code: str) -> str: + """Enlarge TrainingNetwork's L1 arena to cover the optimizer's L1 needs. + + Since the two networks share the same L1 arena, TrainingNetwork must allocate + at least max(train_L1, opt_L1) bytes. When the optimizer needs more L1 than + training (rare but possible, e.g. autoencoder), this function patches + TrainingNetwork.c and TrainingNetwork.h in-place and returns the updated + TrainingNetwork.c source string. + + Parameters + ---------- + dumpdir : str + Directory containing TrainingNetwork.c and TrainingNetwork.h. + train_c_source : str + Current content of TrainingNetwork.c. + opt_alloc_code : str + Optimizer buffer-allocation code after _TRAIN_PREFIX → _OPT_PREFIX + substitution (used to extract the optimizer's L1 size). + + Returns + ------- + str + (Possibly updated) TrainingNetwork.c source string. + """ + m_opt = re.search( + r'DeeployOptNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*(\d+)\)', + opt_alloc_code, + ) + if not m_opt: + return train_c_source + + opt_l1 = int(m_opt.group(1)) + + m_train = re.search( + r'(DeeployNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*)(\d+)(\))', + train_c_source, + ) + if not m_train: + return train_c_source + + train_l1 = int(m_train.group(2)) + if opt_l1 <= train_l1: + return train_c_source # Already large enough + + new_l1 = opt_l1 + + # Patch TrainingNetwork.c malloc size + train_c_new = train_c_source.replace( + m_train.group(0), + f'{m_train.group(1)}{new_l1}{m_train.group(3)}', + 1, + ) + train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c') + with open(train_c_path, 'w') as f: + f.write(train_c_new) + + # Patch TrainingNetwork.h _len constant + train_h_path = os.path.join(dumpdir, 'TrainingNetwork.h') + if os.path.exists(train_h_path): + train_h = open(train_h_path).read() + train_h_new = re.sub( + r'(DeeployNetwork_MEMORYARENA_L1_len\s*=\s*)\d+', + rf'\g<1>{new_l1}', + train_h, + ) + with open(train_h_path, 'w') as f: + f.write(train_h_new) + + return train_c_new + + +def generateOptimizerNetworkHeader(deployer: NetworkDeployer) -> str: + """Generate OptimizerNetwork.h. + + Reuses the Deeploy deployer's output and applies two transformations: + 1. Replace the buffer prefix ``DeeployNetwork_`` → ``DeeployOptNetwork_`` + 2. Inject ``RunOptimizerNetwork`` / ``InitOptimizerNetwork`` function declarations. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + + Returns + ------- + str + C header string. + """ + retStr = "" + retStr += """ +#ifndef __DEEPLOY_OPTIMIZER_HEADER__ +#define __DEEPLOY_OPTIMIZER_HEADER__ +#include +#include +#include +""" + retStr += deployer.generateIncludeString() + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunOptimizerNetwork(); +void InitOptimizerNetwork(); + +""" + else: + retStr += """ +void RunOptimizerNetwork(uint32_t core_id, uint32_t numThreads); +void InitOptimizerNetwork(uint32_t core_id, uint32_t numThreads); + +""" + retStr += deployer.generateIOBufferInitializationCode() + retStr += """ +#endif +""" + # Prefix substitution: all Deeploy-generated DeeployNetwork_ → DeeployOptNetwork_ + retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX) + return retStr + + +def generateOptimizerNetworkImplementation(deployer: NetworkDeployer, + verbosityCfg: CodeGenVerbosity, + shared_input_map: Optional[Dict[int, int]] = None, + shared_output_map: Optional[Dict[int, int]] = None, + train_c_source: Optional[str] = None) -> str: + """Generate OptimizerNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + shared_input_map : Dict[int, int], optional + Optimizer input index → training input index for shared weight/grad buffers. + When provided, those malloc calls are replaced with references to the + already-allocated TrainingNetwork buffers. + shared_output_map : Dict[int, int], optional + Optimizer output index → training input index for in-place shared outputs. + train_c_source : str, optional + Full text of TrainingNetwork.c. When provided, the optimizer's L1/L2 arena + malloc calls are replaced with direct pointers to the training arenas, + saving one L1 and one L2 allocation (safe because the two networks run + strictly sequentially). + + Returns + ------- + str + C implementation string. + """ + retStr = "" + retStr += """#include +#include +#include +""" + retStr += deployer.generateIncludeString() + retStr += """ +#include "OptimizerNetwork.h" + +""" + retStr += deployer.generateBufferInitializationCode() + retStr += deployer.generateGlobalDefinitionCode() + + if isinstance(deployer.Platform, MemPoolPlatform): + retStr += deployer.generateInferenceInitializationCode() + retStr += """ +void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunOptimizerNetwork(){ +""" + retStr += deployer.generateInferenceInitializationCode() + else: + retStr += """ +void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateInferenceInitializationCode() + + retStr += deployer.generateFunction(verbosityCfg) + + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +} + +void InitOptimizerNetwork(){ +""" + else: + retStr += """ +} + +void InitOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateEngineInitializationCode() + retStr += deployer.generateBufferAllocationCode() + retStr += """ +} +""" + # Prefix substitution + retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX) + # Replace malloc calls for shared weight/grad buffers with Training pointers + retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {}) + # Redirect optimizer L1/L2 arena mallocs to reuse training arenas + if train_c_source: + retStr = _patch_shared_arenas(retStr, train_c_source) + return retStr + + +def generateOptimizerTestNetwork(deployer: NetworkDeployer, + dumpdir: str, + verbosityCfg: CodeGenVerbosity, + shared_input_map: Optional[Dict[int, int]] = None, + shared_output_map: Optional[Dict[int, int]] = None) -> None: + """Generate OptimizerNetwork.h and OptimizerNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + dumpdir : str + Output directory for generated files. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + shared_input_map : Dict[int, int], optional + Optimizer input index → training input index for shared weight/grad buffers. + shared_output_map : Dict[int, int], optional + Optimizer output index → training input index for in-place shared outputs. + """ + assert deployer.prepared, "An unprepared deployer was given" + + os.makedirs(dumpdir, exist_ok = True) + + train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c') + train_c_source: Optional[str] = None + if os.path.exists(train_c_path): + with open(train_c_path, 'r') as f: + train_c_source = f.read() + + # Enlarge training L1 arena if optimizer needs more (so unconditional L1 sharing is safe) + if train_c_source: + opt_alloc_preview = deployer.generateBufferAllocationCode().replace(_TRAIN_PREFIX, _OPT_PREFIX) + train_c_source = _ensure_training_l1_capacity(dumpdir, train_c_source, opt_alloc_preview) + + headerStr = generateOptimizerNetworkHeader(deployer) + with open(f'{dumpdir}/OptimizerNetwork.h', 'w') as f: + f.write(headerStr) + + implStr = generateOptimizerNetworkImplementation(deployer, verbosityCfg, shared_input_map, shared_output_map, + train_c_source) + with open(f'{dumpdir}/OptimizerNetwork.c', 'w') as f: + f.write(implStr) + + clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}" + for fname in ['OptimizerNetwork.c', 'OptimizerNetwork.h']: + os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}') diff --git a/DeeployTest/testUtils/core/config.py b/DeeployTest/testUtils/core/config.py index e932c23962..0ecf45d467 100644 --- a/DeeployTest/testUtils/core/config.py +++ b/DeeployTest/testUtils/core/config.py @@ -24,6 +24,14 @@ class DeeployTestConfig: gen_args: List[str] = None verbose: int = 0 debug: bool = False + training: bool = False + # None means "auto-detect from ONNX graph / inputs.npz during codegen" + n_train_steps: Optional[int] = None + n_accum_steps: Optional[int] = None + training_num_data_inputs: Optional[int] = None + # Directory containing the optimizer ONNX (network.onnx with SGD nodes). + # If None, defaults to /../simplemlp_optimizer when training=True. + optimizer_dir: Optional[str] = None def __post_init__(self): if self.cmake_args is None: diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..2fb1224c92 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -10,6 +10,7 @@ from Deeploy.Logging import DEFAULT_LOGGER as log +from ..trainingUtils import add_training_cmake_flags, run_training_codegen from .config import DeeployTestConfig from .output_parser import TestResult, parse_test_output @@ -27,6 +28,10 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: script_dir = Path(__file__).parent.parent.parent + if config.training: + run_training_codegen(config, script_dir) + return + if config.tiling: generation_script = script_dir / "testMVP.py" else: @@ -102,6 +107,9 @@ def configure_cmake(config: DeeployTestConfig) -> None: else: cmd.append("-Dgvsoc_simulation=OFF") + add_training_cmake_flags(cmd, config.training, config.n_train_steps, config.n_accum_steps, + config.training_num_data_inputs) + # Last argument is the source directory script_dir = Path(__file__).parent.parent.parent cmd.append(str(script_dir.parent)) diff --git a/DeeployTest/testUtils/deeployTrainingRunner.py b/DeeployTest/testUtils/deeployTrainingRunner.py new file mode 100644 index 0000000000..8f523bf264 --- /dev/null +++ b/DeeployTest/testUtils/deeployTrainingRunner.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Common entry point for Siracusa training test runners (non-tiled and tiled). + +Usage: + from testUtils.deeployTrainingRunner import main + sys.exit(main(tiling_enabled=False)) # non-tiled + sys.exit(main(tiling_enabled=True)) # tiled (SBTiler) +""" + +import os +from pathlib import Path + +# gapy (gvsoc launcher) uses `#!/usr/bin/env python3`. Put /usr/bin first so +# it resolves to /usr/bin/python3 which has all required packages (gapylib, +# prettytable, …) rather than the minimal venv python. +os.environ['PATH'] = '/usr/bin:' + os.environ.get('PATH', '') + +from .core import DeeployTestConfig, run_complete_test +from .core.paths import get_test_paths +from .deeployRunner import DeeployRunnerArgumentParser, print_colored_result, print_configuration + + +def main(tiling_enabled: bool = False, default_platform: str = 'Siracusa', default_simulator: str = 'gvsoc'): + """ + Build parser, parse args, create DeeployTestConfig, and run the training test. + + Parameters + ---------- + tiling_enabled: + True → passes tiling args (--l1, --l2, …) and sets tiling=True in config. + default_platform: + Platform used when -p is not given on the command line. + default_simulator: + Simulator used when -s is not given on the command line. + """ + + parser = DeeployRunnerArgumentParser(tiling_arguments = tiling_enabled, platform_required = False) + + parser.add_argument('--cores', type = int, default = 8, help = 'Number of cluster cores (default: 8)\n') + parser.add_argument('--n-steps', + metavar = '', + dest = 'n_steps', + type = int, + default = None, + help = 'N_TRAIN_STEPS: optimizer steps (auto-detected if not given)\n') + parser.add_argument('--n-accum', + metavar = '', + dest = 'n_accum', + type = int, + default = None, + help = 'N_ACCUM_STEPS: mini-batches per update step (auto-detected if not given)\n') + parser.add_argument('--num-data-inputs', + metavar = '', + dest = 'num_data_inputs', + type = int, + default = None, + help = 'Inputs that change each mini-batch (auto-detected if not given)\n') + parser.add_argument('--optimizer-dir', + metavar = '', + dest = 'optimizer_dir', + type = str, + default = None, + help = 'Directory containing the optimizer network.onnx ' + "(default: auto-derived by replacing '_train' with '_optimizer')\n") + parser.add_argument( + '--tolerance', + metavar = '', + dest = 'tolerance', + type = float, + default = None, + help = 'Absolute loss tolerance for pass/fail comparison (default: auto from generateTrainingNetwork.py)\n') + + args = parser.parse_args() + + platform = default_platform + simulator = args.simulator if args.simulator else default_simulator + + script_path = Path(__file__).resolve() + base_dir = script_path.parent.parent + + gen_dir, test_dir_abs, test_name = get_test_paths(args.dir, platform, base_dir = str(base_dir)) + + worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'master') + build_dir = str(base_dir / f'TEST_{platform.upper()}' / f'build_{worker_id}') + + cmake_args = [f'-DNUM_CORES={args.cores}'] + if args.cmake: + cmake_args.extend(args.cmake) + + gen_args = [f'--cores={args.cores}'] + if args.tolerance is not None: + gen_args.append(f'--tolerance={args.tolerance}') + if args.input_type_map: + gen_args.extend(['--input-type-map'] + list(args.input_type_map)) + if args.input_offset_map: + gen_args.extend(['--input-offset-map'] + list(args.input_offset_map)) + + if tiling_enabled: + if getattr(args, 'defaultMemLevel', None): + gen_args.append(f'--defaultMemLevel={args.defaultMemLevel}') + if getattr(args, 'l1', None): + gen_args.append(f'--l1={args.l1}') + if getattr(args, 'l2', None) and args.l2 != 1024000: + gen_args.append(f'--l2={args.l2}') + if getattr(args, 'memAllocStrategy', None): + gen_args.append(f'--memAllocStrategy={args.memAllocStrategy}') + if getattr(args, 'searchStrategy', None): + gen_args.append(f'--searchStrategy={args.searchStrategy}') + if getattr(args, 'profileTiling', False): + gen_args.append('--profileTiling') + if getattr(args, 'plotMemAlloc', False): + gen_args.append('--plotMemAlloc') + + config = DeeployTestConfig( + test_name = test_name, + test_dir = test_dir_abs, + platform = platform, + simulator = simulator, + tiling = tiling_enabled, + gen_dir = gen_dir, + build_dir = build_dir, + toolchain = args.toolchain, + toolchain_install_dir = args.toolchain_install_dir, + cmake_args = cmake_args, + gen_args = gen_args, + verbose = args.verbose, + debug = args.debug, + training = True, + n_train_steps = args.n_steps, + n_accum_steps = args.n_accum, + training_num_data_inputs = args.num_data_inputs, + optimizer_dir = args.optimizer_dir, + ) + + print_configuration(config) + + try: + result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim) + print_colored_result(result, config.test_name) + return 0 if result.success else 1 + except Exception as e: + RED = '\033[91m' + RESET = '\033[0m' + print(f'\n{RED}✗ Test {config.test_name} FAILED with exception: {e}{RESET}') + return 1 diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 0c3986cd6e..1dfb43bea4 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -2,11 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union +from typing import Dict, List, Tuple, Union from ortools.constraint_solver.pywrapcp import IntVar from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import PatternMemoryConstraints +from Deeploy.TilingExtension.MemoryScheduler import MemoryScheduler from Deeploy.TilingExtension.TilerExtension import Tiler from Deeploy.TilingExtension.TilerModel import TilerModel @@ -43,3 +45,27 @@ class SBTiler(Tiler): def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], hop: str, tensorName: str) -> Union[int, IntVar]: return 1 + + +class TrainingMemoryScheduler(MemoryScheduler): + """MemoryScheduler variant for training networks. + + Extends input tensor lifetimes to the end of the full tiling schedule so + that forward-pass inputs remain live during the backward pass. + """ + + def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: PatternMemoryConstraints, + memoryLevel: str) -> Tuple[Dict[str, Tuple[int, int]], Dict]: + tensorLifetimeMap, tensorMap = super()._calculateLifetimes(ctxt, patternMemoryConstraint, memoryLevel) + + maxStepIdx = len(patternMemoryConstraint.nodeConstraints) + for tensorName, lifetime in tensorLifetimeMap.items(): + buffer = ctxt.lookup(tensorName) + if buffer.is_input: + tensorLifetimeMap[tensorName] = (0, maxStepIdx) + + return tensorLifetimeMap, tensorMap + + +class TrainingSBTiler(SBTiler): + memorySchedulerClass = TrainingMemoryScheduler diff --git a/DeeployTest/testUtils/trainingUtils.py b/DeeployTest/testUtils/trainingUtils.py new file mode 100644 index 0000000000..a3386cd7ca --- /dev/null +++ b/DeeployTest/testUtils/trainingUtils.py @@ -0,0 +1,334 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Shared helpers used by the training / optimizer code-generation entry points +(generateTrainingNetwork.py, testMVPTraining.py, generateOptimizerNetwork.py, +testMVPOptimizer.py). + +Four kinds of helpers live here, all strictly training-specific: + +1. inputs.npz / outputs.npz readers (``_load_reference_losses``, ``_infer_*``). +2. The singleton ``_mockScheduler`` the Tiler expects for per-node tiling. +3. Training-only argparse builders (``add_training_inference_args``, + ``add_optimizer_training_dir_arg``). +4. The core hooks invoked by ``testUtils.core.execution`` + (``resolve_optimizer_dir``, ``run_training_codegen``, + ``add_training_cmake_flags``). + +Generic helpers (``--cores`` / ``--l1`` / ``--l2`` / ``--defaultMemLevel`` / +``--memAllocStrategy`` / ``--searchStrategy`` / ``--plotMemAlloc`` / +``--profileTiling`` / ``--shouldFail`` arg definitions and the ``shouldFail`` +try/except handshake) are deliberately *not* wrapped into functions here: +they are not training-specific and belong inline in whichever entry point +needs them, consistent with the upstream inference codegen scripts. +""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path +from typing import List, Optional + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.Logging import DEFAULT_LOGGER as log + +# Graph input name marker identifying gradient accumulation buffers. +_GRAD_ACC = "_grad.accumulation.buffer" + + +def _load_reference_losses(train_dir: str) -> Optional[list]: + """Load reference loss values from outputs.npz. + + Returns the list of per-mini-batch loss values if any key in + outputs.npz contains 'loss', otherwise None (with a warning). + """ + outputs_path = os.path.join(train_dir, "outputs.npz") + if not os.path.exists(outputs_path): + log.warning(f"outputs.npz not found at {outputs_path} — loss comparison skipped") + return None + + try: + outputs = np.load(outputs_path) + except Exception as e: + log.warning(f"Failed to load outputs.npz: {e} — loss comparison skipped") + return None + + for key in outputs.files: + if 'loss' in key.lower(): + vals = [float(v) for v in np.array(outputs[key]).flatten().tolist()] + log.info(f"Reference losses loaded from outputs.npz['{key}']: {vals}") + return vals + + log.warning("No 'loss' key found in outputs.npz — loss comparison skipped") + return None + + +def _infer_num_data_inputs(inputs_path: str) -> int: + """Auto-detect number of data inputs from inputs.npz. + + Data inputs are the base arr_* entries that have per-mini-batch + variants (mb1_arr_*) in the npz — i.e. entries that actually change + across mini-batches. + + Raises ValueError if no mb1 entries are found (single-mini-batch case) + where the data/weight boundary cannot be determined automatically. + """ + inputs = np.load(inputs_path) + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + count = sum(1 for k in base_keys if f'mb1_{k}' in inputs.files) + if count == 0: + raise ValueError("Cannot auto-detect num_data_inputs: inputs.npz has only one mini-batch " + "(no mb1_arr_* entries found). Please pass --num-data-inputs explicitly.") + return count + + +def _infer_total_mb(inputs_path: str) -> int: + """Count total mini-batches from inputs.npz. + + New format: inputs.npz contains meta_n_batches (total training mini-batches) + and meta_data_size (number of unique samples stored; C harness cycles via modulo). + + Legacy format: count 1 + number of unique mb* indices. + """ + inputs = np.load(inputs_path) + if "meta_n_batches" in inputs.files: + return int(inputs["meta_n_batches"].flat[0]) + mb_indices = set() + for key in inputs.files: + if key.startswith('mb'): + try: + idx = int(key.split('_')[0][2:]) + mb_indices.add(idx) + except ValueError: + pass + return 1 + len(mb_indices) + + +def _infer_data_size(inputs_path: str) -> int: + """Return the number of unique input samples stored in inputs.npz. + + New format: reads meta_data_size. + Legacy format: same as _infer_total_mb (all batches were unique). + """ + inputs = np.load(inputs_path) + if "meta_data_size" in inputs.files: + return int(inputs["meta_data_size"].flat[0]) + return _infer_total_mb(inputs_path) + + +def _infer_n_accum(inputs_path: str) -> int: + """Return the gradient accumulation step count stored in inputs.npz. + + New format: reads meta_n_accum written by the exporter. + Legacy format: defaults to 1 (no gradient accumulation). + """ + inputs = np.load(inputs_path) + if "meta_n_accum" in inputs.files: + return int(inputs["meta_n_accum"].flat[0]) + return 1 + + +def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]: + """Wrap every node in a singleton list for the Tiler pattern interface.""" + return [[node] for node in graph.nodes] + + +# --------------------------------------------------------------------------- +# argparse builders +# +# The four training / optimizer codegen entry points all define the same +# arguments in their __main__ blocks. These helpers add the shared groups +# to an existing parser so each entry point only has to compose the groups +# it actually needs. +# --------------------------------------------------------------------------- + + +def add_training_inference_args(parser: argparse.ArgumentParser) -> None: + """Arguments consumed by both training codegen entry points.""" + parser.add_argument( + "--num-data-inputs", + type = int, + dest = "num_data_inputs", + default = None, + help = "Number of DATA inputs that change per mini-batch. " + "Auto-detected if not specified.", + ) + parser.add_argument( + "--n-steps", + type = int, + dest = "n_steps", + default = None, + help = "N_TRAIN_STEPS: number of gradient-accumulation update steps. " + "Auto-detected if not specified.", + ) + parser.add_argument( + "--n-accum", + type = int, + dest = "n_accum", + default = None, + help = "N_ACCUM_STEPS: number of mini-batches per update step. " + "Auto-detected if not specified.", + ) + parser.add_argument( + "--learning-rate", + type = float, + dest = "learning_rate", + default = 0.001, + help = "SGD learning rate emitted as TRAINING_LEARNING_RATE in testinputs.h. Default: 0.001.", + ) + parser.add_argument( + "--tolerance", + type = float, + dest = "tolerance_abs", + default = 1e-3, + help = "Absolute loss tolerance emitted as TRAINING_TOLERANCE_ABS in testoutputs.h. Default: 1e-3.", + ) + + +def add_optimizer_training_dir_arg(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--training-dir", + type = str, + default = None, + help = "Directory containing the training network.onnx. When provided, " + "weight and grad-acc buffers are shared with TrainingNetwork instead " + "of being allocated independently.", + ) + + +def resolve_optimizer_dir(test_dir: str, optimizer_dir: Optional[str]) -> str: + """Return the optimizer ONNX directory for a training test. + + If ``optimizer_dir`` is explicitly set, it is returned as-is. Otherwise + fall back to ``/../_optimizer``, where ```` is + derived by replacing the ``_train`` suffix of the test directory's base + name with ``_optimizer`` (e.g. ``simplemlp_train`` → ``simplemlp_optimizer``, + ``sleepconvit_train`` → ``sleepconvit_optimizer``). + """ + if optimizer_dir: + return optimizer_dir + test_path = Path(test_dir) + optimizer_name = test_path.name.replace("_train", "_optimizer") + return str(test_path.parent / optimizer_name) + + +def add_training_cmake_flags(cmd: List[str], training: bool, n_train_steps: Optional[int], n_accum_steps: Optional[int], + training_num_data_inputs: Optional[int]) -> None: + """Append -DTRAINING=ON/OFF plus any known -DN_TRAIN_STEPS / -DN_ACCUM_STEPS / + -DTRAINING_NUM_DATA_INPUTS defines to ``cmd``. In-place.""" + cmd.append(f"-DTRAINING={'ON' if training else 'OFF'}") + if not training: + return + if n_train_steps is not None: + cmd.append(f"-DN_TRAIN_STEPS={n_train_steps}") + if n_accum_steps is not None: + cmd.append(f"-DN_ACCUM_STEPS={n_accum_steps}") + if training_num_data_inputs is not None: + cmd.append(f"-DTRAINING_NUM_DATA_INPUTS={training_num_data_inputs}") + + +def run_training_codegen(config, script_dir: Path) -> None: + """Drive the two-stage training codegen pipeline for one test. + + Runs the training network codegen script (generateTrainingNetwork.py or + testMVPTraining.py) followed by the matching optimizer codegen script + (generateOptimizerNetwork.py or testMVPOptimizer.py), and writes back + any auto-detected training parameters from ``training_meta.json`` into + ``config``. + + The single entry point keeps ``testUtils.core.execution.generate_network`` + oblivious to training internals — it only has to call this and return. + + Parameters + ---------- + config : DeeployTestConfig + The test configuration (must have ``training=True``). Training + fields (``n_train_steps``, ``n_accum_steps``, + ``training_num_data_inputs``) may be updated in-place from the + training_meta.json written by the codegen script. + script_dir : Path + ``DeeployTest/`` — the directory that hosts the four codegen scripts. + """ + if config.tiling: + training_script = script_dir / "testMVPTraining.py" + optimizer_script = script_dir / "testMVPOptimizer.py" + opt_passthrough = ("--cores", "--l1", "--l2", "--defaultMemLevel", "--memAllocStrategy", "--searchStrategy", + "--plotMemAlloc", "--profileTiling") + stage = "Tiled training" + else: + training_script = script_dir / "generateTrainingNetwork.py" + optimizer_script = script_dir / "generateOptimizerNetwork.py" + opt_passthrough = ("--cores", "--l1", "--l2", "--defaultMemLevel") + stage = "Training" + + # --- Step 1: Training network (forward + backward + accumulation) --- + cmd = [ + sys.executable, + str(training_script), + "-d", + config.gen_dir, + "-t", + config.test_dir, + "-p", + config.platform, + ] + if config.n_train_steps is not None: + cmd.append(f"--n-steps={config.n_train_steps}") + if config.n_accum_steps is not None: + cmd.append(f"--n-accum={config.n_accum_steps}") + if config.training_num_data_inputs is not None: + cmd.append(f"--num-data-inputs={config.training_num_data_inputs}") + if config.verbose > 0: + cmd.append("-" + "v" * config.verbose) + if config.debug: + cmd.append("--debug") + cmd.extend(config.gen_args) + + log.debug(f"[Execution] {stage} network generation command: {' '.join(cmd)}") + if subprocess.run(cmd, check = False).returncode != 0: + raise RuntimeError(f"{stage} network generation failed for {config.test_name}") + + # Read back auto-detected values written by the training generation script. + meta_path = Path(config.gen_dir) / "training_meta.json" + if meta_path.exists(): + with open(meta_path) as f: + meta = json.load(f) + config.n_train_steps = meta["n_train_steps"] + config.n_accum_steps = meta["n_accum_steps"] + config.training_num_data_inputs = meta["training_num_data_inputs"] + log.info(f"[Execution] Training meta: {meta}") + + # --- Step 2: Optimizer network (SGD) --- + opt_dir = resolve_optimizer_dir(config.test_dir, config.optimizer_dir) + if not Path(opt_dir).exists(): + log.warning(f"Optimizer directory not found: {opt_dir} — skipping optimizer codegen") + return + if not optimizer_script.exists(): + log.warning(f"{optimizer_script.name} not found — skipping optimizer codegen") + return + + opt_cmd = [ + sys.executable, + str(optimizer_script), + "-d", + config.gen_dir, + "-t", + opt_dir, + "-p", + config.platform, + f"--training-dir={config.test_dir}", + ] + opt_cmd.extend(arg for arg in config.gen_args if any(arg.startswith(p) for p in opt_passthrough)) + if not any(arg.startswith("--defaultMemLevel") for arg in opt_cmd): + opt_cmd.append("--defaultMemLevel=L2") + if config.verbose > 0: + opt_cmd.append("-" + "v" * config.verbose) + + log.debug(f"[Execution] {stage} optimizer network generation command: {' '.join(opt_cmd)}") + if subprocess.run(opt_cmd, check = False).returncode != 0: + raise RuntimeError(f"{stage} optimizer network generation failed for {config.test_name}") diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py index 8fa105d9f4..7e7893b5f5 100644 --- a/DeeployTest/test_siracusa_config.py +++ b/DeeployTest/test_siracusa_config.py @@ -8,7 +8,6 @@ KERNEL_TESTS = [ "Kernels/FP32/ReLU", - "Kernels/FP32/Softmax/CrossEntropy", "Kernels/FP32/Softmax/CrossEntropyGrad", "Kernels/FP32/Softmax/Grad", "Kernels/FP32/Softmax/Regular", diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a687d9a489..a9eefb6d3e 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -139,7 +139,6 @@ "Models/Transformer": [60000, 30000, 15000], "Models/microLlama/microLlama1": [60000, 10000, 5000], "Models/CCT/FP32/CCT_2_32_32_128": [128000], - "Models/CCT_Train/CCT2_FT2": [128000], "Models/TinyViT/Demo": [4000], } @@ -153,6 +152,5 @@ "Models/microLlama/microLlama8": [60000, 20000, 10000], "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000], "Models/CCT/FP32/CCT_2_32_32_128": [128000], - "Models/CCT_Train/CCT2_FT2": [128000], "Models/TinyViT/Demo": [4000], }