diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 797bd44c47..4c647a3ab4 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -336,14 +336,14 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool:
             True if this VariableBuffer has any live aliases, False otherwise
         """
         # Do a breadth-first search across the aliasing double-linked list
-        live = self._live
+        live = self._live or self.is_input or self.is_output
         queue = set(self.aliases)
         visited = set(self.name)
         while len(queue) > 0:
             next = queue.pop()
             buffNext = ctxt.lookup(next)
             assert isinstance(buffNext, VariableBuffer)
-            live |= buffNext._live
+            live |= buffNext._live or buffNext.is_input or buffNext.is_output
             visited.add(next)
             queue |= buffNext.aliases - visited
         return live
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..7ead6556b7 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -492,6 +492,22 @@ def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
 
+class InPlaceAccumulatorV2Layer(ONNXLayer):
+    """Layer for ORT InPlaceAccumulatorV2 operator (com.microsoft).
+
+    Gradient accumulation with optional reset:
+        if lazy_reset_grad: out = gradient
+        else:               out = buffer + gradient
+    """
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        # One conditional check + one element-wise op (copy or add) per element
+        return self.mapper.parser.operatorRepresentation['size']
+
+
 class LinearAttentionLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index ad787d9e4b..385eb03dff 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -2611,15 +2611,18 @@ def parseNodeCtxt(self,
 
 
 class SoftmaxCrossEntropyLossParser(NodeParser):
+    """SoftmaxCrossEntropyLoss parser.
+
+    The canonical form has two outputs: a scalar mean cross-entropy loss and
+    a per-sample log_prob tensor, matching the signature emitted by ONNX
+    Runtime when exporting training graphs.
+    """
 
     def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-
-        ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
-
-        return ret
+        return all([len(node.inputs) == 2, len(node.outputs) == 2])
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
@@ -2628,9 +2631,13 @@ def parseNodeCtxt(self,
 
         logits = ctxt.lookup(node.inputs[0].name)
         labels = ctxt.lookup(node.inputs[1].name)
-        log_prob = ctxt.lookup(node.outputs[0].name)
+        # outputs[0] = loss (0-d scalar, shape [1] after Deeploy normalisation)
+        # outputs[1] = log_prob tensor
+        loss = ctxt.lookup(node.outputs[0].name)
+        log_prob = ctxt.lookup(node.outputs[1].name)
         self.operatorRepresentation['logits'] = logits.name
         self.operatorRepresentation['labels'] = labels.name
+        self.operatorRepresentation['loss'] = loss.name
         self.operatorRepresentation['log_prob'] = log_prob.name
         self.operatorRepresentation['batch'] = logits.shape[0]
         self.operatorRepresentation['num_classes'] = logits.shape[1]
@@ -2697,6 +2704,48 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+class InPlaceAccumulatorV2Parser(NodeParser):
+    """Parser for ORT InPlaceAccumulatorV2 operator (com.microsoft).
+
+    Semantics:
+        if lazy_reset_grad: out = gradient          (reset)
+        else:               out = buffer + gradient  (accumulate)
+
+    Inputs:
+        0: buffer          - current accumulation buffer (float tensor)
+        1: gradient        - new gradient to accumulate (float tensor, same shape)
+        2: lazy_reset_grad - reset flag; if true, overwrite; else add (bool[1])
+
+    Output:
+        0: output_buffer   - updated accumulation buffer (float tensor)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # Require exactly 3 inputs (buffer, gradient, lazy_reset_grad) and 1 output
+        return len(node.inputs) == 3 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        buffer = ctxt.lookup(node.inputs[0].name)
+        gradient = ctxt.lookup(node.inputs[1].name)
+        lazy_reset_grad = ctxt.lookup(node.inputs[2].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['accum_buffer'] = buffer.name
+        self.operatorRepresentation['gradient'] = gradient.name
+        self.operatorRepresentation['lazy_reset_grad'] = lazy_reset_grad.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = int(np.prod(buffer.shape))
+
+        return ctxt, True
+
+
 class BatchNormParser(NodeParser):
 
     def __init__(self):
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..85453563c3 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -577,11 +577,11 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[
     def _inferNumLevels(self, inputs: List[VariableBuffer],
                         operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
 
-        return [2**(self.input_types[0].referencedType.typeWidth)]
+        return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types)
 
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
-        return [False]
+        return [False] * len(self.output_types)
 
 
 class SGDChecker(SignPropTypeChecker):
@@ -598,6 +598,32 @@ def _inferSignedness(self, inputs: List[VariableBuffer],
         return [True]
 
 
+class InPlaceAccumulatorV2Checker(SignPropTypeChecker):
+    """Type checker for ORT InPlaceAccumulatorV2 operator (com.microsoft).
+
+    Inputs:
+        0: buffer          (float32*)
+        1: gradient        (float32*)
+        2: lazy_reset_grad (uint8_t* or bool* - 1 element)
+
+    Output:
+        0: output_buffer   (float32*)
+    """
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # Output has same precision as the buffer input (float32)
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # Float32 output is signed
+        return [True]
+
+
 class BatchNormChecker(SignPropTypeChecker):
 
     def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 5d7b02ae62..06674a7498 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -18,9 +18,9 @@
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
     GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
-    GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
-    QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
-    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
+    GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, InPlaceAccumulatorV2Checker, LayerNormChecker, \
+    MatMulChecker, MulChecker, QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, \
+    RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
@@ -29,11 +29,12 @@
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
 from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
 from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \
-    FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \
-    FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \
-    MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \
-    RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \
-    TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    FloatGELUTemplate, FloatGemmTemplate, FloatInPlaceAccumulatorV2Template, FloatLayernormTemplate, \
+    FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \
+    FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, \
+    RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \
+    SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
+    iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
@@ -353,7 +354,8 @@
 
 PULPSoftmaxCrossEntropyLossBindings = [
     NodeBinding(
-        SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
+        SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)],
+                                       [PointerClass(float32_t), PointerClass(float32_t)]),
         SoftmaxCrossEntropyLossTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
 ]
 
@@ -368,6 +370,14 @@
                 SGDTemplate.referenceTemplate, ForkTransformer)
 ]
 
+PULPInPlaceAccumulatorV2Bindings = [
+    NodeBinding(
+        InPlaceAccumulatorV2Checker(
+            [PointerClass(float32_t), PointerClass(float32_t),
+             PointerClass(uint8_t)], [PointerClass(float32_t)]), FloatInPlaceAccumulatorV2Template.referenceTemplate,
+        ForkTransformer)
+]
+
 PULPTransposeBindings = [
     NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate,
                 ForkTransformer) for type in IntegerDataTypes
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 7456dd9e1b..2413942869 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -14,17 +14,17 @@
 from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
     BasicRQIntegerDivBinding
 from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \
-    GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \
-    ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
-    RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \
-    SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
+    GEMMLayer, InPlaceAccumulatorV2Layer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, \
+    PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, \
+    RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, \
+    SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
-    GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \
-    MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \
-    RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \
-    SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \
-    SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \
-    iSoftmaxParser
+    GELUGradParser, GELUParser, GEMMParser, InPlaceAccumulatorV2Parser, LayerNormGradParser, LayerNormParser, \
+    MatMulParser, MaxPool1DParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, \
+    ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \
+    RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \
+    SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \
+    iRMSNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
@@ -39,17 +39,17 @@
 from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
     PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
     PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \
-    PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \
-    PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \
-    PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \
-    PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \
-    PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \
-    PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \
-    PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \
-    PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \
-    PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \
-    PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \
-    PULPUniformRQSTilingReadyBindings
+    PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPInPlaceAccumulatorV2TilingReadyBindings, \
+    PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, \
+    PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, \
+    PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, \
+    PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \
+    PULPRQSConv1DTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, \
+    PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \
+    PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \
+    PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
+    PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
+    PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
     PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
 
@@ -108,6 +108,7 @@
 SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(),
                                                PULPSoftmaxCrossEntropyGradTilingReadyBindings)
 SGDMapper = NodeMapper(SGDParser(), PULPSGDTilingReadyBindings)
+InPlaceAccumulatorV2Mapper = NodeMapper(InPlaceAccumulatorV2Parser(), PULPInPlaceAccumulatorV2TilingReadyBindings)
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
@@ -151,7 +152,8 @@
     'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]),
     'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]),
     'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]),
-    'SGD': SGDLayer([SGDMapper])
+    'SGD': SGDLayer([SGDMapper]),
+    'InPlaceAccumulatorV2': InPlaceAccumulatorV2Layer([InPlaceAccumulatorV2Mapper]),
 }
 
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index 59499706e5..ef046f191d 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -4,7 +4,8 @@
 
 from typing import Dict, List, Tuple
 
-from Deeploy.AbstractDataTypes import float32_tPtr
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
 
 
@@ -19,7 +20,7 @@ def alignToContext(self, ctxt: NetworkContext,
         if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
             # No bias case - set C to NULL and provide a default type
             operatorRepresentation['C'] = None
-            operatorRepresentation['C_type'] = float32_tPtr  # Default to fp32 type
+            operatorRepresentation['C_type'] = PointerClass(float32_t)  # Default to fp32 type
             operatorRepresentation['C_batched'] = False
 
         return ctxt, operatorRepresentation, []
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py
new file mode 100644
index 0000000000..f7864c7261
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _PULPInPlaceAccumulatorV2Template(NodeTemplate):
+    """True in-place InPlaceAccumulatorV2 template for PULP.
+
+    Writes the accumulation result into ``accum_buffer`` (the graph input).
+    ``data_out`` is registered as an alias of ``accum_buffer`` so the memory
+    allocator knows they share memory and will not free ``accum_buffer``
+    prematurely.
+
+    ``data_out`` is intentionally *not* written by the emitted C code:
+
+    - InPlaceAccumulatorV2 is terminal in the training graph — no downstream
+      kernel consumes ``data_out``; it only exists as a symbolic output so
+      the graph stays well-formed.
+    - In the tiled path, emitting a write to ``data_out`` would also make
+      Deeploy generate an L2 egress DMA for it, and ``data_out``'s L2 slot
+      may overlap with other live buffers, corrupting L2.
+
+    Semantics:
+        if lazy_reset_grad: accum_buffer  = gradient   (reset)
+        else:               accum_buffer += gradient   (accumulate)
+    """
+
+    def alignToContext(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]:
+        accum_buffer = ctxt.lookup(operatorRepresentation['accum_buffer'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        accum_buffer.aliases.add(data_out.name)
+        data_out.aliases.add(accum_buffer.name)
+        data_out._alias = accum_buffer.name
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _PULPInPlaceAccumulatorV2Template("""
+// InPlaceAccumulatorV2 (Name: ${nodeName}, Op: ${nodeOp})
+// Writes result into accum_buffer (in-place).  data_out is an alias of
+// accum_buffer and is deliberately not written — it has no downstream
+// consumer, and emitting a write would trigger an L2 egress DMA whose
+// destination may overlap with live buffers in the tiled path.
+// Reset (lazy_reset_grad=1): accum_buffer  = gradient
+// Accum (lazy_reset_grad=0): accum_buffer += gradient
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (int32_t)${size});
+int32_t ${nodeName}_stop  = MIN(${nodeName}_start + ${nodeName}_chunk,   (int32_t)${size});
+
+if (${lazy_reset_grad}[0]) {
+    for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) {
+        ${accum_buffer}[i] = ${gradient}[i];
+    }
+} else {
+    for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) {
+        ${accum_buffer}[i] += ${gradient}[i];
+    }
+}
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
index 1592fe30c4..d31d2c2797 100644
--- a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
@@ -2,9 +2,43 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from Deeploy.DeeployTypes import NodeTemplate
+from typing import List, Tuple
 
-referenceTemplate = NodeTemplate("""
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _PULPSGDTemplate(NodeTemplate):
+    """In-place SGD template for PULP.
+
+    weight_updated is aliased to weight so the memory allocator places them
+    at the same address in whichever memory level weight lives in (L2 or L3).
+    This ensures the tiled egress DMA writes the updated weight back to
+    weight's buffer — the same buffer the training network reads from on the
+    next forward pass.
+    """
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]:
+        weight = ctxt.lookup(operatorRepresentation['weight'])
+        weight_updated = ctxt.lookup(operatorRepresentation['weight_updated'])
+
+        weight.aliases.add(weight_updated.name)
+        weight_updated.aliases.add(weight.name)
+        weight_updated._alias = weight.name
+
+        # Make weight_updated share weight's allocation (no separate malloc),
+        # regardless of which memory level (L2 or L3) weight is placed in.
+        # The egress DMA then writes updated weights back to weight's address.
+        weight_updated.allocTemplate = NodeTemplate(" ${name} = (${type.typeName}) " + str(weight._instance) + ";")
+        weight_updated.deallocTemplate = NodeTemplate("")
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _PULPSGDTemplate("""
 // SGD Weight Update with Separated Multiplication and Subtraction Unrolling
 // (Name: ${nodeName}, Op: ${nodeOp})
 int8_t ${nodeName}_core_id = pi_core_id();
@@ -46,4 +80,4 @@
     float32_t temp_grad = learning_rate * ref_${grad}[i];
     ref_${weight_updated}[i] = ref_${weight}[i] - temp_grad;
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
index c1aefe01a3..914a18c3ed 100644
--- a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
@@ -4,27 +4,29 @@
 
 from Deeploy.DeeployTypes import NodeTemplate
 
+# Canonical SoftmaxCrossEntropyLoss: emits both a scalar mean loss and the
+# per-sample log_prob tensor.
 referenceTemplate = NodeTemplate("""
 BEGIN_SINGLE_CORE
     // SoftmaxCrossEntropyLoss (Name: ${nodeName}, Op: ${nodeOp})
+    float32_t sce_total_loss = 0.0f;
     for (uint32_t i = 0; i < ${batch}; i++) {
-        float max_logit = ${logits}[i * ${num_classes} + 0];
+        float32_t sce_max_logit = ${logits}[i * ${num_classes}];
         for (uint32_t j = 1; j < ${num_classes}; j++) {
-            if (${logits}[i * ${num_classes} + j] > max_logit) {
-                max_logit = ${logits}[i * ${num_classes} + j];
-            }
-        }
-
-        float32_t sum_exp = 0.0f;
-        for (uint32_t j = 0; j < ${num_classes}; j++) {
-            sum_exp += expf(${logits}[i * ${num_classes} + j] - max_logit);
-        }
-
-        for (uint32_t j = 0; j < ${num_classes}; j++) {
-            // log_prob = logit - max_logit - log(sum_exp)
-            ${log_prob}[i * ${num_classes} + j] = ${logits}[i * ${num_classes} + j] - max_logit - logf(sum_exp);
+            if (${logits}[i * ${num_classes} + j] > sce_max_logit)
+                sce_max_logit = ${logits}[i * ${num_classes} + j];
         }
+        float32_t sce_sum_exp = 0.0f;
+        for (uint32_t j = 0; j < ${num_classes}; j++)
+            sce_sum_exp += expf(${logits}[i * ${num_classes} + j] - sce_max_logit);
+        float32_t sce_log_sum_exp = logf(sce_sum_exp);
+        for (uint32_t j = 0; j < ${num_classes}; j++)
+            ${log_prob}[i * ${num_classes} + j] =
+                ${logits}[i * ${num_classes} + j] - sce_max_logit - sce_log_sum_exp;
+        sce_total_loss += -(${logits}[i * ${num_classes} + (uint32_t)(${labels}[i])]
+                            - sce_max_logit - sce_log_sum_exp);
     }
+    ${loss}[0] = sce_total_loss / (float32_t)${batch};
 END_SINGLE_CORE
 """)
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py
new file mode 100644
index 0000000000..fb2b4bde78
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class InPlaceAccumulatorV2TileConstraint(BOPTileConstraint):
+    """Tile constraint for InPlaceAccumulatorV2.
+
+    Tiles accum_buffer and gradient together (same shape); lazy_reset_grad
+    is a scalar (1 element) and is not tiled.
+    """
+
+    dataIn1Name = 'accum_buffer'
+    dataIn2Name = 'gradient'
+    dataOutName = 'data_out'
+
+    @classmethod
+    def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = super().addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        # lazy_reset_grad is a scalar flag — pin full size so it is not tiled.
+        lazyResetName = parseDict['lazy_reset_grad']
+        tilerModel.addTensorDimToModel(ctxt, lazyResetName)
+        shape = ctxt.lookup(lazyResetName).shape
+        dims = [shape] if isinstance(shape, int) else shape
+        for idx, dim in enumerate(dims):
+            dimVar = tilerModel.getTensorDimVar(lazyResetName, idx)
+            tilerModel.addConstraint(dimVar == dim)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName, 'lazy_reset_grad']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+        outputBaseOffsets[cls.dataOutName] = inputBaseOffsets[cls.dataIn1Name]
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        lazyResetShape = ctxt.lookup(operatorRepresentation['lazy_reset_grad']).shape
+        lazyResetDims = (lazyResetShape,) if isinstance(lazyResetShape, int) else tuple(lazyResetShape)
+        lazyResetCube = HyperRectangle((0,) * len(lazyResetDims), lazyResetDims)
+
+        inputLoadSchedule = [{
+            cls.dataIn1Name: cube,
+            cls.dataIn2Name: cube,
+            'lazy_reset_grad': lazyResetCube,
+        } for cube in outputCubes]
+        outputLoadSchedule = [{cls.dataOutName: out} for out in outputCubes]
+
+        for cube in outputCubes:
+            replacements["size"].append(int(np.prod(cube.dims)))
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
index 38c984de63..78957136e5 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
 from typing import Dict, List, Tuple, Union
 
 from ortools.constraint_solver.pywrapcp import IntVar
@@ -17,10 +18,18 @@
 
 
 class SoftmaxCrossEntropyTileConstraint(TileConstraint):
+    """TileConstraint for SoftmaxCrossEntropyLoss (2 outputs: loss + log_prob).
+
+    Both batch and num_classes are pinned to their full size by
+    addPolicyConstraint, so SCE itself is never tiled — the sole purpose of
+    the wrapTilingSolution override is to bypass the base-class single-output
+    assertion and carry the scalar loss buffer through the DMA schedule.
+    """
 
     dataIn1Name = 'logits'
     dataIn2Name = 'labels'
     dataOutName = 'log_prob'
+    dataLossName = 'loss'
 
     @classmethod
     def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
@@ -108,8 +117,53 @@ def serializeTilingSolution(
 
         return variableReplacementSchedule, tilingSchedule
 
+    @classmethod
+    def wrapTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
+        """Override the base-class single-output wrapper.
+
+        SoftmaxCrossEntropyLoss emits two outputs (loss + log_prob) but the
+        base-class wrapTilingSolution asserts exactly one.  We run the base
+        wrapper on a log_prob-only slice of the tiling solution and then patch
+        the scalar loss address / rectangle back into each resulting schedule.
+
+        Grad subclasses that do not have a scalar loss output fall straight
+        through to the base-class behaviour.
+        """
+        lossVar = operatorRepresentation.get(cls.dataLossName, '')
+
+        # No scalar loss output (e.g. Grad subclass) — plain base-class path.
+        if not lossVar or lossVar not in tilingSolution.outputTensorMemoryConstraints:
+            return super().wrapTilingSolution(tilingSolution, targetMemLevel, ctxt, operatorRepresentation)
+
+        # Log_prob-only slice of the tiling solution so the single-output
+        # assertion in the base class passes.
+        logProbVar = operatorRepresentation[cls.dataOutName]
+        singleOutputSolution = copy.deepcopy(tilingSolution)
+        singleOutputSolution.outputTensorMemoryConstraints = {
+            logProbVar: tilingSolution.outputTensorMemoryConstraints[logProbVar]
+        }
+
+        varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt,
+                                                                     operatorRepresentation)
+
+        # Patch the scalar loss into each schedule's output list.
+        lossAddr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, lossVar)
+        if lossAddr == [None]:
+            return varReplacement, tilingSchedules
+
+        lossRect = HyperRectangle((0,), (1,))
+        for schedule in tilingSchedules:
+            schedule.outputBaseOffsets[cls.dataLossName] = lossAddr
+            for step in schedule.outputLoadSchedule:
+                step[cls.dataLossName] = lossRect
+
+        return varReplacement, tilingSchedules
+
 
 class SoftmaxCrossEntropyGradTileConstraint(SoftmaxCrossEntropyTileConstraint):
     dataIn1Name = 'log_prob'
     dataIn2Name = 'labels'
     dataOutName = 'grad'
+    dataLossName = ''  # no scalar loss output — fall through to base wrapper
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index 901106459e..cc9b4e0ca4 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -16,13 +16,14 @@
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
 from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
     PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \
-    PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \
-    PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \
-    PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \
-    PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \
-    PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \
-    PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, \
-    PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings
+    PULPGatherBindings, PULPiHardswishBindings, PULPInPlaceAccumulatorV2Bindings, PULPiRMSNormBindings, \
+    PULPiRQSGELUBindings, PULPLayernormBinding, PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, \
+    PULPMaxPool2DBindings, PULPMulBindings, PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, \
+    PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, \
+    PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, \
+    PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \
+    PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
+    PULPTransposeBindings, PULPUniformRQSBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \
     RQConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
@@ -30,6 +31,8 @@
 from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GeluTileConstraint import GeluGradTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.InPlaceAccumulatorV2TileConstraint import \
+    InPlaceAccumulatorV2TileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import SoftmaxGradTileConstraint, \
     iSoftmaxTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormGradTileConstraint, \
@@ -155,6 +158,9 @@
 PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings,
                                                      tileConstraint = SGDTileConstraint())
 
+PULPInPlaceAccumulatorV2TilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings = PULPInPlaceAccumulatorV2Bindings, tileConstraint = InPlaceAccumulatorV2TileConstraint())
+
 PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings,
                                                        tileConstraint = SliceTileConstraint())
 
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index 2186d4d4c4..3a583fd452 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -333,7 +333,8 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
                     if _buffer._memoryLevel != memoryLevel:
                         continue
 
-                    if hasattr(_buffer, "_alias") and ctxt.is_global(_buffer._alias):
+                    if hasattr(_buffer, "_alias") and ctxt.is_global(
+                            _buffer._alias) and _buffer._alias not in blockNames:
                         continue
 
                     if hasattr(_buffer, "_alias") and _buffer._alias in blockNames:
@@ -398,11 +399,24 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
         environment variable to be set to the installation directory.
         """
 
+        blockNames = {block.name for block in memoryMap}
+
+        # In-place alias outputs whose target is in the same memoryMap share
+        # storage with the target — skip them from the MiniMalloc CSV (it
+        # rejects size-0 entries) and copy their addrSpace from the target
+        # after the solver runs.
+        aliasBlocks = {
+            block.name for block in memoryMap if getattr(ctxt.lookup(block.name), "_alias", None) in blockNames
+        }
+
         with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file:
             writer = csv.writer(file, lineterminator = "\n")
             writer.writerow(["id", "lower", "upper", "size"])
             for memoryBlock in memoryMap:
 
+                if memoryBlock.name in aliasBlocks:
+                    continue
+
                 _buffer = ctxt.lookup(memoryBlock.name)
                 if nodeMemoryConstraint is None:
                     _bufferSize = _buffer.size if isinstance(
@@ -452,6 +466,14 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
                     if memoryBlock.name == row[0]:
                         memoryBlock._addrSpace = (int(row[-1]), int(row[-1]) + int(row[-2]))
 
+        # Resolve skipped alias blocks: copy addrSpace from the alias target.
+        targetBlocks = {block.name: block for block in memoryMap}
+        for memoryBlock in memoryMap:
+            if memoryBlock.name not in aliasBlocks:
+                continue
+            target = targetBlocks.get(ctxt.dealiasBuffer(memoryBlock.name))
+            memoryBlock._addrSpace = target._addrSpace if target is not None else (0, 0)
+
         return memoryMap
 
     def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index b7f3535790..3d6480d5f9 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -6,8 +6,16 @@ include_directories(${GENERATED_SOURCE})
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-add_library(network OBJECT ${GENERATED_SOURCE}/Network.c)
-target_link_libraries(network PUBLIC deeploylib)
+if(TRAINING)
+  add_library(training_network OBJECT ${GENERATED_SOURCE}/TrainingNetwork.c)
+  target_link_libraries(training_network PUBLIC deeploylib)
+  # Optimizer network (SGD kernel, compiled separately to allow different prefix)
+  add_library(optimizer_network OBJECT ${GENERATED_SOURCE}/OptimizerNetwork.c)
+  target_link_libraries(optimizer_network PUBLIC deeploylib)
+else()
+  add_library(network OBJECT ${GENERATED_SOURCE}/Network.c)
+  target_link_libraries(network PUBLIC deeploylib)
+endif()
 
 if(platform STREQUAL MemPool)
   add_subdirectory(Platforms/MemPool)
@@ -29,7 +37,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP)
   )
 
   if (NOT HEXLIST)
-    target_compile_options(network PUBLIC -DNOFLASH)
+    if(TRAINING)
+      target_compile_options(training_network PUBLIC -DNOFLASH)
+      target_compile_options(optimizer_network PUBLIC -DNOFLASH)
+    else()
+      target_compile_options(network PUBLIC -DNOFLASH)
+    endif()
   else()
     gvsoc_flags_add_files_to_hyperflash(GVSOC_HEX_HYPERFLASH_FLAGS HEXLIST)
     list(APPEND GVSOC_EXTRA_FLAGS ${GVSOC_HEX_HYPERFLASH_FLAGS})
@@ -37,9 +50,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP)
 
   # SCHEREMO: Waive warnings
   # Pointer sign warnings are caused by the data width abstraction used in Deeploy. Signedness is not explicitly modelled, as this is handled by kernels
-  target_compile_options(network PRIVATE
-    -Wno-pointer-sign
-  )
+  if(TRAINING)
+    target_compile_options(training_network PRIVATE -Wno-pointer-sign)
+    target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign)
+  else()
+    target_compile_options(network PRIVATE -Wno-pointer-sign)
+  endif()
 
   if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
     add_subdirectory(Platforms/Siracusa)
@@ -61,7 +77,12 @@ elseif(platform STREQUAL GAP9)
   if (NOT HEXLIST)
     # L2 mode: No flash/readfs files
     # Data lives in L2 memory only
-    target_compile_options(network PUBLIC -DNOFLASH)
+    if(TRAINING)
+      target_compile_options(training_network PUBLIC -DNOFLASH)
+      target_compile_options(optimizer_network PUBLIC -DNOFLASH)
+    else()
+      target_compile_options(network PUBLIC -DNOFLASH)
+    endif()
     message(STATUS "[Deeploy GAP9] L2 mode: No hex files found, -DNOFLASH set")
     message(STATUS "[Deeploy GAP9] If you expect L3 mode, ensure Python codegen created hex files in ${GENERATED_SOURCE}/hex/")
   else()
@@ -77,5 +98,13 @@ elseif(platform STREQUAL GAP9)
     message(STATUS "GAPY_RUNNER_ARGS: ${GAPY_RUNNER_ARGS}")
   endif()
 
+  # Waive warnings in generated code
+  if(TRAINING)
+    target_compile_options(training_network PRIVATE -Wno-pointer-sign -Wno-sign-compare)
+    target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign -Wno-sign-compare)
+  else()
+    target_compile_options(network PRIVATE -Wno-pointer-sign -Wno-sign-compare)
+  endif()
+
   add_subdirectory(Platforms/GAP9)
 endif()
diff --git a/DeeployTest/Platforms/Siracusa/CMakeLists.txt b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
index 45e6191490..28ac5131f2 100644
--- a/DeeployTest/Platforms/Siracusa/CMakeLists.txt
+++ b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
@@ -1,19 +1,46 @@
 # SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
-#
 # SPDX-License-Identifier: Apache-2.0
 
 set(ProjectId ${TESTNAME})
 
-file(GLOB_RECURSE SOURCES
-    src/CycleCounter.c
-    src/deeploytest.c
-)
+option(TRAINING "Use training harness instead of inference harness" OFF)
+
+# Compile-time training parameters (override via -D on cmake command line)
+set(N_TRAIN_STEPS "1" CACHE STRING "Number of optimizer steps")
+set(N_ACCUM_STEPS "1" CACHE STRING "Number of mini-batches per optimizer step")
+set(TRAINING_NUM_DATA_INPUTS "2" CACHE STRING "Number of data inputs per mini-batch")
+
+if(TRAINING)
+    file(GLOB_RECURSE SOURCES
+        src/CycleCounter.c
+        src/deeploytraintest.c
+    )
+    set(NETWORK_LIB training_network)
+else()
+    file(GLOB_RECURSE SOURCES
+        src/CycleCounter.c
+        src/deeploytest.c
+    )
+    set(NETWORK_LIB network)
+endif()
 
 add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES})
 target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc)
 
-target_link_libraries(${ProjectId} PRIVATE network deeploylib)
-target_compile_options(${ProjectId} INTERFACE network)
-add_gvsoc_emulation(${ProjectId} "siracusa")
+if(TRAINING)
+    target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} optimizer_network deeploylib)
+else()
+    target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} deeploylib)
+endif()
+target_compile_options(${ProjectId} INTERFACE ${NETWORK_LIB})
 
+if(TRAINING)
+    target_compile_definitions(${ProjectId} PRIVATE
+        N_TRAIN_STEPS=${N_TRAIN_STEPS}
+        N_ACCUM_STEPS=${N_ACCUM_STEPS}
+        TRAINING_NUM_DATA_INPUTS=${TRAINING_NUM_DATA_INPUTS}
+    )
+endif()
+
+add_gvsoc_emulation(${ProjectId} "siracusa")
 link_compile_dump(${TESTNAME})
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c
new file mode 100644
index 0000000000..50eb34d748
--- /dev/null
+++ b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c
@@ -0,0 +1,385 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Training harness for Siracusa — Phase 2 (with Deeploy-compiled
+ * OptimizerNetwork)
+ *
+ * Loop structure:
+ *
+ *   InitTrainingNetwork()
+ *   InitOptimizerNetwork()
+ *   Connect optimizer buffers → training network's weight/grad buffers
+ *
+ *   for update_step in [0, N_TRAIN_STEPS):          // optimizer steps
+ *       for accum_step in [0, N_ACCUM_STEPS):        // mini-batches per update
+ *           lazy_reset_grad = (accum_step == 0)      // reset on first,
+ * accumulate on rest load data for this mini-batch RunTrainingNetwork() // fwd
+ * + bwd + InPlaceAccumulatorV2 store loss value
+ *       // SGD weight update via Deeploy-compiled optimizer kernel:
+ *       copy weights + grad_acc → optimizer input buffers
+ *       RunOptimizerNetwork()
+ *       copy weight_updated ← optimizer output buffers → training weight
+ * buffers
+ *
+ *   Numerical verification:
+ *     - Compare stored loss values against testLossRef[] (from testoutputs.h)
+ *
+ * Buffer layout in DeeployNetwork_inputs[] (must match ONNX input order):
+ *   [0 .. TRAINING_NUM_DATA_INPUTS-1]              data + labels (per
+ * mini-batch) [TRAINING_NUM_DATA_INPUTS ..
+ *    .. TRAINING_GRAD_BUF_START_IDX-1]             weights (persistent)
+ *   [TRAINING_GRAD_BUF_START_IDX ..
+ *    .. +TRAINING_NUM_GRAD_INPUTS-1]               grad accumulation bufs
+ * (persistent) [DeeployNetwork_num_inputs-1]                  lazy_reset_grad
+ * uint8
+ *
+ * Optimizer buffer layout in DeeployOptNetwork_inputs[] (interleaved pairs):
+ *   [2*i]   weight_i     (copied from
+ * DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i]) [2*i+1] grad_acc_i (copied
+ * from DeeployNetwork_inputs[TRAINING_GRAD_BUF_START_IDX+i])
+ * DeeployOptNetwork_outputs[i] = weight_i_updated
+ *   → copied back to DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i]
+ *
+ * Compile-time constants (emitted by code generator into testinputs.h):
+ *   N_TRAIN_STEPS              number of optimizer (weight-update) steps
+ *   N_ACCUM_STEPS              number of mini-batches accumulated per update
+ *   TRAINING_NUM_DATA_INPUTS   inputs that change each mini-batch (data +
+ * labels) TRAINING_GRAD_BUF_START_IDX  first grad acc buffer index in
+ * DeeployNetwork_inputs[] TRAINING_NUM_GRAD_INPUTS   number of grad
+ * accumulation buffers (== number of weights) TRAINING_NUM_WEIGHT_INPUTS number
+ * of trainable weight buffers TRAINING_LEARNING_RATE     SGD learning rate (for
+ * reference — embedded in optimizer ONNX)
+ *
+ * Reference comparison constants (emitted into testoutputs.h):
+ *   N_LOSS_REFS                number of reference loss values
+ *   NUM_WEIGHT_REFS            number of reference weight tensors
+ *   TRAINING_TOLERANCE_ABS     absolute comparison tolerance
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "CycleCounter.h"
+#include "OptimizerNetwork.h"
+#include "TrainingNetwork.h"
+#include "dory_mem.h"
+#include "pmsis.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+/* Helper: true when ptr is in L2 (CPU-accessible); false when in L3 (external
+ * RAM) */
+#define IS_L2(ptr) ((uint32_t)(ptr) >= 0x10000000u)
+
+/* -------------------------------------------------------------------------
+ * Compile-time defaults — override via CMake target_compile_definitions
+ * ---------------------------------------------------------------------- */
+
+#ifndef N_TRAIN_STEPS
+#define N_TRAIN_STEPS 1
+#endif
+
+#ifndef N_ACCUM_STEPS
+#define N_ACCUM_STEPS 1
+#endif
+
+#ifndef TRAINING_NUM_DATA_INPUTS
+#define TRAINING_NUM_DATA_INPUTS 2
+#endif
+
+#define MAINSTACKSIZE 12000
+#define SLAVESTACKSIZE 3800
+
+/* -------------------------------------------------------------------------
+ * Cluster device
+ * ---------------------------------------------------------------------- */
+
+struct pi_device cluster_dev;
+
+/* -------------------------------------------------------------------------
+ * Loss storage (one value per forward pass)
+ * ---------------------------------------------------------------------- */
+
+#define TOTAL_FWD_PASSES (N_TRAIN_STEPS * N_ACCUM_STEPS)
+static float stored_losses[TOTAL_FWD_PASSES];
+
+/* -------------------------------------------------------------------------
+ * L3-aware memory transfer: handles all combinations of L2/L3 src and dst
+ * ---------------------------------------------------------------------- */
+
+static void l3_aware_copy(void *dst, const void *src, uint32_t bytes) {
+  if (IS_L2(dst) && IS_L2(src)) {
+    memcpy(dst, src, bytes);
+  } else if (IS_L2(dst)) {
+    /* L3 → L2 */
+    ram_read(dst, (void *)src, bytes);
+  } else if (IS_L2(src)) {
+    /* L2 → L3 */
+    ram_write(dst, (void *)src, bytes);
+  } else {
+    /* L3 → L3: stage through a temporary L2 buffer */
+    void *tmp = pi_l2_malloc(bytes);
+    ram_read(tmp, (void *)src, bytes);
+    ram_write(dst, tmp, bytes);
+    pi_l2_free(tmp, bytes);
+  }
+}
+
+static void run_optimizer_step(void) {
+#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0)
+  /* --- Step A: copy current weights + grad acc → optimizer input buffers ---
+   * Skipped when codegen has shared the buffers (pointer equality test). */
+  for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) {
+    uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi;
+    uint32_t train_g_idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + wi;
+    uint32_t opt_w_in = 2u * wi;
+    uint32_t opt_g_in = 2u * wi + 1u;
+
+    if (DeeployOptNetwork_inputs[opt_w_in] !=
+        DeeployNetwork_inputs[train_w_idx]) {
+      l3_aware_copy(DeeployOptNetwork_inputs[opt_w_in],
+                    DeeployNetwork_inputs[train_w_idx],
+                    DeeployOptNetwork_inputs_bytes[opt_w_in]);
+    }
+    if (DeeployOptNetwork_inputs[opt_g_in] !=
+        DeeployNetwork_inputs[train_g_idx]) {
+      l3_aware_copy(DeeployOptNetwork_inputs[opt_g_in],
+                    DeeployNetwork_inputs[train_g_idx],
+                    DeeployOptNetwork_inputs_bytes[opt_g_in]);
+    }
+  }
+
+  /* --- Step B: run optimizer kernel on cluster --- */
+  struct pi_cluster_task opt_task;
+  pi_cluster_task(&opt_task, RunOptimizerNetwork, NULL);
+  opt_task.stack_size = MAINSTACKSIZE;
+  opt_task.slave_stack_size = SLAVESTACKSIZE;
+  pi_cluster_send_task_to_cl(&cluster_dev, &opt_task);
+
+  /* --- Step C: copy weight_updated back to training network's weight buffers
+   * --- Skipped when codegen has shared the output buffer with the training
+   * input. */
+  for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) {
+    uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi;
+    uint32_t opt_w_out = wi;
+
+    if (DeeployOptNetwork_outputs[opt_w_out] ==
+        DeeployNetwork_inputs[train_w_idx]) {
+      continue; /* in-place: training buffer already updated */
+    }
+
+    uint32_t opt_bytes = DeeployOptNetwork_outputs_bytes[opt_w_out];
+    uint32_t train_bytes = DeeployNetwork_inputs_bytes[train_w_idx];
+    if (opt_bytes == train_bytes) {
+      l3_aware_copy(DeeployNetwork_inputs[train_w_idx],
+                    DeeployOptNetwork_outputs[opt_w_out], opt_bytes);
+    } else {
+      /* Broadcasted bias: fill every tile with updated value. */
+      for (uint32_t off = 0; off < train_bytes; off += opt_bytes) {
+        uint32_t chunk =
+            (off + opt_bytes <= train_bytes) ? opt_bytes : (train_bytes - off);
+        l3_aware_copy((char *)DeeployNetwork_inputs[train_w_idx] + off,
+                      DeeployOptNetwork_outputs[opt_w_out], chunk);
+      }
+    }
+  }
+#endif /* TRAINING_NUM_WEIGHT_INPUTS */
+}
+
+/* -------------------------------------------------------------------------
+ * Numerical comparison helpers — run on cluster (FC has no FPU)
+ * ---------------------------------------------------------------------- */
+
+typedef struct {
+  float *computed;
+  float *reference;
+  uint32_t n;
+  uint32_t *err_count;
+} LossCompareArgs;
+
+static void CompareLossesOnCluster(void *args) {
+  if (pi_core_id() != 0)
+    return;
+  LossCompareArgs *a = (LossCompareArgs *)args;
+  float tol = TRAINING_TOLERANCE_ABS; /* read on cluster — has FPU */
+  uint32_t errors = 0;
+  for (uint32_t i = 0; i < a->n; i++) {
+    float diff = a->computed[i] - a->reference[i];
+    if (diff < 0.0f)
+      diff = -diff;
+    printf("  [loss %u] computed=%.6f  ref=%.6f  diff=%.6f  TOL=%.6f\r\n", i,
+           (double)a->computed[i], (double)a->reference[i], (double)diff,
+           (double)tol);
+    if (diff > tol) {
+      errors++;
+    }
+  }
+  *a->err_count = errors;
+}
+
+/* -------------------------------------------------------------------------
+ * main
+ * ---------------------------------------------------------------------- */
+
+int main(void) {
+
+  printf("=== Siracusa Training Harness (Phase 2 — with OptimizerNetwork) "
+         "===\r\n");
+  printf("N_TRAIN_STEPS=%u  N_ACCUM_STEPS=%u  DATA_INPUTS=%u\r\n",
+         (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS,
+         (unsigned)TRAINING_NUM_DATA_INPUTS);
+
+  struct pi_cluster_conf conf;
+  pi_cluster_conf_init(&conf);
+  conf.id = 0;
+  pi_open_from_conf(&cluster_dev, &conf);
+  if (pi_cluster_open(&cluster_dev))
+    return -1;
+
+#ifndef NOFLASH
+  mem_init();
+  open_fs();
+#endif
+
+  struct pi_cluster_task cluster_task;
+
+  /* ------------------------------------------------------------------
+   * Init training network
+   * ------------------------------------------------------------------ */
+
+  printf("Initializing TrainingNetwork...\r\n");
+  pi_cluster_task(&cluster_task, InitTrainingNetwork, NULL);
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+  /* ------------------------------------------------------------------
+   * Zero-initialise gradient accumulation buffers.
+   * ------------------------------------------------------------------ */
+
+  for (uint32_t _gi = 0; _gi < (uint32_t)TRAINING_NUM_GRAD_INPUTS; _gi++) {
+    uint32_t _idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + _gi;
+    uint32_t bytes = DeeployNetwork_inputs_bytes[_idx];
+    void *buf = DeeployNetwork_inputs[_idx];
+    if (IS_L2(buf)) {
+      memset(buf, 0, bytes);
+    } else {
+      /* Write zeros into L3 via DMA using a temporary L2 zero page */
+      uint8_t *zero_page = pi_l2_malloc(512);
+      memset(zero_page, 0, 512);
+      for (uint32_t off = 0; off < bytes; off += 512) {
+        uint32_t chunk = (off + 512 <= bytes) ? 512 : (bytes - off);
+        ram_write((char *)buf + off, zero_page, chunk);
+      }
+      pi_l2_free(zero_page, 512);
+    }
+  }
+
+  /* ------------------------------------------------------------------
+   * Init optimizer network
+   * ------------------------------------------------------------------ */
+
+  printf("Initializing OptimizerNetwork...\r\n");
+  pi_cluster_task(&cluster_task, InitOptimizerNetwork, NULL);
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+  uint32_t reset_idx = DeeployNetwork_num_inputs - 1;
+
+  /* ------------------------------------------------------------------
+   * Copy initial weights into network input buffers.
+   * (InitTrainingNetwork only malloc's them; testInitWeights[] holds
+   *  the actual starting values from inputs.npz.)
+   * ------------------------------------------------------------------ */
+
+#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0)
+  for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) {
+    uint32_t idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi;
+    l3_aware_copy(DeeployNetwork_inputs[idx], testInitWeights[wi],
+                  DeeployNetwork_inputs_bytes[idx]);
+  }
+#endif
+
+  printf("Starting training (%u optimizer steps x %u accum steps)...\r\n",
+         (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS);
+
+  for (uint32_t update_step = 0; update_step < N_TRAIN_STEPS; update_step++) {
+
+    for (uint32_t accum_step = 0; accum_step < N_ACCUM_STEPS; accum_step++) {
+
+      uint32_t mb = update_step * N_ACCUM_STEPS + accum_step;
+
+      printf("  update %u/%u  accum %u/%u  (mini-batch %u)\r\n",
+             update_step + 1, (unsigned)N_TRAIN_STEPS, accum_step + 1,
+             (unsigned)N_ACCUM_STEPS, mb);
+
+      /* ① Set lazy_reset_grad. */
+      {
+        void *reset_ptr = DeeployNetwork_inputs[reset_idx];
+        uint8_t reset_val = (accum_step == 0) ? 1u : 0u;
+        if (IS_L2(reset_ptr)) {
+          *((uint8_t *)reset_ptr) = reset_val;
+        } else {
+          ram_write(reset_ptr, &reset_val, sizeof(uint8_t));
+        }
+      }
+
+      /* ② Load this mini-batch's data + labels (cycle through unique samples).
+       */
+      for (uint32_t buf = 0; buf < TRAINING_NUM_DATA_INPUTS; buf++) {
+        l3_aware_copy(DeeployNetwork_inputs[buf],
+                      testDataVector[mb % TRAINING_DATA_SIZE][buf],
+                      DeeployNetwork_inputs_bytes[buf]);
+      }
+
+      /* ③ Forward + backward + InPlaceAccumulatorV2. */
+      pi_cluster_task(&cluster_task, RunTrainingNetwork, NULL);
+      cluster_task.stack_size = MAINSTACKSIZE;
+      cluster_task.slave_stack_size = SLAVESTACKSIZE;
+      pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+      /* ④ Store loss — use memcpy to avoid float registers on FC (no FPU). */
+      {
+        void *loss_ptr = DeeployNetwork_outputs[0];
+        if (IS_L2(loss_ptr)) {
+          memcpy(&stored_losses[mb], loss_ptr, sizeof(float));
+        } else {
+          ram_read(&stored_losses[mb], loss_ptr, sizeof(float));
+        }
+      }
+
+    } /* end accum_step loop */
+
+    /* ⑤ SGD weight update via Deeploy-compiled OptimizerNetwork. */
+    run_optimizer_step();
+
+  } /* end update_step loop */
+
+  /* ------------------------------------------------------------------
+   * Numerical verification — run on cluster (FC has no FPU)
+   * ------------------------------------------------------------------ */
+
+  uint32_t loss_err_count = 0;
+  uint32_t total_loss_checks =
+      (TOTAL_FWD_PASSES < N_LOSS_REFS) ? TOTAL_FWD_PASSES : N_LOSS_REFS;
+  LossCompareArgs loss_cmp_args = {
+      .computed = stored_losses,
+      .reference = (float *)testLossRef,
+      .n = total_loss_checks,
+      .err_count = &loss_err_count,
+  };
+  pi_cluster_task(&cluster_task, CompareLossesOnCluster, &loss_cmp_args);
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+  printf("Errors: %u out of %u\r\n", (unsigned)loss_err_count,
+         (unsigned)total_loss_checks);
+
+  return loss_err_count == 0 ? 0 : 1;
+}
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz
deleted file mode 100644
index b51a843019..0000000000
Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx
deleted file mode 100644
index 4e132a326b..0000000000
Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz
deleted file mode 100644
index fede142f83..0000000000
Binary files a/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz and /dev/null differ
diff --git a/DeeployTest/deeployTrainingRunner_siracusa.py b/DeeployTest/deeployTrainingRunner_siracusa.py
new file mode 100644
index 0000000000..c13cc31411
--- /dev/null
+++ b/DeeployTest/deeployTrainingRunner_siracusa.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployTrainingRunner import main
+
+if __name__ == '__main__':
+    sys.exit(main(tiling_enabled = False))
diff --git a/DeeployTest/deeployTrainingRunner_tiled_siracusa.py b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py
new file mode 100644
index 0000000000..3509fc04fe
--- /dev/null
+++ b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployTrainingRunner import main
+
+if __name__ == '__main__':
+    sys.exit(main(tiling_enabled = True))
diff --git a/DeeployTest/generateOptimizerNetwork.py b/DeeployTest/generateOptimizerNetwork.py
new file mode 100644
index 0000000000..d13b29505e
--- /dev/null
+++ b/DeeployTest/generateOptimizerNetwork.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Optimizer network code-generation entry point.
+
+Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits
+OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory.
+
+The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the
+default ``DeeployNetwork_``) so that it can be linked together with the
+training network without symbol conflicts.
+
+Usage
+-----
+    /usr/bin/python generateOptimizerNetwork.py \\
+        -t <optimizer_dir>  \\   # directory containing network.onnx
+        -d <output_dir>     \\   # where to write OptimizerNetwork.c/h
+        -p Siracusa         \\
+        --cores 8           \\
+        --lr 0.001
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.trainingUtils import add_optimizer_training_dir_arg
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
+from Deeploy.DeeployTypes import _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
+
+
+def generateOptimizerNetwork(args):
+    log.debug("Arguments: %s", args)
+
+    # 1. Load optimizer network.onnx
+    onnx_path = f'{args.dir}/network.onnx'
+    onnx_model = onnx.load_model(onnx_path)
+    graph = gs.import_onnx(onnx_model)
+
+    log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}")
+    log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}")
+
+    # 2. Platform setup
+    platform, signProp = mapPlatform(args.platform)
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
+    # 3. All optimizer inputs are float32 (weights + grad acc buffers).
+    graph_input_names = [inp.name for inp in onnx_model.graph.input]
+    inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))}
+    inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))}
+
+    # 4. Create and prepare deployer
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer")
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           name = "DeeployOptimizerNetwork",
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets)
+
+    # Set up memory hierarchy so AnnotateDefaultMemoryLevel assigns the correct
+    # memory level to ConstantBuffers (weights).  The optimizer graph is NOT
+    # tiled, but it must share the same memory-level view as the training graph
+    # so that weights end up in the same physical location (L2 when L3 is the
+    # training default, see AnnotateDefaultMemoryLevel).
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel)
+    defaultTargetMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel]
+
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel)
+    deployer = MemoryDeployerWrapper(deployer, [AnnotateDefaultMemoryLevel(memoryHierarchy)])
+
+    verbosityCfg = _NoVerbosity
+    _ = deployer.prepare(verbosityCfg)
+
+    # 5. Build shared-buffer maps when the training ONNX is available
+    shared_input_map: dict = {}
+    shared_output_map: dict = {}
+    training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None
+    if training_onnx and training_onnx.exists():
+        shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model)
+        log.debug(f"[SharedBuffers] input map: {shared_input_map}")
+        log.debug(f"[SharedBuffers] output map: {shared_output_map}")
+        log.info(f"[OptimizerNetwork] Sharing {len(shared_input_map)} inputs and "
+                 f"{len(shared_output_map)} outputs with TrainingNetwork")
+    else:
+        if args.training_dir:
+            log.warning(f"[OptimizerNetwork] training_dir set but {training_onnx} not found — "
+                        "generating standalone OptimizerNetwork (no buffer sharing)")
+
+    # 6. Generate OptimizerNetwork.c / OptimizerNetwork.h
+    os.makedirs(args.dumpdir, exist_ok = True)
+    generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map)
+
+    log.info(f"Optimizer network code generated in: {args.dumpdir}")
+    print(f"[OptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}")
+
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(description = "Deeploy Optimizer Network Code Generation.")
+    parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.")
+    parser.add_argument(
+        "--lr",
+        type = float,
+        default = 0.001,
+        help = "Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.",
+    )
+    parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.")
+    parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.")
+    parser.add_argument("--defaultMemLevel",
+                        type = str,
+                        default = "L2",
+                        help = "Default memory level for IO buffers. Default: L2.")
+    add_optimizer_training_dir_arg(parser)
+    parser.add_argument("--shouldFail", action = "store_true")
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    try:
+        generateOptimizerNetwork(args)
+    except Exception:
+        if args.shouldFail:
+            print("\033[92mOptimizer network generation ended, failed as expected!\033[0m")
+            sys.exit(0)
+        raise
+    if args.shouldFail:
+        raise RuntimeError("Expected to fail!")
diff --git a/DeeployTest/generateTrainingNetwork.py b/DeeployTest/generateTrainingNetwork.py
new file mode 100644
index 0000000000..febd95afdb
--- /dev/null
+++ b/DeeployTest/generateTrainingNetwork.py
@@ -0,0 +1,238 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import sys
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerateTraining import generateTrainingTestNetwork
+from testUtils.platformMapping import mapDeployer, mapPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \
+    _infer_total_mb, _load_reference_losses, add_training_inference_args
+from testUtils.typeMapping import inferTypeAndOffset
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t
+from Deeploy.DeeployTypes import _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
+
+
+def generateTrainingNetwork(args):
+    log.debug("Arguments: %s", args)
+
+    # 1. Load network.onnx (training graph)
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    # 1a. Handle UNDEFINED-typed outputs in training ONNX graphs.
+    # Backward pass ONNX often doesn't propagate types for gradient outputs.
+    # (i) Strip UNDEFINED-typed outputs that have no consumers.
+    # (ii) Patch UNDEFINED-typed outputs WITH consumers to float32 (training default).
+    _stripped = False
+    _patched = False
+    for node in graph.nodes:
+        filtered = [out for out in node.outputs if not (out.dtype == 0 and len(out.outputs) == 0)]
+        if len(filtered) < len(node.outputs):
+            node.outputs = filtered
+            _stripped = True
+        for out in node.outputs:
+            if out.dtype == 0 and len(out.outputs) > 0:
+                out.dtype = np.dtype(np.float32)
+                _patched = True
+    if _stripped:
+        graph.cleanup()
+        log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes")
+    if _patched:
+        log.debug("Patched UNDEFINED-typed outputs with consumers to float32")
+
+    # 2. Load inputs.npz (new format: no grad acc buf entries)
+    inputs_path = f'{args.dir}/inputs.npz'
+    inputs = np.load(inputs_path)
+
+    # 3. Platform setup
+    platform, signProp = mapPlatform(args.platform)
+
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    # Set cores on cluster engines (same pattern as generateNetwork.py)
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
+    # 4. Identify grad acc buf positions in the ONNX graph.
+    graph_input_names = [inp.name for inp in onnx_graph.graph.input]
+    grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n}
+    non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set]
+
+    # Base npz arrays: keys that are neither per-mb entries (mb*) nor metadata (meta_*)
+    base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_'))
+    npz_base = [inputs[k] for k in base_keys]
+
+    if len(npz_base) != len(non_grad_indices):
+        raise ValueError(f"inputs.npz has {len(npz_base)} base entries but network.onnx has "
+                         f"{len(non_grad_indices)} non-grad-buf inputs. "
+                         f"Re-generate inputs.npz with the updated exporter.")
+
+    # Build inputTypes / inputOffsets for ALL graph input positions.
+    inputTypes = {}
+    inputOffsets = {}
+
+    npz_idx = 0
+    for graph_idx in range(len(graph_input_names)):
+        if graph_idx in grad_acc_set:
+            inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+            inputOffsets[f"input_{graph_idx}"] = 0
+        else:
+            arr = npz_base[npz_idx]
+            npz_idx += 1
+
+            if arr.dtype == bool or arr.dtype == np.bool_:
+                inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            elif arr.dtype in (np.float32, np.float64):
+                # Float32 training parameters always stay float32.
+                # inferTypeAndOffset would misclassify integer-valued floats
+                # (e.g. LayerNorm gamma=1.0 / beta=0.0) as int8_t.
+                inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            elif np.prod(arr.shape) == 0:
+                # Zero-sized input (ONNX allows shape (0, ...) for optional
+                # placeholders).  No data to infer from, but downstream still
+                # looks up input_{idx} by key, so populate with a trivial default.
+                inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            else:
+                values = arr.reshape(-1).astype(np.float32)
+                _type, offset = inferTypeAndOffset(values, signProp = False)
+                inputTypes[f"input_{graph_idx}"] = _type
+                inputOffsets[f"input_{graph_idx}"] = offset
+
+    # 5. Create deployer
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           name = "DeeployTrainingNetwork",
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets)
+
+    log.debug(f"Deployer: {deployer}")
+
+    # 6. Prepare deployer
+    verbosityCfg = _NoVerbosity
+
+    _ = deployer.prepare(verbosityCfg)
+
+    # 7. Resolve num_data_inputs, n_steps, n_accum (auto-detect when not given).
+
+    # num_data_inputs: detect from npz mb1 variants if not specified
+    num_data = args.num_data_inputs
+    if num_data is None:
+        num_data = _infer_num_data_inputs(inputs_path)
+        log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz")
+
+    # n_steps / n_accum: derive from inputs.npz mini-batch count if not specified
+    n_steps = args.n_steps
+    n_accum = args.n_accum
+    if n_steps is None or n_accum is None:
+        total_mb = _infer_total_mb(inputs_path)
+        log.info(f"Auto-detected total_mb={total_mb} from inputs.npz")
+        if n_steps is None and n_accum is None:
+            n_accum = _infer_n_accum(inputs_path)
+            n_steps = max(1, total_mb // n_accum)
+        elif n_steps is None:
+            n_steps = max(1, total_mb // n_accum)
+        else:
+            n_accum = max(1, total_mb // n_steps)
+
+    log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}")
+
+    # 8. Build unique_mb_data from npz (only data_size unique samples).
+    # The C harness cycles through them via mb % TRAINING_DATA_SIZE.
+    total_mb = n_steps * n_accum
+    data_size = _infer_data_size(inputs_path)
+    log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}")
+    mb0_data = list(npz_base[:num_data])
+
+    unique_mb_data = []
+    for mb in range(data_size):
+        if mb == 0:
+            unique_mb_data.append(mb0_data)
+        else:
+            mb_row = []
+            for buf_idx in range(num_data):
+                key = f"mb{mb}_arr_{buf_idx:04d}"
+                mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx])
+            unique_mb_data.append(mb_row)
+
+    # Grad acc buf info for testinputs.h.
+    if grad_acc_set:
+        sorted_grad = sorted(grad_acc_set)
+        grad_buf_start_idx = sorted_grad[0]
+    else:
+        grad_buf_start_idx = -1
+    num_grad_inputs = len(grad_acc_set)
+
+    # Initial weight arrays: npz_base[num_data .. grad_buf_start_idx-1]
+    if grad_buf_start_idx > num_data:
+        init_weights = list(npz_base[num_data:grad_buf_start_idx])
+    else:
+        init_weights = []
+
+    # 9. Load reference loss from outputs.npz.
+    reference_losses = _load_reference_losses(args.dir)
+
+    # 10. Generate all output files
+    os.makedirs(args.dumpdir, exist_ok = True)
+
+    generateTrainingTestNetwork(deployer,
+                                unique_mb_data,
+                                args.dumpdir,
+                                verbosityCfg,
+                                n_steps = n_steps,
+                                n_accum = n_accum,
+                                num_data_inputs = num_data,
+                                grad_buf_start_idx = grad_buf_start_idx,
+                                num_grad_inputs = num_grad_inputs,
+                                learning_rate = args.learning_rate,
+                                reference_losses = reference_losses,
+                                init_weights = init_weights,
+                                data_size = data_size,
+                                tolerance_abs = args.tolerance_abs)
+
+    # 11. Write resolved config for execution.py to pick up after subprocess call.
+    meta = {
+        "n_train_steps": n_steps,
+        "n_accum_steps": n_accum,
+        "training_num_data_inputs": num_data,
+    }
+    meta_path = os.path.join(args.dumpdir, "training_meta.json")
+    with open(meta_path, 'w') as f:
+        json.dump(meta, f, indent = 2)
+    log.info(f"Training meta written to {meta_path}: {meta}")
+
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(description = "Deeploy Training Code Generation Utility.")
+    parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.")
+    add_training_inference_args(parser)
+    parser.add_argument("--shouldFail", action = "store_true")
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    try:
+        generateTrainingNetwork(args)
+    except Exception:
+        if args.shouldFail:
+            print("\033[92mTraining network generation ended, failed as expected!\033[0m")
+            sys.exit(0)
+        raise
+    if args.shouldFail:
+        raise RuntimeError("Expected to fail!")
diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py
new file mode 100644
index 0000000000..3a94bf8e48
--- /dev/null
+++ b/DeeployTest/testMVPOptimizer.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tiled optimizer network code-generation entry point.
+
+Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits
+OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory,
+using the SB-Tiler to tile SGD kernels through L1.
+
+The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the
+default ``DeeployNetwork_``) so that it can be linked together with the
+training network without symbol conflicts.
+
+Usage
+-----
+    /usr/bin/python testMVPOptimizer.py \\
+        -t <optimizer_dir>  \\   # directory containing network.onnx
+        -d <output_dir>     \\   # where to write OptimizerNetwork.c/h
+        -p Siracusa         \\
+        --cores 8           \\
+        --l1 64000          \\
+        --l2 1024000        \\
+        --defaultMemLevel L2
+"""
+
+import hashlib
+import os
+import sys
+from pathlib import Path
+
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
+from Deeploy.DeeployTypes import CodeGenVerbosity, _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
+    AnnotateIOMemoryLevel
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
+
+
+def generateTiledOptimizerNetwork(args) -> None:
+    log.debug("Arguments: %s", args)
+
+    # 1. Load optimizer network.onnx
+    onnx_path = f'{args.dir}/network.onnx'
+    onnx_model = onnx.load_model(onnx_path)
+    graph = gs.import_onnx(onnx_model)
+
+    log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}")
+    log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}")
+
+    # 2. Platform setup
+    platform, signProp = mapPlatform(args.platform)
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
+    # 3. All optimizer inputs are float32 (weights + grad acc buffers).
+    graph_input_names = [inp.name for inp in onnx_model.graph.input]
+    inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))}
+    inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))}
+
+    # 4. Create deployer with _mockScheduler (required for TilerDeployerWrapper).
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer")
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           name = "DeeployOptimizerNetwork",
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets,
+                           scheduler = _mockScheduler)
+
+    # 5. Set up memory hierarchy.
+    #    Tiles execute in L1; optimizer I/O (weights, grads) live in L2 (or L3).
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64_000_000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel)
+
+    defaultTargetMemLevel = L1
+    defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel]
+
+    # 6. Wrap with memory-level annotation.
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel)
+    deployer = MemoryDeployerWrapper(deployer, [
+        AnnotateIOMemoryLevel(defaultIoMemLevel.name),
+        AnnotateDefaultMemoryLevel(memoryHierarchy),
+    ])
+
+    # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed).
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer"
+    testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
+
+    # TrainingSBTiler extends all input buffer lifetimes to the end of the
+    # schedule (via TrainingMemoryScheduler).  This prevents the allocator from
+    # reusing the space of a consumed input (e.g. fc1 weight) for a later
+    # output (e.g. fc2 updated weight), which would corrupt the weight buffer.
+    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
+    deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
+    deployer.tiler.searchStrategy = args.searchStrategy
+
+    # 8. Prepare deployer.
+    verbosityCfg = _NoVerbosity
+    if args.profileTiling:
+        verbosityCfg = CodeGenVerbosity(tilingProfiling = True)
+    _ = deployer.prepare(verbosityCfg)
+
+    # 9. Build shared-buffer maps when the training ONNX is available
+    shared_input_map: dict = {}
+    shared_output_map: dict = {}
+    training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None
+    if training_onnx and training_onnx.exists():
+        shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model)
+        log.debug(f"[SharedBuffers] input map: {shared_input_map}")
+        log.debug(f"[SharedBuffers] output map: {shared_output_map}")
+        log.info(f"[TiledOptimizerNetwork] Sharing {len(shared_input_map)} inputs and "
+                 f"{len(shared_output_map)} outputs with TrainingNetwork")
+    else:
+        if args.training_dir:
+            log.warning(f"[TiledOptimizerNetwork] training_dir set but {training_onnx} not found — "
+                        "generating standalone OptimizerNetwork (no buffer sharing)")
+
+    # 10. Generate OptimizerNetwork.c / OptimizerNetwork.h
+    os.makedirs(args.dumpdir, exist_ok = True)
+    generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map)
+
+    log.info(f"Tiled optimizer network code generated in: {args.dumpdir}")
+    print(f"[TiledOptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}")
+
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(description = "Deeploy Tiled Optimizer Network Code Generation.")
+    parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.")
+    parser.add_argument(
+        "--lr",
+        type = float,
+        default = 0.001,
+        help = "Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.",
+    )
+    parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.")
+    parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.")
+    parser.add_argument("--defaultMemLevel",
+                        type = str,
+                        default = "L2",
+                        help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--memAllocStrategy",
+                        type = str,
+                        default = "MiniMalloc",
+                        help = "Memory allocation strategy. Default: MiniMalloc.")
+    parser.add_argument("--searchStrategy",
+                        type = str,
+                        default = "random-max",
+                        help = "CP solver search strategy. Default: random-max.")
+    parser.add_argument("--plotMemAlloc",
+                        action = "store_true",
+                        help = "Save memory allocation plots in the deeployStates folder.")
+    parser.add_argument("--profileTiling",
+                        action = "store_true",
+                        help = "Enable tiling profiling (inserts cycle counters around each tiled kernel).")
+    add_optimizer_training_dir_arg(parser)
+    parser.add_argument("--shouldFail", action = "store_true")
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    try:
+        generateTiledOptimizerNetwork(args)
+    except Exception:
+        if args.shouldFail:
+            print("\033[92mTiled optimizer network generation ended, failed as expected!\033[0m")
+            sys.exit(0)
+        raise
+    if args.shouldFail:
+        raise RuntimeError("Expected to fail!")
diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py
new file mode 100644
index 0000000000..c0e4e7c2d8
--- /dev/null
+++ b/DeeployTest/testMVPTraining.py
@@ -0,0 +1,274 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import hashlib
+import json
+import os
+import sys
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.codeGenerateTraining import generateTrainingTestNetwork
+from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \
+    _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args
+from testUtils.typeMapping import inferTypeAndOffset
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t
+from Deeploy.DeeployTypes import CodeGenVerbosity, _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
+    AnnotateIOMemoryLevel
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
+
+
+def generateTiledTrainingNetwork(args) -> None:
+    log.debug("Arguments: %s", args)
+
+    # 1. Load network.onnx (training graph with forward + backward ops).
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    # 1a. Strip UNDEFINED-typed unused optional outputs (e.g. MaxPool mask indices).
+    _stripped = False
+    for node in graph.nodes:
+        filtered = [out for out in node.outputs if not (out.dtype == 0 and len(out.outputs) == 0)]
+        if len(filtered) < len(node.outputs):
+            node.outputs = filtered
+            _stripped = True
+    if _stripped:
+        graph.cleanup()
+        log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes")
+
+    # 2. Load inputs.npz.
+    inputs_path = f'{args.dir}/inputs.npz'
+    inputs = np.load(inputs_path)
+
+    # 3. Platform setup.
+    platform, signProp = mapPlatform(args.platform)
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
+    # 4. Identify grad acc buf positions in the ONNX graph.
+    graph_input_names = [inp.name for inp in onnx_graph.graph.input]
+    grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n}
+    non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set]
+
+    base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_'))
+    npz_base = [inputs[k] for k in base_keys]
+
+    if len(npz_base) != len(non_grad_indices):
+        raise ValueError(f"inputs.npz has {len(npz_base)} base entries but network.onnx has "
+                         f"{len(non_grad_indices)} non-grad-buf inputs. "
+                         f"Re-generate inputs.npz with the updated exporter.")
+
+    # 5. Build inputTypes / inputOffsets for ALL graph input positions.
+    inputTypes = {}
+    inputOffsets = {}
+
+    npz_idx = 0
+    for graph_idx in range(len(graph_input_names)):
+        if graph_idx in grad_acc_set:
+            inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+            inputOffsets[f"input_{graph_idx}"] = 0
+        else:
+            arr = npz_base[npz_idx]
+            npz_idx += 1
+            if arr.dtype == bool or arr.dtype == np.bool_:
+                inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            elif arr.dtype in (np.float32, np.float64):
+                inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            elif np.prod(arr.shape) == 0:
+                # Zero-sized input (ONNX allows shape (0, ...) for optional
+                # placeholders).  No data to infer from, but downstream still
+                # looks up input_{idx} by key, so populate with a trivial default.
+                inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t)
+                inputOffsets[f"input_{graph_idx}"] = 0
+            else:
+                values = arr.reshape(-1).astype(np.float32)
+                _type, offset = inferTypeAndOffset(values, signProp = False)
+                inputTypes[f"input_{graph_idx}"] = _type
+                inputOffsets[f"input_{graph_idx}"] = offset
+
+    # 6. Create deployer with _mockScheduler (required for TilerDeployerWrapper).
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           name = "DeeployTrainingNetwork",
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets,
+                           scheduler = _mockScheduler)
+
+    # 7. Set up memory hierarchy.
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64_000_000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
+    memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel)
+
+    defaultTargetMemLevel = L1
+    defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel]
+
+    # 8. Wrap with memory-level annotation.
+    deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel)
+
+    deployer = MemoryDeployerWrapper(deployer, [
+        AnnotateIOMemoryLevel(defaultIoMemLevel.name),
+        AnnotateDefaultMemoryLevel(memoryHierarchy),
+    ])
+
+    # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass).
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}"
+    testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
+
+    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
+    deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
+    deployer.tiler.searchStrategy = args.searchStrategy
+
+    # 10. Prepare deployer.
+    verbosityCfg = _NoVerbosity
+    if args.profileTiling:
+        verbosityCfg = CodeGenVerbosity(tilingProfiling = True)
+    _ = deployer.prepare(verbosityCfg)
+
+    # 11. Resolve num_data_inputs, n_steps, n_accum.
+    num_data = args.num_data_inputs
+    if num_data is None:
+        num_data = _infer_num_data_inputs(inputs_path)
+        log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz")
+
+    n_steps = args.n_steps
+    n_accum = args.n_accum
+    if n_steps is None or n_accum is None:
+        total_mb = _infer_total_mb(inputs_path)
+        log.info(f"Auto-detected total_mb={total_mb} from inputs.npz")
+        if n_steps is None and n_accum is None:
+            n_accum = _infer_n_accum(inputs_path)
+            n_steps = max(1, total_mb // n_accum)
+        elif n_steps is None:
+            n_steps = max(1, total_mb // n_accum)
+        else:
+            n_accum = max(1, total_mb // n_steps)
+
+    log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}")
+
+    # 12. Build unique_mb_data from npz.
+    total_mb = n_steps * n_accum
+    data_size = _infer_data_size(inputs_path)
+    log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}")
+    mb0_data = list(npz_base[:num_data])
+
+    unique_mb_data = []
+    for mb in range(data_size):
+        if mb == 0:
+            unique_mb_data.append(mb0_data)
+        else:
+            mb_row = []
+            for buf_idx in range(num_data):
+                key = f"mb{mb}_arr_{buf_idx:04d}"
+                mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx])
+            unique_mb_data.append(mb_row)
+
+    # Grad acc buf info for testinputs.h.
+    if grad_acc_set:
+        sorted_grad = sorted(grad_acc_set)
+        grad_buf_start_idx = sorted_grad[0]
+    else:
+        grad_buf_start_idx = -1
+    num_grad_inputs = len(grad_acc_set)
+
+    if grad_buf_start_idx > num_data:
+        init_weights = list(npz_base[num_data:grad_buf_start_idx])
+    else:
+        init_weights = []
+
+    # 13. Load reference losses.
+    reference_losses = _load_reference_losses(args.dir)
+
+    # 14. Generate output files.
+    os.makedirs(args.dumpdir, exist_ok = True)
+
+    generateTrainingTestNetwork(deployer,
+                                unique_mb_data,
+                                args.dumpdir,
+                                verbosityCfg,
+                                n_steps = n_steps,
+                                n_accum = n_accum,
+                                num_data_inputs = num_data,
+                                grad_buf_start_idx = grad_buf_start_idx,
+                                num_grad_inputs = num_grad_inputs,
+                                learning_rate = args.learning_rate,
+                                reference_losses = reference_losses,
+                                init_weights = init_weights,
+                                data_size = data_size,
+                                tolerance_abs = args.tolerance_abs)
+
+    # 15. Write resolved config for execution.py to pick up.
+    meta = {
+        "n_train_steps": n_steps,
+        "n_accum_steps": n_accum,
+        "training_num_data_inputs": num_data,
+    }
+    meta_path = os.path.join(args.dumpdir, "training_meta.json")
+    with open(meta_path, 'w') as f:
+        json.dump(meta, f, indent = 2)
+    log.info(f"Training meta written to {meta_path}: {meta}")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(description = "Deeploy Tiled Training Code Generation Utility.")
+    parser.add_argument("--cores", type = int, default = 1, help = "Number of cluster cores. Default: 1.")
+    add_training_inference_args(parser)
+    parser.add_argument("--l1", type = int, default = 64_000, help = "L1 size in bytes. Default: 64000.")
+    parser.add_argument("--l2", type = int, default = 1_024_000, help = "L2 size in bytes. Default: 1024000.")
+    parser.add_argument("--defaultMemLevel",
+                        type = str,
+                        default = "L2",
+                        help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--memAllocStrategy",
+                        type = str,
+                        default = "MiniMalloc",
+                        help = "Memory allocation strategy. Default: MiniMalloc.")
+    parser.add_argument("--searchStrategy",
+                        type = str,
+                        default = "random-max",
+                        help = "CP solver search strategy. Default: random-max.")
+    parser.add_argument("--plotMemAlloc",
+                        action = "store_true",
+                        help = "Save memory allocation plots in the deeployStates folder.")
+    parser.add_argument("--profileTiling",
+                        action = "store_true",
+                        help = "Enable tiling profiling (inserts cycle counters around each tiled kernel).")
+    parser.add_argument("--shouldFail", action = "store_true")
+    parser.set_defaults(shouldFail = False)
+    args = parser.parse_args()
+
+    try:
+        generateTiledTrainingNetwork(args)
+    except Exception:
+        if args.shouldFail:
+            print("\033[92mTiled training network generation ended, failed as expected!\033[0m")
+            sys.exit(0)
+        raise
+    if args.shouldFail:
+        raise RuntimeError("Expected to fail!")
diff --git a/DeeployTest/testUtils/codeGenerateTraining.py b/DeeployTest/testUtils/codeGenerateTraining.py
new file mode 100644
index 0000000000..4ef9a9fd8a
--- /dev/null
+++ b/DeeployTest/testUtils/codeGenerateTraining.py
@@ -0,0 +1,892 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Code-generation helpers for the training / optimizer test harness.
+
+These functions emit the C source, header and data files for training tests
+that drive both a TrainingNetwork (forward + backward + gradient accumulation)
+and an OptimizerNetwork (SGD weight update) on the target platform.
+
+Kept as a separate module from testUtils.codeGenerate (which handles plain
+inference codegen) so this PR's training-side additions touch the inference
+helpers only through imports, not by interleaving with inference definitions.
+"""
+
+import os
+import re
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkDeployer
+from Deeploy.Targets.MemPool.Platform import MemPoolPlatform
+from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform
+
+from .codeGenerate import generateL3HexDump
+
+
+def generateTrainingTestInputsHeader(deployer: NetworkDeployer,
+                                     all_mb_data: List[List[np.ndarray]],
+                                     n_steps: int,
+                                     n_accum: int,
+                                     grad_buf_start_idx: int = 0,
+                                     num_grad_inputs: int = 0,
+                                     learning_rate: float = 0.001,
+                                     init_weights: List[np.ndarray] = None,
+                                     data_size: int = None) -> str:
+    """Generate testinputs.h for training tests.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer (used to look up buffer types).
+    all_mb_data : list of list of np.ndarray
+        Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for
+        mini-batch *mb* and DATA buffer *buf*.  All mini-batches must have the
+        same number of buffers.
+    n_steps : int
+        N_TRAIN_STEPS macro value.
+    n_accum : int
+        N_ACCUM_STEPS macro value.
+    grad_buf_start_idx : int
+        Index of the first grad accumulation buffer in DeeployNetwork_inputs[].
+        Used to emit TRAINING_GRAD_BUF_START_IDX.  Pass 0 (and num_grad_inputs=0)
+        to suppress the define (e.g. when no grad bufs exist).
+    num_grad_inputs : int
+        Number of grad accumulation buffers.  Used to emit TRAINING_NUM_GRAD_INPUTS.
+
+    Returns
+    -------
+    str
+        C header string.
+    """
+    total_mb = n_steps * n_accum
+    num_data = len(all_mb_data[0]) if all_mb_data else 0
+    # data_size: number of unique samples stored in C arrays.
+    # C harness cycles: testDataVector[mb % TRAINING_DATA_SIZE].
+    # Defaults to total_mb (no cycling) for backward compatibility.
+    effective_data_size = data_size if (data_size is not None and data_size < total_mb) else total_mb
+
+    retStr = ""
+    retStr += f"#define N_TRAIN_STEPS {n_steps}\n"
+    retStr += f"#define N_ACCUM_STEPS {n_accum}\n"
+    retStr += f"#define TRAINING_DATA_SIZE {effective_data_size}\n"
+    retStr += f"#define TRAINING_NUM_DATA_INPUTS {num_data}\n"
+    if num_grad_inputs > 0:
+        retStr += f"#define TRAINING_GRAD_BUF_START_IDX {grad_buf_start_idx}\n"
+        retStr += f"#define TRAINING_NUM_GRAD_INPUTS {num_grad_inputs}\n"
+        num_weight_inputs = grad_buf_start_idx - num_data
+        retStr += f"#define TRAINING_NUM_WEIGHT_INPUTS {num_weight_inputs}\n"
+    retStr += f"#define TRAINING_LEARNING_RATE {learning_rate:.10g}f\n"
+    retStr += "\n"
+
+    # Emit per-mini-batch buffer arrays — only effective_data_size unique rows.
+    # all_mb_data must contain exactly effective_data_size rows.
+    for mb in range(effective_data_size):
+        mb_data = all_mb_data[mb] if mb < len(all_mb_data) else all_mb_data[-1]
+        row_entries = []
+        for buf_idx, arr in enumerate(mb_data):
+            values = arr.reshape(-1)
+
+            # Determine C type from deployer context (buffer "input_N").
+            input_key = f"input_{buf_idx}"
+            if deployer.ctxt.is_buffer(input_key):
+                buffer = deployer.ctxt.lookup(input_key)
+                typeName = buffer._type.referencedType.typeName
+                typeWidth = buffer._type.referencedType.typeWidth
+            else:
+                # Fallback: infer from numpy dtype
+                if arr.dtype == np.float32 or arr.dtype == np.float64:
+                    typeName = "float32_t"
+                    typeWidth = 32
+                elif arr.dtype == np.int64:
+                    typeName = "int64_t"
+                    typeWidth = 64
+                elif arr.dtype == np.bool_ or arr.dtype == bool:
+                    typeName = "uint8_t"
+                    typeWidth = 8
+                else:
+                    typeName = "int32_t"
+                    typeWidth = 32
+
+            buf_name = f"testData_mb{mb}_buf{buf_idx}"
+            row_entries.append(buf_name)
+
+            # Format values
+            if typeName == 'float32_t':
+                list_str = ", ".join(
+                    [f'{float(x)}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values.astype(np.float32)])
+            else:
+                list_str = ", ".join([str(x) for x in values])
+
+            # 4-byte alignment padding
+            total_bytes = (values.size * typeWidth) // 8
+            pad_bytes = (-total_bytes) % 4
+            if pad_bytes:
+                paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
+                list_str += ", " + ", ".join("0" for _ in range(paddingElements))
+
+            retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n"
+
+        # Emit the row pointer array for this mini-batch
+        row_name = f"testDataRow{mb}"
+        retStr += f"void* {row_name}[] = {{{', '.join(f'(void*){e}' for e in row_entries)}}};\n"
+        retStr += "\n"
+
+    # Emit the top-level vector of row pointers (only unique samples; C harness cycles via modulo).
+    retStr += f"void** testDataVector[{effective_data_size}] = {{{', '.join(f'testDataRow{mb}' for mb in range(effective_data_size))}}};\n"
+
+    # Emit initial weight arrays (one per weight input, indices num_data..grad_buf_start_idx-1).
+    if init_weights:
+        retStr += "\n"
+        weight_entries = []
+        num_data = len(all_mb_data[0]) if all_mb_data else 0
+        for wi, arr in enumerate(init_weights):
+            buf_global_idx = num_data + wi
+            input_key = f"input_{buf_global_idx}"
+            if deployer.ctxt.is_buffer(input_key):
+                buffer = deployer.ctxt.lookup(input_key)
+                typeName = buffer._type.referencedType.typeName
+                typeWidth = buffer._type.referencedType.typeWidth
+            else:
+                typeName = "float32_t"
+                typeWidth = 32
+            values = arr.reshape(-1).astype(np.float32)
+            # Tile values to match Deeploy's internal (possibly sequence-length-tiled) shape.
+            if deployer.ctxt.is_buffer(input_key):
+                expected_nelems = int(np.prod(deployer.ctxt.lookup(input_key).shape))
+                if expected_nelems > len(values) and expected_nelems % len(values) == 0:
+                    values = np.tile(values, expected_nelems // len(values))
+            list_str = ", ".join([f'{float(x)}f' for x in values])
+            buf_name = f"testInitWeight_{wi}"
+            weight_entries.append(buf_name)
+            retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n"
+        retStr += f"void* testInitWeights[{len(weight_entries)}] = {{{', '.join(f'(void*){e}' for e in weight_entries)}}};\n"
+
+    return retStr
+
+
+def generateTrainingTestOutputsHeader(
+    reference_losses: List = None,
+    tolerance_abs: float = 1e-3,
+) -> str:
+    """Generate testoutputs.h for training tests — loss comparison only.
+
+    Parameters
+    ----------
+    reference_losses : list of float, optional
+        Reference loss value for each forward pass (one per mini-batch step).
+        If None, loss comparison is skipped.
+    tolerance_abs : float
+        Absolute comparison tolerance emitted as TRAINING_TOLERANCE_ABS.
+
+    Returns
+    -------
+    str
+        C header string.
+    """
+    has_loss = reference_losses is not None and len(reference_losses) > 0
+
+    retStr = "// testoutputs.h — Phase 2: loss verification\n"
+    retStr += f"#define TRAINING_TOLERANCE_ABS {tolerance_abs:.10g}f\n\n"
+
+    if has_loss:
+        n = len(reference_losses)
+        retStr += "// Expected loss for each forward pass (one per mini-batch)\n"
+        retStr += f"#define N_LOSS_REFS {n}\n"
+        vals = ", ".join(f"{float(v):.10g}f" for v in reference_losses)
+        retStr += f"float32_t testLossRef[{n}] = {{{vals}}};\n\n"
+    else:
+        retStr += "// No loss reference available — loss comparison skipped.\n"
+        retStr += "#define N_LOSS_REFS 0\n\n"
+
+    return retStr
+
+
+def generateTrainingNetworkHeader(deployer: NetworkDeployer) -> str:
+    """Generate TrainingNetwork.h — same as generateTestNetworkHeader but with
+    RunTrainingNetwork / InitTrainingNetwork function names and a distinct header guard.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer.
+
+    Returns
+    -------
+    str
+        C header string.
+    """
+    retStr = ""
+
+    retStr += """
+#ifndef __DEEPLOY_TRAINING_HEADER__
+#define __DEEPLOY_TRAINING_HEADER__
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+"""
+    retStr += deployer.generateIncludeString()
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+void RunTrainingNetwork();
+void InitTrainingNetwork();
+
+"""
+    else:
+        retStr += """
+void RunTrainingNetwork(uint32_t core_id, uint32_t numThreads);
+void InitTrainingNetwork(uint32_t core_id, uint32_t numThread);
+
+"""
+
+    retStr += deployer.generateIOBufferInitializationCode()
+    retStr += """
+#endif
+"""
+
+    return retStr
+
+
+def generateTrainingNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str:
+    """Generate TrainingNetwork.c — same as generateTestNetworkImplementation but with
+    RunTrainingNetwork / InitTrainingNetwork function names and including TrainingNetwork.h.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer.
+    verbosityCfg : CodeGenVerbosity
+        Verbosity configuration.
+
+    Returns
+    -------
+    str
+        C implementation string.
+    """
+    retStr = ""
+
+    retStr += """#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+"""
+    retStr += deployer.generateIncludeString()
+    retStr += """
+
+#include "TrainingNetwork.h"
+
+"""
+
+    retStr += deployer.generateBufferInitializationCode()
+    retStr += deployer.generateGlobalDefinitionCode()
+
+    if isinstance(deployer.Platform, MemPoolPlatform):
+        retStr += deployer.generateInferenceInitializationCode()
+        retStr += """
+void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+    elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+void RunTrainingNetwork(){
+"""
+        retStr += deployer.generateInferenceInitializationCode()
+    else:
+        retStr += """
+void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+        retStr += deployer.generateInferenceInitializationCode()
+
+    retStr += deployer.generateFunction(verbosityCfg)
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+}
+
+void InitTrainingNetwork(){
+"""
+    else:
+        retStr += """
+}
+
+void InitTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+    retStr += deployer.generateEngineInitializationCode()
+    retStr += deployer.generateBufferAllocationCode()
+    retStr += """
+}
+"""
+
+    return retStr
+
+
+def generateTrainingTestNetwork(deployer: NetworkDeployer,
+                                all_mb_data: List[List[np.ndarray]],
+                                dumpdir: str,
+                                verbosityCfg: CodeGenVerbosity,
+                                n_steps: int = 1,
+                                n_accum: int = 1,
+                                num_data_inputs: int = 2,
+                                grad_buf_start_idx: int = 0,
+                                num_grad_inputs: int = 0,
+                                learning_rate: float = 0.001,
+                                reference_losses: List = None,
+                                init_weights: List = None,
+                                data_size: int = None,
+                                tolerance_abs: float = 1e-3) -> None:
+    """Generate all training test files: testinputs.h, testoutputs.h, TrainingNetwork.h, TrainingNetwork.c.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer (ctxt.name must already be set to "DeeployTrainingNetwork").
+    all_mb_data : list of list of np.ndarray
+        Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for
+        mini-batch *mb* and DATA buffer *buf*.
+    dumpdir : str
+        Output directory for generated files.
+    verbosityCfg : CodeGenVerbosity
+        Verbosity configuration.
+    n_steps : int
+        N_TRAIN_STEPS value.
+    n_accum : int
+        N_ACCUM_STEPS value.
+    num_data_inputs : int
+        Number of data inputs (TRAINING_NUM_DATA_INPUTS).
+    grad_buf_start_idx : int
+        Index of the first grad accumulation buffer in DeeployNetwork_inputs[].
+    num_grad_inputs : int
+        Number of grad accumulation buffers (TRAINING_NUM_GRAD_INPUTS).
+    """
+    assert deployer.prepared, "An unprepared deployer was given"
+
+    os.makedirs(dumpdir, exist_ok = True)
+
+    # testinputs.h
+    testInputStr = generateTrainingTestInputsHeader(deployer,
+                                                    all_mb_data,
+                                                    n_steps,
+                                                    n_accum,
+                                                    grad_buf_start_idx,
+                                                    num_grad_inputs,
+                                                    learning_rate,
+                                                    init_weights = init_weights,
+                                                    data_size = data_size)
+    with open(f'{dumpdir}/testinputs.h', 'w') as f:
+        f.write(testInputStr)
+
+    # testoutputs.h
+    testOutputStr = generateTrainingTestOutputsHeader(
+        reference_losses = reference_losses,
+        tolerance_abs = tolerance_abs,
+    )
+    with open(f'{dumpdir}/testoutputs.h', 'w') as f:
+        f.write(testOutputStr)
+
+    # TrainingNetwork.h
+    headerStr = generateTrainingNetworkHeader(deployer)
+    with open(f'{dumpdir}/TrainingNetwork.h', 'w') as f:
+        f.write(headerStr)
+
+    # TrainingNetwork.c
+    implStr = generateTrainingNetworkImplementation(deployer, verbosityCfg)
+    with open(f'{dumpdir}/TrainingNetwork.c', 'w') as f:
+        f.write(implStr)
+
+    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+    for fname in ['TrainingNetwork.c', 'TrainingNetwork.h', 'testinputs.h', 'testoutputs.h']:
+        os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}')
+
+    # Build initial-value list for every input_N buffer so that L3 hex files
+    # can be written.  The list must cover all N where "input_N" exists in the
+    # deployer context.  Layout (must match DeeployNetwork_inputs[] order):
+    #   [0 .. num_data_inputs-1]              → first mini-batch data
+    #   [num_data_inputs .. grad_start-1]     → initial weights
+    #   [grad_start .. grad_start+num_grad-1] → zeros  (grad acc bufs)
+    #   [last]                                → lazy_reset_grad = 1 (uint8)
+    l3_initial_inputs: List[np.ndarray] = []
+    # Count how many input_N buffers exist in the deployer context
+    n_total_inputs = sum(
+        1 for name in deployer.ctxt.globalObjects if name.startswith("input_") and name[len("input_"):].isdigit())
+    for i in range(n_total_inputs):
+        if all_mb_data and i < len(all_mb_data[0]):
+            # Data / label input
+            l3_initial_inputs.append(all_mb_data[0][i])
+        elif (init_weights is not None and grad_buf_start_idx > 0 and num_data_inputs <= i < grad_buf_start_idx):
+            # Weight input
+            wi = i - num_data_inputs
+            l3_initial_inputs.append(init_weights[wi] if wi <
+                                     len(init_weights) else np.array([0.0], dtype = np.float32))
+        elif (grad_buf_start_idx > 0 and num_grad_inputs > 0
+              and grad_buf_start_idx <= i < grad_buf_start_idx + num_grad_inputs):
+            # Gradient accumulation buffer — zero-initialised
+            buf = deployer.ctxt.globalObjects.get(f"input_{i}")
+            shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,)
+            l3_initial_inputs.append(np.zeros(shape, dtype = np.float32))
+        else:
+            # lazy_reset_grad (last input) or any unknown slot — default 1 / uint8
+            buf = deployer.ctxt.globalObjects.get(f"input_{i}")
+            shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,)
+            l3_initial_inputs.append(np.ones(shape, dtype = np.uint8))
+
+    generateL3HexDump(deployer, os.path.join(dumpdir, 'hex'), l3_initial_inputs, [])
+
+
+# ---------------------------------------------------------------------------
+# Optimizer network code-generation helpers
+# ---------------------------------------------------------------------------
+
+_OPT_PREFIX = "DeeployOptNetwork_"
+_TRAIN_PREFIX = "DeeployNetwork_"
+
+
+def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict[int, int], Dict[int, int]]:
+    """Build optimizer→training index maps for tensors shared between the two graphs.
+
+    The optimizer ONNX inputs are interleaved weight/grad pairs that have the
+    same tensor names as inputs in the training ONNX graph.  We match by name
+    so that ``InitOptimizerNetwork`` can reference the already-allocated
+    ``DeeployNetwork_input_N`` pointers instead of allocating fresh buffers.
+
+    Parameters
+    ----------
+    train_onnx_path : str
+        Path to the training ``network.onnx``.
+    opt_onnx_model :
+        Already-loaded optimizer ONNX model (``onnx.ModelProto``).
+
+    Returns
+    -------
+    shared_input_map : Dict[int, int]
+        opt_input_idx → train_input_idx
+    shared_output_map : Dict[int, int]
+        opt_output_idx → train_input_idx  (SGD outputs == updated weights,
+        same physical buffer as the weight input)
+    """
+    import onnx as _onnx
+    train_model = _onnx.load_model(train_onnx_path)
+    train_names = [inp.name for inp in train_model.graph.input]
+    train_name_to_idx = {name: i for i, name in enumerate(train_names)}
+
+    opt_input_names = [inp.name for inp in opt_onnx_model.graph.input]
+    opt_output_names = [out.name for out in opt_onnx_model.graph.output]
+
+    shared_input_map: Dict[int, int] = {}
+    for opt_idx, name in enumerate(opt_input_names):
+        if name in train_name_to_idx:
+            shared_input_map[opt_idx] = train_name_to_idx[name]
+
+    shared_output_map: Dict[int, int] = {}
+    for opt_idx, name in enumerate(opt_output_names):
+        # Try exact match first; then strip the '_updated' suffix that the SGD
+        # node appends to output tensor names (e.g. 'conv1_weight_updated' → 'conv1_weight').
+        lookup_name = name
+        if lookup_name not in train_name_to_idx and lookup_name.endswith('_updated'):
+            lookup_name = lookup_name[:-len('_updated')]
+        if lookup_name in train_name_to_idx:
+            shared_output_map[opt_idx] = train_name_to_idx[lookup_name]
+
+    return shared_input_map, shared_output_map
+
+
+def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
+    """Redirect optimizer I/O buffers to Training's already-allocated buffers.
+
+    Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
+    the generated symbols already carry the ``DeeployOptNetwork_`` prefix.
+
+    Handles two allocation styles produced by Deeploy:
+
+    *Non-tiled* (per-buffer malloc)::
+
+        DeeployOptNetwork_input_N = (SomeType *)pi_l2_malloc(sizeof(...));
+
+    *Tiled* (single arena with offsets)::
+
+        DeeployOptNetwork_input_N = (float32_t *)((char *)DeeployOptNetwork_MEMORYARENA_L2 + OFFSET);
+
+    Both are replaced with direct pointers into the TrainingNetwork arenas::
+
+        DeeployOptNetwork_input_N = (float32_t *)DeeployNetwork_input_M;
+
+    After all I/O pointers are redirected, if a ``MEMORYARENA_L2`` or
+    ``MEMORYARENA_L3`` allocation is no longer referenced anywhere in the Init
+    body (i.e., the shared buffers consumed the entire arena), the now-unused
+    malloc is also removed to reclaim the L2/L3 memory.
+
+    Parameters
+    ----------
+    retStr : str
+        The already-prefix-substituted C source string.
+    shared_input_map : Dict[int, int]
+        Optimizer input index → training input index.
+    shared_output_map : Dict[int, int]
+        Optimizer output index → training input index (in-place update).
+
+    Returns
+    -------
+    str
+        Patched C source string.
+    """
+    if not shared_input_map and not shared_output_map:
+        return retStr
+
+    # ------------------------------------------------------------------
+    # Pattern 1 (non-tiled): individual pi_*_malloc per buffer
+    # ------------------------------------------------------------------
+    _malloc_pat = re.compile(
+        r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)\s*pi_\w+_malloc\([^;]+\);')
+
+    # ------------------------------------------------------------------
+    # Pattern 2 (tiled): arena-offset assignment
+    #   DeeployOptNetwork_input_N = (Type *)((char *)DeeployOptNetwork_MEMORYARENA_Lx + OFFSET);
+    # ------------------------------------------------------------------
+    _arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
+                            r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')
+
+    def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
+        if kind == "input" and idx in shared_input_map:
+            train_idx = shared_input_map[idx]
+            return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* shared with TrainingNetwork */'
+        if kind == "output" and idx in shared_output_map:
+            train_idx = shared_output_map[idx]
+            return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* in-place, shared with TrainingNetwork */'
+        return None
+
+    def _replace(m: re.Match) -> str:
+        replacement = _make_replacement(m.group(1), m.group(2), int(m.group(3)))
+        return replacement if replacement is not None else m.group(0)
+
+    retStr = _malloc_pat.sub(_replace, retStr)
+    retStr = _arena_pat.sub(_replace, retStr)
+
+    # ------------------------------------------------------------------
+    # Arena elimination: if a MEMORYARENA_Lx is no longer used for any
+    # pointer arithmetic after the redirects, its malloc is dead and can
+    # be removed to reclaim L2/L3.  The global declaration is left in
+    # place (harmless; the variable will be NULL at runtime).
+    # ------------------------------------------------------------------
+    for level in ('L2', 'L3'):
+        arena_sym = f'DeeployOptNetwork_MEMORYARENA_{level}'
+        # Pattern for the malloc assignment line itself
+        malloc_line_pat = re.compile(rf'[^\n]*{re.escape(arena_sym)}\s*=\s*\([^)]+\)\s*pi_\w+_malloc\([^;]+\);\s*\n')
+        # Pattern for any use of the arena in pointer arithmetic:
+        #   (char *)ARENA + OFFSET  or  (void *)ARENA  etc.
+        arena_use_pat = re.compile(rf'\(\s*(?:char|void|int8_t)\s*\*\s*\)\s*{re.escape(arena_sym)}')
+        if not arena_use_pat.search(retStr):
+            # No remaining pointer arithmetic — the malloc is dead
+            retStr = malloc_line_pat.sub('', retStr)
+
+    # ------------------------------------------------------------------
+    # Inject TrainingNetwork header so DeeployNetwork_input_N symbols resolve
+    # ------------------------------------------------------------------
+    retStr = retStr.replace(
+        '#include "OptimizerNetwork.h"',
+        '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"',
+    )
+    return retStr
+
+
+def _patch_shared_arenas(retStr: str, train_c_source: str) -> str:
+    """Redirect optimizer L1/L2 arena allocations to reuse training network's arenas.
+
+    TrainingNetwork and OptimizerNetwork run strictly sequentially: RunTrainingNetwork()
+    completes before RunOptimizerNetwork() starts.  Their L1/L2 tile-working arenas
+    therefore never overlap in time and can share the same physical memory.
+
+    Only the L1 arena is shared: it is pure tile-compute scratch whose content is
+    dead after each kernel returns.  The L2 arena is NOT shared because it may hold
+    persistent tensor data (weights, activations) at fixed offsets in non-tiled mode;
+    sharing it would let the optimizer's L2 staging buffers overwrite that data.
+
+    Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution.
+
+    Parameters
+    ----------
+    retStr : str
+        The already-prefix-substituted C source string for the optimizer.
+    train_c_source : str
+        The full text of TrainingNetwork.c (used to confirm the arena symbols exist).
+
+    Returns
+    -------
+    str
+        Patched C source string.
+    """
+    for level in ('L1',):
+        train_sym = f'DeeployNetwork_MEMORYARENA_{level}'
+        # Only alias if the training network actually has this arena
+        if train_sym not in train_c_source:
+            continue
+
+        opt_sym = f'DeeployOptNetwork_MEMORYARENA_{level}'
+        opt_malloc_pat = re.compile(rf'({re.escape(opt_sym)})\s*=\s*\([^)]+\)\s*\w+\(sizeof\([^)]+\)\s*\*\s*\d+\)\s*;')
+        if not opt_malloc_pat.search(retStr):
+            continue
+
+        replacement = f'{opt_sym} = (int8_t *){train_sym};  /* shared with TrainingNetwork */'
+        retStr = opt_malloc_pat.sub(replacement, retStr)
+
+    # Inject TrainingNetwork header if not already present
+    # (_patch_shared_buffers may have already added it; guard against duplicates)
+    if '#include "TrainingNetwork.h"' not in retStr:
+        retStr = retStr.replace(
+            '#include "OptimizerNetwork.h"',
+            '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"',
+        )
+
+    return retStr
+
+
+def _ensure_training_l1_capacity(dumpdir: str, train_c_source: str, opt_alloc_code: str) -> str:
+    """Enlarge TrainingNetwork's L1 arena to cover the optimizer's L1 needs.
+
+    Since the two networks share the same L1 arena, TrainingNetwork must allocate
+    at least max(train_L1, opt_L1) bytes.  When the optimizer needs more L1 than
+    training (rare but possible, e.g. autoencoder), this function patches
+    TrainingNetwork.c and TrainingNetwork.h in-place and returns the updated
+    TrainingNetwork.c source string.
+
+    Parameters
+    ----------
+    dumpdir : str
+        Directory containing TrainingNetwork.c and TrainingNetwork.h.
+    train_c_source : str
+        Current content of TrainingNetwork.c.
+    opt_alloc_code : str
+        Optimizer buffer-allocation code after _TRAIN_PREFIX → _OPT_PREFIX
+        substitution (used to extract the optimizer's L1 size).
+
+    Returns
+    -------
+    str
+        (Possibly updated) TrainingNetwork.c source string.
+    """
+    m_opt = re.search(
+        r'DeeployOptNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*(\d+)\)',
+        opt_alloc_code,
+    )
+    if not m_opt:
+        return train_c_source
+
+    opt_l1 = int(m_opt.group(1))
+
+    m_train = re.search(
+        r'(DeeployNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*)(\d+)(\))',
+        train_c_source,
+    )
+    if not m_train:
+        return train_c_source
+
+    train_l1 = int(m_train.group(2))
+    if opt_l1 <= train_l1:
+        return train_c_source  # Already large enough
+
+    new_l1 = opt_l1
+
+    # Patch TrainingNetwork.c malloc size
+    train_c_new = train_c_source.replace(
+        m_train.group(0),
+        f'{m_train.group(1)}{new_l1}{m_train.group(3)}',
+        1,
+    )
+    train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c')
+    with open(train_c_path, 'w') as f:
+        f.write(train_c_new)
+
+    # Patch TrainingNetwork.h _len constant
+    train_h_path = os.path.join(dumpdir, 'TrainingNetwork.h')
+    if os.path.exists(train_h_path):
+        train_h = open(train_h_path).read()
+        train_h_new = re.sub(
+            r'(DeeployNetwork_MEMORYARENA_L1_len\s*=\s*)\d+',
+            rf'\g<1>{new_l1}',
+            train_h,
+        )
+        with open(train_h_path, 'w') as f:
+            f.write(train_h_new)
+
+    return train_c_new
+
+
+def generateOptimizerNetworkHeader(deployer: NetworkDeployer) -> str:
+    """Generate OptimizerNetwork.h.
+
+    Reuses the Deeploy deployer's output and applies two transformations:
+      1. Replace the buffer prefix ``DeeployNetwork_`` → ``DeeployOptNetwork_``
+      2. Inject ``RunOptimizerNetwork`` / ``InitOptimizerNetwork`` function declarations.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer for the optimizer ONNX graph.
+
+    Returns
+    -------
+    str
+        C header string.
+    """
+    retStr = ""
+    retStr += """
+#ifndef __DEEPLOY_OPTIMIZER_HEADER__
+#define __DEEPLOY_OPTIMIZER_HEADER__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+"""
+    retStr += deployer.generateIncludeString()
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+void RunOptimizerNetwork();
+void InitOptimizerNetwork();
+
+"""
+    else:
+        retStr += """
+void RunOptimizerNetwork(uint32_t core_id, uint32_t numThreads);
+void InitOptimizerNetwork(uint32_t core_id, uint32_t numThreads);
+
+"""
+    retStr += deployer.generateIOBufferInitializationCode()
+    retStr += """
+#endif
+"""
+    # Prefix substitution: all Deeploy-generated DeeployNetwork_ → DeeployOptNetwork_
+    retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
+    return retStr
+
+
+def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
+                                           verbosityCfg: CodeGenVerbosity,
+                                           shared_input_map: Optional[Dict[int, int]] = None,
+                                           shared_output_map: Optional[Dict[int, int]] = None,
+                                           train_c_source: Optional[str] = None) -> str:
+    """Generate OptimizerNetwork.c.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer for the optimizer ONNX graph.
+    verbosityCfg : CodeGenVerbosity
+        Verbosity configuration.
+    shared_input_map : Dict[int, int], optional
+        Optimizer input index → training input index for shared weight/grad buffers.
+        When provided, those malloc calls are replaced with references to the
+        already-allocated TrainingNetwork buffers.
+    shared_output_map : Dict[int, int], optional
+        Optimizer output index → training input index for in-place shared outputs.
+    train_c_source : str, optional
+        Full text of TrainingNetwork.c.  When provided, the optimizer's L1/L2 arena
+        malloc calls are replaced with direct pointers to the training arenas,
+        saving one L1 and one L2 allocation (safe because the two networks run
+        strictly sequentially).
+
+    Returns
+    -------
+    str
+        C implementation string.
+    """
+    retStr = ""
+    retStr += """#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+"""
+    retStr += deployer.generateIncludeString()
+    retStr += """
+#include "OptimizerNetwork.h"
+
+"""
+    retStr += deployer.generateBufferInitializationCode()
+    retStr += deployer.generateGlobalDefinitionCode()
+
+    if isinstance(deployer.Platform, MemPoolPlatform):
+        retStr += deployer.generateInferenceInitializationCode()
+        retStr += """
+void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+    elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+void RunOptimizerNetwork(){
+"""
+        retStr += deployer.generateInferenceInitializationCode()
+    else:
+        retStr += """
+void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+        retStr += deployer.generateInferenceInitializationCode()
+
+    retStr += deployer.generateFunction(verbosityCfg)
+
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+}
+
+void InitOptimizerNetwork(){
+"""
+    else:
+        retStr += """
+}
+
+void InitOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+"""
+    retStr += deployer.generateEngineInitializationCode()
+    retStr += deployer.generateBufferAllocationCode()
+    retStr += """
+}
+"""
+    # Prefix substitution
+    retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
+    # Replace malloc calls for shared weight/grad buffers with Training pointers
+    retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
+    # Redirect optimizer L1/L2 arena mallocs to reuse training arenas
+    if train_c_source:
+        retStr = _patch_shared_arenas(retStr, train_c_source)
+    return retStr
+
+
+def generateOptimizerTestNetwork(deployer: NetworkDeployer,
+                                 dumpdir: str,
+                                 verbosityCfg: CodeGenVerbosity,
+                                 shared_input_map: Optional[Dict[int, int]] = None,
+                                 shared_output_map: Optional[Dict[int, int]] = None) -> None:
+    """Generate OptimizerNetwork.h and OptimizerNetwork.c.
+
+    Parameters
+    ----------
+    deployer : NetworkDeployer
+        Prepared deployer for the optimizer ONNX graph.
+    dumpdir : str
+        Output directory for generated files.
+    verbosityCfg : CodeGenVerbosity
+        Verbosity configuration.
+    shared_input_map : Dict[int, int], optional
+        Optimizer input index → training input index for shared weight/grad buffers.
+    shared_output_map : Dict[int, int], optional
+        Optimizer output index → training input index for in-place shared outputs.
+    """
+    assert deployer.prepared, "An unprepared deployer was given"
+
+    os.makedirs(dumpdir, exist_ok = True)
+
+    train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c')
+    train_c_source: Optional[str] = None
+    if os.path.exists(train_c_path):
+        with open(train_c_path, 'r') as f:
+            train_c_source = f.read()
+
+    # Enlarge training L1 arena if optimizer needs more (so unconditional L1 sharing is safe)
+    if train_c_source:
+        opt_alloc_preview = deployer.generateBufferAllocationCode().replace(_TRAIN_PREFIX, _OPT_PREFIX)
+        train_c_source = _ensure_training_l1_capacity(dumpdir, train_c_source, opt_alloc_preview)
+
+    headerStr = generateOptimizerNetworkHeader(deployer)
+    with open(f'{dumpdir}/OptimizerNetwork.h', 'w') as f:
+        f.write(headerStr)
+
+    implStr = generateOptimizerNetworkImplementation(deployer, verbosityCfg, shared_input_map, shared_output_map,
+                                                     train_c_source)
+    with open(f'{dumpdir}/OptimizerNetwork.c', 'w') as f:
+        f.write(implStr)
+
+    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+    for fname in ['OptimizerNetwork.c', 'OptimizerNetwork.h']:
+        os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}')
diff --git a/DeeployTest/testUtils/core/config.py b/DeeployTest/testUtils/core/config.py
index e932c23962..0ecf45d467 100644
--- a/DeeployTest/testUtils/core/config.py
+++ b/DeeployTest/testUtils/core/config.py
@@ -24,6 +24,14 @@ class DeeployTestConfig:
     gen_args: List[str] = None
     verbose: int = 0
     debug: bool = False
+    training: bool = False
+    # None means "auto-detect from ONNX graph / inputs.npz during codegen"
+    n_train_steps: Optional[int] = None
+    n_accum_steps: Optional[int] = None
+    training_num_data_inputs: Optional[int] = None
+    # Directory containing the optimizer ONNX (network.onnx with SGD nodes).
+    # If None, defaults to <test_dir>/../simplemlp_optimizer when training=True.
+    optimizer_dir: Optional[str] = None
 
     def __post_init__(self):
         if self.cmake_args is None:
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 1dcddeea62..2fb1224c92 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -10,6 +10,7 @@
 
 from Deeploy.Logging import DEFAULT_LOGGER as log
 
+from ..trainingUtils import add_training_cmake_flags, run_training_codegen
 from .config import DeeployTestConfig
 from .output_parser import TestResult, parse_test_output
 
@@ -27,6 +28,10 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None:
 
     script_dir = Path(__file__).parent.parent.parent
 
+    if config.training:
+        run_training_codegen(config, script_dir)
+        return
+
     if config.tiling:
         generation_script = script_dir / "testMVP.py"
     else:
@@ -102,6 +107,9 @@ def configure_cmake(config: DeeployTestConfig) -> None:
     else:
         cmd.append("-Dgvsoc_simulation=OFF")
 
+    add_training_cmake_flags(cmd, config.training, config.n_train_steps, config.n_accum_steps,
+                             config.training_num_data_inputs)
+
     # Last argument is the source directory
     script_dir = Path(__file__).parent.parent.parent
     cmd.append(str(script_dir.parent))
diff --git a/DeeployTest/testUtils/deeployTrainingRunner.py b/DeeployTest/testUtils/deeployTrainingRunner.py
new file mode 100644
index 0000000000..8f523bf264
--- /dev/null
+++ b/DeeployTest/testUtils/deeployTrainingRunner.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Common entry point for Siracusa training test runners (non-tiled and tiled).
+
+Usage:
+    from testUtils.deeployTrainingRunner import main
+    sys.exit(main(tiling_enabled=False))   # non-tiled
+    sys.exit(main(tiling_enabled=True))    # tiled (SBTiler)
+"""
+
+import os
+from pathlib import Path
+
+# gapy (gvsoc launcher) uses `#!/usr/bin/env python3`.  Put /usr/bin first so
+# it resolves to /usr/bin/python3 which has all required packages (gapylib,
+# prettytable, …) rather than the minimal venv python.
+os.environ['PATH'] = '/usr/bin:' + os.environ.get('PATH', '')
+
+from .core import DeeployTestConfig, run_complete_test
+from .core.paths import get_test_paths
+from .deeployRunner import DeeployRunnerArgumentParser, print_colored_result, print_configuration
+
+
+def main(tiling_enabled: bool = False, default_platform: str = 'Siracusa', default_simulator: str = 'gvsoc'):
+    """
+    Build parser, parse args, create DeeployTestConfig, and run the training test.
+
+    Parameters
+    ----------
+    tiling_enabled:
+        True  → passes tiling args (--l1, --l2, …) and sets tiling=True in config.
+    default_platform:
+        Platform used when -p is not given on the command line.
+    default_simulator:
+        Simulator used when -s is not given on the command line.
+    """
+
+    parser = DeeployRunnerArgumentParser(tiling_arguments = tiling_enabled, platform_required = False)
+
+    parser.add_argument('--cores', type = int, default = 8, help = 'Number of cluster cores (default: 8)\n')
+    parser.add_argument('--n-steps',
+                        metavar = '<N>',
+                        dest = 'n_steps',
+                        type = int,
+                        default = None,
+                        help = 'N_TRAIN_STEPS: optimizer steps (auto-detected if not given)\n')
+    parser.add_argument('--n-accum',
+                        metavar = '<N>',
+                        dest = 'n_accum',
+                        type = int,
+                        default = None,
+                        help = 'N_ACCUM_STEPS: mini-batches per update step (auto-detected if not given)\n')
+    parser.add_argument('--num-data-inputs',
+                        metavar = '<N>',
+                        dest = 'num_data_inputs',
+                        type = int,
+                        default = None,
+                        help = 'Inputs that change each mini-batch (auto-detected if not given)\n')
+    parser.add_argument('--optimizer-dir',
+                        metavar = '<dir>',
+                        dest = 'optimizer_dir',
+                        type = str,
+                        default = None,
+                        help = 'Directory containing the optimizer network.onnx '
+                        "(default: auto-derived by replacing '_train' with '_optimizer')\n")
+    parser.add_argument(
+        '--tolerance',
+        metavar = '<tol>',
+        dest = 'tolerance',
+        type = float,
+        default = None,
+        help = 'Absolute loss tolerance for pass/fail comparison (default: auto from generateTrainingNetwork.py)\n')
+
+    args = parser.parse_args()
+
+    platform = default_platform
+    simulator = args.simulator if args.simulator else default_simulator
+
+    script_path = Path(__file__).resolve()
+    base_dir = script_path.parent.parent
+
+    gen_dir, test_dir_abs, test_name = get_test_paths(args.dir, platform, base_dir = str(base_dir))
+
+    worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'master')
+    build_dir = str(base_dir / f'TEST_{platform.upper()}' / f'build_{worker_id}')
+
+    cmake_args = [f'-DNUM_CORES={args.cores}']
+    if args.cmake:
+        cmake_args.extend(args.cmake)
+
+    gen_args = [f'--cores={args.cores}']
+    if args.tolerance is not None:
+        gen_args.append(f'--tolerance={args.tolerance}')
+    if args.input_type_map:
+        gen_args.extend(['--input-type-map'] + list(args.input_type_map))
+    if args.input_offset_map:
+        gen_args.extend(['--input-offset-map'] + list(args.input_offset_map))
+
+    if tiling_enabled:
+        if getattr(args, 'defaultMemLevel', None):
+            gen_args.append(f'--defaultMemLevel={args.defaultMemLevel}')
+        if getattr(args, 'l1', None):
+            gen_args.append(f'--l1={args.l1}')
+        if getattr(args, 'l2', None) and args.l2 != 1024000:
+            gen_args.append(f'--l2={args.l2}')
+        if getattr(args, 'memAllocStrategy', None):
+            gen_args.append(f'--memAllocStrategy={args.memAllocStrategy}')
+        if getattr(args, 'searchStrategy', None):
+            gen_args.append(f'--searchStrategy={args.searchStrategy}')
+        if getattr(args, 'profileTiling', False):
+            gen_args.append('--profileTiling')
+        if getattr(args, 'plotMemAlloc', False):
+            gen_args.append('--plotMemAlloc')
+
+    config = DeeployTestConfig(
+        test_name = test_name,
+        test_dir = test_dir_abs,
+        platform = platform,
+        simulator = simulator,
+        tiling = tiling_enabled,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = args.toolchain,
+        toolchain_install_dir = args.toolchain_install_dir,
+        cmake_args = cmake_args,
+        gen_args = gen_args,
+        verbose = args.verbose,
+        debug = args.debug,
+        training = True,
+        n_train_steps = args.n_steps,
+        n_accum_steps = args.n_accum,
+        training_num_data_inputs = args.num_data_inputs,
+        optimizer_dir = args.optimizer_dir,
+    )
+
+    print_configuration(config)
+
+    try:
+        result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim)
+        print_colored_result(result, config.test_name)
+        return 0 if result.success else 1
+    except Exception as e:
+        RED = '\033[91m'
+        RESET = '\033[0m'
+        print(f'\n{RED}✗ Test {config.test_name} FAILED with exception: {e}{RESET}')
+        return 1
diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py
index 0c3986cd6e..1dfb43bea4 100644
--- a/DeeployTest/testUtils/tilingUtils.py
+++ b/DeeployTest/testUtils/tilingUtils.py
@@ -2,11 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Union
+from typing import Dict, List, Tuple, Union
 
 from ortools.constraint_solver.pywrapcp import IntVar
 
 from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import PatternMemoryConstraints
+from Deeploy.TilingExtension.MemoryScheduler import MemoryScheduler
 from Deeploy.TilingExtension.TilerExtension import Tiler
 from Deeploy.TilingExtension.TilerModel import TilerModel
 
@@ -43,3 +45,27 @@ class SBTiler(Tiler):
     def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
                             hop: str, tensorName: str) -> Union[int, IntVar]:
         return 1
+
+
+class TrainingMemoryScheduler(MemoryScheduler):
+    """MemoryScheduler variant for training networks.
+
+    Extends input tensor lifetimes to the end of the full tiling schedule so
+    that forward-pass inputs remain live during the backward pass.
+    """
+
+    def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: PatternMemoryConstraints,
+                            memoryLevel: str) -> Tuple[Dict[str, Tuple[int, int]], Dict]:
+        tensorLifetimeMap, tensorMap = super()._calculateLifetimes(ctxt, patternMemoryConstraint, memoryLevel)
+
+        maxStepIdx = len(patternMemoryConstraint.nodeConstraints)
+        for tensorName, lifetime in tensorLifetimeMap.items():
+            buffer = ctxt.lookup(tensorName)
+            if buffer.is_input:
+                tensorLifetimeMap[tensorName] = (0, maxStepIdx)
+
+        return tensorLifetimeMap, tensorMap
+
+
+class TrainingSBTiler(SBTiler):
+    memorySchedulerClass = TrainingMemoryScheduler
diff --git a/DeeployTest/testUtils/trainingUtils.py b/DeeployTest/testUtils/trainingUtils.py
new file mode 100644
index 0000000000..a3386cd7ca
--- /dev/null
+++ b/DeeployTest/testUtils/trainingUtils.py
@@ -0,0 +1,334 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Shared helpers used by the training / optimizer code-generation entry points
+(generateTrainingNetwork.py, testMVPTraining.py, generateOptimizerNetwork.py,
+testMVPOptimizer.py).
+
+Four kinds of helpers live here, all strictly training-specific:
+
+1. inputs.npz / outputs.npz readers (``_load_reference_losses``, ``_infer_*``).
+2. The singleton ``_mockScheduler`` the Tiler expects for per-node tiling.
+3. Training-only argparse builders (``add_training_inference_args``,
+   ``add_optimizer_training_dir_arg``).
+4. The core hooks invoked by ``testUtils.core.execution``
+   (``resolve_optimizer_dir``, ``run_training_codegen``,
+   ``add_training_cmake_flags``).
+
+Generic helpers (``--cores`` / ``--l1`` / ``--l2`` / ``--defaultMemLevel`` /
+``--memAllocStrategy`` / ``--searchStrategy`` / ``--plotMemAlloc`` /
+``--profileTiling`` / ``--shouldFail`` arg definitions and the ``shouldFail``
+try/except handshake) are deliberately *not* wrapped into functions here:
+they are not training-specific and belong inline in whichever entry point
+needs them, consistent with the upstream inference codegen scripts.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
+# Graph input name marker identifying gradient accumulation buffers.
+_GRAD_ACC = "_grad.accumulation.buffer"
+
+
+def _load_reference_losses(train_dir: str) -> Optional[list]:
+    """Load reference loss values from outputs.npz.
+
+    Returns the list of per-mini-batch loss values if any key in
+    outputs.npz contains 'loss', otherwise None (with a warning).
+    """
+    outputs_path = os.path.join(train_dir, "outputs.npz")
+    if not os.path.exists(outputs_path):
+        log.warning(f"outputs.npz not found at {outputs_path} — loss comparison skipped")
+        return None
+
+    try:
+        outputs = np.load(outputs_path)
+    except Exception as e:
+        log.warning(f"Failed to load outputs.npz: {e} — loss comparison skipped")
+        return None
+
+    for key in outputs.files:
+        if 'loss' in key.lower():
+            vals = [float(v) for v in np.array(outputs[key]).flatten().tolist()]
+            log.info(f"Reference losses loaded from outputs.npz['{key}']: {vals}")
+            return vals
+
+    log.warning("No 'loss' key found in outputs.npz — loss comparison skipped")
+    return None
+
+
+def _infer_num_data_inputs(inputs_path: str) -> int:
+    """Auto-detect number of data inputs from inputs.npz.
+
+    Data inputs are the base arr_* entries that have per-mini-batch
+    variants (mb1_arr_*) in the npz — i.e. entries that actually change
+    across mini-batches.
+
+    Raises ValueError if no mb1 entries are found (single-mini-batch case)
+    where the data/weight boundary cannot be determined automatically.
+    """
+    inputs = np.load(inputs_path)
+    base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_'))
+    count = sum(1 for k in base_keys if f'mb1_{k}' in inputs.files)
+    if count == 0:
+        raise ValueError("Cannot auto-detect num_data_inputs: inputs.npz has only one mini-batch "
+                         "(no mb1_arr_* entries found). Please pass --num-data-inputs explicitly.")
+    return count
+
+
+def _infer_total_mb(inputs_path: str) -> int:
+    """Count total mini-batches from inputs.npz.
+
+    New format: inputs.npz contains meta_n_batches (total training mini-batches)
+    and meta_data_size (number of unique samples stored; C harness cycles via modulo).
+
+    Legacy format: count 1 + number of unique mb* indices.
+    """
+    inputs = np.load(inputs_path)
+    if "meta_n_batches" in inputs.files:
+        return int(inputs["meta_n_batches"].flat[0])
+    mb_indices = set()
+    for key in inputs.files:
+        if key.startswith('mb'):
+            try:
+                idx = int(key.split('_')[0][2:])
+                mb_indices.add(idx)
+            except ValueError:
+                pass
+    return 1 + len(mb_indices)
+
+
+def _infer_data_size(inputs_path: str) -> int:
+    """Return the number of unique input samples stored in inputs.npz.
+
+    New format: reads meta_data_size.
+    Legacy format: same as _infer_total_mb (all batches were unique).
+    """
+    inputs = np.load(inputs_path)
+    if "meta_data_size" in inputs.files:
+        return int(inputs["meta_data_size"].flat[0])
+    return _infer_total_mb(inputs_path)
+
+
+def _infer_n_accum(inputs_path: str) -> int:
+    """Return the gradient accumulation step count stored in inputs.npz.
+
+    New format: reads meta_n_accum written by the exporter.
+    Legacy format: defaults to 1 (no gradient accumulation).
+    """
+    inputs = np.load(inputs_path)
+    if "meta_n_accum" in inputs.files:
+        return int(inputs["meta_n_accum"].flat[0])
+    return 1
+
+
+def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]:
+    """Wrap every node in a singleton list for the Tiler pattern interface."""
+    return [[node] for node in graph.nodes]
+
+
+# ---------------------------------------------------------------------------
+# argparse builders
+#
+# The four training / optimizer codegen entry points all define the same
+# arguments in their __main__ blocks.  These helpers add the shared groups
+# to an existing parser so each entry point only has to compose the groups
+# it actually needs.
+# ---------------------------------------------------------------------------
+
+
+def add_training_inference_args(parser: argparse.ArgumentParser) -> None:
+    """Arguments consumed by both training codegen entry points."""
+    parser.add_argument(
+        "--num-data-inputs",
+        type = int,
+        dest = "num_data_inputs",
+        default = None,
+        help = "Number of DATA inputs that change per mini-batch. "
+        "Auto-detected if not specified.",
+    )
+    parser.add_argument(
+        "--n-steps",
+        type = int,
+        dest = "n_steps",
+        default = None,
+        help = "N_TRAIN_STEPS: number of gradient-accumulation update steps. "
+        "Auto-detected if not specified.",
+    )
+    parser.add_argument(
+        "--n-accum",
+        type = int,
+        dest = "n_accum",
+        default = None,
+        help = "N_ACCUM_STEPS: number of mini-batches per update step. "
+        "Auto-detected if not specified.",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type = float,
+        dest = "learning_rate",
+        default = 0.001,
+        help = "SGD learning rate emitted as TRAINING_LEARNING_RATE in testinputs.h. Default: 0.001.",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type = float,
+        dest = "tolerance_abs",
+        default = 1e-3,
+        help = "Absolute loss tolerance emitted as TRAINING_TOLERANCE_ABS in testoutputs.h. Default: 1e-3.",
+    )
+
+
+def add_optimizer_training_dir_arg(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--training-dir",
+        type = str,
+        default = None,
+        help = "Directory containing the training network.onnx.  When provided, "
+        "weight and grad-acc buffers are shared with TrainingNetwork instead "
+        "of being allocated independently.",
+    )
+
+
+def resolve_optimizer_dir(test_dir: str, optimizer_dir: Optional[str]) -> str:
+    """Return the optimizer ONNX directory for a training test.
+
+    If ``optimizer_dir`` is explicitly set, it is returned as-is.  Otherwise
+    fall back to ``<test_dir>/../<model>_optimizer``, where ``<model>`` is
+    derived by replacing the ``_train`` suffix of the test directory's base
+    name with ``_optimizer`` (e.g. ``simplemlp_train`` → ``simplemlp_optimizer``,
+    ``sleepconvit_train`` → ``sleepconvit_optimizer``).
+    """
+    if optimizer_dir:
+        return optimizer_dir
+    test_path = Path(test_dir)
+    optimizer_name = test_path.name.replace("_train", "_optimizer")
+    return str(test_path.parent / optimizer_name)
+
+
+def add_training_cmake_flags(cmd: List[str], training: bool, n_train_steps: Optional[int], n_accum_steps: Optional[int],
+                             training_num_data_inputs: Optional[int]) -> None:
+    """Append -DTRAINING=ON/OFF plus any known -DN_TRAIN_STEPS / -DN_ACCUM_STEPS /
+    -DTRAINING_NUM_DATA_INPUTS defines to ``cmd``.  In-place."""
+    cmd.append(f"-DTRAINING={'ON' if training else 'OFF'}")
+    if not training:
+        return
+    if n_train_steps is not None:
+        cmd.append(f"-DN_TRAIN_STEPS={n_train_steps}")
+    if n_accum_steps is not None:
+        cmd.append(f"-DN_ACCUM_STEPS={n_accum_steps}")
+    if training_num_data_inputs is not None:
+        cmd.append(f"-DTRAINING_NUM_DATA_INPUTS={training_num_data_inputs}")
+
+
+def run_training_codegen(config, script_dir: Path) -> None:
+    """Drive the two-stage training codegen pipeline for one test.
+
+    Runs the training network codegen script (generateTrainingNetwork.py or
+    testMVPTraining.py) followed by the matching optimizer codegen script
+    (generateOptimizerNetwork.py or testMVPOptimizer.py), and writes back
+    any auto-detected training parameters from ``training_meta.json`` into
+    ``config``.
+
+    The single entry point keeps ``testUtils.core.execution.generate_network``
+    oblivious to training internals — it only has to call this and return.
+
+    Parameters
+    ----------
+    config : DeeployTestConfig
+        The test configuration (must have ``training=True``).  Training
+        fields (``n_train_steps``, ``n_accum_steps``,
+        ``training_num_data_inputs``) may be updated in-place from the
+        training_meta.json written by the codegen script.
+    script_dir : Path
+        ``DeeployTest/`` — the directory that hosts the four codegen scripts.
+    """
+    if config.tiling:
+        training_script = script_dir / "testMVPTraining.py"
+        optimizer_script = script_dir / "testMVPOptimizer.py"
+        opt_passthrough = ("--cores", "--l1", "--l2", "--defaultMemLevel", "--memAllocStrategy", "--searchStrategy",
+                           "--plotMemAlloc", "--profileTiling")
+        stage = "Tiled training"
+    else:
+        training_script = script_dir / "generateTrainingNetwork.py"
+        optimizer_script = script_dir / "generateOptimizerNetwork.py"
+        opt_passthrough = ("--cores", "--l1", "--l2", "--defaultMemLevel")
+        stage = "Training"
+
+    # --- Step 1: Training network (forward + backward + accumulation) ---
+    cmd = [
+        sys.executable,
+        str(training_script),
+        "-d",
+        config.gen_dir,
+        "-t",
+        config.test_dir,
+        "-p",
+        config.platform,
+    ]
+    if config.n_train_steps is not None:
+        cmd.append(f"--n-steps={config.n_train_steps}")
+    if config.n_accum_steps is not None:
+        cmd.append(f"--n-accum={config.n_accum_steps}")
+    if config.training_num_data_inputs is not None:
+        cmd.append(f"--num-data-inputs={config.training_num_data_inputs}")
+    if config.verbose > 0:
+        cmd.append("-" + "v" * config.verbose)
+    if config.debug:
+        cmd.append("--debug")
+    cmd.extend(config.gen_args)
+
+    log.debug(f"[Execution] {stage} network generation command: {' '.join(cmd)}")
+    if subprocess.run(cmd, check = False).returncode != 0:
+        raise RuntimeError(f"{stage} network generation failed for {config.test_name}")
+
+    # Read back auto-detected values written by the training generation script.
+    meta_path = Path(config.gen_dir) / "training_meta.json"
+    if meta_path.exists():
+        with open(meta_path) as f:
+            meta = json.load(f)
+        config.n_train_steps = meta["n_train_steps"]
+        config.n_accum_steps = meta["n_accum_steps"]
+        config.training_num_data_inputs = meta["training_num_data_inputs"]
+        log.info(f"[Execution] Training meta: {meta}")
+
+    # --- Step 2: Optimizer network (SGD) ---
+    opt_dir = resolve_optimizer_dir(config.test_dir, config.optimizer_dir)
+    if not Path(opt_dir).exists():
+        log.warning(f"Optimizer directory not found: {opt_dir} — skipping optimizer codegen")
+        return
+    if not optimizer_script.exists():
+        log.warning(f"{optimizer_script.name} not found — skipping optimizer codegen")
+        return
+
+    opt_cmd = [
+        sys.executable,
+        str(optimizer_script),
+        "-d",
+        config.gen_dir,
+        "-t",
+        opt_dir,
+        "-p",
+        config.platform,
+        f"--training-dir={config.test_dir}",
+    ]
+    opt_cmd.extend(arg for arg in config.gen_args if any(arg.startswith(p) for p in opt_passthrough))
+    if not any(arg.startswith("--defaultMemLevel") for arg in opt_cmd):
+        opt_cmd.append("--defaultMemLevel=L2")
+    if config.verbose > 0:
+        opt_cmd.append("-" + "v" * config.verbose)
+
+    log.debug(f"[Execution] {stage} optimizer network generation command: {' '.join(opt_cmd)}")
+    if subprocess.run(opt_cmd, check = False).returncode != 0:
+        raise RuntimeError(f"{stage} optimizer network generation failed for {config.test_name}")
diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py
index 8fa105d9f4..7e7893b5f5 100644
--- a/DeeployTest/test_siracusa_config.py
+++ b/DeeployTest/test_siracusa_config.py
@@ -8,7 +8,6 @@
 
 KERNEL_TESTS = [
     "Kernels/FP32/ReLU",
-    "Kernels/FP32/Softmax/CrossEntropy",
     "Kernels/FP32/Softmax/CrossEntropyGrad",
     "Kernels/FP32/Softmax/Grad",
     "Kernels/FP32/Softmax/Regular",
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index a687d9a489..a9eefb6d3e 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -139,7 +139,6 @@
     "Models/Transformer": [60000, 30000, 15000],
     "Models/microLlama/microLlama1": [60000, 10000, 5000],
     "Models/CCT/FP32/CCT_2_32_32_128": [128000],
-    "Models/CCT_Train/CCT2_FT2": [128000],
     "Models/TinyViT/Demo": [4000],
 }
 
@@ -153,6 +152,5 @@
     "Models/microLlama/microLlama8": [60000, 20000, 10000],
     "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000],
     "Models/CCT/FP32/CCT_2_32_32_128": [128000],
-    "Models/CCT_Train/CCT2_FT2": [128000],
     "Models/TinyViT/Demo": [4000],
 }