From 05b9404760fe8e6584fcd395d10ad635b9a82efd Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 12 Mar 2026 15:51:43 +0100
Subject: [PATCH 01/16] Update gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index d9e4faace3..7ffc9ca243 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,3 +57,6 @@ CHANGELOG_GEN.md
 # Container Artifacts
 .pyusbip/
 .cache/
+
+# Claude context file
+CLAUDE.md
\ No newline at end of file

From 5615ed49f3c17ddcd3c7b568d561c7f880179290 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 12 Mar 2026 18:02:54 +0100
Subject: [PATCH 02/16] XDNA2 Platform Beta Support

---
 CMakeLists.txt                                |  20 ++
 Deeploy/Targets/XDNA2/Bindings.py             |  22 ++
 Deeploy/Targets/XDNA2/Deployer.py             | 204 ++++++++++++++++++
 Deeploy/Targets/XDNA2/Parsers.py              |   6 +
 Deeploy/Targets/XDNA2/Platform.py             |  69 ++++++
 .../Targets/XDNA2/Templates/AddTemplate.py    |  81 +++++++
 Deeploy/Targets/XDNA2/Templates/__init__.py   |   3 +
 Deeploy/Targets/XDNA2/TypeCheckers.py         |  29 +++
 DeeployTest/Platforms/XDNA2/CMakeLists.txt    | 149 +++++++++++++
 DeeployTest/Platforms/XDNA2/main.cpp          | 194 +++++++++++++++++
 .../Tests/Kernels/BF16/Add/Regular/inputs.npz | Bin 0 -> 8706 bytes
 .../Kernels/BF16/Add/Regular/network.onnx     | Bin 0 -> 128 bytes
 .../Kernels/BF16/Add/Regular/outputs.npz      | Bin 0 -> 4366 bytes
 DeeployTest/conftest.py                       |   1 +
 DeeployTest/deeployRunner_xdna2.py            |  17 ++
 DeeployTest/generateNetwork_xdna2.py          | 189 ++++++++++++++++
 DeeployTest/testUtils/core/execution.py       |   5 +
 DeeployTest/testUtils/deeployRunner.py        |   1 +
 DeeployTest/testUtils/platformMapping.py      |  23 +-
 DeeployTest/test_platforms.py                 |  24 +++
 DeeployTest/test_xdna2_config.py              |  10 +
 TargetLibraries/XDNA2/CMakeLists.txt          |  90 ++++++++
 TargetLibraries/XDNA2/kernels/add.cc          |  54 +++++
 requirements-dev.txt                          |   7 +
 24 files changed, 1197 insertions(+), 1 deletion(-)
 create mode 100644 Deeploy/Targets/XDNA2/Bindings.py
 create mode 100644 Deeploy/Targets/XDNA2/Deployer.py
 create mode 100644 Deeploy/Targets/XDNA2/Parsers.py
 create mode 100644 Deeploy/Targets/XDNA2/Platform.py
 create mode 100644 Deeploy/Targets/XDNA2/Templates/AddTemplate.py
 create mode 100644 Deeploy/Targets/XDNA2/Templates/__init__.py
 create mode 100644 Deeploy/Targets/XDNA2/TypeCheckers.py
 create mode 100644 DeeployTest/Platforms/XDNA2/CMakeLists.txt
 create mode 100644 DeeployTest/Platforms/XDNA2/main.cpp
 create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz
 create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx
 create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz
 create mode 100644 DeeployTest/deeployRunner_xdna2.py
 create mode 100644 DeeployTest/generateNetwork_xdna2.py
 create mode 100644 DeeployTest/test_xdna2_config.py
 create mode 100644 TargetLibraries/XDNA2/CMakeLists.txt
 create mode 100644 TargetLibraries/XDNA2/kernels/add.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c8a024c15..8c23ccca7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier)
   message(STATUS "Building for platform 'SoftHier'")
 elseif(platform STREQUAL Chimera)
   message(STATUS "Building for platform 'Chimera'")
+elseif(platform STREQUAL XDNA2)
+  message(STATUS "Building for platform 'XDNA2'")
 else()
   message(FATAL_ERROR "Invalid platform '${platform}' specified!")
 endif()
@@ -299,5 +301,23 @@ if(platform STREQUAL Chimera)
 
 endif()
 
+if(platform STREQUAL XDNA2)
+
+  project(${TESTNAME} LANGUAGES CXX)
+
+  message(STATUS "============================= XDNA2 Configuration ============================")
+  message(STATUS "[cMake  ]   GENERATED_SOURCE         = " ${GENERATED_SOURCE})
+  message(STATUS "[cMake  ]   TESTNAME                 = " ${TESTNAME})
+  message(STATUS "==============================================================================")
+  message(STATUS "")
+
+  # XDNA2 uses its own CMakeLists.txt in DeeployTest/Platforms/XDNA2/
+  # which handles the two-step build: xclbin -> host binary.
+  # AIE kernel compilation is in TargetLibraries/XDNA2/.
+  add_subdirectory(TargetLibraries/XDNA2)
+  add_subdirectory(DeeployTest/Platforms/XDNA2)
+
+endif()
+
 
 print_simulation_config()
diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py
new file mode 100644
index 0000000000..68d7672787
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Bindings.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import bfloat16_t
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.Targets.XDNA2.Templates import AddTemplate
+from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker
+
+# XDNA2 does not use the standard C code transformation pipeline.
+# The deployer generates a holistic MLIR module, not per-node C snippets.
+# An empty CodeTransformation is used as a placeholder.
+XDNA2Transformer = CodeTransformation([])
+
+XDNA2AddBindings = [
+    NodeBinding(
+        XDNA2AddChecker([PointerClass(bfloat16_t), PointerClass(bfloat16_t)], [PointerClass(bfloat16_t)]),
+        AddTemplate.referenceTemplate,
+        XDNA2Transformer,
+    )
+]
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
new file mode 100644
index 0000000000..7aa77668eb
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import tempfile
+from typing import Callable, Dict, Optional, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Targets.XDNA2.Templates.AddTemplate import XDNA2NodeTemplate
+
+# JUNGVI: Will be removed once Deeploy generates it's own MLIR
+
+# Default path to the mlir-aie Python environment.
+# Can be overridden via the MLIR_AIE_PYTHON env variable.
+_DEFAULT_IRON_PYTHON = os.environ.get(
+    "MLIR_AIE_PYTHON",
+    "/scratch/jungvi/micromamba/envs/iron/bin/python",
+)
+
+# Path to the IRON design scripts shipped with mlir-aie examples.
+# Can be overridden via the IRON_OPERATORS_DIR env variable.
+_DEFAULT_IRON_OPERATORS_DIR = os.environ.get(
+    "IRON_OPERATORS_DIR",
+    "/scratch/jungvi/IRON/iron/operators",
+)
+
+
+class XDNA2Deployer(SignPropDeployer):
+    """Deployer for the XDNA2 (AIE2p) platform.
+
+    Unlike other Deeploy deployers that generate C code, this deployer
+    generates an mlir-aie MLIR module.  The MLIR is produced by invoking the
+    IRON operator ``design.py`` scripts as subprocesses (using the mlir-aie
+    Python environment) so that the main Deeploy environment does not need to
+    have ``aie.iron`` installed.
+
+    It also writes ``testinputs.h`` and ``testoutputs.h`` via the XDNA2
+    generation script so the XRT C++ testbench can be compiled against
+    known-good golden values.
+    """
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Optional[Dict[str, int]] = None,
+                 iron_python: Optional[str] = None,
+                 iron_operators_dir: Optional[str] = None):
+        """
+        Parameters
+        ----------
+        iron_python : str, optional
+            Path to the Python interpreter in the mlir-aie (IRON) environment.
+            Defaults to ``MLIR_AIE_PYTHON`` env variable or
+            ``/scratch/jungvi/micromamba/envs/iron/bin/python``.
+        iron_operators_dir : str, optional
+            Path to the IRON operators directory containing per-operator
+            ``design.py`` scripts.
+            Defaults to ``IRON_OPERATORS_DIR`` env variable or
+            ``/scratch/jungvi/IRON/iron/operators``.
+        """
+        super().__init__(
+            graph,
+            deploymentPlatform,
+            inputTypes,
+            loweringOptimizer,
+            scheduler,
+            name,
+            default_channels_first = default_channels_first,
+            deeployStateDir = deeployStateDir,
+            inputOffsets = inputOffsets if inputOffsets is not None else {},
+        )
+        self._iron_python = iron_python or _DEFAULT_IRON_PYTHON
+        self._iron_operators_dir = iron_operators_dir or _DEFAULT_IRON_OPERATORS_DIR
+
+    # ------------------------------------------------------------------
+    # MLIR generation
+    # ------------------------------------------------------------------
+
+    def generateMLIR(self) -> str:
+        """Generate an mlir-aie MLIR module for the prepared graph.
+
+        Iterates over ``self.layerBinding``, extracts AIE parameters from each
+        bound template, and calls the corresponding IRON ``design.py`` script
+        as a subprocess.  Currently only a single BF16 Add node is supported.
+
+        Returns
+        -------
+        str
+            MLIR module string (ready to be written to ``network.mlir``).
+
+        Raises
+        ------
+        RuntimeError
+            If the graph contains unsupported operators or if the IRON
+            subprocess fails.
+        """
+        assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()"
+
+        mlir_parts = []
+
+        for node_name, layer in self.layerBinding.items():
+            mapper = layer.mapper
+            template = mapper.binder.template
+            op_repr = mapper.parser.operatorRepresentation
+
+            if not isinstance(template, XDNA2NodeTemplate):
+                raise RuntimeError(
+                    f"Node '{node_name}' has no XDNA2NodeTemplate — "
+                    f"only BF16 Add is supported in this release.")
+
+            aie_params = template.getAIEParams(op_repr)
+            log.info(f"[XDNA2] Generating MLIR for node '{node_name}' "
+                     f"with params: {aie_params}")
+
+            mlir_str = self._generate_add_mlir(aie_params)
+            mlir_parts.append(mlir_str)
+
+        if not mlir_parts:
+            raise RuntimeError("No bound layers found in graph — cannot generate MLIR.")
+
+        # For a single-node graph the MLIR is just the one module.
+        # Multi-node support would require merging modules.
+        return mlir_parts[0]
+
+    def _generate_add_mlir(self, aie_params: dict) -> str:
+        """Call the IRON elementwise_add design.py to produce MLIR.
+
+        Parameters
+        ----------
+        aie_params : dict
+            Dict with keys: num_elements, n_cols, n_channels, tile_size, trace_size.
+
+        Returns
+        -------
+        str
+            MLIR module string.
+        """
+        design_script = os.path.join(
+            self._iron_operators_dir, "elementwise_add", "design.py"
+        )
+
+        if not os.path.isfile(design_script):
+            raise RuntimeError(
+                f"IRON design script not found: {design_script}\n"
+                f"Set IRON_OPERATORS_DIR to point to the IRON operators directory.")
+
+        if not os.path.isfile(self._iron_python):
+            raise RuntimeError(
+                f"IRON Python interpreter not found: {self._iron_python}\n"
+                f"Set MLIR_AIE_PYTHON to the mlir-aie Python interpreter.")
+
+        with tempfile.NamedTemporaryFile(suffix=".mlir", delete=False) as tmp:
+            output_path = tmp.name
+
+        try:
+            cmd = [
+                self._iron_python,
+                design_script,
+                "--dev", "npu2",
+                "--length", str(aie_params['num_elements']),
+                "--columns", str(aie_params['n_cols']),
+                "--channels", str(aie_params['n_channels']),
+                "--tile-size", str(aie_params['tile_size']),
+                "--trace-size", str(aie_params['trace_size']),
+                "--output-file-path", output_path,
+            ]
+
+            log.debug(f"[XDNA2] Running: {' '.join(cmd)}")
+
+            result = subprocess.run(
+                cmd,
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"IRON design.py failed (exit {result.returncode}):\n"
+                    f"  cmd: {' '.join(cmd)}\n"
+                    f"  stdout: {result.stdout}\n"
+                    f"  stderr: {result.stderr}")
+
+            with open(output_path, 'r') as f:
+                mlir_str = f.read()
+
+        finally:
+            if os.path.exists(output_path):
+                os.unlink(output_path)
+
+        return mlir_str
diff --git a/Deeploy/Targets/XDNA2/Parsers.py b/Deeploy/Targets/XDNA2/Parsers.py
new file mode 100644
index 0000000000..c665312dbd
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Parsers.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# XDNA2 reuses the Generic AddParser (see Platform.py).
+# Add any XDNA2-specific parsers here as the platform grows.
diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py
new file mode 100644
index 0000000000..82ef1ec3d2
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Platform.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
+    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.Targets.Generic.Layers import AddLayer
+from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.Generic.Parsers import AddParser
+from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+
+XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)
+
+XDNA2Mapping = {
+    'Add': AddLayer([XDNA2AddMapper]),
+}
+
+# Buffer classes reuse Generic templates since XDNA2Deployer manages its own
+# output format (MLIR + test headers) and these templates are never rendered.
+
+
+class XDNA2VariableBuffer(VariableBuffer):
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class XDNA2TransientBuffer(TransientBuffer):
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class XDNA2ConstantBuffer(ConstantBuffer):
+    initTemplate = AllocateTemplate.referenceGlobalInitTemplate
+    allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceGlobalTemplate
+
+
+class XDNA2StructBuffer(StructBuffer):
+    initTemplate = AllocateTemplate.referenceStructInitTemplate
+    allocTemplate = AllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+# No topology optimization passes needed for the initial Add-only platform.
+XDNA2Optimizer = TopologyOptimizer([], name = "XDNA2Optimizer")
+
+
+class XDNA2Engine(DeploymentEngine):
+
+    def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "",
+                 includeList = None) -> None:
+        if includeList is None:
+            includeList = []
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class XDNA2Platform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = None,
+                 variableBuffer = XDNA2VariableBuffer,
+                 constantBuffer = XDNA2ConstantBuffer,
+                 structBuffer = XDNA2StructBuffer,
+                 transientBuffer = XDNA2TransientBuffer):
+        if engines is None:
+            engines = [XDNA2Engine()]
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
new file mode 100644
index 0000000000..050413eedc
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+
+class XDNA2NodeTemplate(NodeTemplate):
+    """Base class for XDNA2 templates.
+
+    Temporary Feature:
+    Unlike Mako-based templates for C code, XDNA2 templates do not produce
+    code snippets. Instead they store AIE kernel metadata that the
+    XDNA2Deployer reads when generating the holistic MLIR module.
+    """
+
+    def __init__(self, kernel_fn_name: str, kernel_obj: str, kernel_src: str, tile_size: int = 1024):
+        """Initialize an XDNA2NodeTemplate.
+
+        Parameters
+        ----------
+        kernel_fn_name : str
+            Name of the AIE C++ kernel function (e.g. "eltwise_add_bf16_vector").
+        kernel_obj : str
+            Compiled kernel object file name (e.g. "add.o").
+        kernel_src : str
+            Kernel source file name relative to TargetLibraries/XDNA2/kernels/
+            (e.g. "add.cc").
+        tile_size : int
+            Number of elements per tile (default 1024, max 4096).
+        """
+        # Empty Mako template — no C code is generated per node.
+        super().__init__("")
+        self.kernel_fn_name = kernel_fn_name
+        self.kernel_obj = kernel_obj
+        self.kernel_src = kernel_src
+        self.tile_size = tile_size
+
+    def getAIEParams(self, operatorRepresentation: dict) -> dict:
+        """Return the aie.iron parameters for this node.
+
+        Parameters
+        ----------
+        operatorRepresentation : dict
+            The operator representation dict produced by the parser.
+
+        Returns
+        -------
+        dict
+            Parameters to pass to the corresponding aie.iron design function.
+        """
+        raise NotImplementedError
+
+
+class XDNA2AddTemplate(XDNA2NodeTemplate):
+    """XDNA2 template for BF16 elementwise Add."""
+
+    def __init__(self):
+        super().__init__(
+            kernel_fn_name = "eltwise_add_bf16_vector",
+            kernel_obj = "add.o",
+            kernel_src = "add.cc",
+            tile_size = 1024,
+        )
+
+    def getAIEParams(self, operatorRepresentation: dict) -> dict:
+        num_elements = int(operatorRepresentation['size'])
+        tile_size = min(num_elements, self.tile_size)
+        # Ensure num_elements is divisible by tile_size
+        if num_elements % tile_size != 0:
+            tile_size = 1
+        return {
+            'num_elements': num_elements,
+            'n_cols': 1,
+            'n_channels': 1,
+            'tile_size': tile_size,
+            'trace_size': 0,
+        }
+
+
+referenceTemplate = XDNA2AddTemplate()
diff --git a/Deeploy/Targets/XDNA2/Templates/__init__.py b/Deeploy/Targets/XDNA2/Templates/__init__.py
new file mode 100644
index 0000000000..4694b67df5
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Templates/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/Deeploy/Targets/XDNA2/TypeCheckers.py b/Deeploy/Targets/XDNA2/TypeCheckers.py
new file mode 100644
index 0000000000..cb9c98fd39
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/TypeCheckers.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Optional, Sequence, Type
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
+from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
+
+
+class XDNA2AddChecker(SignPropTypeChecker):
+    """Type checker for BF16 elementwise Add on XDNA2.
+
+    Both inputs and the output are bfloat16_t pointers.
+    """
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        # Float types do not have a meaningful nLevels — return 1 as a neutral value.
+        return [1]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
+        # BF16 is a signed floating-point type.
+        return [True]
diff --git a/DeeployTest/Platforms/XDNA2/CMakeLists.txt b/DeeployTest/Platforms/XDNA2/CMakeLists.txt
new file mode 100644
index 0000000000..d017d7f22f
--- /dev/null
+++ b/DeeployTest/Platforms/XDNA2/CMakeLists.txt
@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ---------------------------------------------------------------------------
+# XDNA2 (AIE2p) testbench CMake configuration
+#
+# Included via add_subdirectory() by the top-level CMakeLists.txt when
+#   -Dplatform=XDNA2
+# is passed.  It orchestrates two build steps:
+#
+#   1. Compile network.mlir to network.xclbin + npu_insts.bin with aiecc.py.
+#   2. Compile the XRT host binary (main.cpp) with the system compiler.
+#
+# AIE kernel compilation is handled by TargetLibraries/XDNA2/CMakeLists.txt.
+#
+# Required variables (set via environment or CMake cache):
+#   MLIR_AIE_INSTALL_DIR  – path to the mlir-aie installation
+#                           (auto-resolved from aie.utils.config or env)
+#   LLVM_AIE_INSTALL_DIR  – path to the llvm-aie installation
+#                           (auto-resolved from aie.utils.config or env)
+#   XRT_INSTALL_DIR       – path to the XRT installation
+#                           (default: $ENV{XILINX_XRT} or /opt/xilinx/xrt)
+#   GENERATED_SOURCE      – directory containing network.mlir, testinputs.h, testoutputs.h
+#                           (set by the Deeploy test runner)
+#   TESTNAME              – name of the test target (set by the Deeploy test runner)
+# ---------------------------------------------------------------------------
+
+# ---------------------------------------------------------------------------
+# Resolve toolchain and runtime paths
+# ---------------------------------------------------------------------------
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+# --- llvm-aie (Peano) install dir (needed for --peano flag) ---
+set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir")
+if(NOT LLVM_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());"
+        OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT LLVM_AIE_INSTALL_DIR)
+        message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. "
+            "Set LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.")
+    endif()
+endif()
+
+# --- mlir-aie install dir (needed for aiecc.py) ---
+set(MLIR_AIE_INSTALL_DIR "$ENV{MLIR_AIE_INSTALL_DIR}" CACHE PATH "mlir-aie install dir")
+if(NOT MLIR_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.root_path());"
+        OUTPUT_VARIABLE MLIR_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT MLIR_AIE_INSTALL_DIR)
+        message(FATAL_ERROR "[XDNA2] Could not find mlir-aie install dir. "
+            "Set MLIR_AIE_INSTALL_DIR or install the mlir-aie wheel.")
+    endif()
+endif()
+
+# --- XRT install dir ---
+if(NOT XRT_INSTALL_DIR)
+    if(DEFINED ENV{XILINX_XRT})
+        set(XRT_INSTALL_DIR $ENV{XILINX_XRT})
+    else()
+        set(XRT_INSTALL_DIR "/opt/xilinx/xrt")
+    endif()
+endif()
+
+set(AIECC_PY "${MLIR_AIE_INSTALL_DIR}/bin/aiecc.py")
+
+# Deeploy-generated sources
+set(NETWORK_MLIR "${GENERATED_SOURCE}/network.mlir")
+
+message(STATUS "[XDNA2] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2] MLIR_AIE_INSTALL_DIR = ${MLIR_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2] XRT_INSTALL_DIR      = ${XRT_INSTALL_DIR}")
+message(STATUS "[XDNA2] GENERATED_SOURCE      = ${GENERATED_SOURCE}")
+message(STATUS "[XDNA2] TESTNAME              = ${TESTNAME}")
+
+# ---------------------------------------------------------------------------
+# Step 1: Compile MLIR -> xclbin + npu_insts.bin
+# ---------------------------------------------------------------------------
+set(XCLBIN      "${CMAKE_CURRENT_BINARY_DIR}/network.xclbin")
+set(NPU_INSTS   "${CMAKE_CURRENT_BINARY_DIR}/npu_insts.bin")
+
+add_custom_command(
+    OUTPUT  "${XCLBIN}" "${NPU_INSTS}"
+    # Copy kernel objects into aiecc.py working dir so the linker scripts
+    # generated by aiecc.py can find them via INPUT(kernel.o).
+    COMMAND ${CMAKE_COMMAND} -E copy ${XDNA2_KERNEL_OBJECTS} "${CMAKE_CURRENT_BINARY_DIR}"
+    COMMAND ${CMAKE_COMMAND} -E env
+            "PATH=${MLIR_AIE_INSTALL_DIR}/bin:$ENV{PATH}"
+            "python" "${AIECC_PY}"
+            --no-aiesim
+            --no-xchesscc
+            --no-xbridge
+            --peano "${LLVM_AIE_INSTALL_DIR}"
+            --aie-generate-cdo
+            --aie-generate-npu-insts
+            --npu-insts-name npu_insts.bin
+            --aie-generate-xclbin
+            --xclbin-kernel-name=MLIR_AIE
+            --xclbin-name network.xclbin
+            "${NETWORK_MLIR}"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    DEPENDS "${NETWORK_MLIR}" ${XDNA2_KERNEL_OBJECTS} xdna2_kernels
+    COMMENT "[XDNA2] Compiling MLIR -> network.xclbin + npu_insts.bin"
+    VERBATIM
+)
+add_custom_target(xdna2_xclbin DEPENDS "${XCLBIN}" "${NPU_INSTS}")
+
+# ---------------------------------------------------------------------------
+# Step 2: Compile XRT host binary
+# ---------------------------------------------------------------------------
+add_executable("${TESTNAME}"
+    "${CMAKE_CURRENT_LIST_DIR}/main.cpp"
+)
+
+target_include_directories("${TESTNAME}" PRIVATE
+    "${XRT_INSTALL_DIR}/include"
+    "${GENERATED_SOURCE}"
+)
+
+target_link_directories("${TESTNAME}" PRIVATE
+    "${XRT_INSTALL_DIR}/lib"
+)
+
+target_link_libraries("${TESTNAME}" PRIVATE
+    xrt_coreutil
+    uuid
+    dl
+    pthread
+)
+
+target_compile_features("${TESTNAME}" PRIVATE cxx_std_17)
+
+# The xclbin and npu_insts must be available at runtime in the same directory
+# as the binary.  Add a post-build step to copy them.
+add_custom_command(TARGET "${TESTNAME}" POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${XCLBIN}" "$<TARGET_FILE_DIR:${TESTNAME}>/network.xclbin"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${NPU_INSTS}" "$<TARGET_FILE_DIR:${TESTNAME}>/npu_insts.bin"
+    COMMENT "[XDNA2] Copying xclbin and npu_insts to binary directory"
+)
+
+add_dependencies("${TESTNAME}" xdna2_xclbin)
diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
new file mode 100644
index 0000000000..07ffb7a0ca
--- /dev/null
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -0,0 +1,194 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// XRT C++ testbench for the XDNA2 (AIE2p) platform.
+// Loads network.xclbin produced by aiecc.py, runs the MLIR_AIE kernel,
+// reads back outputs and compares against golden reference values.
+// Output format: "Errors: X out of Y" (required by output_parser.py).
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_hw_context.h"
+#include "xrt/xrt_kernel.h"
+
+// Generated by Deeploy's generateNetwork_xdna2.py:
+//   testinputs.h  – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} defines
+//   testoutputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_OUTPUT{i} defines
+#include "testinputs.h"
+#include "testoutputs.h"
+
+// ---------------------------------------------------------------------------
+// BF16 helpers
+// ---------------------------------------------------------------------------
+static float bf16_to_float(uint16_t bf16)
+{
+    uint32_t f32_bits = static_cast<uint32_t>(bf16) << 16;
+    float f;
+    std::memcpy(&f, &f32_bits, sizeof(f));
+    return f;
+}
+
+static bool bf16_nearly_equal(uint16_t a, uint16_t b,
+                              float rtol = 0.0f, float atol = 0.0f)
+{
+    // Default: allow 1 BF16 ULP difference to account for hardware rounding.
+    // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values.
+    float fa = bf16_to_float(a);
+    float fb = bf16_to_float(b);
+    float diff = std::fabs(fa - fb);
+
+    // Compute 1 ULP for the reference value's magnitude
+    uint16_t ref_exp = (b >> 7) & 0xFF;  // BF16 exponent (8 bits)
+    float ulp;
+    if (ref_exp == 0)
+        ulp = std::ldexp(1.0f, -133);  // subnormal ULP
+    else
+        ulp = std::ldexp(1.0f, static_cast<int>(ref_exp) - 127 - 7);  // 7 mantissa bits
+
+    float tol = std::fmax(atol + rtol * std::fabs(fb), ulp);
+    return diff <= tol;
+}
+
+// ---------------------------------------------------------------------------
+// Read the NPU instruction binary produced by aiecc.py
+// ---------------------------------------------------------------------------
+static std::vector<uint32_t> read_instr_binary(const std::string &path)
+{
+    std::ifstream file(path, std::ios::binary);
+    if (!file.is_open()) {
+        throw std::runtime_error("Cannot open instruction file: " + path);
+    }
+    file.seekg(0, std::ios::end);
+    size_t byte_size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<uint32_t> instr(byte_size / sizeof(uint32_t));
+    file.read(reinterpret_cast<char *>(instr.data()), byte_size);
+    return instr;
+}
+
+int main(int argc, char **argv)
+{
+    // Paths to the compiled artefacts (relative to the binary's working dir)
+    std::string xclbin_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/network.xclbin";
+    std::string instr_path  = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/npu_insts.bin";
+
+    bool verbose = false;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "-v" || arg == "--verbose" || arg == "-vv") {
+            verbose = true;
+        }
+    }
+    if (argc >= 2 && argv[1][0] != '-') xclbin_path = argv[1];
+    if (argc >= 3 && argv[2][0] != '-') instr_path  = argv[2];
+
+    // -----------------------------------------------------------------------
+    // 1. Open XRT device, register xclbin, create hw_context
+    //    (matches mlir-aie test_utils::init_xrt_load_kernel pattern)
+    // -----------------------------------------------------------------------
+    auto device = xrt::device(0);
+    auto xclbin = xrt::xclbin(xclbin_path);
+    device.register_xclbin(xclbin);
+    xrt::hw_context context(device, xclbin.get_uuid());
+    auto kernel = xrt::kernel(context, "MLIR_AIE");
+
+    // -----------------------------------------------------------------------
+    // 2. Read NPU instruction binary
+    // -----------------------------------------------------------------------
+    std::vector<uint32_t> instr_v = read_instr_binary(instr_path);
+    size_t n_instr = instr_v.size();
+
+    // -----------------------------------------------------------------------
+    // 3. Derive element counts from the testinputs/testoutputs header defines.
+    //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
+    //    by generateNetwork_xdna2.py.
+    // -----------------------------------------------------------------------
+    static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
+                  "Input 0 and input 1 must have the same number of elements");
+    static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,
+                  "Inputs and output must have the same number of elements");
+
+    const size_t n_elem    = N_ELEMENTS_OUTPUT0;
+    const size_t elem_size = sizeof(uint16_t);   // BF16 = 2 bytes
+    const size_t buf_bytes = n_elem * elem_size;
+
+    // -----------------------------------------------------------------------
+    // 4. Allocate XRT buffer objects
+    //    Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out)
+    // -----------------------------------------------------------------------
+    auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t),
+                            XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+    auto bo_in0   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+    auto bo_in1   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+    auto bo_out   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+    // -----------------------------------------------------------------------
+    // 5. Copy data into device buffers
+    // -----------------------------------------------------------------------
+    std::memcpy(bo_instr.map<uint32_t *>(), instr_v.data(), n_instr * sizeof(uint32_t));
+    std::memcpy(bo_in0.map<void *>(), testInputVector0, buf_bytes);
+    std::memcpy(bo_in1.map<void *>(), testInputVector1, buf_bytes);
+
+    bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // -----------------------------------------------------------------------
+    // 6. Launch kernel and wait for completion
+    //    opcode 3 = execute NPU instruction stream
+    // -----------------------------------------------------------------------
+    unsigned int opcode = 3;
+    auto run = kernel(opcode, bo_instr, static_cast<uint32_t>(n_instr),
+                      bo_in0, bo_in1, bo_out);
+    run.wait();
+
+    // -----------------------------------------------------------------------
+    // 7. Sync output back and compare against golden reference
+    // -----------------------------------------------------------------------
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    const uint16_t *hw_out     = bo_out.map<const uint16_t *>();
+    const uint16_t *golden_out = testOutputVector0;
+
+    int errors = 0;
+    for (size_t i = 0; i < n_elem; ++i) {
+        bool match = bf16_nearly_equal(hw_out[i], golden_out[i]);
+        if (!match) {
+            ++errors;
+            if (errors <= 10) {
+                std::cerr << "  Mismatch at index " << i
+                          << ": hw=" << bf16_to_float(hw_out[i])
+                          << " (0x" << std::hex << hw_out[i] << std::dec << ")"
+                          << "  ref=" << bf16_to_float(golden_out[i])
+                          << " (0x" << std::hex << golden_out[i] << std::dec << ")"
+                          << "  diff=" << std::fabs(bf16_to_float(hw_out[i]) - bf16_to_float(golden_out[i]))
+                          << "\n";
+            }
+        }
+        if (verbose) {
+            float hw_f  = bf16_to_float(hw_out[i]);
+            float ref_f = bf16_to_float(golden_out[i]);
+            std::cout << "[" << i << "] hw=" << hw_f
+                      << "  ref=" << ref_f
+                      << "  diff=" << std::fabs(hw_f - ref_f)
+                      << (match ? "" : "  *** MISMATCH")
+                      << "\n";
+        }
+    }
+
+    // Output format required by testUtils/core/output_parser.py
+    std::cout << "Errors: " << errors << " out of " << n_elem << "\n";
+
+    return (errors == 0) ? 0 : 1;
+}
diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3cfdd76a1f7a397616b6f353c278b0bb20ce5efb
GIT binary patch
literal 8706
zcmd6NX;{wR*KdO+LxV=Cq-Y{Ti0c0Cb&DcN$xj(eD2dV_M5!d2=ZT_HiDpF_?(beV
zqDUGH4MG$lrGzp+e$W5SdCs}coAc_deeJ!kwbov*_O;iw_UE%~EQKe?3H(Q93WPqt
zF&rc<An@;;Bp@fS*Td8IpyPsh9-e-J0@D9&{`cs=?tdk5-9{^$&4L1l1deI%cJ|rj
zt!=EX&2d?*Jy%`Z<$(7=Zzm7O1Kzux|FxSt?e}s1Yxi+=@^t>I>nvPguz0TST=f9;
z|5us#QaFPwQVYcdRW5WkctU9HH*77wh-q(yNc8(`ntUOgr{J^&jn$pNQFRq4o1A96
zEaG72_Kzg>`&R6~&m$ZCd{OPy4p0qT%f_x{sLT3!$nnpB?;JIH*Xk+lTyvGWnTdn{
z^jcE3{vk<7>Lxd&CAiu0E!18^824_l!AE7Ppm!n;EVjR61Lf>-k<13<Mr*-aGXgr3
zCqajU9#+)4lkxE(rvIZO`6BNKCSP}xRZaTv+9iVV6-y%1GiHMP$r}7rm_th+4bz}#
zP4s>dMVT!KD*4+{@#SGy<=T!@4i!@Nk`^hPssNv+8o~o5OHhg`r#q@YQ0v^Su=@21
zT4s@hF;ekp?6eQNB7c)99WvzNtp(7{@1iD~G{_Y#btv3<o_arjN51>$qQR?bs^xQw
zn#-I)-+#W5;Elsn<LX+D`>d~om)S>3?DXJf`CT@qrh{6o$YB$|x8o1vYBXAR8RIPj
zaY^ZYa<pj=o;rVns#;wH6W2=G*2IB$!2@XIU`z8B`yfp&rU%sQ;fc~Stl$5feD(Ap
zhRRB~U#S{%suGzeQDRtR>OsteD>#Z`O^`dymyQXBpm@?!(qisJy1x}e*WHcq;CdxF
zVO0yebkp$W9Zi&9?2g*W^_07`440qLfXy+xsM87_2L2Al-jiA6P^=xiO!@=W{>EHC
z{YS*nHUPbS#Q8^M!pU0|8@zV;5#3R)g8p-)=|t}^uy0i-bG<aERoqDw-u48y{2Hf9
zmfP6Vmam}HMTE2_9fP8rKa?p-#XgzC=yu5nJC})bUCeFKINuI9T6-|zm<L!Y-6o~Z
z>NNeU7CR?P8xwc+k+jjb^kPp2`X=UJZ_NTyFykJyKhh-m<Db|E_6bC0PCh)y+d(9q
z+vxMN(J&VJl;kJYL36eV*;AW=?K4U-ef>j_%kG9(50}!J3+rgQbS)HBMB}5$&M0$b
z7L-cdqZ=RR*CuY)rU_4qh=a~mNIF@~%<Y^7B8I~JPf@Gb;b-<JBb0$@4|bsO)Iyqi
z#0{AG%lLZeJ4f>M7`<JWPaU=d!N(gC&^^BpSVJ|sR@VT*#F@&iKSJ`F7Zc;A2)t%n
z&6Hi;ODxr9a=kgjpyROx-)SaN8QWS+T&9RWQnJb3&1ZqhGsanmv~j+>1pjrs7B%>F
z8aIWD;i%sZteJd+CBJ%5v!NDK%*44X?&ac<T~YXR#sFk~Fyg-cl8iGWdDO9|gp8Rz
zgMthGsHS&}ru{L8%V(aG)2V#gD=tA(ewJcuuMZvB6-Ga??qpC`5(C{wNKsZWJlB~)
z+@@V6`>QuI20zP|iB&Db@JSbl0jXz?d==rA@45>m(YHyJ{R#})6@!k4(?I6*DYEtP
z7m~K_1!nAbhla~<Nt#9sR_q(0BO<4Xd%qoXC^ea08hOah6Aq@EjZ{eIW*%s)oW<`6
z^QJ|(n4F4AC8_%;HSO*}(U1A0W!f`H*jR^;HMfKB;!Bj7e~fjyeG1!|LfXCOBD~g-
zhh4m9aNt@Bj5qfXFnR%%w|a>9*apz*NTqWv!Wf4L19aRN0QqexsJePJh#Rk<?gPm%
zx{M1SC%%Izawc$Y*<D1naCYN_6#P{@0Y4tN0d7k)Vfvr}9={QcC(T6}Qk9OK&2#V?
zW|EaJ6QQwvIt1iS<$rmcPV7%5GVbplk}oeFz_iH|_%XJMm}900rC#^&%>WOWGdxmL
zy$QDKPQ_MR8z{aN28o9~$URX{;^7uS3?^*Hn=*>@`8*Z6Xl)%i_Bjw#j-N&cApvTa
z#lroq1iv4*#jU#5BwKeAY*}o=ahhHLU3Oa`G}s=Jx~72HxE%Mbye+LX65ub{)C5WP
z@9F;GZf0v{KC~+|;rAt3Sp4-qo?L5*Q<Pp3{e_~?>=R8w9$!UXhb;eDwgZzLHA2rV
z9w3~GMW7oyg~+Fw!mdg;Vp}c-N1PX<<cFoSDA0mvOWJ~0+FiPAp9qMmgwfIJVRqZr
zf8eajAAImB54lw=j%}9bvlsO7O&7q1?s)1SzJ?sz7YUgomtkRAG;3vA4E;qsrq<C8
zGxd~k*$Ecuwp}=8b)DjMD_CIK4I2i^uvGmniChqhfs;l_l8GptvGqN^$aRN*UMgc*
zu{d5F3`3PbZS<9UO3dyqr@PEAkv6Mp;x*|q6b6^lVhI-UY#6jwU8Z9NF(A06jts08
zr8g4FNkh#y;0{jUmMg|Ef)cX`b2A)Bz$RQ2RzNm9f5=O!$|9HSH{$4_M!Z!xf$t?6
zivjKL=@Qixx^Qd^7RE=zS<6G%AzDsi_qR~Zw0J6Xhamzd)L>0TF}0W*2I6xP@ZI%h
z6AP{ucfx_)B+=(O40+Tt&BbyMnC!}K2%JZCqgLS|n{HzNdowof(u30X5g=GlOAKNY
zsJ_Qr;ym#@oe*>YPVRe;wT}wvVB2rhc0in%SOOh-&hT{pGL*38V|xw5Kcz7V<=!fD
zwJ$#>Gr3auTWyH=H`U{9<22e))5t_@EMv{Q*MPlm8F5pb1A0aa(4^fNgo0b3YU>dc
z<?g~C-wUAR;|jX3tO6z`y`mpU4J^wQCGmzQNy+L4_+eB{od!K|Nx^HnKCPcLx;3EX
zLQNR>Ac3a_5+KfG8h^(&3taHL9q)u}z?m5VWd7-;^lD8o-6p6>h0JV7rD8Bn`@9&A
z=IX)~&D-Sr5lK+4OC`BPi0k1{&I%i!qx1W@IDh6z>{^qFMq#_Khabn*>J$>YqwAR&
z-&bIuw+m>kVW9J<CJmXbjdz?UgO)#+aJE$=$0(AG)jdUAY+Z1jiz&%kT7$F23*jLz
z6Fyr6(=vq`qRUa{FA`3|g>EVIl1Vt0y)p#1{6Aomo`M3q?a*nO7m+l20Jjch;0ED9
zVypfRI`%(A+3z_x_Ns%fHWR@dQ5(qfJxsnV?8V^`VMy(iMTMC;kaSrcmwnT~v!^G5
zmh}o)?f#nN-)P4yo&+q}eG-j6c#<JbCEA_rBP#>ep`(5-4)6@&ve#jftgnhD{Aw^b
z_mbL$2s5Ik;l%xd8rClxAsgew(e!2;mEBhY9~Oq<Z#QNB{6!C7fsQ8kU|=(~em{mc
z#?I5$$OP(q%#;R&#Gsmk9H^O#Lg{M6*xv@Q&{3G{sIY*%x>iKj?3JNMf>q(-sb?r*
zr$hSQ*VC}j4`8>&fZ7K<h8?S-@w}Y^c!cGUmwpcTPyQ0px+opQ8uF+~LI)1Kh~+I3
z*@HH#N@;*%KHFm3g?&vA@Nnx_($^w_w`a}bM+~fn$p<4z5Z@5{qA%fzv5lbcyqv`N
zD{;SSv=g!5J4ENB9yL;056L3xaB5i=?DG}Dn!)E(*y;=Y;k5>M_Yh_czNaI{I%(~L
zEYg4J7`<2R18S<${E-VA&~xo-ydEUS7rJr?i&nXz(~uP{^o@aeBS-1gkr33g*vr~l
zo&wJfKlGfw7iE4)!uGD6H1@`9XiRbBnPe>juYi+S@!O22j`q~%LOM=XZ}?l6&Ij?b
z3M$`q0uOzwV?XSSgN!j5{?m9w3)B7hPOKJ4X&mYISP6~8@%VXdEVi$DO=PZkFj_|^
z!^62;`qVXsjK7;nZcSeWN@?q0%)S}np%j0iQ4s79OM^Q(Cs09Z2yeIDW85q~AW88J
z$jzMq`RgfNlDiuhoj2ha8J(dx7>q4y!d#c2KA7Yd4_&-{81-rk#86GJPku$jzhpvb
zz6K<EF|cN`CK&pc5J#&%JiWY-IkQs|AGPLSomo3nSwDl3@5QjO@C31VSp-*)nsF2h
ztjUY7k?3vONI3R~nRV<%l)k#2U2c)X`YXHvy@B13A8Nu;zImKXR9eE6HdxbdQ4zSQ
zSedRB0JNW&g$G^;LHmXV^0>SY<xPLE7H^J2UeamETo(cdcg<k>{@H|il@TcESw?;@
zxJg#l=Yq!lC>V7Sqnf6fIO3E;pL#ivcSGyw^I=(3I~q)O2n*tHgF51&O!BR@i3WZ3
zBQ7r^NpnX$GR|MAtV9)vU=pr8EXE(P8N#23hd3tYzc^o?4YP@E=};v#hu?Ai9$E6N
z9%gKfLxDp#=_!_r&qjA+%cOkz<a#9p%pU+_fg&p0ISRQ|GR)|a9@s96tmtA<a%4s=
zvG=y*b`5TZHBw6uq>q5AX*CV!f2Y&Vy29f8-<;s=7-FvB%*pzvh|EmxBw<5icp_I1
zvYd~h>A5P*<{n^8eit&C;R`XC{sYeU3vp;m3$>|{N6R+CTGhvp9HC%Lbu)rpV!@2%
zoKL7;@g2LLSny2EcGAJ7hwNdK*F4>CE=2E)H<T`mK|SFVl=u{hy&n>Z=ax)3)zMA6
zrmOQkRl4x?@{MR)FNrovt&|Z{W0s!Fq_=EN!4WBae8Jy_`t!e#f!I1M3aux5#M&6W
z9}4us+D7=bdIk<Y73K!DyHIoA9Q?B|mf3SN9H;tladcH9$u^dRo#rzz`Hnv<)lC9k
z$92d#Ee4NImVh7~#*1|qsO0DEST^@F6S!t7uAZ$&rw<H+Sff5o>3&FrJKSl!R63KW
zecPlq;x)OilMD}@wZW-V=Sk!T1vqV;h}Gj2Y;&b2%qZIqMrE7G_4hmBj@b{AlXC`M
zX(bUwwMx2cw2*YinDEa?)R8lRd-*<RyK(8deo}M%GdX-O32oOJVy}BC+;1DBLQzuO
z<eVQhYVTVJ{qg|5%@HNdYlHA^RSz52EyF%7Ey4XiPf#lXf|VEc!@S#5nDl#b^qp@f
zJMHLh65g-+H}3=@wGG9YF454Y`jE<Ry@vZtdPy@GBu9sC;qqT@WUe&_l1q_9+?@%E
zSb<8%FO%=XXQ}yuF<RU00M9qEFi`v&vMevbdR`C9rd}n%>sG?-x#P^_ei7pT(iXP0
zd9zXF#@Mpt5glEn0ZMaLng}i^AvO_nVA%lDV`rAbrLh}itK@m~K34_J3j|QF>mmNW
z6^a>GPT-Y_jf{rj-#@T#5xtkI2D2xW5V6*5BJsGD2-K>A&Hn%|YNE}Q|4;B@;s3AT
z#kIGh{QT4!8W%Q+pI9NwENDrs6?ha1dnTm8id<__peTS7_l%Q_*<zR-URmSe;}6RB
zB=HM(9q7C11LJ-W)z|w0d29fow1vsmUV($3x8m|#3H0lq2wgb`F<r2Sbc~$Gtz$M+
zAhw4Q4p8B~v@FAKZai4yIgO}?wvpXCJ~5n%UXtyoO4qB^KuYC1Dm@qn<jX9~y?2P6
zY8Z~s$KR6PZ&fg`|2*rOVL@*f#$b7)8!hc=M91>Ic-GjJxIexGD)$y~z4R3D{q^g>
zJ?lpbdX%Y1z%w!{BY@FXe8o(XX{DxL$}uGQHcFXj!-c|Vy84U~&fhwTQxaqe8?G!P
zOGmYF&R7QUHSR%};V&X=>IYHd=Rqky4$_yXg1P=QI4UjzalTQQwsR5KL>A)<H%nah
z^db~0*@Mh33U8GnX`g>0&RKGZ-u=?TJgLiOyj0@BbG(W^W3G}yg#i2+`-ZeuS(?<1
z0xl2mqVZ`QdS`tEwXS^yQs?io{pd$?IHLSJ?{{PDl~Z(L(tK=uS_VF^btoOk1p(^M
zEcI7n-gRAL!|xq2IV#u<!3X8|8>R{H$&^XBMbv{+TCf28mwh5HYwM_}=Lzt776O<2
zIHbn&5a^~wk}a$hs^=PjM))h{{<%_P8qNd7rz-s7L)Y-bNGe+Ws-}Y5PSDB2d|bcC
zA7|{?hMwlC++&$<psl5irfSzvwP}fXp81R~9X_(gK7puqr<jd;cpq;{ufVGdmB_=u
zTJnlMCJiE&vGo2HDC$~^--R@w!)hMyuznd9^kvg8(zV2DKm{}$rRheQI#B%@1an+N
z=ua^&M4cJ~-I4F0JpUY;FLcK?845RiPotQ{W%L{xr(d+&$@XiltX-rQE;ut6tG60r
zq_z{9i%#V_Zb&1!Lc3tQ>v!tDk5IotLrlG<%g@+ZhFfm-qwh~C{wcvTsFijZDufJK
zgN1ouy}ymz*|-|!XS?9VnW8Wu+8A^)WpN^rf|3zwI956tdn#k`WPB>N>Rw@<xaX5^
zuhwAD-A>MR$*F%YQw@Z2?|{m|99XXxgf{})L2gJ4chgWv;6yS+<RQ)2*^lWZ^XZE-
z_4py{Fs#tJ3EPC$;=A;-#I(W)cl4>Fg_$|5?NVj$7Ci_3{;%}Jqd+JPs%KQHETMCx
zk1qMWhAocxz_yFoq5g4Qs5+zti$)b;O>G^_JXsC=airG;kXYu<z>6D@GcSA^9(HJk
zJi{6yw!)DfZ%Bjf0~7gD{Q`V>o+aEWKFkCf2a(Nxftaj+C@;uJfQ!FvA-Z0l6gT$J
zgCh56{Yr7Z_q!01?6nDA7vIJ6wz}A)orI<{4`Z~{R;I&6g#U=E3VA=d5E>Umhi<9y
zE#(8Tw)q@fzPFH{I&_7uoS2HX<5pnyXf8@Wd_X4Llm;J(0T6DnroCZ&vQ~kjpJN(a
zD>KEEXekt0Udc<F+<;e1-N=i=!?6EHGWoNk5o=<9vFTqJ7<L-QkMZHC>%0R-Eh5P9
zU>g|}9s;d5AIau@I`GtApPT$~3!P-T7H5h-B?tZj8f{NRF=h5iJpM?QE4C@0NFFx9
zy+_^P_~{pHv0pG%9gzZ;ZFv|Id6is!`Wf0whRBQae45`Xi1JEp)LHBV?LKgW?n-on
z`<;Ia{i1DP?sAh_{o#{Y%i}Ovx&X#JGodEm7yNr~;Hu^Sz#NNaa@wy6ttZv<R_4}_
z(Jhi7cybA`P)R0hGkH)llmU+dmh#t~zCc%<)uDAOdvJ0upRsJugiCUfNI6Bs)yECp
zh2|n>Itx=@Peir+4@CClN8G<-2s~CP1M||DPOz7Sv4dA}&h}K;_%e^4`I~z!YJSj!
zJURUMcp(^xhLEuiEjX(v&69Q#hD|<C=wO8;_tTXIbau&xRK-$|)82yniX^!QJB~wq
z#u2z1^O_Ux=|EQ=u%lU{^<?Un3OG<;L>&Lwi}_PFKzPeOs7uv`37rI#u18Q#n>!9J
zcffADR=jk<n;7&Luv5)n(rz7wKkZT`UOOBML$mbIFSCmL5G=<a?JD?nOqS{gC6UBV
zIc}x$O_=jFgRXj}h);?y(ko#?zz=L^y|&+@Y4>ZP=gm9z$Xr=2(_YQkg!Vzgr%IA?
z@(`#58R1y$7EE#&AP;2%K%j3H6MiWdKdF?UYf&Y7Eq{gGMm>zDU>x){ZRO<@Nb@W0
zR?vp4TC`)9C^s;q249OdP^ZmZ5Vz|Mu~;GvQ23tNHgt*XlUW6U#@6WFumj>~DR$1d
zL|%w3B|C4c;*lrSyheu^ctSrMe^0(fgRG;m(kzb%7$&0nN<+A2oQhY+rBUe340Jtz
z8S18ACfKrx>TLQ$m~K&iXVC&Svg8kRgo+U<g-B+W?PS<x?hL<*{-B>-JD#<(Ct5;F
z(R(-_A{?^m)$?Pxdff%Q@_jx(ogSdK`?*wqLnpna6amxro`H2o=h3$F;(R?(HBg)@
z2??&}iK*!^P?(mCKCuE^sdK(4+XNu=qa7d3_CN=vn{YWPl+F>chXc9Z#Om`C_F_T_
zT(R?}#vglecD5DvD^GzrLHEe^<OU4!Fv2SXa`;%SmJX|y;ZV0aE<I(8i$lyHQNtSS
ztmUa!bpy3sEk$G|tw-m^1hgynM70nNto)FKBcEoIya&cGQS0wJ?rP$68ah$qt`egD
zYXdJ*ss_I$XJe(r2uG<j1mM~fzO3*S5^k0Tt>r3oYS(12`?&~L9{S8G9c{*H2?hSN
z&0dhL<By+=-_emnSFqRbDuxId(1hE@=oWf|s8#e6SMyv_x8WeM{(KkSUkt;Y!cOdu
zA93K-oef)lUc)Jane^S0V(^m@0`bvRg!k5g#st?<pZG*FZ)q_7;4+WhHlD>OrRZa}
zkvRN3Va#27uouVs@<?mCDPC%kg(($;c`v>K1P-Ubq={xku|E#Ji!1O)5*uN*IG^19
z8V<d)GSII0AMEs$CzBE*c@9Hy#JM#B*iSEU$yz_?EED4_EV7}&Ry@34F90-1nXmP0
zHf;DsvGnT}rcv@U>t2xzP9ug?reYypFB8OvgBRfD8VRuUx=n>=cvI6e4XF6$4#{2q
z5_!H{aJnW;T5l<FNoy1abK6+Ud2hfsFbzH%8(@{aGr1rpY!bRHj#PIW;Q7Hy_D9rx
zY}!7NsO)`1F7&w4U)2xLT{-~7a%zZrw-G)oh@~Ip`IyTMLBSFUe9burft8v#>9r;(
zHcQ~x>j3iet~AXFas=mh;+VerE?(;`Lv>woG<cbb)?#f$&u@svd{g8eaDR?VLv2Xi
zjyP1#{7F}~NRwA>a{Q<h-o!6q6l8SY(r}Sba$!LTxc6$)=4v@C$dloZE6#%Ux_&k*
zGZ^D92ol--^7zqQ4Bsr74yrbV=q`B*ochbL>7p(wYkmb?t2j8@SI?y0@x<N7#n3?}
zh534A8WW*h!yZZc$<qqmN^d4q;bh|p*fh%xl4h4;yRs>De!3pZC-pHy-|x_W^X`FC
zL74L{AJ06V11Z+#C{Zbky+v7M{hc3lQNnJpG&jf9K5F<uOb08Pq|v%O3L3}N!TUuQ
z$j_AFh7W~c|I#?7HaZ38%dA2LtC`qka~etRDYQUY>cOVtg=BkpJ>rJnn-_7PP4A~a
z0}>gwDG!%Ab-{hhSlB9(M#Od(p;Wa!nGzX`zVrSPQrgxK>3fSw8l2A8x^WCuWfs64
zNQYJ5qCvjC5v85al083z!G7f<90?ZWO<fX2T7R9Pxw3=Uf8!!eO3J5}S&Fm(q%olF
z6_w1<VE2Z<A`hRX(_prhs?K7W#GWrW%6ulx8L~Ac$r?=QCtv9P@(oij{zQ=lQRKwp
zceT$<=aCQTAE>;n9_+i}0jKYe5$niMy0cG|U+`fiq&sCmgWzm{1x|P`U@jC}Zi6+a
z>w#Up11o#?P$MgVhg$oX)CsYevicF3qbUb{v#&BQx^@s{-;)%Uv2ZKV1@pHkVDisJ
z91US77`%-5;CepJb`#=z42Up!vtsc^ZylAl(S_NZS!nQhKWQ~RLrbOR!p{|>Bwk}7
zPOg|scb*A_E3S7*esejdJkf^GM-gb;bQOx^L*V(oByv%3CW)SGN3lX5a-Oi%*vN}0
zE_H;MHYseTsyN=0hI!mD>=u8IQcsK_B%&YYc~wHVW(&&389}mGCH#5yjuzc&W^986
z_<8YRq(&nQXSlB=fsgC)=J-S~(F}p3$5&&?sd!xdQ-Z&BPZ+iDmBR<8@1X3beiFL#
z4ZVIP9DT>9;0g6`klmsT(X&NpVSpOkN=w3!lnG41Qw6d&W|%sEeog}Jt-_2DAvkY7
z1+d(hKfE;!_D){G-+fIL(@6-p`dpwt3jY#he<JasR|s^Ee!+&n<S|?B#O}>XL-AGe
z=(_j>e&lb5-9aYcH>Df1avo6Lw{6Ia&x6S=Md)PT1Ts6s*_Aa(*zTl{eTG(8u9br+
zi*GV^bI;+7$VvRqyKmCpx8C5nXfBS|xZ(tHM{JN1;I>NbVIoYA5fP_eoav^GmPfXc
zTbd#G;E*eP9XW%^=M(5LF$FAMp+j`WG%=w(f-2niqE7@kf63f&BCm3g7$3XFN;r4_
z1!pZG?6?<o7duC5_B>&3=ypP5Tp`-ao56t%pYW-_1F^}t3%3lu6Z@)p<h-90R@nXK
zI7tT6TT3i3$t{U`%(5hQt}D^sbshTttrJwHG>~;?CxP=acj6|07)D1cm`m3hNK#z-
zU$VZI9qtvw;;U1*YA*fs>mfZ1H+)W{M}vrAO9g(Yhy{greD;??49e-Hf%#l@2<gto
znxnn&>7ou~Ows18iE4x|YMSJz<Qtxcgf41*mV<R7pUL=1PfiqP4Q`w)LJB^`uok5S
zxOq1Z`--K(K2(dZX>Uuc>Oa%0gZc0*|0n(OPzTm5Yov{z-!it@98iArff}g=!q1(@
zvFA|=Q8U;N%f@v;X5AOK>A3>Go-RX-I*xTyb>YUx-^9jJP)JVr|JHl)p9YJ6*C4@v
v*Z-vx<9`qQpSZ`rhyNo%g6!WujsFDxHwt27DI)r>OX%<L{<|NY`LFh0gLpzL

literal 0
HcmV?d00001

diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx b/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..1c8bff7c5984e9a97503d324ea5e7419c0cb46e5
GIT binary patch
literal 128
zcmd;Jw<_aO<YLdvD=00AH-Iq=g*fs{OTa=(%#JB3LL5nHhGy}JDJfB6Ff|53++3U-
e0z#}@%p46I2ysIc@g#9BxKSt)PApsu0=xj}1sjq8

literal 0
HcmV?d00001

diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a8da62120f9ec86be30ee905d66c7ea4603dccb8
GIT binary patch
literal 4366
zcmbVQTR7J1(|*Y=iMNQfv1?<KB%;Lo%#Tq0l~lA6C9+#qA-kdyZEQ-B?6Qf<k|ZgT
z<bCFe5~Wg!3KhkYu#`kn>tE|T`A)u*@1ARBuDNH<=9)RU?QF)0De(RW>bwU`p)-Qd
z<Nb0Oo&wJ=ILJRZ$VFez*FRK*$N!iAHT$do3ny(IR@*s>@IrWdG~GP{w*_b}Rns){
zT%@V1rs?Sy5ES6*>*5#S?(vJacHJK6@skhqcJ=r8S!*rSH&~>rt*f?A?f(nufQBxd
zdAkIE6TOa;#lpa0{U};`9i(Bd`B3EAEkt)Q)Nrb#W<w=pT9q~YeklrSYbU|e;j@r$
zzLegRP(=1A*sQ3f4*I^^3+{vkpw6s2xY&LH1Q*v3mpNBZs-;L^ecA}KU=As-5Qk*B
zC7i{@WpKhH24drG(5vbX>6<ZA`085+vm|5bEUlRUU(R8%;XAN9d`~b&hDo@`D|G&_
z00Q3Mg@7A;(o&=g-;~|)+M+IEzO)*1i&U|yzzMkEiS)F+D(C*6XteLJfrLb!Q0vD<
zFnrubceEXZXV!HXw~U~U+G%u}8jQU2MeyB14D4U;#ON*EG(7Ps*nwf?s%6925K0Bs
z@3dfH`#u=gdK_Ff4MSC*Ar#!Vg0Tykkm&py`Rm&(eDhvK&^oRaO%hX3ExT0^*nR=7
ztUdtCOAgV*zvkg;>Gi}b*b<7bXIEyl7{Ri-b~@uj0cyRS&VuY+@cI65T)(7*dMGBM
z*N_|*mLCL8{TlGoXThEF6pVJ0XJvA|m^W)0CTXh>-Ot5Tb~Ki>Iab34TPZwZUy4&>
zV=5;kNnk_WR|u1pXJ$q9^!)l2*fS>*uf%VGPdgSuZE`3K+V_F`(JOGh^$F<5cu}XJ
zo3wm4Mdgw#QY3p8`5!f4P$>s$$HG`j#dPL*#s#z02TA+)SKv9m6q064MtISS1F3nW
zZ>JcSUzLe@-pkQ8pdF*t$FU_N0q~CehNkwRwD*b@Gx#)}MZ7Dd{$H|5PTK_T)5bJp
z`X9i2LmoyrOvLH;L$N7Bg2jyd1<iZjQ_H#=Aewy`<Im6J^v$k<XvR~Tm}UeaW#>Rr
zd>Y5w-i*1G<5~Bo^`x%S4dXT)Bw}Lc>HXi+Nq?g$slTTQp|88C+TD1(y~LY{@0!o;
z&+mb_p%@5vXhEyK8Waxu!J?!cICb+~T<aynPB*N?Q=8=3fORYBvQvh*%^omX|A@GT
zt3a0#9~YVoLyB-Ms5Qh2#2-+6y;}^*tgqtk)+x+;Y#fV=-$!pLWy9u7NoL^eBZz1(
z7TlmoM1TJ*PQjrYbQKH9`|@fio{)tj^VdPq@M+S#bSM3wWWpk)^Mot66=0Nv3%6;Z
zBWJPXDe`!JW-q45urS$7+!k{bQhky^XHz7!1)ac|1Bp2O++ktDp%`i@8iq~UJFxWr
z2Aa&9%akYEqUGC#-1;*MafM4ekzKcri|$kBeE(R5t?&2Ysb>{rYh(ggPSb#mQJ(}&
zdUDY1+W@vdG+5TI@k~jefLlk5s9NfQ$_%+UOq`*B`d%uWjM@h<-knEe>aGf{9~M!M
zWl}6Ns*hf}nFO`!O?YP6bx8WVk^Heq0Wr@A)6!P4I;ForN$mg>#`=&fqiD9<>X>l;
zXfq78>}GQ=t;K{BK3M1J1iYMGnAdcQyo|^prxSj_3zxGXVJO4Ay;{+CxCq`g_EEQL
zU$nmV7Mu8|ajx4aSpCDt8h#SY@EyX!lLBh1lTAy~CNl5Ko<#C)1^8?7xDWpK;mdS+
z&i&LxF5NQ@vclZBk@KS1wZs%NWRB3IEoHzbFHuDzg1*X;<os?S<d~(IO`g4gn>kAr
zq*CAGk@W9mOYuzhJV%>8nd8L$VSNm&PRHW9f-1CJbBii1t;BcBqA=Se3-}-9vB$g;
ztUv0oGd_BNQqedd{Tf2F7IO}k$Iz`tf(2J!1NZC{l-eOiW3-A%zjy{jqz6_$F4+et
zT?EAUB}D&S0a5yMS?{b=6l=<+Irj$f%c0NYcDEeowsJgE*|m{aKH$TQ1=leAx+a7e
z711<u9ttM*W0znRH~Ucq7}se*eOdy{9rl6NMPblBq`)jpN}yjV5XZ}XqS{tGZlyyZ
zeos`!R>g<d-4svlYzoYlaQfU2`#*7D_jFoixD&sat-_+AH{fbg0bQC+AZo5d2ER$e
z$da>W8ijvAp{)T#=)YbX^w}8yI`#zY#3Ug+G6h6ahDdT}4dq?GFRWEr%%0_%vJcy`
zap9F|?5##TY8XspdaAeaU2Q(_-CvM>%FiLZr2@N34Y;`}_QG6M8EndG112Xg==qR_
z7KZ{c|JG+>SuM-0*+!A)s&aFY|4VZBL^;7)Y3`!%6~4JRpVQ6RN9!-OkkU(?ctr9R
z9x~Ym2eMbe+m1@2U9cV1^fiP}<2o?@&oylG=@i&K*9M#h<DkZ302?0V(h7e<R3ale
z(IOQXiO8jqFZ)5=sTz9X6X-a@PO_^wgS;#|3JNQqV9*?6oN&~ceHmMc$JfWxv8UeH
zEH?yt(ZeKPTa-n_-nyCVa|_e9&lE(Ah9G}i0mPiI!b<!>#;bIonr{JAZ8PQU-kF$X
ztsI6Lo(K~;Hy&R9J_5a+(}iBz#^@3j1}tKcVCtR%OlqEr=L_WF&Fl#LKA{74m#U(>
za6fiATeB6xR&1+>EK}RljNyCzLA_`UiY$l7`k%MisC$m7%}f<c3nW}?3;_$V=j4)x
zA?90k5|g$Z^tE&ZC*E5$dnijwjrw6;dIJ&DpHAj+CqOu<8njN<krzGjIM(o%^0UJ6
zm0%P`PIJOHo}I$rJ4*OC=TG`I;uQF8e}!6|wRquD3KpHugmES2_@Uw($zAD-35~g6
zAHJF;hYn-WbS)Mz?k2hi_~5{qxXP4{L1G;B8Xb8)<OIpW4E-pGvY!jtH8D7HbrD``
z%p>}WmYl%~c^0ukMNqT+HHp2Rif3>B{Bc$HF)_n}zQ3G{4b_ub_Z}m{zj8^q$?*hP
z)pZBgNr<uk+(QzRCygDWG5F}+Z8Ela3Iq$ZQQ5o$to+oOG%pvtoTqZ1V<Vv`BpUCV
z=&|A~TXJBh6FDBX6-C$nAQI6Jp=-xBtlxhW^Y&Yjq!rKMk@+Boj4p@7*i<gj--#87
z1Y=%QD)P#_1&tq_KqoK)dV-ZPaFG$Dex3-g24c`QJOpfd4%5V?kANF`Pu=^fX`K8W
zl-eo|rfWIu?azlMrBzJ-?lp+qvI;vbC@Q@&N8?-qv+B*6mx~0K<35a{g3E%u4a#K4
zZ5dEx$)rR*3!hK=0|N`SI7MX<)KoTPYyQ3rX1;r%Avv~kG<iF_KG;so-PGyyOc~Cl
zU?Q8L`<YzISKzv5XHZ_oGSkA+NnE&w0qMTyCX`wH8{R%A0S)6F*ayc`gnbMY1pWR9
zR)}}vT~Qs*OdvohMOhltt0M5fpu_@|9P!KzU8qU+0-^B~CgZ6Jb9~Bi-P&8i3I})k
zeYOMIoEgsqNk>t$eh?=vux6!~`e>0*503mu!6hEi;QZDT9TO(P#<Uzr7)TZ-1xZ3<
zq$=}mje@iH)VV{>t6+8KVd~YR2%EEKWAU2}L|<e%rX5~NPL_?wj%EWal=&?5&8Q?V
z`xi3ZQdg>TQ<8}cp2d%<O3ZAy4sAVJ!N*mOYyY80c@ASZJXsOF3*%v}z#m-~MBt98
zl2EnaH67Y!gI?v5c&Otc9C)nEs={n(O6NgLc0Go*ufAfxzcg34S(W6tPGLUA@^GlP
zl&EwXv$LM5cw_Q4jLcW$Hnx4Ed;68ht<RnKu%sES#A=}R?E<hk&gWLBh;zC5Kld%}
zLp=YvPu1E6%*_9hV}*OxvmDpe!d>x)AzfdNSy>*yxYV`S=;p^|*%d-#h9YRXJwp=}
zJCLeQ0N&rqH{WV0G1qiHceO_wxK(qx@{8|;g0+Fr{;d<kLZmU}`!Jk7*$e!uz3?V7
z2+#S?hJiDd2nls~#+#1`w<FN5rWhC*6Yl!YAsEX&DdhT20cuagx1+&yc)|(n@7w^*
zouRZyw*_^Qv;;}gJT{g%iCb3V4pVon6=Xb4hZ(CsW7C=mY(QiUj(t(5$$s*<Zea^r
zgr>m7;0sWKdD!T5o8~wi$5H7XnE70e`)61W602H~_xdq?X`T*8<VPUo=o7Lb+XeOp
z&V=6XRFrSKj#W$@Ivz5z?D;4?{?|v+Cs>BZ&pU$qb4AQuwh!lI93VZRCt=GiEjDv)
zC;Fx+V?lfZri@?1KHipN&#LudaP18At?w34Lm#N0z+*k%x4?_kEfDeXPrN;>4NjBG
z>HXZv%=t_{u{?H}IR99~na<OP@1Zi>BR?zRdi@Tpe>?@a&|oV4N*QExBDpl<BqHN=
z0dLoMQr#0}xZhcUDU?43wZk9axbjP|bbN>gZPC!Sy%%x~mDxz9B~)wz_F{JmVrD$5
z7@r`TtNH9~%Vo&VJxfO%(y(b;E1npZVcZWJGuxp1DE{Yq{Gu!dU)raz?FJUCL3I+A
zUUr0w>>n~#uTF<dH-f+GCi+eO2wu}Xi07u%Vy?dl)HL>y+KL>3N77f^`{pG3@}Z0n
z>d9)<4&pPt7_gI4$2Z%b(KXQ>SQ@KECgqo7+@c0*zvdzt{Q2jeh?`=N{$!|q9ZDoB
z4Y=RCM^I0t8TZQDV1C#Za_^7?yR5+Da!SP^c=J2lTX&i$EG{E<33G_?Lsb~5mL~k6
zNcJGf07YNxFi9y5tUMsZi?xSv!{OcBT-k16Kh=V9$9Zgllmf<_aU)s><Z;?nF>dSS
zw}ii+n(_D^rn8RkMF?4nQJd$%G=VzQHi~0U+-2y#uoV8bIS#ga`82w$o$PI>7u;L0
zhQ*xwN_IZ`irG^vnaP$+V20Q5`p{0a?RAEv(LTrt$)QG0%@CH^Zx(i4mE5>y4+}16
z&;t#t(Qp4&w2Jx-wLJCMR&{AImb#L9kDLVI$NgZIv7G8{Oe76m60l{r1RUwE#!ST&
zFjUrr0nsDamA(gwk^zW`?8GHK4YVvqnq<yr^y`#b=pN012M=_ZTAu{h<6K5cyvL~h
zzL}h`@-bYQ(g9D7X~L2lA4!z0JtXs2faKzNSR8toXpS?7R;#Pn@Jfaa*i0j(UElDL
zmjg-Dn@DR-zQd7~Td>^t8mLJpRm$#<N7-EyxYqO#oSveEG)$D$j+mh1u~4wBxI?P+
z@4=e|^0ZIlFq-Ag$J`$=;D72k*x87PDvbNT^oxHbiho0d$glLj)eJiuafx4?$WL4P
KGt+++jsF6ROd?hQ

literal 0
HcmV?d00001

diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index c7077067d9..e37ddcf99b 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -68,6 +68,7 @@ def pytest_configure(config: pytest.Config) -> None:
                             "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)")
     config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test")
     config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)")
+    config.addinivalue_line("markers", "xdna2: mark test as an XDNA2 (AIE2p) platform test")
     config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)")
     config.addinivalue_line("markers", "models: mark test as a model test (full networks)")
     config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py
new file mode 100644
index 0000000000..9d4f27a477
--- /dev/null
+++ b/DeeployTest/deeployRunner_xdna2.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform.
+
+Usage (from DeeployTest/):
+    python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular [--skipsim] [-v]
+"""
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == '__main__':
+    sys.exit(main(default_platform="XDNA2", default_simulator="host"))
diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py
new file mode 100644
index 0000000000..69af71f429
--- /dev/null
+++ b/DeeployTest/generateNetwork_xdna2.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""XDNA2 network generation script.
+
+JUNGVI: TODO: Move this script to ONNX4Deeploy
+
+Replaces the generic ``generateNetwork.py`` for the XDNA2 platform.
+Instead of emitting C code it:
+
+1. Loads the ONNX model and npz test-data.
+2. Prepares the XDNA2Deployer (type checking + graph binding).
+3. Emits ``testinputs.h`` and ``testoutputs.h`` with raw BF16 uint16_t arrays.
+4. Calls ``deployer.generateMLIR()`` and writes ``network.mlir``.
+"""
+
+import os
+import struct
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+
+from testUtils.platformMapping import mapDeployer, mapPlatform
+from testUtils.testRunner import TestGeneratorArgumentParser
+from testUtils.typeMapping import inferTypeAndOffset
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import bfloat16_t
+from Deeploy.DeeployTypes import _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
+
+def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray:
+    """Convert a float32 numpy array to an array of BF16 bit patterns (uint16_t).
+
+    Uses round-to-nearest-even (the standard IEEE 754 rounding mode).
+    """
+    f32 = arr.astype(np.float32)
+    raw = f32.view(np.uint32)
+    # Standard round-to-nearest-even: add 0x7FFF + BF16_LSB to the full word,
+    # then truncate.  The 0x7FFF biases values just below the midpoint to
+    # round down, while adding the BF16 LSB provides tie-breaking to even.
+    bf16_lsb = (raw >> 16) & 1
+    raw = raw + np.uint32(0x7FFF) + bf16_lsb
+    bf16 = (raw >> 16).astype(np.uint16)
+    return bf16
+
+
+def _bf16_to_float32(bf16: np.ndarray) -> np.ndarray:
+    """Convert an array of BF16 uint16 bit patterns back to float32."""
+    f32_bits = bf16.astype(np.uint32) << 16
+    return f32_bits.view(np.float32)
+
+
+def _generate_xdna2_inputs_header(input_arrays: list) -> str:
+    """Generate testinputs.h with raw uint16_t BF16 bit-pattern arrays."""
+    lines = []
+    lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna")
+    lines.append("// SPDX-License-Identifier: Apache-2.0")
+    lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.")
+    lines.append("#pragma once")
+    lines.append("#include <stdint.h>")
+    lines.append("")
+
+    vec_names = []
+    for idx, arr in enumerate(input_arrays):
+        bf16 = _float32_to_bf16_uint16(arr.flatten())
+        n = len(bf16)
+        name = f"testInputVector{idx}"
+        vec_names.append(name)
+        hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16)
+        lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};")
+        lines.append(f"#define N_ELEMENTS_INPUT{idx} {n}u")
+        lines.append("")
+
+    lines.append(f"static const void *testInputVector[{len(vec_names)}] = {{")
+    lines.append("    " + ", ".join(f"(const void *){n}" for n in vec_names))
+    lines.append("};")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def _generate_xdna2_outputs_header(output_arrays: list) -> str:
+    """Generate testoutputs.h with raw uint16_t BF16 bit-pattern arrays."""
+    lines = []
+    lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna")
+    lines.append("// SPDX-License-Identifier: Apache-2.0")
+    lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.")
+    lines.append("#pragma once")
+    lines.append("#include <stdint.h>")
+    lines.append("")
+
+    vec_names = []
+    for idx, arr in enumerate(output_arrays):
+        bf16 = _float32_to_bf16_uint16(arr.flatten())
+        n = len(bf16)
+        name = f"testOutputVector{idx}"
+        vec_names.append(name)
+        hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16)
+        lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};")
+        lines.append(f"#define N_ELEMENTS_OUTPUT{idx} {n}u")
+        lines.append("")
+
+    lines.append(f"static const void *testOutputVector[{len(vec_names)}] = {{")
+    lines.append("    " + ", ".join(f"(const void *){n}" for n in vec_names))
+    lines.append("};")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def generateNetworkXDNA2(args):
+    log.debug("Arguments: %s", args)
+
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputs_npz = np.load(f'{args.dir}/inputs.npz')
+    outputs_npz = np.load(f'{args.dir}/outputs.npz')
+
+    test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files]
+    test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files]
+
+    # XDNA2 is a non-signprop platform: signProp = False
+    platform, signProp = mapPlatform(args.platform)
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    for index, (name, values) in enumerate(zip(inputs_npz.files, test_inputs_f32)):
+        if np.prod(values.shape) == 0:
+            continue
+        # Force bfloat16_t — BF16 test data stored as float32 in npz would be
+        # inferred as float32_t by minimalFloatType, but the XDNA2 kernel
+        # requires bfloat16_t inputs.
+        inputTypes[f"input_{index}"] = PointerClass(bfloat16_t)
+        inputOffsets[f"input_{index}"] = 0
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    deployer = mapDeployer(platform,
+                           graph,
+                           inputTypes,
+                           deeployStateDir=_DEEPLOYSTATEDIR,
+                           inputOffsets=inputOffsets)
+
+    # Prepare the deployer (type checking + binding)
+    deployer.prepare(_NoVerbosity)
+
+    # Create output directory
+    os.makedirs(args.dumpdir, exist_ok=True)
+
+    # Write testinputs.h (raw BF16 bit patterns as uint16_t)
+    testInputStr = _generate_xdna2_inputs_header(test_inputs_f32)
+    with open(f'{args.dumpdir}/testinputs.h', 'w') as f:
+        f.write(testInputStr)
+
+    # Recompute golden outputs from the actual BF16 inputs the hardware will
+    # see.  The original outputs.npz may have been computed in float32
+    # precision, which can differ by several BF16 ULPs.
+    bf16_inputs = [_float32_to_bf16_uint16(a.flatten()) for a in test_inputs_f32]
+    bf16_input_f32 = [_bf16_to_float32(b) for b in bf16_inputs]
+    golden_f32 = bf16_input_f32[0]
+    for inp in bf16_input_f32[1:]:
+        golden_f32 = golden_f32 + inp
+    test_outputs_bf16 = [golden_f32.reshape(arr.shape) for arr in test_outputs_f32]
+
+    # Write testoutputs.h (raw BF16 bit patterns as uint16_t)
+    testOutputStr = _generate_xdna2_outputs_header(test_outputs_bf16)
+    with open(f'{args.dumpdir}/testoutputs.h', 'w') as f:
+        f.write(testOutputStr)
+
+    # Write network.mlir
+    mlir_str = deployer.generateMLIR()
+    with open(f'{args.dumpdir}/network.mlir', 'w') as f:
+        f.write(mlir_str)
+
+    log.info(f"[XDNA2] Generated: testinputs.h, testoutputs.h, network.mlir -> {args.dumpdir}")
+
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(description="Deeploy XDNA2 Code Generation Utility.")
+    args = parser.parse_args()
+
+    if args.platform != 'XDNA2':
+        parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}")
+
+    generateNetworkXDNA2(args)
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 1dcddeea62..572df44be1 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -29,6 +29,8 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None:
 
     if config.tiling:
         generation_script = script_dir / "testMVP.py"
+    elif config.platform == "XDNA2":
+        generation_script = script_dir / "generateNetwork_xdna2.py"
     else:
         generation_script = script_dir / "generateNetwork.py"
 
@@ -166,6 +168,9 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
         # Run binary directly
         binary_path = Path(config.build_dir) / "bin" / config.test_name
         cmd = [str(binary_path)]
+        # Propagate verbosity to the host binary (e.g. XDNA2 main.cpp uses -v)
+        if config.verbose >= 1:
+            cmd.append("-v")
     else:
         # Run via CMake target
         cmake_cmd = os.environ.get("CMAKE", "cmake")
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..78d5ff9cd6 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None,
         "snitch": "Snitch",
         "chimera": "Chimera",
         "softhier": "SoftHier",
+        "xdna2": "XDNA2",
     }
 
     if args.platform:
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906f9..28425393cb 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -29,9 +29,11 @@
 from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
 from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
+from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
+from Deeploy.Targets.XDNA2.Platform import XDNA2Optimizer, XDNA2Platform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Chimera":
         Platform = ChimeraPlatform()
 
+    elif platformName == "XDNA2":
+        Platform = XDNA2Platform()
+
     else:
         raise RuntimeError(f"Deployment platform {platformName} is not implemented")
 
@@ -273,6 +278,22 @@ def mapDeployer(platform: DeploymentPlatform,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
 
+    elif isinstance(platform, XDNA2Platform):
+        if loweringOptimizer is None:
+            loweringOptimizer = XDNA2Optimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = XDNA2Deployer(graph,
+                                 platform,
+                                 inputTypes,
+                                 loweringOptimizer,
+                                 scheduler,
+                                 name = name,
+                                 default_channels_first = default_channels_first,
+                                 deeployStateDir = deeployStateDir)
+
     else:
         raise RuntimeError(f"Deployer for platform {platform} is not implemented")
 
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 6d9f3cfcd7..dca5c7b7cc 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -42,6 +42,7 @@
 from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS
 from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS
 from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS
+from test_xdna2_config import KERNEL_TESTS as XDNA2_KERNEL_TESTS
 from testUtils.pytestRunner import create_test_config, run_and_assert_test
 
 
@@ -117,6 +118,11 @@ def param_id(param):
         "model_tests": GAP9_MODEL_TESTS,
         "default_num_cores": GAP9_DEFAULT_NUM_CORES,
     },
+    "xdna2": {
+        "platform": "XDNA2",
+        "simulator": "host",
+        "kernel_tests": XDNA2_KERNEL_TESTS,
+    },
 }
 
 ### Markers summary ###
@@ -987,3 +993,21 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch
         double_buffer = True,
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.xdna2
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", XDNA2_KERNEL_TESTS, ids = XDNA2_KERNEL_TESTS)
+def test_xdna2_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["xdna2"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
diff --git a/DeeployTest/test_xdna2_config.py b/DeeployTest/test_xdna2_config.py
new file mode 100644
index 0000000000..7988aa09b1
--- /dev/null
+++ b/DeeployTest/test_xdna2_config.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Test list for the XDNA2 platform.
+# Each entry is a relative path under DeeployTest/Tests/.
+
+KERNEL_TESTS = [
+    "Kernels/BF16/Add/Regular",
+]
diff --git a/TargetLibraries/XDNA2/CMakeLists.txt b/TargetLibraries/XDNA2/CMakeLists.txt
new file mode 100644
index 0000000000..c2e1ffdecd
--- /dev/null
+++ b/TargetLibraries/XDNA2/CMakeLists.txt
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ---------------------------------------------------------------------------
+# XDNA2 (AIE2p) kernel library
+#
+# Compiles AIE C++ kernels using the llvm-aie (Peano) cross-compiler.
+# Exports a CMake target `xdna2_kernels` that other targets can depend on,
+# and sets XDNA2_KERNEL_OBJECTS in the parent scope.
+# ---------------------------------------------------------------------------
+
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+# --- Resolve llvm-aie (Peano) install dir ---
+set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir")
+if(NOT LLVM_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());"
+        OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+endif()
+if(NOT LLVM_AIE_INSTALL_DIR)
+    message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. "
+        "Please set the environment variable LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.")
+endif()
+
+# --- Resolve mlir-aie include dir (aie_api headers) ---
+if(NOT MLIR_AIE_INCLUDE_DIR)
+    if(DEFINED ENV{MLIR_AIE_INCLUDE_DIR})
+        set(MLIR_AIE_INCLUDE_DIR $ENV{MLIR_AIE_INCLUDE_DIR})
+    else()
+        execute_process(
+            COMMAND ${Python3_EXECUTABLE}
+                    -c "import aie.utils.config; print(aie.utils.config.cxx_header_path());"
+            OUTPUT_VARIABLE MLIR_AIE_INCLUDE_DIR
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+            RESULT_VARIABLE _aie_cfg_result
+        )
+        if(NOT _aie_cfg_result EQUAL 0 OR NOT MLIR_AIE_INCLUDE_DIR)
+            message(FATAL_ERROR "[XDNA2] Could not query aie.utils.config.cxx_header_path(). "
+                "Please set the environment variable MLIR_AIE_INCLUDE_DIR or install the mlir-aie wheel.")
+        endif()
+    endif()
+endif()
+
+set(LLVM_AIE_CLANG "${LLVM_AIE_INSTALL_DIR}/bin/clang++")
+
+message(STATUS "[XDNA2 Kernels] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2 Kernels] MLIR_AIE_INCLUDE_DIR = ${MLIR_AIE_INCLUDE_DIR}")
+
+# ---------------------------------------------------------------------------
+# Compile AIE kernels
+# ---------------------------------------------------------------------------
+file(GLOB XDNA2_KERNEL_SOURCES "${CMAKE_CURRENT_LIST_DIR}/kernels/*.cc")
+
+set(XDNA2_KERNEL_OBJECTS "")
+
+foreach(KERNEL_SRC ${XDNA2_KERNEL_SOURCES})
+    get_filename_component(KERNEL_NAME ${KERNEL_SRC} NAME_WE)
+    set(KERNEL_OBJ "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_NAME}.o")
+
+    add_custom_command(
+        OUTPUT  "${KERNEL_OBJ}"
+        COMMAND "${LLVM_AIE_CLANG}"
+                --target=aie2p-none-unknown-elf
+                "-I${MLIR_AIE_INCLUDE_DIR}"
+                -std=c++20
+                -Wno-parentheses
+                -Wno-attributes
+                -Wno-macro-redefined
+                -Wno-empty-body
+                -O2
+                -DNDEBUG
+                -c "${KERNEL_SRC}"
+                -o "${KERNEL_OBJ}"
+        DEPENDS "${KERNEL_SRC}"
+        COMMENT "[XDNA2] Compiling AIE kernel: ${KERNEL_NAME}.cc -> ${KERNEL_NAME}.o"
+        VERBATIM
+    )
+
+    list(APPEND XDNA2_KERNEL_OBJECTS "${KERNEL_OBJ}")
+endforeach()
+
+add_custom_target(xdna2_kernels DEPENDS ${XDNA2_KERNEL_OBJECTS})
+
+# Export kernel objects to parent scope so the testbench CMake can use them
+set(XDNA2_KERNEL_OBJECTS "${XDNA2_KERNEL_OBJECTS}" PARENT_SCOPE)
diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc
new file mode 100644
index 0000000000..1a53e47398
--- /dev/null
+++ b/TargetLibraries/XDNA2/kernels/add.cc
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#define NOCPP
+
+#include <aie_api/aie.hpp>
+#include <aie_kernels/aie_kernel_utils.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+template <typename T_in, typename T_out> void eltwise_add(T_in *a, T_in *b, T_out *c, int size)
+{
+    for (int i = 0; i < size; i++) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template <typename T_in, typename T_out> void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size)
+{
+    constexpr int vec_factor = 16;
+    event0();
+    T_in *__restrict pA1 = a;
+    T_in *__restrict pB1 = b;
+    T_out *__restrict pC1 = c;
+    const int F = size / vec_factor;
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(16)
+    for (int i = 0; i < F; i++) {
+        aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+        pA1 += vec_factor;
+        aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
+        pB1 += vec_factor;
+        aie::vector<T_out, vec_factor> cout = aie::add(A0, B0);
+        aie::store_v(pC1, cout);
+        pC1 += vec_factor;
+    }
+    event1();
+}
+
+extern "C" {
+
+void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size)
+{
+    eltwise_add<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+}
+
+void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size)
+{
+    eltwise_vadd<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+}
+
+} // extern "C"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6d047b4957..5cbdc0ef64 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1
+--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
+--extra-index-url https://pypi.org/simple
+
+mlir_aie==v1.2.1
+llvm-aie
+
 # Quality of life
 netron
 debugpy

From d039415104bea6c78bb23c7517efd59fe414b5f7 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Tue, 17 Mar 2026 14:14:38 +0100
Subject: [PATCH 03/16] Add XDNA container

---
 .gitignore                           |  3 +-
 Container/Dockerfile.deeploy-xdna    | 58 ++++++++++++++++++++++++++++
 DeeployTest/Platforms/XDNA2/main.cpp | 14 +++++--
 README_XDNA.md                       | 30 ++++++++++++++
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 Container/Dockerfile.deeploy-xdna
 create mode 100644 README_XDNA.md

diff --git a/.gitignore b/.gitignore
index 7ffc9ca243..a9993aac54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,4 +59,5 @@ CHANGELOG_GEN.md
 .cache/
 
 # Claude context file
-CLAUDE.md
\ No newline at end of file
+CLAUDE.md
+Container/xrt-debs/
diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna
new file mode 100644
index 0000000000..f39d1df3ed
--- /dev/null
+++ b/Container/Dockerfile.deeploy-xdna
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+FROM ubuntu:24.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+ENV LLVM_INSTALL_DIR="nope"
+
+RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository -y ppa:amd-team/xrt \
+    && apt-get update && apt-get install -y \
+    cmake \
+    ninja-build \
+    g++ \
+    git \
+    git-lfs \
+    python3 \
+    python3-pip \
+    python-is-python3 \
+    uuid-dev \
+    wget \
+    curl \
+    ccache \
+    libxrt2 \
+    libxrt-npu2 \
+    libxrt-dev \
+    libxrt-utils \
+    libxrt-utils-npu \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV XILINX_XRT=/opt/xilinx/xrt
+ENV PATH=${XILINX_XRT}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib
+
+
+WORKDIR /app
+COPY pyproject.toml ./
+RUN pip install toml-to-requirements && \
+    toml-to-req --toml-file pyproject.toml && \
+    pip install -r requirements.txt && \
+    rm -f requirements.txt pyproject.toml
+
+RUN pip install \
+    --extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 \
+    --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \
+    "mlir_aie==v1.2.1" \
+    llvm-aie
+
+ENV MLIR_AIE_PYTHON=/usr/bin/python3
+ENV IRON_OPERATORS_DIR=/usr/lib/python3/dist-packages/aie/iron/operators
+
+WORKDIR /app/Deeploy
diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
index 07ffb7a0ca..046384e4db 100644
--- a/DeeployTest/Platforms/XDNA2/main.cpp
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -79,9 +79,17 @@ static std::vector<uint32_t> read_instr_binary(const std::string &path)
 
 int main(int argc, char **argv)
 {
-    // Paths to the compiled artefacts (relative to the binary's working dir)
-    std::string xclbin_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/network.xclbin";
-    std::string instr_path  = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/npu_insts.bin";
+    // Paths to the compiled artefacts: default to the directory containing
+    // this binary so the test works regardless of the working directory or
+    // whether it is run inside a container.
+    std::string bin_dir;
+    {
+        std::string argv0(argv[0]);
+        auto sep = argv0.rfind('/');
+        bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep);
+    }
+    std::string xclbin_path = bin_dir + "/network.xclbin";
+    std::string instr_path  = bin_dir + "/npu_insts.bin";
 
     bool verbose = false;
     for (int i = 1; i < argc; ++i) {
diff --git a/README_XDNA.md b/README_XDNA.md
new file mode 100644
index 0000000000..a96a3550c8
--- /dev/null
+++ b/README_XDNA.md
@@ -0,0 +1,30 @@
+# How to use Deeploy on the XDNA2 NPU
+
+A dockerfile containing everything required to run on XDNA2 is available to build with the dockerfile at `Container/Dockerfile.deeploy-xdna`.
+
+You can build it locally on Ubuntu 24.04 with:
+```
+docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local .
+```
+
+You need to have XRT installed on your host, once installed it is present in `/opt/xilinx/xrt`. You can run the docker container previously built with:
+```
+docker run -it \
+  --device /dev/accel/accel0 \
+  --ulimit memlock=-1 \
+  -v /scratch/jungvi/IRON:/opt/IRON \
+  -e IRON_OPERATORS_DIR=/opt/IRON/iron/operators \
+  -v "$(pwd)":/app/Deeploy \
+  -v /opt/xilinx:/opt/xilinx \
+  --name deeploy_dev \
+  deeploy-xdna:local
+```
+
+Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation.
+
+Once the container is started you can a simple Add node, from ONNX to execution with:
+```
+pip install -e ./ && \
+cd DeeployTest && \
+python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/
+```
\ No newline at end of file

From e66864a75a9b792fbd5198bb96c1054010267c6b Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 18 Mar 2026 11:25:30 +0100
Subject: [PATCH 04/16] First attempt at generating MLIR code with Deeploy

---
 Container/Dockerfile.deeploy-xdna             |   1 -
 Deeploy/MLIRDataTypes.py                      |  83 ++++++++
 Deeploy/Targets/XDNA2/Deployer.py             | 190 +++++------------
 .../Targets/XDNA2/Templates/AddTemplate.py    | 201 +++++++++++++-----
 4 files changed, 287 insertions(+), 188 deletions(-)
 create mode 100644 Deeploy/MLIRDataTypes.py

diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna
index f39d1df3ed..fd62657740 100644
--- a/Container/Dockerfile.deeploy-xdna
+++ b/Container/Dockerfile.deeploy-xdna
@@ -53,6 +53,5 @@ RUN pip install \
     llvm-aie
 
 ENV MLIR_AIE_PYTHON=/usr/bin/python3
-ENV IRON_OPERATORS_DIR=/usr/lib/python3/dist-packages/aie/iron/operators
 
 WORKDIR /app/Deeploy
diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py
new file mode 100644
index 0000000000..2091307858
--- /dev/null
+++ b/Deeploy/MLIRDataTypes.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Base class for MLIR-emitting node templates.
+
+This module provides :class:`MLIRNodeTemplate`, a :class:`NodeTemplate`
+subclass whose ``generate()`` method produces an MLIR string instead of C
+code.  Concrete subclasses override :meth:`emit` to populate an
+``mlir.ir.Module`` using dialect-specific Python bindings (e.g.
+``aie.dialects`` for the XDNA2 backend).
+
+The class is intentionally dialect-agnostic so that future MLIR-based
+backends (NVGPU, Linalg, …) can reuse the same base.
+"""
+
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+
+
+class MLIRNodeTemplate(NodeTemplate):
+    """NodeTemplate subclass that emits MLIR instead of C code.
+
+    Subclasses must override :meth:`emit` to add dialect operations to an
+    ``mlir.ir.Module`` (or region / insertion point provided via *kwargs*).
+
+    ``generate()`` is overridden as a convenience that constructs a
+    standalone module, calls :meth:`emit`, and returns the MLIR text.
+    The base-class ``alignToContext`` / ``hoistTransientBuffers`` hooks are
+    retained and work unchanged.
+    """
+
+    def __init__(self):
+        # Empty Mako template — no C code is generated.
+        super().__init__("")
+
+    # ------------------------------------------------------------------
+    # Subclass API
+    # ------------------------------------------------------------------
+
+    @abstractmethod
+    def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
+        """Populate an MLIR module with the operations for this node.
+
+        The caller (typically the deployer) sets up an ``mlir.ir.Module``
+        with the appropriate device wrapper and passes dialect-specific
+        context through *kwargs* (e.g. insertion point, tile references,
+        ObjectFifo handles).
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            The parser's node representation (buffer names, sizes, types …).
+        **kwargs
+            Dialect-specific context provided by the deployer.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # NodeTemplate overrides
+    # ------------------------------------------------------------------
+
+    def generate(self, operatorRepresentation={}, **kwargs) -> str:
+        """Generate an MLIR string for this node.
+
+        This default implementation is a thin wrapper: it delegates to
+        :meth:`emit`.  Deployers that need to build a single module from
+        multiple nodes should call :meth:`emit` directly with the shared
+        module context and then stringify the complete module themselves.
+
+        Returns
+        -------
+        str
+            MLIR text (printable module or fragment).
+        """
+        self.emit(operatorRepresentation, **kwargs)
+        return ""
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
index 7aa77668eb..7df9a1976d 100644
--- a/Deeploy/Targets/XDNA2/Deployer.py
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -1,49 +1,38 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
+"""XDNA2 deployer — generates mlir-aie MLIR using ``aie.dialects``.
+
+Unlike other Deeploy deployers that generate C code via Mako templates,
+this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations
+and returns the verified MLIR text.
+"""
+
+from __future__ import annotations
 
-import os
-import subprocess
-import tempfile
 from typing import Callable, Dict, Optional, Type
 
 import onnx_graphsurgeon as gs
 
+from aie.extras.context import mlir_mod_ctx
+from aie.dialects import aie as aie_d
+from aie.dialects import aiex as aiex_d
+import aie.ir as ir
+
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Logging import DEFAULT_LOGGER as log
-from Deeploy.Targets.XDNA2.Templates.AddTemplate import XDNA2NodeTemplate
-
-# JUNGVI: Will be removed once Deeploy generates it's own MLIR
-
-# Default path to the mlir-aie Python environment.
-# Can be overridden via the MLIR_AIE_PYTHON env variable.
-_DEFAULT_IRON_PYTHON = os.environ.get(
-    "MLIR_AIE_PYTHON",
-    "/scratch/jungvi/micromamba/envs/iron/bin/python",
-)
-
-# Path to the IRON design scripts shipped with mlir-aie examples.
-# Can be overridden via the IRON_OPERATORS_DIR env variable.
-_DEFAULT_IRON_OPERATORS_DIR = os.environ.get(
-    "IRON_OPERATORS_DIR",
-    "/scratch/jungvi/IRON/iron/operators",
-)
+from Deeploy.MLIRDataTypes import MLIRNodeTemplate
 
 
 class XDNA2Deployer(SignPropDeployer):
     """Deployer for the XDNA2 (AIE2p) platform.
 
-    Unlike other Deeploy deployers that generate C code, this deployer
-    generates an mlir-aie MLIR module.  The MLIR is produced by invoking the
-    IRON operator ``design.py`` scripts as subprocesses (using the mlir-aie
-    Python environment) so that the main Deeploy environment does not need to
-    have ``aie.iron`` installed.
-
-    It also writes ``testinputs.h`` and ``testoutputs.h`` via the XDNA2
-    generation script so the XRT C++ testbench can be compiled against
-    known-good golden values.
+    Generates an mlir-aie MLIR module by calling :meth:`emit` /
+    :meth:`emitRuntimeSequence` on each bound :class:`MLIRNodeTemplate`.
+    The module is verified via MLIR's built-in verifier before being
+    returned as a string.
     """
 
     def __init__(self,
@@ -55,22 +44,7 @@ def __init__(self,
                  name: str = 'DeeployNetwork',
                  default_channels_first: bool = False,
                  deeployStateDir: str = "DeeployStateDir",
-                 inputOffsets: Optional[Dict[str, int]] = None,
-                 iron_python: Optional[str] = None,
-                 iron_operators_dir: Optional[str] = None):
-        """
-        Parameters
-        ----------
-        iron_python : str, optional
-            Path to the Python interpreter in the mlir-aie (IRON) environment.
-            Defaults to ``MLIR_AIE_PYTHON`` env variable or
-            ``/scratch/jungvi/micromamba/envs/iron/bin/python``.
-        iron_operators_dir : str, optional
-            Path to the IRON operators directory containing per-operator
-            ``design.py`` scripts.
-            Defaults to ``IRON_OPERATORS_DIR`` env variable or
-            ``/scratch/jungvi/IRON/iron/operators``.
-        """
+                 inputOffsets: Optional[Dict[str, int]] = None):
         super().__init__(
             graph,
             deploymentPlatform,
@@ -82,8 +56,6 @@ def __init__(self,
             deeployStateDir = deeployStateDir,
             inputOffsets = inputOffsets if inputOffsets is not None else {},
         )
-        self._iron_python = iron_python or _DEFAULT_IRON_PYTHON
-        self._iron_operators_dir = iron_operators_dir or _DEFAULT_IRON_OPERATORS_DIR
 
     # ------------------------------------------------------------------
     # MLIR generation
@@ -92,113 +64,65 @@ def __init__(self,
     def generateMLIR(self) -> str:
         """Generate an mlir-aie MLIR module for the prepared graph.
 
-        Iterates over ``self.layerBinding``, extracts AIE parameters from each
-        bound template, and calls the corresponding IRON ``design.py`` script
-        as a subprocess.  Currently only a single BF16 Add node is supported.
+        Iterates over bound layers, calls each template's ``emit()``
+        to construct AIE operations, adds a ``runtime_sequence`` for
+        host-side DMA, verifies the module, and returns the MLIR text.
 
         Returns
         -------
         str
-            MLIR module string (ready to be written to ``network.mlir``).
-
-        Raises
-        ------
-        RuntimeError
-            If the graph contains unsupported operators or if the IRON
-            subprocess fails.
+            Verified MLIR module string.
         """
         assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()"
 
-        mlir_parts = []
-
+        # Collect templates and their operator representations
+        nodes = []
         for node_name, layer in self.layerBinding.items():
             mapper = layer.mapper
             template = mapper.binder.template
             op_repr = mapper.parser.operatorRepresentation
 
-            if not isinstance(template, XDNA2NodeTemplate):
+            if not isinstance(template, MLIRNodeTemplate):
                 raise RuntimeError(
-                    f"Node '{node_name}' has no XDNA2NodeTemplate — "
+                    f"Node '{node_name}' has no MLIRNodeTemplate — "
                     f"only BF16 Add is supported in this release.")
 
-            aie_params = template.getAIEParams(op_repr)
-            log.info(f"[XDNA2] Generating MLIR for node '{node_name}' "
-                     f"with params: {aie_params}")
-
-            mlir_str = self._generate_add_mlir(aie_params)
-            mlir_parts.append(mlir_str)
+            nodes.append((node_name, template, op_repr))
 
-        if not mlir_parts:
-            raise RuntimeError("No bound layers found in graph — cannot generate MLIR.")
+        if not nodes:
+            raise RuntimeError("No bound layers found — cannot generate MLIR.")
 
-        # For a single-node graph the MLIR is just the one module.
-        # Multi-node support would require merging modules.
-        return mlir_parts[0]
+        # Build the MLIR module
+        with mlir_mod_ctx() as ctx:
 
-    def _generate_add_mlir(self, aie_params: dict) -> str:
-        """Call the IRON elementwise_add design.py to produce MLIR.
+            @aie_d.device(aie_d.AIEDevice.npu2)
+            def _device():
+                compute_tile = aie_d.tile(0, 2) # JUNGVI: This will have to change when we deploy on the whole array
+                shim_tile = aie_d.tile(0, 0)
 
-        Parameters
-        ----------
-        aie_params : dict
-            Dict with keys: num_elements, n_cols, n_channels, tile_size, trace_size.
+                # Emit each node's operations (ObjectFifos, core, kernel decls)
+                for node_name, template, op_repr in nodes:
+                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'")
+                    template.emit(op_repr,
+                                  compute_tile=compute_tile,
+                                  shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly?
 
-        Returns
-        -------
-        str
-            MLIR module string.
-        """
-        design_script = os.path.join(
-            self._iron_operators_dir, "elementwise_add", "design.py"
-        )
-
-        if not os.path.isfile(design_script):
-            raise RuntimeError(
-                f"IRON design script not found: {design_script}\n"
-                f"Set IRON_OPERATORS_DIR to point to the IRON operators directory.")
-
-        if not os.path.isfile(self._iron_python):
-            raise RuntimeError(
-                f"IRON Python interpreter not found: {self._iron_python}\n"
-                f"Set MLIR_AIE_PYTHON to the mlir-aie Python interpreter.")
-
-        with tempfile.NamedTemporaryFile(suffix=".mlir", delete=False) as tmp:
-            output_path = tmp.name
-
-        try:
-            cmd = [
-                self._iron_python,
-                design_script,
-                "--dev", "npu2",
-                "--length", str(aie_params['num_elements']),
-                "--columns", str(aie_params['n_cols']),
-                "--channels", str(aie_params['n_channels']),
-                "--tile-size", str(aie_params['tile_size']),
-                "--trace-size", str(aie_params['trace_size']),
-                "--output-file-path", output_path,
-            ]
-
-            log.debug(f"[XDNA2] Running: {' '.join(cmd)}")
-
-            result = subprocess.run(
-                cmd,
-                check=False,
-                capture_output=True,
-                text=True,
-            )
-
-            if result.returncode != 0:
-                raise RuntimeError(
-                    f"IRON design.py failed (exit {result.returncode}):\n"
-                    f"  cmd: {' '.join(cmd)}\n"
-                    f"  stdout: {result.stdout}\n"
-                    f"  stderr: {result.stderr}")
+                # Runtime sequence: collect tensor types from all nodes' I/O
+                # For now (single-node), derive from the first node.
+                _, first_template, first_op_repr = nodes[0]
+                params = first_template.getAIEParams(first_op_repr)
+                num_elements = params['num_elements']
+                tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get())
 
-            with open(output_path, 'r') as f:
-                mlir_str = f.read()
+                @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+                def _seq(*args):
+                    for _, template, op_repr in nodes:
+                        template.emitRuntimeSequence(op_repr, list(args))
 
-        finally:
-            if os.path.exists(output_path):
-                os.unlink(output_path)
+            module = ctx.module
+            assert module.operation.verify(), \
+                "[XDNA2] Generated MLIR module failed verification"
 
+        mlir_str = str(module)
+        log.info(f"[XDNA2] MLIR module generated ({len(mlir_str)} bytes)")
         return mlir_str
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
index 050413eedc..47dcb41d10 100644
--- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -1,81 +1,174 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
+"""XDNA2 MLIR template for BF16 elementwise Add.
 
-from Deeploy.DeeployTypes import NodeTemplate
+Uses ``aie.dialects`` (from the pip-installed ``mlir-aie`` package) to emit
+verified MLIR operations into an existing module context provided by the
+:class:`XDNA2Deployer`.
+"""
 
+from __future__ import annotations
 
-class XDNA2NodeTemplate(NodeTemplate):
-    """Base class for XDNA2 templates.
+from typing import TYPE_CHECKING
 
-    Temporary Feature:
-    Unlike Mako-based templates for C code, XDNA2 templates do not produce
-    code snippets. Instead they store AIE kernel metadata that the
-    XDNA2Deployer reads when generating the holistic MLIR module.
+from aie.dialects import aie as aie_d
+from aie.dialects import aiex as aiex_d
+from aie.dialects import arith as arith_d
+from aie.dialects import func as func_d
+from aie.dialects import scf as scf_d
+import aie.ir as ir
+
+from Deeploy.MLIRDataTypes import MLIRNodeTemplate
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import OperatorRepresentation
+
+
+class XDNA2AddTemplate(MLIRNodeTemplate):
+    """MLIR template for BF16 elementwise Add on XDNA2 (AIE2p).
+
+    The :meth:`emit` method constructs a single-core AIE program with:
+
+    * Two input ObjectFifos and one output ObjectFifo (depth 2 for
+      double-buffering).
+    * A compute core that loops, acquiring / releasing FIFO elements and
+      calling the vectorised ``eltwise_add_bf16_vector`` kernel.
+    * A runtime sequence that configures shim DMA for L3 ↔ L1 transfers.
+
+    Parameters are extracted from the *operatorRepresentation* populated
+    by the parser (``size`` = total number of BF16 elements).
     """
 
-    def __init__(self, kernel_fn_name: str, kernel_obj: str, kernel_src: str, tile_size: int = 1024):
-        """Initialize an XDNA2NodeTemplate.
+    KERNEL_FN = "eltwise_add_bf16_vector"
+    KERNEL_OBJ = "add.o"
+    MAX_TILE_SIZE = 1024
 
-        Parameters
-        ----------
-        kernel_fn_name : str
-            Name of the AIE C++ kernel function (e.g. "eltwise_add_bf16_vector").
-        kernel_obj : str
-            Compiled kernel object file name (e.g. "add.o").
-        kernel_src : str
-            Kernel source file name relative to TargetLibraries/XDNA2/kernels/
-            (e.g. "add.cc").
-        tile_size : int
-            Number of elements per tile (default 1024, max 4096).
-        """
-        # Empty Mako template — no C code is generated per node.
-        super().__init__("")
-        self.kernel_fn_name = kernel_fn_name
-        self.kernel_obj = kernel_obj
-        self.kernel_src = kernel_src
-        self.tile_size = tile_size
+    def __init__(self):
+        super().__init__()
 
-    def getAIEParams(self, operatorRepresentation: dict) -> dict:
-        """Return the aie.iron parameters for this node.
+    # ------------------------------------------------------------------
+    # Parameter helpers
+    # ------------------------------------------------------------------
 
-        Parameters
-        ----------
-        operatorRepresentation : dict
-            The operator representation dict produced by the parser.
+    def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict:
+        """Extract AIE parameters from the operator representation.
 
         Returns
         -------
         dict
-            Parameters to pass to the corresponding aie.iron design function.
+            ``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and
+            ensuring divisibility).
         """
-        raise NotImplementedError
-
-
-class XDNA2AddTemplate(XDNA2NodeTemplate):
-    """XDNA2 template for BF16 elementwise Add."""
-
-    def __init__(self):
-        super().__init__(
-            kernel_fn_name = "eltwise_add_bf16_vector",
-            kernel_obj = "add.o",
-            kernel_src = "add.cc",
-            tile_size = 1024,
-        )
-
-    def getAIEParams(self, operatorRepresentation: dict) -> dict:
         num_elements = int(operatorRepresentation['size'])
-        tile_size = min(num_elements, self.tile_size)
-        # Ensure num_elements is divisible by tile_size
+        tile_size = min(num_elements, self.MAX_TILE_SIZE)
         if num_elements % tile_size != 0:
             tile_size = 1
         return {
             'num_elements': num_elements,
-            'n_cols': 1,
-            'n_channels': 1,
             'tile_size': tile_size,
-            'trace_size': 0,
         }
 
+    # ------------------------------------------------------------------
+    # MLIR emission
+    # ------------------------------------------------------------------
+
+    def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
+        """Add AIE operations for a BF16 Add node into the current device context.
+
+        Must be called inside an ``@aie_d.device(...)`` region (the deployer
+        sets this up).  The following keyword arguments are expected:
+
+        * ``compute_tile`` — result of ``aie_d.tile(col, row)``
+        * ``shim_tile`` — result of ``aie_d.tile(col, 0)``
+        """
+        params = self.getAIEParams(operatorRepresentation)
+        num_elements = params['num_elements']
+        tile_size = params['tile_size']
+        num_tiles = num_elements // tile_size
+
+        compute_tile = kwargs['compute_tile']
+        shim_tile = kwargs['shim_tile']
+
+        # MemRef types
+        tile_ty = ir.MemRefType.get((tile_size,), ir.BF16Type.get())
+        i32 = ir.IntegerType.get_signless(32)
+
+        # ObjectFifos (depth 2 for double-buffering)
+        aie_d.object_fifo("in1_0", shim_tile, [compute_tile], 2, tile_ty)
+        aie_d.object_fifo("in2_0", shim_tile, [compute_tile], 2, tile_ty)
+        aie_d.object_fifo("out_0", compute_tile, [shim_tile], 2, tile_ty)
+
+        # External kernel declaration
+        aie_d.external_func(self.KERNEL_FN, [tile_ty, tile_ty, tile_ty, i32])
+
+        # Compute core
+        @aie_d.core(compute_tile, link_with=self.KERNEL_OBJ)
+        def _core():
+            subview_ty = aie_d.ObjectFifoSubviewType.get(tile_ty)
+            for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
+                for _ in scf_d.for_(0, num_tiles, 1):
+                    acq_in1 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in1_0", 1)
+                    elem_in1 = aie_d.objectfifo_subview_access(tile_ty, acq_in1, 0)
+                    acq_in2 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in2_0", 1)
+                    elem_in2 = aie_d.objectfifo_subview_access(tile_ty, acq_in2, 0)
+                    acq_out = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Produce, "out_0", 1)
+                    elem_out = aie_d.objectfifo_subview_access(tile_ty, acq_out, 0)
+                    size_val = arith_d.constant(i32, tile_size)
+                    func_d.call([], self.KERNEL_FN, [elem_in1, elem_in2, elem_out, size_val])
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in1_0", 1)
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in2_0", 1)
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, "out_0", 1)
+                    scf_d.yield_([])
+                scf_d.yield_([])
+
+    def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
+                            seq_args: list) -> None:
+        """Emit DMA configuration inside a runtime_sequence block.
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            Node representation (used to extract ``num_elements``).
+        seq_args : list
+            Block arguments of the runtime_sequence (memref values for
+            in1, in2, out — in the order matching the ONNX graph I/O).
+        """
+        params = self.getAIEParams(operatorRepresentation)
+        num_elements = params['num_elements']
+
+        dims = [
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=num_elements, stride=1),
+        ]
+
+        in1, in2, out = seq_args[0], seq_args[1], seq_args[2]
+
+        task_in1 = aiex_d.dma_configure_task_for("in1_0")
+        block_in1 = task_in1.body.blocks.append()
+        with ir.InsertionPoint(block_in1):
+            aie_d.dma_bd(in1, offset=0, len=num_elements, dimensions=dims, burst_length=0)
+            aie_d.end()
+        aiex_d.dma_start_task(task_in1)
+
+        task_in2 = aiex_d.dma_configure_task_for("in2_0")
+        block_in2 = task_in2.body.blocks.append()
+        with ir.InsertionPoint(block_in2):
+            aie_d.dma_bd(in2, offset=0, len=num_elements, dimensions=dims, burst_length=0)
+            aie_d.end()
+        aiex_d.dma_start_task(task_in2)
+
+        task_out = aiex_d.dma_configure_task_for("out_0", issue_token=True)
+        block_out = task_out.body.blocks.append()
+        with ir.InsertionPoint(block_out):
+            aie_d.dma_bd(out, offset=0, len=num_elements, dimensions=dims, burst_length=0)
+            aie_d.end()
+        aiex_d.dma_start_task(task_out)
+        aiex_d.dma_await_task(task_out)
+        aiex_d.dma_free_task(task_in1)
+        aiex_d.dma_free_task(task_in2)
+
 
 referenceTemplate = XDNA2AddTemplate()

From d8548468a1d4cfd7f1398d1d5a975d5be5e01da5 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 18 Mar 2026 13:26:15 +0100
Subject: [PATCH 05/16] Generate tiled code but too much logic is in the
 Template

---
 Deeploy/Targets/XDNA2/Deployer.py             | 27 ++++--
 Deeploy/Targets/XDNA2/Platform.py             | 89 ++++++++++++++++++-
 .../Targets/XDNA2/Templates/AddTemplate.py    | 66 ++++++++++++--
 Deeploy/Targets/XDNA2/Tiler.py                | 16 ++++
 DeeployTest/deeployRunner_xdna2.py            |  2 +-
 DeeployTest/generateNetwork_xdna2.py          | 58 +++++++++---
 DeeployTest/testUtils/core/execution.py       |  6 +-
 DeeployTest/testUtils/deeployRunner.py        |  8 ++
 DeeployTest/testUtils/platformMapping.py      |  5 +-
 DeeployTest/testUtils/testRunner.py           | 25 +++++-
 10 files changed, 266 insertions(+), 36 deletions(-)
 create mode 100644 Deeploy/Targets/XDNA2/Tiler.py

diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
index 7df9a1976d..34fd1b52b5 100644
--- a/Deeploy/Targets/XDNA2/Deployer.py
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -24,6 +24,7 @@
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MLIRDataTypes import MLIRNodeTemplate
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 
 
 class XDNA2Deployer(SignPropDeployer):
@@ -67,6 +68,10 @@ def generateMLIR(self) -> str:
         Iterates over bound layers, calls each template's ``emit()``
         to construct AIE operations, adds a ``runtime_sequence`` for
         host-side DMA, verifies the module, and returns the MLIR text.
+        
+        If tiling is enabled (patternMemoryConstraint available), passes
+        tiling information to templates to generate tiled transfers and
+        compute kernels.
 
         Returns
         -------
@@ -81,13 +86,17 @@ def generateMLIR(self) -> str:
             mapper = layer.mapper
             template = mapper.binder.template
             op_repr = mapper.parser.operatorRepresentation
+            
+            # Check if tiling is enabled by looking for patternMemoryConstraint
+            executionBlock = mapper.binder.executionBlock
+            tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)
 
             if not isinstance(template, MLIRNodeTemplate):
                 raise RuntimeError(
                     f"Node '{node_name}' has no MLIRNodeTemplate — "
                     f"only BF16 Add is supported in this release.")
 
-            nodes.append((node_name, template, op_repr))
+            nodes.append((node_name, template, op_repr, tilingConstraint))
 
         if not nodes:
             raise RuntimeError("No bound layers found — cannot generate MLIR.")
@@ -101,23 +110,25 @@ def _device():
                 shim_tile = aie_d.tile(0, 0)
 
                 # Emit each node's operations (ObjectFifos, core, kernel decls)
-                for node_name, template, op_repr in nodes:
-                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'")
+                for node_name, template, op_repr, tilingConstraint in nodes:
+                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" +
+                             (" with tiling" if tilingConstraint else ""))
                     template.emit(op_repr,
                                   compute_tile=compute_tile,
-                                  shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly?
+                                  shim_tile=shim_tile,
+                                  tilingConstraint=tilingConstraint)  # Pass tiling info
 
                 # Runtime sequence: collect tensor types from all nodes' I/O
                 # For now (single-node), derive from the first node.
-                _, first_template, first_op_repr = nodes[0]
-                params = first_template.getAIEParams(first_op_repr)
+                _, first_template, first_op_repr, first_tilingConstraint = nodes[0]
+                params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint)
                 num_elements = params['num_elements']
                 tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get())
 
                 @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
                 def _seq(*args):
-                    for _, template, op_repr in nodes:
-                        template.emitRuntimeSequence(op_repr, list(args))
+                    for _, template, op_repr, tilingConstraint in nodes:
+                        template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint)
 
             module = ctx.module
             assert module.operation.verify(), \
diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py
index 82ef1ec3d2..4a186aca7c 100644
--- a/Deeploy/Targets/XDNA2/Platform.py
+++ b/Deeploy/Targets/XDNA2/Platform.py
@@ -2,19 +2,34 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
-    StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
+    NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
 from Deeploy.Targets.Generic.Layers import AddLayer
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.Parsers import AddParser
 from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings
 
+# Standard mapper for non-tiled deployment
 XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)
 
+# Tiling-ready mapper for tiled deployment  
+XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings)
+
+# Standard mapping (used when tiling is disabled)
 XDNA2Mapping = {
     'Add': AddLayer([XDNA2AddMapper]),
 }
 
+# Tiling-ready mapping (used when tiling is enabled)
+XDNA2TilingMapping = {
+    'Add': AddLayer([XDNA2AddTilableMapper]),
+}
+
 # Buffer classes reuse Generic templates since XDNA2Deployer manages its own
 # output format (MLIR + test headers) and these templates are never rendered.
 
@@ -56,6 +71,21 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str =
         super().__init__(name, Mapping, initCode, includeList)
 
 
+class XDNA2AIECoreEngine(DeploymentEngine):
+    """AIE core execution engine with L1 local memory as preferred memory level.
+    
+    The AIE core has 8KB of local memory (L1) for temporary buffers and computation.
+    Data is transferred from L3 (shared memory) to L1 as needed.
+    """
+
+    def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "",
+                 includeList = None, preferredMemoryLevel: str = "L1") -> None:
+        if includeList is None:
+            includeList = []
+        super().__init__(name, Mapping, initCode, includeList)
+        self.preferredMemoryLevel = preferredMemoryLevel
+
+
 class XDNA2Platform(DeploymentPlatform):
 
     def __init__(self,
@@ -67,3 +97,58 @@ def __init__(self,
         if engines is None:
             engines = [XDNA2Engine()]
         super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class MemoryXDNA2Platform(MemoryPlatform):
+    """XDNA2 platform with memory hierarchy support for tiling.
+    
+    Defines the memory hierarchy:
+    - L1: 8KB per AIE core (local memory)
+    - L3: Shared memory for entire AIE array
+    """
+
+    def __init__(self,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 engines = None,
+                 variableBuffer = XDNA2VariableBuffer,
+                 constantBuffer = XDNA2ConstantBuffer,
+                 structBuffer = XDNA2StructBuffer,
+                 transientBuffer = XDNA2TransientBuffer) -> None:
+        if engines is None:
+            engines = [XDNA2AIECoreEngine()]
+        super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
+                         structBuffer, transientBuffer)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node.
+        
+        For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level).
+        Otherwise use the default target memory level (typically L3).
+        """
+        # Check if node has an engine assignment
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+        
+        return self.defaultTargetMemoryLevel.name
+
+
+class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper):
+    """Wrapper for XDNA2Platform with memory-level support."""
+
+    def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, 
+                 defaultTargetMemoryLevel: MemoryLevel):
+        assert isinstance(platform, XDNA2Platform), \
+            f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}"
+        super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node."""
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+        
+        return self.defaultTargetMemoryLevel.name
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
index 47dcb41d10..3a62d6f757 100644
--- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -12,6 +12,8 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
+
 from aie.dialects import aie as aie_d
 from aie.dialects import aiex as aiex_d
 from aie.dialects import arith as arith_d
@@ -51,19 +53,56 @@ def __init__(self):
     # Parameter helpers
     # ------------------------------------------------------------------
 
-    def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict:
+    def getAIEParams(self, operatorRepresentation: OperatorRepresentation,
+                     tilingConstraint=None) -> dict:
         """Extract AIE parameters from the operator representation.
+        
+        If tilingConstraint is available (tiling enabled), use information
+        from it. Otherwise fall back to fixed tile sizes.
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            Parsed operator representation containing 'size' (total elements).
+        tilingConstraint : PatternMemoryConstraints, optional
+            Tiling solution from the solver. If provided, tile size is derived
+            from the tiling solution.
 
         Returns
         -------
         dict
-            ``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and
-            ensuring divisibility).
+            ``num_elements``, ``tile_size`` (from tiling solution if available,
+            otherwise clamped to MAX_TILE_SIZE).
         """
         num_elements = int(operatorRepresentation['size'])
-        tile_size = min(num_elements, self.MAX_TILE_SIZE)
+        
+        # If tiling is enabled, extract tile size from the tiling solution
+        if tilingConstraint is not None:
+            # tilingConstraint is a PatternMemoryConstraints with nodeConstraints
+            nodeConstraint = tilingConstraint.nodeConstraints[0]
+            outputConstraints = nodeConstraint.outputTensorMemoryConstraints
+            if outputConstraints:
+                # Get the first output tensor's L1 memory constraint (tile shape)
+                firstOutputName = list(outputConstraints.keys())[0]
+                tensorConstraint = outputConstraints[firstOutputName]
+                # Use L1 constraint which holds the tile shape for the AIE core
+                if "L1" in tensorConstraint.memoryConstraints:
+                    l1Constraint = tensorConstraint.memoryConstraints["L1"]
+                    if l1Constraint.shape is not None:
+                        tile_size = int(np.prod(l1Constraint.shape))
+                    else:
+                        tile_size = min(num_elements, self.MAX_TILE_SIZE)
+                else:
+                    tile_size = min(num_elements, self.MAX_TILE_SIZE)
+            else:
+                tile_size = min(num_elements, self.MAX_TILE_SIZE)
+        else:
+            tile_size = min(num_elements, self.MAX_TILE_SIZE)
+            
         if num_elements % tile_size != 0:
-            tile_size = 1
+            # Round down to the largest divisor of num_elements that fits
+            tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0)
+            
         return {
             'num_elements': num_elements,
             'tile_size': tile_size,
@@ -81,8 +120,17 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None
 
         * ``compute_tile`` — result of ``aie_d.tile(col, row)``
         * ``shim_tile`` — result of ``aie_d.tile(col, 0)``
+        * ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            Parsed operator representation with 'size' and other attributes
+        **kwargs
+            compute_tile, shim_tile, tilingConstraint (optional)
         """
-        params = self.getAIEParams(operatorRepresentation)
+        tilingConstraint = kwargs.get('tilingConstraint', None)
+        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
         num_elements = params['num_elements']
         tile_size = params['tile_size']
         num_tiles = num_elements // tile_size
@@ -123,7 +171,7 @@ def _core():
                 scf_d.yield_([])
 
     def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
-                            seq_args: list) -> None:
+                            seq_args: list, tilingConstraint=None) -> None:
         """Emit DMA configuration inside a runtime_sequence block.
 
         Parameters
@@ -133,8 +181,10 @@ def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
         seq_args : list
             Block arguments of the runtime_sequence (memref values for
             in1, in2, out — in the order matching the ONNX graph I/O).
+        tilingConstraint : NodeMemoryConstraint, optional
+            Tiling solution from the solver (currently ignored, for future use).
         """
-        params = self.getAIEParams(operatorRepresentation)
+        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
         num_elements = params['num_elements']
 
         dims = [
diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py
new file mode 100644
index 0000000000..9754aa0688
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Tiler.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation."""
+
+from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
+from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+# For Add operator, reuse the generic BOP (Binary Operator) tile constraint
+# which handles equal-dimension binary operations
+XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(
+    nodeBindings=XDNA2AddBindings,
+    tileConstraint=AddTileConstraint()
+)
diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py
index 9d4f27a477..82be139d46 100644
--- a/DeeployTest/deeployRunner_xdna2.py
+++ b/DeeployTest/deeployRunner_xdna2.py
@@ -14,4 +14,4 @@
 from testUtils.deeployRunner import main
 
 if __name__ == '__main__':
-    sys.exit(main(default_platform="XDNA2", default_simulator="host"))
+    sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True))
diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py
index 69af71f429..995eaabbb7 100644
--- a/DeeployTest/generateNetwork_xdna2.py
+++ b/DeeployTest/generateNetwork_xdna2.py
@@ -22,14 +22,21 @@
 import onnx
 import onnx_graphsurgeon as gs
 
-from testUtils.platformMapping import mapDeployer, mapPlatform
+from testUtils.platformMapping import mapDeployer
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import bfloat16_t
-from Deeploy.DeeployTypes import _NoVerbosity
 from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, XDNA2AIECoreEngine, XDNA2TilingMapping
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
+
+
+def _tilingScheduler(graph: gs.Graph):
+    """Scheduler that returns List[List[gs.Node]] as required by the tiling framework."""
+    return [[node] for node in graph.nodes]
 
 
 def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray:
@@ -122,9 +129,6 @@ def generateNetworkXDNA2(args):
     test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files]
     test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files]
 
-    # XDNA2 is a non-signprop platform: signProp = False
-    platform, signProp = mapPlatform(args.platform)
-
     inputTypes = {}
     inputOffsets = {}
 
@@ -139,14 +143,45 @@ def generateNetworkXDNA2(args):
 
     _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
 
-    deployer = mapDeployer(platform,
+    # Define memory hierarchy: L1 (AIE core local) and L3 (shared)
+    l1_size = int(getattr(args, 'l1', None) or 8192)  # 8KB default
+    l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024)  # 128MB default
+
+    log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}")
+
+    l1_level = MemoryLevel("L1", neighbourNames=["L3"], size=l1_size)
+    l3_level = MemoryLevel("L3", neighbourNames=["L1"], size=l3_size)
+    memory_hierarchy = MemoryHierarchy([l1_level, l3_level])
+    memory_hierarchy.setDefaultMemoryLevel("L3")  # Tensors default to L3
+
+    # Create memory-aware platform with AIE core engine
+    # defaultTargetMemoryLevel=L1 tells the tiling framework that computation
+    # targets L1, so it must tile data from L3 into L1-sized chunks.
+    mem_platform = MemoryXDNA2Platform(
+        memoryHierarchy=memory_hierarchy,
+        defaultTargetMemoryLevel=l1_level,
+        engines=[XDNA2AIECoreEngine(Mapping=XDNA2TilingMapping, preferredMemoryLevel="L1")]
+    )
+
+    # Create base deployer with memory platform
+    deployer = mapDeployer(mem_platform,
                            graph,
                            inputTypes,
+                           scheduler=_tilingScheduler,
                            deeployStateDir=_DEEPLOYSTATEDIR,
                            inputOffsets=inputOffsets)
 
-    # Prepare the deployer (type checking + binding)
-    deployer.prepare(_NoVerbosity)
+    # Wrap with MemoryDeployerWrapper (adds memory level annotation)
+    deployer = MemoryDeployerWrapper(deployer)
+
+    # Wrap with TilerDeployerWrapper (adds tiling)
+    deployer = TilerDeployerWrapper(deployer, workDir=_DEEPLOYSTATEDIR)
+
+    # frontEnd() parses the graph; bind() triggers tiling via wrappers
+    deployer.frontEnd()
+    deployer.bind()
+    deployer.prepared = True
+    log.info("[XDNA2] Tiling completed, proceeding with MLIR generation")
 
     # Create output directory
     os.makedirs(args.dumpdir, exist_ok=True)
@@ -180,8 +215,9 @@ def generateNetworkXDNA2(args):
 
 
 if __name__ == '__main__':
-    parser = TestGeneratorArgumentParser(description="Deeploy XDNA2 Code Generation Utility.")
-    args = parser.parse_args()
+    parser = TestGeneratorArgumentParser(tiling_arguments=True,
+                                        description="Deeploy XDNA2 Code Generation Utility.")
+    args, _ = parser.parse_known_args()
 
     if args.platform != 'XDNA2':
         parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}")
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 572df44be1..a259c93ad7 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -27,10 +27,10 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None:
 
     script_dir = Path(__file__).parent.parent.parent
 
-    if config.tiling:
-        generation_script = script_dir / "testMVP.py"
-    elif config.platform == "XDNA2":
+    if config.platform == "XDNA2":
         generation_script = script_dir / "generateNetwork_xdna2.py"
+    elif config.tiling:
+        generation_script = script_dir / "testMVP.py"
     else:
         generation_script = script_dir / "generateNetwork.py"
 
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index 78d5ff9cd6..d8b76668dc 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -139,6 +139,12 @@ def __init__(self,
                               type = int,
                               default = 1024000,
                               help = 'L2 size in bytes\n')
+            self.add_argument('--l3',
+                              metavar = '<size>',
+                              dest = 'l3',
+                              type = int,
+                              default = None,
+                              help = 'L3 size in bytes\n')
             self.add_argument('--randomizedMemoryScheduler',
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
@@ -221,6 +227,8 @@ def create_config_from_args(args: argparse.Namespace,
             gen_args_list.append(f"--l1={args.l1}")
         if hasattr(args, 'l2') and args.l2 and args.l2 != 1024000:
             gen_args_list.append(f"--l2={args.l2}")
+        if hasattr(args, 'l3') and args.l3:
+            gen_args_list.append(f"--l3={args.l3}")
         if hasattr(args, 'randomizedMemoryScheduler') and args.randomizedMemoryScheduler:
             gen_args_list.append("--randomizedMemoryScheduler")
         if hasattr(args, 'profileTiling') and args.profileTiling:
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 28425393cb..eaa9b2503f 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -30,7 +30,8 @@
 from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
-from Deeploy.Targets.XDNA2.Platform import XDNA2Optimizer, XDNA2Platform
+from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, XDNA2Optimizer, \
+    XDNA2Platform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
 _NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"]
@@ -278,7 +279,7 @@ def mapDeployer(platform: DeploymentPlatform,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
 
-    elif isinstance(platform, XDNA2Platform):
+    elif isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)):
         if loweringOptimizer is None:
             loweringOptimizer = XDNA2Optimizer
 
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 9578c2f26c..e233cc9b1d 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -61,7 +61,7 @@ def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int
 
 class TestGeneratorArgumentParser(argparse.ArgumentParser):
 
-    def __init__(self, description = None):
+    def __init__(self, tiling_arguments: bool = False, description = None):
 
         formatter = _ArgumentDefaultMetavarTypeFormatter
 
@@ -70,6 +70,8 @@ def __init__(self, description = None):
         else:
             super().__init__(description = description, formatter_class = formatter)
 
+        self.tiling_arguments = tiling_arguments
+
         self.add_argument('-t',
                           metavar = '<dir>',
                           dest = 'dir',
@@ -90,6 +92,27 @@ def __init__(self, description = None):
                           help = 'Set the output dump folder\n')
         self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n')
 
+        # Tiling-related arguments (for XDNA2 and other tiled platforms)
+        if self.tiling_arguments:
+            self.add_argument('--l1',
+                              metavar = '<size>',
+                              dest = 'l1',
+                              type = int,
+                              default = None,
+                              help = 'Set L1 memory size in bytes (enables tiling if specified).\n')
+            self.add_argument('--l3',
+                              metavar = '<size>',
+                              dest = 'l3',
+                              type = int,
+                              default = None,
+                              help = 'Set L3 memory size in bytes.\n')
+            self.add_argument('--defaultMemLevel',
+                              metavar = '<level>',
+                              dest = 'defaultMemLevel',
+                              type = str,
+                              default = "L3",
+                              help = 'Set default memory level (default: L3)\n')
+
         self.args = None
 
     def parse_args(self, args = None, namespace = None) -> argparse.Namespace:

From 9f7db2667f4644465c0c3a0bf2d911cbabcf625e Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 18 Mar 2026 14:37:48 +0100
Subject: [PATCH 06/16] Move data movement in passes. Template represent for
 loop and aquire/release

---
 Deeploy/MLIRDataTypes.py                      | 140 ++++++++++-
 Deeploy/Targets/XDNA2/Bindings.py             |  28 ++-
 .../MLIRObjectFifoPass.py                     | 143 +++++++++++
 .../MLIRRuntimeSequencePass.py                |  93 +++++++
 .../CodeTransformationPasses/__init__.py      |   6 +
 Deeploy/Targets/XDNA2/Deployer.py             | 133 ++++++----
 .../Targets/XDNA2/Templates/AddTemplate.py    | 233 +++++-------------
 7 files changed, 537 insertions(+), 239 deletions(-)
 create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
 create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
 create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py

diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py
index 2091307858..642fb6fef1 100644
--- a/Deeploy/MLIRDataTypes.py
+++ b/Deeploy/MLIRDataTypes.py
@@ -1,22 +1,30 @@
 # SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-"""Base class for MLIR-emitting node templates.
-
-This module provides :class:`MLIRNodeTemplate`, a :class:`NodeTemplate`
-subclass whose ``generate()`` method produces an MLIR string instead of C
-code.  Concrete subclasses override :meth:`emit` to populate an
-``mlir.ir.Module`` using dialect-specific Python bindings (e.g.
-``aie.dialects`` for the XDNA2 backend).
-
-The class is intentionally dialect-agnostic so that future MLIR-based
-backends (NVGPU, Linalg, …) can reuse the same base.
+"""Base classes for MLIR-emitting node templates and code transformations.
+
+This module provides:
+
+* :class:`MLIRNodeTemplate` — a :class:`NodeTemplate` subclass whose
+  ``emit()`` method populates an ``mlir.ir.Module`` instead of rendering C.
+* :class:`MLIRExecutionBlock` — MLIR-specific execution state replacing the
+  C-oriented :class:`ExecutionBlock` (code-snippet deque) with MLIR builder
+  state (tile references, ObjectFifo handles, tiling parameters).
+* :class:`MLIRCodeTransformationPass` — base class for MLIR code
+  transformation passes that operate on an :class:`MLIRExecutionBlock`.
+* :class:`MLIRCodeTransformation` — two-phase pass container
+  (``devicePasses`` + ``runtimeSequencePasses``) that the deployer
+  orchestrates inside ``@aie_d.device`` and ``@aiex_d.runtime_sequence``
+  regions respectively.
+
+All classes are intentionally dialect-agnostic so that future MLIR-based
+backends (NVGPU, Linalg, …) can reuse them.
 """
 
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import TYPE_CHECKING
+from typing import Any, Dict, List, Optional, TYPE_CHECKING, Tuple
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -24,6 +32,116 @@
     from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 
 
+# ======================================================================
+# MLIRExecutionBlock
+# ======================================================================
+
+class MLIRExecutionBlock:
+    """MLIR-specific execution state for a single operator.
+
+    Replaces the C-oriented :class:`ExecutionBlock` (which holds a deque of
+    :class:`CodeSnippet` objects) with fields that carry MLIR builder state
+    through the code-transformation pipeline.
+
+    Passes populate fields progressively:
+
+    1. The deployer sets ``computeTile``, ``shimTile``,
+       ``operatorRepresentation``, and ``patternMemoryConstraint``.
+    2. A device-phase pass (e.g. ``MLIRObjectFifoPass``) fills
+       ``fifoMap``, ``fifoTypes``, ``tileSize``, ``numTiles``,
+       ``kernelFuncName``, and ``kernelObjFile``.
+    3. The deployer sets ``runtimeSequenceArgs`` before the runtime-
+       sequence phase.
+    4. A runtime-sequence pass (e.g. ``MLIRRuntimeSequencePass``) reads
+       all of the above to emit DMA configuration.
+    """
+
+    def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None:
+        # MLIR tile references (set by deployer)
+        self.computeTile: Any = computeTile
+        self.shimTile: Any = shimTile
+
+        # Operator metadata (set by deployer from parser)
+        self.operatorRepresentation: OperatorRepresentation = {}
+
+        # Tiling constraint from midend solver (may be None)
+        self.patternMemoryConstraint: Any = None
+
+        # Populated by device-phase passes (e.g. MLIRObjectFifoPass)
+        self.fifoMap: Dict[str, str] = {}  # tensor name → FIFO name
+        self.fifoTypes: Dict[str, Any] = {}  # tensor name → MemRefType
+        self.tileSize: int = 0
+        self.numTiles: int = 0
+        self.numElements: int = 0
+        self.kernelFuncName: Optional[str] = None
+        self.kernelObjFile: Optional[str] = None
+
+        # Set by deployer before runtime-sequence phase
+        self.runtimeSequenceArgs: List[Any] = []
+
+        # Input / output tensor name lists (set by deployer from parser)
+        self.inputNames: List[str] = []
+        self.outputNames: List[str] = []
+
+
+# ======================================================================
+# MLIRCodeTransformationPass / MLIRCodeTransformation
+# ======================================================================
+
+class MLIRCodeTransformationPass:
+    """Base class for passes that transform an :class:`MLIRExecutionBlock`.
+
+    Subclasses override :meth:`apply` to read / mutate the block's fields
+    and optionally emit MLIR operations into the current insertion point.
+    """
+
+    def apply(self,
+              ctxt: NetworkContext,
+              mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        return ctxt, mlirBlock
+
+
+class MLIRCodeTransformation:
+    """Two-phase pass container for MLIR code transformations.
+
+    *devicePasses* run inside an ``@aie_d.device(...)`` region (ObjectFifo
+    creation, external-kernel declarations, …).
+
+    *runtimeSequencePasses* run inside an ``@aiex_d.runtime_sequence``
+    block (DMA configuration, token await, …).
+
+    The deployer calls :meth:`applyDevicePasses` and
+    :meth:`applyRuntimeSequencePasses` at the appropriate points.
+    """
+
+    def __init__(self,
+                 devicePasses: Optional[List[MLIRCodeTransformationPass]] = None,
+                 runtimeSequencePasses: Optional[List[MLIRCodeTransformationPass]] = None) -> None:
+        self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or []
+        self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or []
+
+    def applyDevicePasses(self,
+                          ctxt: NetworkContext,
+                          mlirBlock: MLIRExecutionBlock,
+                          name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        for _pass in self.devicePasses:
+            ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
+        return ctxt, mlirBlock
+
+    def applyRuntimeSequencePasses(self,
+                                   ctxt: NetworkContext,
+                                   mlirBlock: MLIRExecutionBlock,
+                                   name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        for _pass in self.runtimeSequencePasses:
+            ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
+        return ctxt, mlirBlock
+
+
+# ======================================================================
+# MLIRNodeTemplate
+# ======================================================================
+
 class MLIRNodeTemplate(NodeTemplate):
     """NodeTemplate subclass that emits MLIR instead of C code.
 
diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py
index 68d7672787..e30bbc6646 100644
--- a/Deeploy/Targets/XDNA2/Bindings.py
+++ b/Deeploy/Targets/XDNA2/Bindings.py
@@ -4,14 +4,32 @@
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import bfloat16_t
-from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.MLIRDataTypes import MLIRCodeTransformation
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass
 from Deeploy.Targets.XDNA2.Templates import AddTemplate
 from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker
 
-# XDNA2 does not use the standard C code transformation pipeline.
-# The deployer generates a holistic MLIR module, not per-node C snippets.
-# An empty CodeTransformation is used as a placeholder.
-XDNA2Transformer = CodeTransformation([])
+_ADD_INPUT_KEYS = ['data_in_1', 'data_in_2']
+_ADD_OUTPUT_KEYS = ['data_out']
+
+XDNA2Transformer = MLIRCodeTransformation(
+    devicePasses = [
+        MLIRObjectFifoPass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+            kernelFuncName = "eltwise_add_bf16_vector",
+            kernelObjFile = "add.o",
+        ),
+    ],
+    runtimeSequencePasses = [
+        MLIRRuntimeSequencePass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+        ),
+    ],
+)
 
 XDNA2AddBindings = [
     NodeBinding(
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
new file mode 100644
index 0000000000..be6b492906
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
@@ -0,0 +1,143 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that creates ObjectFifos and declares external kernels.
+
+Given an :class:`MLIRExecutionBlock` with ``computeTile``, ``shimTile``,
+``operatorRepresentation``, and (optionally) ``patternMemoryConstraint``,
+this pass:
+
+1. Derives ``tileSize`` and ``numTiles`` (from tiling solver or fallback).
+2. Creates one ``aie_d.object_fifo`` per input tensor (shim → compute)
+   and one per output tensor (compute → shim), all with depth 2
+   (double-buffering).
+3. Declares the external kernel via ``aie_d.external_func``.
+4. Stores FIFO names, types, and kernel metadata on the block for
+   downstream passes and the compute template.
+
+The pass is operator-agnostic — it only needs the tensor names and a
+tile-size derivation function.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Tuple
+
+import numpy as np
+
+from aie.dialects import aie as aie_d
+import aie.ir as ir
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+MAX_TILE_SIZE = 1024
+
+
+def _deriveTileSize(numElements: int, patternMemoryConstraint) -> int:
+    """Extract tile size from the tiling solution, or fall back to MAX_TILE_SIZE."""
+    tileSize = min(numElements, MAX_TILE_SIZE)
+
+    if patternMemoryConstraint is not None:
+        try:
+            nodeConstraint = patternMemoryConstraint.nodeConstraints[0]
+            outputConstraints = nodeConstraint.outputTensorMemoryConstraints
+            if outputConstraints:
+                firstOutputName = list(outputConstraints.keys())[0]
+                tensorConstraint = outputConstraints[firstOutputName]
+                if "L1" in tensorConstraint.memoryConstraints:
+                    l1Constraint = tensorConstraint.memoryConstraints["L1"]
+                    if l1Constraint.shape is not None:
+                        tileSize = int(np.prod(l1Constraint.shape))
+        except (AttributeError, IndexError, KeyError):
+            pass
+
+    # Ensure tile_size evenly divides num_elements
+    if numElements % tileSize != 0:
+        tileSize = max(d for d in range(1, tileSize + 1) if numElements % d == 0)
+
+    return tileSize
+
+
+class MLIRObjectFifoPass(MLIRCodeTransformationPass):
+    """Create ObjectFifos and declare the external kernel.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors
+        (e.g. ``['data_in_1', 'data_in_2']``).
+    outputTensorKeys : list of str
+        Keys that name output tensors (e.g. ``['data_out']``).
+    kernelFuncName : str
+        Symbol name of the external AIE kernel function.
+    kernelObjFile : str
+        Object file to link with the AIE core (e.g. ``"add.o"``).
+    kernelArgTypes : callable, optional
+        A callable ``(tile_memref_type) -> list[ir.Type]`` that returns
+        the kernel's argument types.  Defaults to
+        ``[tile_ty, tile_ty, tile_ty, i32]`` (suitable for binary
+        elementwise ops).
+    fifoDepth : int
+        ObjectFifo depth (default 2 for double-buffering).
+    """
+
+    def __init__(self,
+                 inputTensorKeys: list,
+                 outputTensorKeys: list,
+                 kernelFuncName: str,
+                 kernelObjFile: str,
+                 kernelArgTypes=None,
+                 fifoDepth: int = 2) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+        self.kernelFuncName = kernelFuncName
+        self.kernelObjFile = kernelObjFile
+        self._kernelArgTypes = kernelArgTypes
+        self.fifoDepth = fifoDepth
+
+    def apply(self,
+              ctxt: NetworkContext,
+              mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        opRepr = mlirBlock.operatorRepresentation
+        numElements = int(opRepr['size'])
+        tileSize = _deriveTileSize(numElements, mlirBlock.patternMemoryConstraint)
+        numTiles = numElements // tileSize
+
+        mlirBlock.tileSize = tileSize
+        mlirBlock.numTiles = numTiles
+        mlirBlock.numElements = numElements
+        mlirBlock.kernelFuncName = self.kernelFuncName
+        mlirBlock.kernelObjFile = self.kernelObjFile
+
+        tileTy = ir.MemRefType.get((tileSize,), ir.BF16Type.get())
+        computeTile = mlirBlock.computeTile
+        shimTile = mlirBlock.shimTile
+
+        # Create input ObjectFifos (shim → compute)
+        for idx, key in enumerate(self.inputTensorKeys):
+            fifoName = f"in{idx + 1}_0"
+            aie_d.object_fifo(fifoName, shimTile, [computeTile], self.fifoDepth, tileTy)
+            mlirBlock.fifoMap[key] = fifoName
+            mlirBlock.fifoTypes[key] = tileTy
+
+        # Create output ObjectFifos (compute → shim)
+        for idx, key in enumerate(self.outputTensorKeys):
+            fifoName = f"out_{idx}"
+            aie_d.object_fifo(fifoName, computeTile, [shimTile], self.fifoDepth, tileTy)
+            mlirBlock.fifoMap[key] = fifoName
+            mlirBlock.fifoTypes[key] = tileTy
+
+        # Declare external kernel
+        i32 = ir.IntegerType.get_signless(32)
+        if self._kernelArgTypes is not None:
+            argTypes = self._kernelArgTypes(tileTy)
+        else:
+            # Default: binary elementwise  (in1, in2, out, size)
+            argTypes = [tileTy, tileTy, tileTy, i32]
+        aie_d.external_func(self.kernelFuncName, argTypes)
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
new file mode 100644
index 0000000000..18a4607328
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Runtime-sequence pass that configures shim DMA for L3 ↔ L1 transfers.
+
+Given an :class:`MLIRExecutionBlock` whose device-phase passes have already
+populated ``fifoMap``, ``numElements``, and ``runtimeSequenceArgs``, this
+pass emits ``aiex_d.dma_configure_task_for`` / ``dma_start_task`` /
+``dma_await_task`` / ``dma_free_task`` operations directly into the current
+``@aiex_d.runtime_sequence`` insertion point.
+
+The pass is operator-agnostic — it iterates over the FIFO map and
+runtime-sequence arguments to configure DMA for every input and output
+tensor.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Tuple
+
+from aie.dialects import aie as aie_d
+from aie.dialects import aiex as aiex_d
+import aie.ir as ir
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+
+class MLIRRuntimeSequencePass(MLIRCodeTransformationPass):
+    """Emit DMA configuration inside a ``runtime_sequence`` block.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors.
+    outputTensorKeys : list of str
+        Keys that name output tensors.
+    """
+
+    def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+
+    def apply(self,
+              ctxt: NetworkContext,
+              mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        numElements = mlirBlock.numElements
+        seqArgs = mlirBlock.runtimeSequenceArgs
+
+        dims = [
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=1, stride=0),
+            aie_d.bd_dim_layout(size=numElements, stride=1),
+        ]
+
+        # Build ordered list of (fifoName, seqArg, isOutput)
+        transfers = []
+        allKeys = self.inputTensorKeys + self.outputTensorKeys
+        for idx, key in enumerate(allKeys):
+            fifoName = mlirBlock.fifoMap[key]
+            isOutput = key in self.outputTensorKeys
+            transfers.append((fifoName, seqArgs[idx], isOutput))
+
+        inputTasks = []
+        outputTasks = []
+
+        for fifoName, seqArg, isOutput in transfers:
+            if isOutput:
+                task = aiex_d.dma_configure_task_for(fifoName, issue_token=True)
+            else:
+                task = aiex_d.dma_configure_task_for(fifoName)
+            block = task.body.blocks.append()
+            with ir.InsertionPoint(block):
+                aie_d.dma_bd(seqArg, offset=0, len=numElements, dimensions=dims, burst_length=0)
+                aie_d.end()
+            aiex_d.dma_start_task(task)
+
+            if isOutput:
+                outputTasks.append(task)
+            else:
+                inputTasks.append(task)
+
+        # Await output tasks, then free input tasks
+        for task in outputTasks:
+            aiex_d.dma_await_task(task)
+        for task in inputTasks:
+            aiex_d.dma_free_task(task)
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000000..aae227155a
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import *
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
index 34fd1b52b5..16cda89891 100644
--- a/Deeploy/Targets/XDNA2/Deployer.py
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -6,6 +6,15 @@
 Unlike other Deeploy deployers that generate C code via Mako templates,
 this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations
 and returns the verified MLIR text.
+
+MLIR generation is split into two phases orchestrated by
+:class:`MLIRCodeTransformation`:
+
+1. **Device phase** — inside ``@aie_d.device(npu2)``: for each operator,
+   run ``devicePasses`` (ObjectFifo creation, external-kernel
+   declaration) then call ``template.emit()`` (compute core only).
+2. **Runtime-sequence phase** — inside ``@aiex_d.runtime_sequence``:
+   for each operator, run ``runtimeSequencePasses`` (DMA configuration).
 """
 
 from __future__ import annotations
@@ -23,15 +32,20 @@
 from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Logging import DEFAULT_LOGGER as log
-from Deeploy.MLIRDataTypes import MLIRNodeTemplate
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.MLIRDataTypes import MLIRCodeTransformation, MLIRExecutionBlock, MLIRNodeTemplate
 
 
 class XDNA2Deployer(SignPropDeployer):
     """Deployer for the XDNA2 (AIE2p) platform.
 
-    Generates an mlir-aie MLIR module by calling :meth:`emit` /
-    :meth:`emitRuntimeSequence` on each bound :class:`MLIRNodeTemplate`.
+    Generates an mlir-aie MLIR module via two-phase code transformation:
+
+    * **Device phase**: ``MLIRObjectFifoPass`` creates ObjectFifos and
+      declares external kernels; the bound ``MLIRNodeTemplate`` emits
+      the compute core.
+    * **Runtime-sequence phase**: ``MLIRRuntimeSequencePass`` configures
+      shim DMA for L3 ↔ L1 transfers.
+
     The module is verified via MLIR's built-in verifier before being
     returned as a string.
     """
@@ -65,13 +79,16 @@ def __init__(self,
     def generateMLIR(self) -> str:
         """Generate an mlir-aie MLIR module for the prepared graph.
 
-        Iterates over bound layers, calls each template's ``emit()``
-        to construct AIE operations, adds a ``runtime_sequence`` for
-        host-side DMA, verifies the module, and returns the MLIR text.
-        
-        If tiling is enabled (patternMemoryConstraint available), passes
-        tiling information to templates to generate tiled transfers and
-        compute kernels.
+        Iterates over bound layers in two phases:
+
+        1. **Device phase** — for each node, creates an
+           :class:`MLIRExecutionBlock`, runs device-phase code-
+           transformation passes (ObjectFifo creation, kernel
+           declaration), then calls ``template.emit()`` (compute core).
+        2. **Runtime-sequence phase** — opens an
+           ``@aiex_d.runtime_sequence`` block, sets
+           ``runtimeSequenceArgs`` on each block, then runs
+           runtime-sequence passes (DMA configuration).
 
         Returns
         -------
@@ -80,60 +97,86 @@ def generateMLIR(self) -> str:
         """
         assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()"
 
-        # Collect templates and their operator representations
+        # Collect per-node info from the bound layers
         nodes = []
-        for node_name, layer in self.layerBinding.items():
+        for nodeName, layer in self.layerBinding.items():
             mapper = layer.mapper
-            template = mapper.binder.template
-            op_repr = mapper.parser.operatorRepresentation
-            
-            # Check if tiling is enabled by looking for patternMemoryConstraint
-            executionBlock = mapper.binder.executionBlock
+            binder = mapper.binder
+            template = binder.template
+            opRepr = mapper.parser.operatorRepresentation
+            codeTransformer = binder.codeTransformer
+
+            # Tiling constraint from the midend solver (may be None)
+            executionBlock = binder.executionBlock
             tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)
 
             if not isinstance(template, MLIRNodeTemplate):
                 raise RuntimeError(
-                    f"Node '{node_name}' has no MLIRNodeTemplate — "
+                    f"Node '{nodeName}' has no MLIRNodeTemplate — "
                     f"only BF16 Add is supported in this release.")
+            if not isinstance(codeTransformer, MLIRCodeTransformation):
+                raise RuntimeError(
+                    f"Node '{nodeName}' uses a non-MLIR CodeTransformation — "
+                    f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.")
 
-            nodes.append((node_name, template, op_repr, tilingConstraint))
+            nodes.append({
+                'nodeName': nodeName,
+                'template': template,
+                'opRepr': opRepr,
+                'codeTransformer': codeTransformer,
+                'tilingConstraint': tilingConstraint,
+            })
 
         if not nodes:
             raise RuntimeError("No bound layers found — cannot generate MLIR.")
 
         # Build the MLIR module
+        mlirBlocks = []
+
         with mlir_mod_ctx() as ctx:
 
             @aie_d.device(aie_d.AIEDevice.npu2)
             def _device():
-                compute_tile = aie_d.tile(0, 2) # JUNGVI: This will have to change when we deploy on the whole array
-                shim_tile = aie_d.tile(0, 0)
-
-                # Emit each node's operations (ObjectFifos, core, kernel decls)
-                for node_name, template, op_repr, tilingConstraint in nodes:
-                    log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" +
-                             (" with tiling" if tilingConstraint else ""))
-                    template.emit(op_repr,
-                                  compute_tile=compute_tile,
-                                  shim_tile=shim_tile,
-                                  tilingConstraint=tilingConstraint)  # Pass tiling info
-
-                # Runtime sequence: collect tensor types from all nodes' I/O
-                # For now (single-node), derive from the first node.
-                _, first_template, first_op_repr, first_tilingConstraint = nodes[0]
-                params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint)
-                num_elements = params['num_elements']
-                tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get())
-
-                @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+                computeTile = aie_d.tile(0, 2)  # TODO: generalize to full array
+                shimTile = aie_d.tile(0, 0)
+
+                # === Device phase ===
+                for node in nodes:
+                    # Create MLIRExecutionBlock with deployer-level state
+                    eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile)
+                    eb.operatorRepresentation = node['opRepr']
+                    eb.patternMemoryConstraint = node['tilingConstraint']
+
+                    log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" +
+                             (" (tiled)" if node['tilingConstraint'] else ""))
+
+                    # Run device-phase passes (ObjectFifo creation, kernel decl)
+                    self.ctxt, eb = node['codeTransformer'].applyDevicePasses(
+                        self.ctxt, eb, node['nodeName'])
+
+                    # Emit compute core (template reads FIFOs etc. from eb)
+                    node['template'].emit(node['opRepr'], executionBlock=eb)
+
+                    mlirBlocks.append((node, eb))
+
+                # === Runtime-sequence phase ===
+                # Derive tensor type from the first node's numElements
+                _, firstEb = mlirBlocks[0]
+                numElements = firstEb.numElements
+                tensorTy = ir.MemRefType.get((numElements,), ir.BF16Type.get())
+
+                @aiex_d.runtime_sequence(tensorTy, tensorTy, tensorTy)
                 def _seq(*args):
-                    for _, template, op_repr, tilingConstraint in nodes:
-                        template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint)
+                    for node, eb in mlirBlocks:
+                        eb.runtimeSequenceArgs = list(args)
+                        log.info(f"[XDNA2] Runtime-sequence phase for '{node['nodeName']}'")
+                        self.ctxt, eb = node['codeTransformer'].applyRuntimeSequencePasses(
+                            self.ctxt, eb, node['nodeName'])
 
             module = ctx.module
             assert module.operation.verify(), \
                 "[XDNA2] Generated MLIR module failed verification"
 
-        mlir_str = str(module)
-        log.info(f"[XDNA2] MLIR module generated ({len(mlir_str)} bytes)")
-        return mlir_str
+        mlirStr = str(module)
+        log.info(f"[XDNA2] MLIR module generated ({len(mlirStr)} bytes)")
+        return mlirStr
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
index 3a62d6f757..ab0b72be77 100644
--- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -1,21 +1,21 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-"""XDNA2 MLIR template for BF16 elementwise Add.
-
-Uses ``aie.dialects`` (from the pip-installed ``mlir-aie`` package) to emit
-verified MLIR operations into an existing module context provided by the
-:class:`XDNA2Deployer`.
+"""XDNA2 MLIR template for BF16 elementwise Add — compute kernel only.
+
+This template emits only the AIE core compute logic (FIFO
+acquire → kernel call → FIFO release).  ObjectFifo creation, external
+kernel declaration, and DMA runtime-sequence configuration are handled
+by :class:`MLIRObjectFifoPass` and :class:`MLIRRuntimeSequencePass`
+respectively, which populate the :class:`MLIRExecutionBlock` before
+this template's :meth:`emit` is called.
 """
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
-import numpy as np
-
 from aie.dialects import aie as aie_d
-from aie.dialects import aiex as aiex_d
 from aie.dialects import arith as arith_d
 from aie.dialects import func as func_d
 from aie.dialects import scf as scf_d
@@ -28,197 +28,74 @@
 
 
 class XDNA2AddTemplate(MLIRNodeTemplate):
-    """MLIR template for BF16 elementwise Add on XDNA2 (AIE2p).
+    """Compute-only MLIR template for BF16 elementwise Add on XDNA2 (AIE2p).
 
-    The :meth:`emit` method constructs a single-core AIE program with:
+    Emits an ``@aie_d.core`` block containing nested loops that acquire
+    input/output ObjectFifo elements and call the vectorised
+    ``eltwise_add_bf16_vector`` kernel.
 
-    * Two input ObjectFifos and one output ObjectFifo (depth 2 for
-      double-buffering).
-    * A compute core that loops, acquiring / releasing FIFO elements and
-      calling the vectorised ``eltwise_add_bf16_vector`` kernel.
-    * A runtime sequence that configures shim DMA for L3 ↔ L1 transfers.
-
-    Parameters are extracted from the *operatorRepresentation* populated
-    by the parser (``size`` = total number of BF16 elements).
+    All ObjectFifo creation and DMA configuration is performed by
+    upstream :class:`MLIRCodeTransformationPass` instances.  This
+    template reads FIFO names, tile size, and kernel metadata from the
+    :class:`MLIRExecutionBlock` passed through ``kwargs['executionBlock']``.
     """
 
-    KERNEL_FN = "eltwise_add_bf16_vector"
-    KERNEL_OBJ = "add.o"
-    MAX_TILE_SIZE = 1024
-
     def __init__(self):
         super().__init__()
 
     # ------------------------------------------------------------------
-    # Parameter helpers
-    # ------------------------------------------------------------------
-
-    def getAIEParams(self, operatorRepresentation: OperatorRepresentation,
-                     tilingConstraint=None) -> dict:
-        """Extract AIE parameters from the operator representation.
-        
-        If tilingConstraint is available (tiling enabled), use information
-        from it. Otherwise fall back to fixed tile sizes.
-
-        Parameters
-        ----------
-        operatorRepresentation : OperatorRepresentation
-            Parsed operator representation containing 'size' (total elements).
-        tilingConstraint : PatternMemoryConstraints, optional
-            Tiling solution from the solver. If provided, tile size is derived
-            from the tiling solution.
-
-        Returns
-        -------
-        dict
-            ``num_elements``, ``tile_size`` (from tiling solution if available,
-            otherwise clamped to MAX_TILE_SIZE).
-        """
-        num_elements = int(operatorRepresentation['size'])
-        
-        # If tiling is enabled, extract tile size from the tiling solution
-        if tilingConstraint is not None:
-            # tilingConstraint is a PatternMemoryConstraints with nodeConstraints
-            nodeConstraint = tilingConstraint.nodeConstraints[0]
-            outputConstraints = nodeConstraint.outputTensorMemoryConstraints
-            if outputConstraints:
-                # Get the first output tensor's L1 memory constraint (tile shape)
-                firstOutputName = list(outputConstraints.keys())[0]
-                tensorConstraint = outputConstraints[firstOutputName]
-                # Use L1 constraint which holds the tile shape for the AIE core
-                if "L1" in tensorConstraint.memoryConstraints:
-                    l1Constraint = tensorConstraint.memoryConstraints["L1"]
-                    if l1Constraint.shape is not None:
-                        tile_size = int(np.prod(l1Constraint.shape))
-                    else:
-                        tile_size = min(num_elements, self.MAX_TILE_SIZE)
-                else:
-                    tile_size = min(num_elements, self.MAX_TILE_SIZE)
-            else:
-                tile_size = min(num_elements, self.MAX_TILE_SIZE)
-        else:
-            tile_size = min(num_elements, self.MAX_TILE_SIZE)
-            
-        if num_elements % tile_size != 0:
-            # Round down to the largest divisor of num_elements that fits
-            tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0)
-            
-        return {
-            'num_elements': num_elements,
-            'tile_size': tile_size,
-        }
-
-    # ------------------------------------------------------------------
-    # MLIR emission
+    # MLIR emission — compute kernel only
     # ------------------------------------------------------------------
 
     def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
-        """Add AIE operations for a BF16 Add node into the current device context.
-
-        Must be called inside an ``@aie_d.device(...)`` region (the deployer
-        sets this up).  The following keyword arguments are expected:
-
-        * ``compute_tile`` — result of ``aie_d.tile(col, row)``
-        * ``shim_tile`` — result of ``aie_d.tile(col, 0)``
-        * ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution
-
-        Parameters
-        ----------
-        operatorRepresentation : OperatorRepresentation
-            Parsed operator representation with 'size' and other attributes
-        **kwargs
-            compute_tile, shim_tile, tilingConstraint (optional)
+        """Emit the AIE core compute block for a BF16 Add node.
+
+        Must be called inside an ``@aie_d.device(...)`` region **after**
+        the device-phase code-transformation passes have populated the
+        :class:`MLIRExecutionBlock`.
+
+        Expected keyword arguments
+        --------------------------
+        executionBlock : MLIRExecutionBlock
+            Carries ``computeTile``, ``fifoMap``, ``fifoTypes``,
+            ``tileSize``, ``numTiles``, ``kernelFuncName``, and
+            ``kernelObjFile`` — all set by ``MLIRObjectFifoPass``.
         """
-        tilingConstraint = kwargs.get('tilingConstraint', None)
-        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
-        num_elements = params['num_elements']
-        tile_size = params['tile_size']
-        num_tiles = num_elements // tile_size
+        eb = kwargs['executionBlock']
 
-        compute_tile = kwargs['compute_tile']
-        shim_tile = kwargs['shim_tile']
+        computeTile = eb.computeTile
+        tileSize = eb.tileSize
+        numTiles = eb.numTiles
+        kernelFn = eb.kernelFuncName
+        kernelObj = eb.kernelObjFile
 
-        # MemRef types
-        tile_ty = ir.MemRefType.get((tile_size,), ir.BF16Type.get())
+        # MemRef / scalar types
+        tileTy = eb.fifoTypes[list(eb.fifoTypes.keys())[0]]
         i32 = ir.IntegerType.get_signless(32)
 
-        # ObjectFifos (depth 2 for double-buffering)
-        aie_d.object_fifo("in1_0", shim_tile, [compute_tile], 2, tile_ty)
-        aie_d.object_fifo("in2_0", shim_tile, [compute_tile], 2, tile_ty)
-        aie_d.object_fifo("out_0", compute_tile, [shim_tile], 2, tile_ty)
+        # FIFO names (populated by MLIRObjectFifoPass)
+        in1Fifo = eb.fifoMap['data_in_1']
+        in2Fifo = eb.fifoMap['data_in_2']
+        outFifo = eb.fifoMap['data_out']
 
-        # External kernel declaration
-        aie_d.external_func(self.KERNEL_FN, [tile_ty, tile_ty, tile_ty, i32])
-
-        # Compute core
-        @aie_d.core(compute_tile, link_with=self.KERNEL_OBJ)
+        @aie_d.core(computeTile, link_with=kernelObj)
         def _core():
-            subview_ty = aie_d.ObjectFifoSubviewType.get(tile_ty)
+            subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy)
             for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
-                for _ in scf_d.for_(0, num_tiles, 1):
-                    acq_in1 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in1_0", 1)
-                    elem_in1 = aie_d.objectfifo_subview_access(tile_ty, acq_in1, 0)
-                    acq_in2 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in2_0", 1)
-                    elem_in2 = aie_d.objectfifo_subview_access(tile_ty, acq_in2, 0)
-                    acq_out = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Produce, "out_0", 1)
-                    elem_out = aie_d.objectfifo_subview_access(tile_ty, acq_out, 0)
-                    size_val = arith_d.constant(i32, tile_size)
-                    func_d.call([], self.KERNEL_FN, [elem_in1, elem_in2, elem_out, size_val])
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in1_0", 1)
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in2_0", 1)
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, "out_0", 1)
+                for _ in scf_d.for_(0, numTiles, 1):
+                    acqIn1 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in1Fifo, 1)
+                    elemIn1 = aie_d.objectfifo_subview_access(tileTy, acqIn1, 0)
+                    acqIn2 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in2Fifo, 1)
+                    elemIn2 = aie_d.objectfifo_subview_access(tileTy, acqIn2, 0)
+                    acqOut = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, outFifo, 1)
+                    elemOut = aie_d.objectfifo_subview_access(tileTy, acqOut, 0)
+                    sizeVal = arith_d.constant(i32, tileSize)
+                    func_d.call([], kernelFn, [elemIn1, elemIn2, elemOut, sizeVal])
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in1Fifo, 1)
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in2Fifo, 1)
+                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, outFifo, 1)
                     scf_d.yield_([])
                 scf_d.yield_([])
 
-    def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation,
-                            seq_args: list, tilingConstraint=None) -> None:
-        """Emit DMA configuration inside a runtime_sequence block.
-
-        Parameters
-        ----------
-        operatorRepresentation : OperatorRepresentation
-            Node representation (used to extract ``num_elements``).
-        seq_args : list
-            Block arguments of the runtime_sequence (memref values for
-            in1, in2, out — in the order matching the ONNX graph I/O).
-        tilingConstraint : NodeMemoryConstraint, optional
-            Tiling solution from the solver (currently ignored, for future use).
-        """
-        params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint)
-        num_elements = params['num_elements']
-
-        dims = [
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=num_elements, stride=1),
-        ]
-
-        in1, in2, out = seq_args[0], seq_args[1], seq_args[2]
-
-        task_in1 = aiex_d.dma_configure_task_for("in1_0")
-        block_in1 = task_in1.body.blocks.append()
-        with ir.InsertionPoint(block_in1):
-            aie_d.dma_bd(in1, offset=0, len=num_elements, dimensions=dims, burst_length=0)
-            aie_d.end()
-        aiex_d.dma_start_task(task_in1)
-
-        task_in2 = aiex_d.dma_configure_task_for("in2_0")
-        block_in2 = task_in2.body.blocks.append()
-        with ir.InsertionPoint(block_in2):
-            aie_d.dma_bd(in2, offset=0, len=num_elements, dimensions=dims, burst_length=0)
-            aie_d.end()
-        aiex_d.dma_start_task(task_in2)
-
-        task_out = aiex_d.dma_configure_task_for("out_0", issue_token=True)
-        block_out = task_out.body.blocks.append()
-        with ir.InsertionPoint(block_out):
-            aie_d.dma_bd(out, offset=0, len=num_elements, dimensions=dims, burst_length=0)
-            aie_d.end()
-        aiex_d.dma_start_task(task_out)
-        aiex_d.dma_await_task(task_out)
-        aiex_d.dma_free_task(task_in1)
-        aiex_d.dma_free_task(task_in2)
-
 
 referenceTemplate = XDNA2AddTemplate()

From 14f3ced77de9fabc630be440d483896df4102abe Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 18 Mar 2026 14:47:33 +0100
Subject: [PATCH 07/16] Template is agnostic of tiling and data movement that
 are handled by code transformation passes

---
 Deeploy/MLIRDataTypes.py                      |   4 +
 Deeploy/Targets/XDNA2/Bindings.py             |   5 +
 .../MLIRComputeCorePass.py                    | 116 ++++++++++++++++++
 .../CodeTransformationPasses/__init__.py      |   1 +
 Deeploy/Targets/XDNA2/Deployer.py             |   9 +-
 .../Targets/XDNA2/Templates/AddTemplate.py    |  97 +++++----------
 6 files changed, 159 insertions(+), 73 deletions(-)
 create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py

diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py
index 642fb6fef1..1f8a2be446 100644
--- a/Deeploy/MLIRDataTypes.py
+++ b/Deeploy/MLIRDataTypes.py
@@ -76,6 +76,10 @@ def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None:
         self.kernelFuncName: Optional[str] = None
         self.kernelObjFile: Optional[str] = None
 
+        # The MLIRNodeTemplate for this node (set by deployer, called by
+        # MLIRComputeCorePass to emit the kernel call inside the core block)
+        self.template: Optional[Any] = None
+
         # Set by deployer before runtime-sequence phase
         self.runtimeSequenceArgs: List[Any] = []
 
diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py
index e30bbc6646..14b8b0317a 100644
--- a/Deeploy/Targets/XDNA2/Bindings.py
+++ b/Deeploy/Targets/XDNA2/Bindings.py
@@ -6,6 +6,7 @@
 from Deeploy.CommonExtensions.DataTypes import bfloat16_t
 from Deeploy.DeeployTypes import NodeBinding
 from Deeploy.MLIRDataTypes import MLIRCodeTransformation
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import MLIRComputeCorePass
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass
 from Deeploy.Targets.XDNA2.Templates import AddTemplate
@@ -22,6 +23,10 @@
             kernelFuncName = "eltwise_add_bf16_vector",
             kernelObjFile = "add.o",
         ),
+        MLIRComputeCorePass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+        ),
     ],
     runtimeSequencePasses = [
         MLIRRuntimeSequencePass(
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
new file mode 100644
index 0000000000..7d06fab241
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that emits the AIE core block with tiling loops.
+
+This pass constructs the structural MLIR around the compute kernel:
+
+1. Opens an ``@aie_d.core`` block linked to the kernel object file.
+2. Opens an infinite outer ``scf.for`` loop (streaming).
+3. Opens an inner ``scf.for`` tiling loop (``numTiles`` iterations).
+4. Acquires input/output ObjectFifo elements.
+5. Builds a modified ``operatorRepresentation`` where tensor keys
+   (e.g. ``data_in_1``) are replaced with the acquired MLIR memref
+   values and ``size`` is replaced with the tile size — mirroring
+   how ``TilingVariableReplacement`` rewrites buffer names for C
+   backends.
+6. Calls ``template.emit(modifiedOpRepr)`` — the template only emits
+   its ``func_d.call`` using values from ``operatorRepresentation``.
+7. Releases all FIFO elements and closes loops.
+
+The pass is operator-agnostic: it only needs the tensor key lists and
+reads everything else from the :class:`MLIRExecutionBlock` populated by
+prior passes (e.g. :class:`MLIRObjectFifoPass`).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Tuple
+
+from aie.dialects import aie as aie_d
+from aie.dialects import scf as scf_d
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+
+class MLIRComputeCorePass(MLIRCodeTransformationPass):
+    """Emit ``@aie_d.core`` with tiling loops and FIFO acquire/release.
+
+    The template stored on ``mlirBlock.template`` is called inside the
+    inner loop with a *modified* ``operatorRepresentation`` whose tensor
+    entries point to acquired MLIR memref values instead of buffer name
+    strings.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors.
+    outputTensorKeys : list of str
+        Keys that name output tensors.
+    """
+
+    def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+
+    def apply(self,
+              ctxt: NetworkContext,
+              mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        computeTile = mlirBlock.computeTile
+        kernelObj = mlirBlock.kernelObjFile
+        tileSize = mlirBlock.tileSize
+        numTiles = mlirBlock.numTiles
+        opRepr = mlirBlock.operatorRepresentation
+        template = mlirBlock.template
+
+        # Use the first tensor's type as representative tile memref type
+        firstKey = self.inputTensorKeys[0]
+        tileTy = mlirBlock.fifoTypes[firstKey]
+
+        @aie_d.core(computeTile, link_with=kernelObj)
+        def _core():
+            subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy)
+            for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
+                for _ in scf_d.for_(0, numTiles, 1):
+                    # Acquire all input FIFO elements
+                    acquiredElements = {}
+                    for key in self.inputTensorKeys:
+                        fifoName = mlirBlock.fifoMap[key]
+                        acq = aie_d.objectfifo_acquire(
+                            subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(
+                            tileTy, acq, 0)
+
+                    # Acquire all output FIFO elements
+                    for key in self.outputTensorKeys:
+                        fifoName = mlirBlock.fifoMap[key]
+                        acq = aie_d.objectfifo_acquire(
+                            subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(
+                            tileTy, acq, 0)
+
+                    # Build modified opRepr: replace tensor names with MLIR
+                    # values, replace size with tile size.  This mirrors the
+                    # C backend's TilingVariableReplacement pass.
+                    modifiedOpRepr = {**opRepr, 'size': tileSize, **acquiredElements}
+
+                    # Call the template — it only emits func_d.call()
+                    template.emit(modifiedOpRepr)
+
+                    # Release all inputs
+                    for key in self.inputTensorKeys:
+                        aie_d.objectfifo_release(
+                            aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1)
+                    # Release all outputs
+                    for key in self.outputTensorKeys:
+                        aie_d.objectfifo_release(
+                            aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1)
+
+                    scf_d.yield_([])
+                scf_d.yield_([])
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
index aae227155a..fe25ee3fdf 100644
--- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
@@ -3,4 +3,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import *
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import *
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
index 16cda89891..f4a0e0a365 100644
--- a/Deeploy/Targets/XDNA2/Deployer.py
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -146,17 +146,18 @@ def _device():
                     eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile)
                     eb.operatorRepresentation = node['opRepr']
                     eb.patternMemoryConstraint = node['tilingConstraint']
+                    eb.template = node['template']
 
                     log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" +
                              (" (tiled)" if node['tilingConstraint'] else ""))
 
-                    # Run device-phase passes (ObjectFifo creation, kernel decl)
+                    # Run device-phase passes:
+                    #  1. MLIRObjectFifoPass — creates FIFOs, declares kernel
+                    #  2. MLIRComputeCorePass — opens core + loops, calls
+                    #     template.emit() with acquired FIFO elements in opRepr
                     self.ctxt, eb = node['codeTransformer'].applyDevicePasses(
                         self.ctxt, eb, node['nodeName'])
 
-                    # Emit compute core (template reads FIFOs etc. from eb)
-                    node['template'].emit(node['opRepr'], executionBlock=eb)
-
                     mlirBlocks.append((node, eb))
 
                 # === Runtime-sequence phase ===
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
index ab0b72be77..7a13b0625f 100644
--- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -1,24 +1,25 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-"""XDNA2 MLIR template for BF16 elementwise Add — compute kernel only.
-
-This template emits only the AIE core compute logic (FIFO
-acquire → kernel call → FIFO release).  ObjectFifo creation, external
-kernel declaration, and DMA runtime-sequence configuration are handled
-by :class:`MLIRObjectFifoPass` and :class:`MLIRRuntimeSequencePass`
-respectively, which populate the :class:`MLIRExecutionBlock` before
-this template's :meth:`emit` is called.
+"""XDNA2 MLIR template for BF16 elementwise Add — pure compute primitive.
+
+This template emits **only** a ``func_d.call`` to the vectorised
+``eltwise_add_bf16_vector`` kernel.  It receives its operands (acquired
+ObjectFifo element memrefs) and tile size through
+``operatorRepresentation``, exactly like a C Mako template receives
+buffer-name strings.
+
+All structural MLIR (``@aie_d.core``, loops, FIFO acquire/release,
+ObjectFifo creation, DMA configuration) is handled by
+:class:`MLIRCodeTransformationPass` instances upstream.
 """
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
-from aie.dialects import aie as aie_d
 from aie.dialects import arith as arith_d
 from aie.dialects import func as func_d
-from aie.dialects import scf as scf_d
 import aie.ir as ir
 
 from Deeploy.MLIRDataTypes import MLIRNodeTemplate
@@ -28,74 +29,32 @@
 
 
 class XDNA2AddTemplate(MLIRNodeTemplate):
-    """Compute-only MLIR template for BF16 elementwise Add on XDNA2 (AIE2p).
+    """Pure compute-primitive for BF16 elementwise Add on XDNA2.
 
-    Emits an ``@aie_d.core`` block containing nested loops that acquire
-    input/output ObjectFifo elements and call the vectorised
-    ``eltwise_add_bf16_vector`` kernel.
+    ``emit()`` is called by :class:`MLIRComputeCorePass` inside an
+    already-open ``@aie_d.core`` + tiling-loop context, with
+    ``operatorRepresentation`` entries replaced by live MLIR values:
 
-    All ObjectFifo creation and DMA configuration is performed by
-    upstream :class:`MLIRCodeTransformationPass` instances.  This
-    template reads FIFO names, tile size, and kernel metadata from the
-    :class:`MLIRExecutionBlock` passed through ``kwargs['executionBlock']``.
+    * ``data_in_1``, ``data_in_2``, ``data_out`` — acquired memref
+      elements (from ObjectFifo acquire).
+    * ``size`` — tile size (Python int).
     """
 
+    KERNEL_FN = "eltwise_add_bf16_vector"
+
     def __init__(self):
         super().__init__()
 
-    # ------------------------------------------------------------------
-    # MLIR emission — compute kernel only
-    # ------------------------------------------------------------------
-
     def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
-        """Emit the AIE core compute block for a BF16 Add node.
-
-        Must be called inside an ``@aie_d.device(...)`` region **after**
-        the device-phase code-transformation passes have populated the
-        :class:`MLIRExecutionBlock`.
-
-        Expected keyword arguments
-        --------------------------
-        executionBlock : MLIRExecutionBlock
-            Carries ``computeTile``, ``fifoMap``, ``fifoTypes``,
-            ``tileSize``, ``numTiles``, ``kernelFuncName``, and
-            ``kernelObjFile`` — all set by ``MLIRObjectFifoPass``.
-        """
-        eb = kwargs['executionBlock']
-
-        computeTile = eb.computeTile
-        tileSize = eb.tileSize
-        numTiles = eb.numTiles
-        kernelFn = eb.kernelFuncName
-        kernelObj = eb.kernelObjFile
-
-        # MemRef / scalar types
-        tileTy = eb.fifoTypes[list(eb.fifoTypes.keys())[0]]
+        """Emit a single ``func.call`` to the vectorised Add kernel."""
         i32 = ir.IntegerType.get_signless(32)
-
-        # FIFO names (populated by MLIRObjectFifoPass)
-        in1Fifo = eb.fifoMap['data_in_1']
-        in2Fifo = eb.fifoMap['data_in_2']
-        outFifo = eb.fifoMap['data_out']
-
-        @aie_d.core(computeTile, link_with=kernelObj)
-        def _core():
-            subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy)
-            for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
-                for _ in scf_d.for_(0, numTiles, 1):
-                    acqIn1 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in1Fifo, 1)
-                    elemIn1 = aie_d.objectfifo_subview_access(tileTy, acqIn1, 0)
-                    acqIn2 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in2Fifo, 1)
-                    elemIn2 = aie_d.objectfifo_subview_access(tileTy, acqIn2, 0)
-                    acqOut = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, outFifo, 1)
-                    elemOut = aie_d.objectfifo_subview_access(tileTy, acqOut, 0)
-                    sizeVal = arith_d.constant(i32, tileSize)
-                    func_d.call([], kernelFn, [elemIn1, elemIn2, elemOut, sizeVal])
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in1Fifo, 1)
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in2Fifo, 1)
-                    aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, outFifo, 1)
-                    scf_d.yield_([])
-                scf_d.yield_([])
+        sizeVal = arith_d.constant(i32, int(operatorRepresentation['size']))
+        func_d.call([], self.KERNEL_FN, [
+            operatorRepresentation['data_in_1'],
+            operatorRepresentation['data_in_2'],
+            operatorRepresentation['data_out'],
+            sizeVal,
+        ])
 
 
 referenceTemplate = XDNA2AddTemplate()

From b850b23d20597d8574aadbd5bacf54da02e665a9 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 19 Mar 2026 09:42:55 +0100
Subject: [PATCH 08/16] Add CI on self hosted runner

---
 .github/workflows/_runner-xdna2.yml     | 47 +++++++++++++++++++++++++
 .github/workflows/ci-platform-xdna2.yml | 31 ++++++++++++++++
 README_XDNA.md                          | 29 ++++++++++++---
 3 files changed, 103 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/_runner-xdna2.yml
 create mode 100644 .github/workflows/ci-platform-xdna2.yml

diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml
new file mode 100644
index 0000000000..f48f99c932
--- /dev/null
+++ b/.github/workflows/_runner-xdna2.yml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-xdna2
+
+"on":
+  workflow_call:
+    inputs:
+      pytest-marker:
+        required: true
+        type: string
+      docker-image:
+        required: false
+        type: string
+        default: "deeploy-xdna:local"
+
+jobs:
+  test-runner-xdna2:
+    runs-on: xdna2-npu
+    # NOTE: We cannot use the `container:` directive here because
+    # GitHub Actions does not support `--device` flags required for
+    # NPU access (/dev/accel/accel0). Instead we use explicit
+    # `docker run` commands.
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Run Tests in Docker
+        shell: bash
+        run: |
+          docker run --rm \
+            --device /dev/accel/accel0 \
+            --ulimit memlock=-1 \
+            -v /opt/xilinx:/opt/xilinx \
+            -v "${{ github.workspace }}":/app/Deeploy \
+            -w /app/Deeploy \
+            ${{ inputs.docker-image }} \
+            bash -c "
+              pip install -e . &&
+              pip install -r requirements-dev.txt &&
+              cd DeeployTest &&
+              pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}'
+            "
diff --git a/.github/workflows/ci-platform-xdna2.yml b/.github/workflows/ci-platform-xdna2.yml
new file mode 100644
index 0000000000..ccf455edf7
--- /dev/null
+++ b/.github/workflows/ci-platform-xdna2.yml
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • XDNA2
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: "XDNA2 Docker image (must be pre-built on the runner)"
+        required: false
+        default: "deeploy-xdna:local"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  xdna2-kernels:
+    uses: ./.github/workflows/_runner-xdna2.yml
+    with:
+      pytest-marker: "kernels"
+      docker-image: ${{ inputs.docker_image || 'deeploy-xdna:local' }}
diff --git a/README_XDNA.md b/README_XDNA.md
index a96a3550c8..56cfcb1225 100644
--- a/README_XDNA.md
+++ b/README_XDNA.md
@@ -12,8 +12,6 @@ You need to have XRT installed on your host, once installed it is present in `/o
 docker run -it \
   --device /dev/accel/accel0 \
   --ulimit memlock=-1 \
-  -v /scratch/jungvi/IRON:/opt/IRON \
-  -e IRON_OPERATORS_DIR=/opt/IRON/iron/operators \
   -v "$(pwd)":/app/Deeploy \
   -v /opt/xilinx:/opt/xilinx \
   --name deeploy_dev \
@@ -22,9 +20,32 @@ docker run -it \
 
 Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation.
 
-Once the container is started you can a simple Add node, from ONNX to execution with:
+Once the container is started you can run a simple Add node, from ONNX to execution with:
 ```
 pip install -e ./ && \
 cd DeeployTest && \
 python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/
-```
\ No newline at end of file
+```
+
+## CI with a Self-Hosted Runner
+
+XDNA2 tests run on a self-hosted GitHub Actions runner with NPU access.
+The Docker image is built locally on the runner (not distributed via GHCR).
+
+### One-time setup on the runner machine
+
+1. Build the Docker image:
+   ```
+   docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local .
+   ```
+
+2. Register the GitHub Actions runner (Settings → Actions → Runners → New self-hosted runner).
+   Use the label **`xdna2-npu`** and install as a service:
+   ```
+   ./svc.sh install && ./svc.sh start
+   ```
+
+3. Make sure the runner user has access to `/dev/accel/accel0` (e.g. is in the `render` group).
+
+Once the runner is registered, pushes and PRs automatically trigger the
+`CI • XDNA2` workflow defined in `.github/workflows/ci-platform-xdna2.yml`.
\ No newline at end of file

From d79e36fba00359f13885144eaad990ca0f96fea8 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 19 Mar 2026 09:52:53 +0100
Subject: [PATCH 09/16] Remove unecessary install

---
 .github/workflows/_runner-xdna2.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml
index f48f99c932..0463c54137 100644
--- a/.github/workflows/_runner-xdna2.yml
+++ b/.github/workflows/_runner-xdna2.yml
@@ -41,7 +41,6 @@ jobs:
             ${{ inputs.docker-image }} \
             bash -c "
               pip install -e . &&
-              pip install -r requirements-dev.txt &&
               cd DeeployTest &&
               pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}'
             "

From 7c995bca034b29d012825a5fa803db159caeea82 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 19 Mar 2026 10:00:47 +0100
Subject: [PATCH 10/16] Add cleanup step before checkout to fix permission

---
 .github/workflows/_runner-xdna2.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml
index 0463c54137..d9ba66d8a7 100644
--- a/.github/workflows/_runner-xdna2.yml
+++ b/.github/workflows/_runner-xdna2.yml
@@ -24,6 +24,14 @@ jobs:
     # NPU access (/dev/accel/accel0). Instead we use explicit
     # `docker run` commands.
     steps:
+      - name: Fix workspace permissions
+        shell: bash
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}":/workspace \
+            ${{ inputs.docker-image }} \
+            chown -R $(id -u):$(id -g) /workspace || true
+
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:

From fc2b364c03708c49bc110a38e5918c9c874c9153 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Tue, 24 Mar 2026 17:00:53 +0100
Subject: [PATCH 11/16] aie import is optional to not enforce mlir-aie and
 llvm-aie package installation

---
 DeeployTest/testUtils/platformMapping.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index eaa9b2503f..58e2e1c396 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -29,9 +29,6 @@
 from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
 from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
-from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
-from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, XDNA2Optimizer, \
-    XDNA2Platform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
 _NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"]
@@ -80,6 +77,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
         Platform = ChimeraPlatform()
 
     elif platformName == "XDNA2":
+        from Deeploy.Targets.XDNA2.Platform import XDNA2Platform
         Platform = XDNA2Platform()
 
     else:
@@ -279,7 +277,18 @@ def mapDeployer(platform: DeploymentPlatform,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
 
-    elif isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)):
+    else:
+        # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms
+        try:
+            from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
+            from Deeploy.Targets.XDNA2.Platform import (MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper,
+                                                         XDNA2Optimizer, XDNA2Platform)
+        except ImportError:
+            raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+
+        if not isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)):
+            raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+
         if loweringOptimizer is None:
             loweringOptimizer = XDNA2Optimizer
 
@@ -295,7 +304,4 @@ def mapDeployer(platform: DeploymentPlatform,
                                  default_channels_first = default_channels_first,
                                  deeployStateDir = deeployStateDir)
 
-    else:
-        raise RuntimeError(f"Deployer for platform {platform} is not implemented")
-
     return deployer

From 1865530ba68770b70e510fca25d2b81908dcd298 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Tue, 24 Mar 2026 17:01:22 +0100
Subject: [PATCH 12/16] Decouple xdna requirements from dev requirements

---
 Container/Dockerfile.deeploy-xdna | 11 +++--------
 requirements-dev.txt              |  7 -------
 requirements-xdna.txt             | 10 ++++++++++
 3 files changed, 13 insertions(+), 15 deletions(-)
 create mode 100644 requirements-xdna.txt

diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna
index fd62657740..16907402df 100644
--- a/Container/Dockerfile.deeploy-xdna
+++ b/Container/Dockerfile.deeploy-xdna
@@ -40,17 +40,12 @@ ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib
 
 
 WORKDIR /app
-COPY pyproject.toml ./
+COPY pyproject.toml requirements-xdna.txt ./
 RUN pip install toml-to-requirements && \
     toml-to-req --toml-file pyproject.toml && \
     pip install -r requirements.txt && \
-    rm -f requirements.txt pyproject.toml
-
-RUN pip install \
-    --extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 \
-    --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \
-    "mlir_aie==v1.2.1" \
-    llvm-aie
+    pip install -r requirements-xdna.txt && \
+    rm -f requirements.txt pyproject.toml requirements-xdna.txt
 
 ENV MLIR_AIE_PYTHON=/usr/bin/python3
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5cbdc0ef64..6d047b4957 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,13 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
---extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1
---extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
---extra-index-url https://pypi.org/simple
-
-mlir_aie==v1.2.1
-llvm-aie
-
 # Quality of life
 netron
 debugpy
diff --git a/requirements-xdna.txt b/requirements-xdna.txt
new file mode 100644
index 0000000000..21204f5987
--- /dev/null
+++ b/requirements-xdna.txt
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1
+--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
+--extra-index-url https://pypi.org/simple
+
+mlir_aie==v1.2.1
+llvm-aie

From de6f9616d7034f6bf472845e3a23af3a23469a06 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Tue, 24 Mar 2026 17:02:14 +0100
Subject: [PATCH 13/16] Format

---
 DeeployTest/testUtils/platformMapping.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 58e2e1c396..9155ed77ae 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -281,8 +281,8 @@ def mapDeployer(platform: DeploymentPlatform,
         # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms
         try:
             from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
-            from Deeploy.Targets.XDNA2.Platform import (MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper,
-                                                         XDNA2Optimizer, XDNA2Platform)
+            from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, \
+                XDNA2Optimizer, XDNA2Platform
         except ImportError:
             raise RuntimeError(f"Deployer for platform {platform} is not implemented")
 

From 01d458bfcf7bde410aaf3481e53fcbba20ae1c03 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Tue, 24 Mar 2026 17:02:32 +0100
Subject: [PATCH 14/16] Format

---
 Deeploy/MLIRDataTypes.py                      |  20 +-
 .../MLIRComputeCorePass.py                    |  24 +-
 .../MLIRObjectFifoPass.py                     |   9 +-
 .../MLIRRuntimeSequencePass.py                |  18 +-
 .../CodeTransformationPasses/__init__.py      |   2 +-
 Deeploy/Targets/XDNA2/Deployer.py             |  20 +-
 Deeploy/Targets/XDNA2/Platform.py             |  27 +-
 .../Targets/XDNA2/Templates/AddTemplate.py    |   2 +-
 Deeploy/Targets/XDNA2/Tiler.py                |   7 +-
 DeeployTest/Platforms/XDNA2/main.cpp          | 318 +++++++++---------
 DeeployTest/deeployRunner_xdna2.py            |   3 +-
 DeeployTest/generateNetwork_xdna2.py          |  28 +-
 TargetLibraries/XDNA2/kernels/add.cc          |  66 ++--
 13 files changed, 261 insertions(+), 283 deletions(-)

diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py
index 1f8a2be446..8305c26a04 100644
--- a/Deeploy/MLIRDataTypes.py
+++ b/Deeploy/MLIRDataTypes.py
@@ -24,18 +24,18 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, TYPE_CHECKING, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 if TYPE_CHECKING:
     from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 
-
 # ======================================================================
 # MLIRExecutionBlock
 # ======================================================================
 
+
 class MLIRExecutionBlock:
     """MLIR-specific execution state for a single operator.
 
@@ -92,6 +92,7 @@ def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None:
 # MLIRCodeTransformationPass / MLIRCodeTransformation
 # ======================================================================
 
+
 class MLIRCodeTransformationPass:
     """Base class for passes that transform an :class:`MLIRExecutionBlock`.
 
@@ -99,9 +100,7 @@ class MLIRCodeTransformationPass:
     and optionally emit MLIR operations into the current insertion point.
     """
 
-    def apply(self,
-              ctxt: NetworkContext,
-              mlirBlock: MLIRExecutionBlock,
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
               name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         return ctxt, mlirBlock
 
@@ -125,17 +124,13 @@ def __init__(self,
         self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or []
         self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or []
 
-    def applyDevicePasses(self,
-                          ctxt: NetworkContext,
-                          mlirBlock: MLIRExecutionBlock,
+    def applyDevicePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
                           name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         for _pass in self.devicePasses:
             ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
         return ctxt, mlirBlock
 
-    def applyRuntimeSequencePasses(self,
-                                   ctxt: NetworkContext,
-                                   mlirBlock: MLIRExecutionBlock,
+    def applyRuntimeSequencePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
                                    name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         for _pass in self.runtimeSequencePasses:
             ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
@@ -146,6 +141,7 @@ def applyRuntimeSequencePasses(self,
 # MLIRNodeTemplate
 # ======================================================================
 
+
 class MLIRNodeTemplate(NodeTemplate):
     """NodeTemplate subclass that emits MLIR instead of C code.
 
@@ -188,7 +184,7 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None
     # NodeTemplate overrides
     # ------------------------------------------------------------------
 
-    def generate(self, operatorRepresentation={}, **kwargs) -> str:
+    def generate(self, operatorRepresentation = {}, **kwargs) -> str:
         """Generate an MLIR string for this node.
 
         This default implementation is a thin wrapper: it delegates to
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
index 7d06fab241..2f58acc852 100644
--- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
@@ -56,9 +56,7 @@ def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> N
         self.inputTensorKeys = inputTensorKeys
         self.outputTensorKeys = outputTensorKeys
 
-    def apply(self,
-              ctxt: NetworkContext,
-              mlirBlock: MLIRExecutionBlock,
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
               name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         computeTile = mlirBlock.computeTile
         kernelObj = mlirBlock.kernelObjFile
@@ -71,7 +69,7 @@ def apply(self,
         firstKey = self.inputTensorKeys[0]
         tileTy = mlirBlock.fifoTypes[firstKey]
 
-        @aie_d.core(computeTile, link_with=kernelObj)
+        @aie_d.core(computeTile, link_with = kernelObj)
         def _core():
             subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy)
             for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
@@ -80,18 +78,14 @@ def _core():
                     acquiredElements = {}
                     for key in self.inputTensorKeys:
                         fifoName = mlirBlock.fifoMap[key]
-                        acq = aie_d.objectfifo_acquire(
-                            subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1)
-                        acquiredElements[key] = aie_d.objectfifo_subview_access(
-                            tileTy, acq, 0)
+                        acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0)
 
                     # Acquire all output FIFO elements
                     for key in self.outputTensorKeys:
                         fifoName = mlirBlock.fifoMap[key]
-                        acq = aie_d.objectfifo_acquire(
-                            subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1)
-                        acquiredElements[key] = aie_d.objectfifo_subview_access(
-                            tileTy, acq, 0)
+                        acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0)
 
                     # Build modified opRepr: replace tensor names with MLIR
                     # values, replace size with tile size.  This mirrors the
@@ -103,12 +97,10 @@ def _core():
 
                     # Release all inputs
                     for key in self.inputTensorKeys:
-                        aie_d.objectfifo_release(
-                            aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1)
+                        aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1)
                     # Release all outputs
                     for key in self.outputTensorKeys:
-                        aie_d.objectfifo_release(
-                            aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1)
+                        aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1)
 
                     scf_d.yield_([])
                 scf_d.yield_([])
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
index be6b492906..d49b0e4c03 100644
--- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
@@ -23,10 +23,9 @@
 
 from typing import TYPE_CHECKING, Tuple
 
+import aie.ir as ir
 import numpy as np
-
 from aie.dialects import aie as aie_d
-import aie.ir as ir
 
 from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
 
@@ -89,7 +88,7 @@ def __init__(self,
                  outputTensorKeys: list,
                  kernelFuncName: str,
                  kernelObjFile: str,
-                 kernelArgTypes=None,
+                 kernelArgTypes = None,
                  fifoDepth: int = 2) -> None:
         self.inputTensorKeys = inputTensorKeys
         self.outputTensorKeys = outputTensorKeys
@@ -98,9 +97,7 @@ def __init__(self,
         self._kernelArgTypes = kernelArgTypes
         self.fifoDepth = fifoDepth
 
-    def apply(self,
-              ctxt: NetworkContext,
-              mlirBlock: MLIRExecutionBlock,
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
               name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         opRepr = mlirBlock.operatorRepresentation
         numElements = int(opRepr['size'])
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
index 18a4607328..6331bd0914 100644
--- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
@@ -18,9 +18,9 @@
 
 from typing import TYPE_CHECKING, Tuple
 
+import aie.ir as ir
 from aie.dialects import aie as aie_d
 from aie.dialects import aiex as aiex_d
-import aie.ir as ir
 
 from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
 
@@ -43,18 +43,16 @@ def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None:
         self.inputTensorKeys = inputTensorKeys
         self.outputTensorKeys = outputTensorKeys
 
-    def apply(self,
-              ctxt: NetworkContext,
-              mlirBlock: MLIRExecutionBlock,
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
               name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
         numElements = mlirBlock.numElements
         seqArgs = mlirBlock.runtimeSequenceArgs
 
         dims = [
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=1, stride=0),
-            aie_d.bd_dim_layout(size=numElements, stride=1),
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = numElements, stride = 1),
         ]
 
         # Build ordered list of (fifoName, seqArg, isOutput)
@@ -70,12 +68,12 @@ def apply(self,
 
         for fifoName, seqArg, isOutput in transfers:
             if isOutput:
-                task = aiex_d.dma_configure_task_for(fifoName, issue_token=True)
+                task = aiex_d.dma_configure_task_for(fifoName, issue_token = True)
             else:
                 task = aiex_d.dma_configure_task_for(fifoName)
             block = task.body.blocks.append()
             with ir.InsertionPoint(block):
-                aie_d.dma_bd(seqArg, offset=0, len=numElements, dimensions=dims, burst_length=0)
+                aie_d.dma_bd(seqArg, offset = 0, len = numElements, dimensions = dims, burst_length = 0)
                 aie_d.end()
             aiex_d.dma_start_task(task)
 
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
index fe25ee3fdf..f7843db7b3 100644
--- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
@@ -2,6 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import *
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import *
 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import *
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
index f4a0e0a365..0ea3e2491d 100644
--- a/Deeploy/Targets/XDNA2/Deployer.py
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -21,12 +21,11 @@
 
 from typing import Callable, Dict, Optional, Type
 
+import aie.ir as ir
 import onnx_graphsurgeon as gs
-
-from aie.extras.context import mlir_mod_ctx
 from aie.dialects import aie as aie_d
 from aie.dialects import aiex as aiex_d
-import aie.ir as ir
+from aie.extras.context import mlir_mod_ctx
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
@@ -111,13 +110,11 @@ def generateMLIR(self) -> str:
             tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)
 
             if not isinstance(template, MLIRNodeTemplate):
-                raise RuntimeError(
-                    f"Node '{nodeName}' has no MLIRNodeTemplate — "
-                    f"only BF16 Add is supported in this release.")
+                raise RuntimeError(f"Node '{nodeName}' has no MLIRNodeTemplate — "
+                                   f"only BF16 Add is supported in this release.")
             if not isinstance(codeTransformer, MLIRCodeTransformation):
-                raise RuntimeError(
-                    f"Node '{nodeName}' uses a non-MLIR CodeTransformation — "
-                    f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.")
+                raise RuntimeError(f"Node '{nodeName}' uses a non-MLIR CodeTransformation — "
+                                   f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.")
 
             nodes.append({
                 'nodeName': nodeName,
@@ -143,7 +140,7 @@ def _device():
                 # === Device phase ===
                 for node in nodes:
                     # Create MLIRExecutionBlock with deployer-level state
-                    eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile)
+                    eb = MLIRExecutionBlock(computeTile = computeTile, shimTile = shimTile)
                     eb.operatorRepresentation = node['opRepr']
                     eb.patternMemoryConstraint = node['tilingConstraint']
                     eb.template = node['template']
@@ -155,8 +152,7 @@ def _device():
                     #  1. MLIRObjectFifoPass — creates FIFOs, declares kernel
                     #  2. MLIRComputeCorePass — opens core + loops, calls
                     #     template.emit() with acquired FIFO elements in opRepr
-                    self.ctxt, eb = node['codeTransformer'].applyDevicePasses(
-                        self.ctxt, eb, node['nodeName'])
+                    self.ctxt, eb = node['codeTransformer'].applyDevicePasses(self.ctxt, eb, node['nodeName'])
 
                     mlirBlocks.append((node, eb))
 
diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py
index 4a186aca7c..b54ce8acb9 100644
--- a/Deeploy/Targets/XDNA2/Platform.py
+++ b/Deeploy/Targets/XDNA2/Platform.py
@@ -9,15 +9,15 @@
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
 from Deeploy.Targets.Generic.Layers import AddLayer
-from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.Parsers import AddParser
+from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
 from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings
 
 # Standard mapper for non-tiled deployment
 XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)
 
-# Tiling-ready mapper for tiled deployment  
+# Tiling-ready mapper for tiled deployment
 XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings)
 
 # Standard mapping (used when tiling is disabled)
@@ -64,8 +64,7 @@ class XDNA2StructBuffer(StructBuffer):
 
 class XDNA2Engine(DeploymentEngine):
 
-    def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "",
-                 includeList = None) -> None:
+    def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", includeList = None) -> None:
         if includeList is None:
             includeList = []
         super().__init__(name, Mapping, initCode, includeList)
@@ -73,13 +72,17 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str =
 
 class XDNA2AIECoreEngine(DeploymentEngine):
     """AIE core execution engine with L1 local memory as preferred memory level.
-    
+
     The AIE core has 8KB of local memory (L1) for temporary buffers and computation.
     Data is transferred from L3 (shared memory) to L1 as needed.
     """
 
-    def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "",
-                 includeList = None, preferredMemoryLevel: str = "L1") -> None:
+    def __init__(self,
+                 name: str = "XDNA2_AIE_Core",
+                 Mapping = XDNA2Mapping,
+                 initCode: str = "",
+                 includeList = None,
+                 preferredMemoryLevel: str = "L1") -> None:
         if includeList is None:
             includeList = []
         super().__init__(name, Mapping, initCode, includeList)
@@ -101,7 +104,7 @@ def __init__(self,
 
 class MemoryXDNA2Platform(MemoryPlatform):
     """XDNA2 platform with memory hierarchy support for tiling.
-    
+
     Defines the memory hierarchy:
     - L1: 8KB per AIE core (local memory)
     - L3: Shared memory for entire AIE array
@@ -122,7 +125,7 @@ def __init__(self,
 
     def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
         """Get the target memory level for a tensor in a given node.
-        
+
         For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level).
         Otherwise use the default target memory level (typically L3).
         """
@@ -131,14 +134,14 @@ def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkCont
             engine = node._engine_assignment
             if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
                 return engine.preferredMemoryLevel
-        
+
         return self.defaultTargetMemoryLevel.name
 
 
 class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper):
     """Wrapper for XDNA2Platform with memory-level support."""
 
-    def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, 
+    def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy,
                  defaultTargetMemoryLevel: MemoryLevel):
         assert isinstance(platform, XDNA2Platform), \
             f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}"
@@ -150,5 +153,5 @@ def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkCont
             engine = node._engine_assignment
             if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
                 return engine.preferredMemoryLevel
-        
+
         return self.defaultTargetMemoryLevel.name
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
index 7a13b0625f..6c526a9e38 100644
--- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -18,9 +18,9 @@
 
 from typing import TYPE_CHECKING
 
+import aie.ir as ir
 from aie.dialects import arith as arith_d
 from aie.dialects import func as func_d
-import aie.ir as ir
 
 from Deeploy.MLIRDataTypes import MLIRNodeTemplate
 
diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py
index 9754aa0688..b2282c34b0 100644
--- a/Deeploy/Targets/XDNA2/Tiler.py
+++ b/Deeploy/Targets/XDNA2/Tiler.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-
 """XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation."""
 
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
@@ -10,7 +9,5 @@
 
 # For Add operator, reuse the generic BOP (Binary Operator) tile constraint
 # which handles equal-dimension binary operations
-XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(
-    nodeBindings=XDNA2AddBindings,
-    tileConstraint=AddTileConstraint()
-)
+XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = XDNA2AddBindings,
+                                                      tileConstraint = AddTileConstraint())
diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
index 046384e4db..20d748265d 100644
--- a/DeeployTest/Platforms/XDNA2/main.cpp
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -22,181 +22,185 @@
 #include "xrt/xrt_kernel.h"
 
 // Generated by Deeploy's generateNetwork_xdna2.py:
-//   testinputs.h  – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} defines
-//   testoutputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_OUTPUT{i} defines
+//   testinputs.h  – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i}
+//   defines testoutputs.h – uint16_t arrays of BF16 bit patterns +
+//   N_ELEMENTS_OUTPUT{i} defines
 #include "testinputs.h"
 #include "testoutputs.h"
 
 // ---------------------------------------------------------------------------
 // BF16 helpers
 // ---------------------------------------------------------------------------
-static float bf16_to_float(uint16_t bf16)
-{
-    uint32_t f32_bits = static_cast<uint32_t>(bf16) << 16;
-    float f;
-    std::memcpy(&f, &f32_bits, sizeof(f));
-    return f;
+static float bf16_to_float(uint16_t bf16) {
+  uint32_t f32_bits = static_cast<uint32_t>(bf16) << 16;
+  float f;
+  std::memcpy(&f, &f32_bits, sizeof(f));
+  return f;
 }
 
-static bool bf16_nearly_equal(uint16_t a, uint16_t b,
-                              float rtol = 0.0f, float atol = 0.0f)
-{
-    // Default: allow 1 BF16 ULP difference to account for hardware rounding.
-    // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values.
-    float fa = bf16_to_float(a);
-    float fb = bf16_to_float(b);
-    float diff = std::fabs(fa - fb);
-
-    // Compute 1 ULP for the reference value's magnitude
-    uint16_t ref_exp = (b >> 7) & 0xFF;  // BF16 exponent (8 bits)
-    float ulp;
-    if (ref_exp == 0)
-        ulp = std::ldexp(1.0f, -133);  // subnormal ULP
-    else
-        ulp = std::ldexp(1.0f, static_cast<int>(ref_exp) - 127 - 7);  // 7 mantissa bits
-
-    float tol = std::fmax(atol + rtol * std::fabs(fb), ulp);
-    return diff <= tol;
+static bool bf16_nearly_equal(uint16_t a, uint16_t b, float rtol = 0.0f,
+                              float atol = 0.0f) {
+  // Default: allow 1 BF16 ULP difference to account for hardware rounding.
+  // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values.
+  float fa = bf16_to_float(a);
+  float fb = bf16_to_float(b);
+  float diff = std::fabs(fa - fb);
+
+  // Compute 1 ULP for the reference value's magnitude
+  uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits)
+  float ulp;
+  if (ref_exp == 0)
+    ulp = std::ldexp(1.0f, -133); // subnormal ULP
+  else
+    ulp = std::ldexp(1.0f,
+                     static_cast<int>(ref_exp) - 127 - 7); // 7 mantissa bits
+
+  float tol = std::fmax(atol + rtol * std::fabs(fb), ulp);
+  return diff <= tol;
 }
 
 // ---------------------------------------------------------------------------
 // Read the NPU instruction binary produced by aiecc.py
 // ---------------------------------------------------------------------------
-static std::vector<uint32_t> read_instr_binary(const std::string &path)
-{
-    std::ifstream file(path, std::ios::binary);
-    if (!file.is_open()) {
-        throw std::runtime_error("Cannot open instruction file: " + path);
-    }
-    file.seekg(0, std::ios::end);
-    size_t byte_size = file.tellg();
-    file.seekg(0, std::ios::beg);
-
-    std::vector<uint32_t> instr(byte_size / sizeof(uint32_t));
-    file.read(reinterpret_cast<char *>(instr.data()), byte_size);
-    return instr;
+static std::vector<uint32_t> read_instr_binary(const std::string &path) {
+  std::ifstream file(path, std::ios::binary);
+  if (!file.is_open()) {
+    throw std::runtime_error("Cannot open instruction file: " + path);
+  }
+  file.seekg(0, std::ios::end);
+  size_t byte_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<uint32_t> instr(byte_size / sizeof(uint32_t));
+  file.read(reinterpret_cast<char *>(instr.data()), byte_size);
+  return instr;
 }
 
-int main(int argc, char **argv)
-{
-    // Paths to the compiled artefacts: default to the directory containing
-    // this binary so the test works regardless of the working directory or
-    // whether it is run inside a container.
-    std::string bin_dir;
-    {
-        std::string argv0(argv[0]);
-        auto sep = argv0.rfind('/');
-        bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep);
+int main(int argc, char **argv) {
+  // Paths to the compiled artefacts: default to the directory containing
+  // this binary so the test works regardless of the working directory or
+  // whether it is run inside a container.
+  std::string bin_dir;
+  {
+    std::string argv0(argv[0]);
+    auto sep = argv0.rfind('/');
+    bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep);
+  }
+  std::string xclbin_path = bin_dir + "/network.xclbin";
+  std::string instr_path = bin_dir + "/npu_insts.bin";
+
+  bool verbose = false;
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg == "-v" || arg == "--verbose" || arg == "-vv") {
+      verbose = true;
     }
-    std::string xclbin_path = bin_dir + "/network.xclbin";
-    std::string instr_path  = bin_dir + "/npu_insts.bin";
-
-    bool verbose = false;
-    for (int i = 1; i < argc; ++i) {
-        std::string arg = argv[i];
-        if (arg == "-v" || arg == "--verbose" || arg == "-vv") {
-            verbose = true;
-        }
+  }
+  if (argc >= 2 && argv[1][0] != '-')
+    xclbin_path = argv[1];
+  if (argc >= 3 && argv[2][0] != '-')
+    instr_path = argv[2];
+
+  // -----------------------------------------------------------------------
+  // 1. Open XRT device, register xclbin, create hw_context
+  //    (matches mlir-aie test_utils::init_xrt_load_kernel pattern)
+  // -----------------------------------------------------------------------
+  auto device = xrt::device(0);
+  auto xclbin = xrt::xclbin(xclbin_path);
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, "MLIR_AIE");
+
+  // -----------------------------------------------------------------------
+  // 2. Read NPU instruction binary
+  // -----------------------------------------------------------------------
+  std::vector<uint32_t> instr_v = read_instr_binary(instr_path);
+  size_t n_instr = instr_v.size();
+
+  // -----------------------------------------------------------------------
+  // 3. Derive element counts from the testinputs/testoutputs header defines.
+  //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
+  //    by generateNetwork_xdna2.py.
+  // -----------------------------------------------------------------------
+  static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
+                "Input 0 and input 1 must have the same number of elements");
+  static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,
+                "Inputs and output must have the same number of elements");
+
+  const size_t n_elem = N_ELEMENTS_OUTPUT0;
+  const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes
+  const size_t buf_bytes = n_elem * elem_size;
+
+  // -----------------------------------------------------------------------
+  // 4. Allocate XRT buffer objects
+  //    Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out)
+  // -----------------------------------------------------------------------
+  auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_in0 =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_in1 =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // -----------------------------------------------------------------------
+  // 5. Copy data into device buffers
+  // -----------------------------------------------------------------------
+  std::memcpy(bo_instr.map<uint32_t *>(), instr_v.data(),
+              n_instr * sizeof(uint32_t));
+  std::memcpy(bo_in0.map<void *>(), testInputVector0, buf_bytes);
+  std::memcpy(bo_in1.map<void *>(), testInputVector1, buf_bytes);
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // -----------------------------------------------------------------------
+  // 6. Launch kernel and wait for completion
+  //    opcode 3 = execute NPU instruction stream
+  // -----------------------------------------------------------------------
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, static_cast<uint32_t>(n_instr), bo_in0,
+                    bo_in1, bo_out);
+  run.wait();
+
+  // -----------------------------------------------------------------------
+  // 7. Sync output back and compare against golden reference
+  // -----------------------------------------------------------------------
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  const uint16_t *hw_out = bo_out.map<const uint16_t *>();
+  const uint16_t *golden_out = testOutputVector0;
+
+  int errors = 0;
+  for (size_t i = 0; i < n_elem; ++i) {
+    bool match = bf16_nearly_equal(hw_out[i], golden_out[i]);
+    if (!match) {
+      ++errors;
+      if (errors <= 10) {
+        std::cerr << "  Mismatch at index " << i
+                  << ": hw=" << bf16_to_float(hw_out[i]) << " (0x" << std::hex
+                  << hw_out[i] << std::dec << ")"
+                  << "  ref=" << bf16_to_float(golden_out[i]) << " (0x"
+                  << std::hex << golden_out[i] << std::dec << ")"
+                  << "  diff="
+                  << std::fabs(bf16_to_float(hw_out[i]) -
+                               bf16_to_float(golden_out[i]))
+                  << "\n";
+      }
     }
-    if (argc >= 2 && argv[1][0] != '-') xclbin_path = argv[1];
-    if (argc >= 3 && argv[2][0] != '-') instr_path  = argv[2];
-
-    // -----------------------------------------------------------------------
-    // 1. Open XRT device, register xclbin, create hw_context
-    //    (matches mlir-aie test_utils::init_xrt_load_kernel pattern)
-    // -----------------------------------------------------------------------
-    auto device = xrt::device(0);
-    auto xclbin = xrt::xclbin(xclbin_path);
-    device.register_xclbin(xclbin);
-    xrt::hw_context context(device, xclbin.get_uuid());
-    auto kernel = xrt::kernel(context, "MLIR_AIE");
-
-    // -----------------------------------------------------------------------
-    // 2. Read NPU instruction binary
-    // -----------------------------------------------------------------------
-    std::vector<uint32_t> instr_v = read_instr_binary(instr_path);
-    size_t n_instr = instr_v.size();
-
-    // -----------------------------------------------------------------------
-    // 3. Derive element counts from the testinputs/testoutputs header defines.
-    //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
-    //    by generateNetwork_xdna2.py.
-    // -----------------------------------------------------------------------
-    static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
-                  "Input 0 and input 1 must have the same number of elements");
-    static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,
-                  "Inputs and output must have the same number of elements");
-
-    const size_t n_elem    = N_ELEMENTS_OUTPUT0;
-    const size_t elem_size = sizeof(uint16_t);   // BF16 = 2 bytes
-    const size_t buf_bytes = n_elem * elem_size;
-
-    // -----------------------------------------------------------------------
-    // 4. Allocate XRT buffer objects
-    //    Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out)
-    // -----------------------------------------------------------------------
-    auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t),
-                            XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-    auto bo_in0   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-    auto bo_in1   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-    auto bo_out   = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
-
-    // -----------------------------------------------------------------------
-    // 5. Copy data into device buffers
-    // -----------------------------------------------------------------------
-    std::memcpy(bo_instr.map<uint32_t *>(), instr_v.data(), n_instr * sizeof(uint32_t));
-    std::memcpy(bo_in0.map<void *>(), testInputVector0, buf_bytes);
-    std::memcpy(bo_in1.map<void *>(), testInputVector1, buf_bytes);
-
-    bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-    bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-    bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-    // -----------------------------------------------------------------------
-    // 6. Launch kernel and wait for completion
-    //    opcode 3 = execute NPU instruction stream
-    // -----------------------------------------------------------------------
-    unsigned int opcode = 3;
-    auto run = kernel(opcode, bo_instr, static_cast<uint32_t>(n_instr),
-                      bo_in0, bo_in1, bo_out);
-    run.wait();
-
-    // -----------------------------------------------------------------------
-    // 7. Sync output back and compare against golden reference
-    // -----------------------------------------------------------------------
-    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    const uint16_t *hw_out     = bo_out.map<const uint16_t *>();
-    const uint16_t *golden_out = testOutputVector0;
-
-    int errors = 0;
-    for (size_t i = 0; i < n_elem; ++i) {
-        bool match = bf16_nearly_equal(hw_out[i], golden_out[i]);
-        if (!match) {
-            ++errors;
-            if (errors <= 10) {
-                std::cerr << "  Mismatch at index " << i
-                          << ": hw=" << bf16_to_float(hw_out[i])
-                          << " (0x" << std::hex << hw_out[i] << std::dec << ")"
-                          << "  ref=" << bf16_to_float(golden_out[i])
-                          << " (0x" << std::hex << golden_out[i] << std::dec << ")"
-                          << "  diff=" << std::fabs(bf16_to_float(hw_out[i]) - bf16_to_float(golden_out[i]))
-                          << "\n";
-            }
-        }
-        if (verbose) {
-            float hw_f  = bf16_to_float(hw_out[i]);
-            float ref_f = bf16_to_float(golden_out[i]);
-            std::cout << "[" << i << "] hw=" << hw_f
-                      << "  ref=" << ref_f
-                      << "  diff=" << std::fabs(hw_f - ref_f)
-                      << (match ? "" : "  *** MISMATCH")
-                      << "\n";
-        }
+    if (verbose) {
+      float hw_f = bf16_to_float(hw_out[i]);
+      float ref_f = bf16_to_float(golden_out[i]);
+      std::cout << "[" << i << "] hw=" << hw_f << "  ref=" << ref_f
+                << "  diff=" << std::fabs(hw_f - ref_f)
+                << (match ? "" : "  *** MISMATCH") << "\n";
     }
+  }
 
-    // Output format required by testUtils/core/output_parser.py
-    std::cout << "Errors: " << errors << " out of " << n_elem << "\n";
+  // Output format required by testUtils/core/output_parser.py
+  std::cout << "Errors: " << errors << " out of " << n_elem << "\n";
 
-    return (errors == 0) ? 0 : 1;
+  return (errors == 0) ? 0 : 1;
 }
diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py
index 82be139d46..2fd1a40418 100644
--- a/DeeployTest/deeployRunner_xdna2.py
+++ b/DeeployTest/deeployRunner_xdna2.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-
 """Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform.
 
 Usage (from DeeployTest/):
@@ -14,4 +13,4 @@
 from testUtils.deeployRunner import main
 
 if __name__ == '__main__':
-    sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True))
+    sys.exit(main(default_platform = "XDNA2", default_simulator = "host", tiling_enabled = True))
diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py
index 995eaabbb7..43fd941926 100644
--- a/DeeployTest/generateNetwork_xdna2.py
+++ b/DeeployTest/generateNetwork_xdna2.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
 # SPDX-License-Identifier: Apache-2.0
-
 """XDNA2 network generation script.
 
 JUNGVI: TODO: Move this script to ONNX4Deeploy
@@ -16,12 +15,10 @@
 """
 
 import os
-import struct
 
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-
 from testUtils.platformMapping import mapDeployer
 from testUtils.testRunner import TestGeneratorArgumentParser
 
@@ -149,8 +146,8 @@ def generateNetworkXDNA2(args):
 
     log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}")
 
-    l1_level = MemoryLevel("L1", neighbourNames=["L3"], size=l1_size)
-    l3_level = MemoryLevel("L3", neighbourNames=["L1"], size=l3_size)
+    l1_level = MemoryLevel("L1", neighbourNames = ["L3"], size = l1_size)
+    l3_level = MemoryLevel("L3", neighbourNames = ["L1"], size = l3_size)
     memory_hierarchy = MemoryHierarchy([l1_level, l3_level])
     memory_hierarchy.setDefaultMemoryLevel("L3")  # Tensors default to L3
 
@@ -158,24 +155,23 @@ def generateNetworkXDNA2(args):
     # defaultTargetMemoryLevel=L1 tells the tiling framework that computation
     # targets L1, so it must tile data from L3 into L1-sized chunks.
     mem_platform = MemoryXDNA2Platform(
-        memoryHierarchy=memory_hierarchy,
-        defaultTargetMemoryLevel=l1_level,
-        engines=[XDNA2AIECoreEngine(Mapping=XDNA2TilingMapping, preferredMemoryLevel="L1")]
-    )
+        memoryHierarchy = memory_hierarchy,
+        defaultTargetMemoryLevel = l1_level,
+        engines = [XDNA2AIECoreEngine(Mapping = XDNA2TilingMapping, preferredMemoryLevel = "L1")])
 
     # Create base deployer with memory platform
     deployer = mapDeployer(mem_platform,
                            graph,
                            inputTypes,
-                           scheduler=_tilingScheduler,
-                           deeployStateDir=_DEEPLOYSTATEDIR,
-                           inputOffsets=inputOffsets)
+                           scheduler = _tilingScheduler,
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets)
 
     # Wrap with MemoryDeployerWrapper (adds memory level annotation)
     deployer = MemoryDeployerWrapper(deployer)
 
     # Wrap with TilerDeployerWrapper (adds tiling)
-    deployer = TilerDeployerWrapper(deployer, workDir=_DEEPLOYSTATEDIR)
+    deployer = TilerDeployerWrapper(deployer, workDir = _DEEPLOYSTATEDIR)
 
     # frontEnd() parses the graph; bind() triggers tiling via wrappers
     deployer.frontEnd()
@@ -184,7 +180,7 @@ def generateNetworkXDNA2(args):
     log.info("[XDNA2] Tiling completed, proceeding with MLIR generation")
 
     # Create output directory
-    os.makedirs(args.dumpdir, exist_ok=True)
+    os.makedirs(args.dumpdir, exist_ok = True)
 
     # Write testinputs.h (raw BF16 bit patterns as uint16_t)
     testInputStr = _generate_xdna2_inputs_header(test_inputs_f32)
@@ -215,8 +211,8 @@ def generateNetworkXDNA2(args):
 
 
 if __name__ == '__main__':
-    parser = TestGeneratorArgumentParser(tiling_arguments=True,
-                                        description="Deeploy XDNA2 Code Generation Utility.")
+    parser = TestGeneratorArgumentParser(tiling_arguments = True,
+                                         description = "Deeploy XDNA2 Code Generation Utility.")
     args, _ = parser.parse_known_args()
 
     if args.platform != 'XDNA2':
diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc
index 1a53e47398..13b8b54637 100644
--- a/TargetLibraries/XDNA2/kernels/add.cc
+++ b/TargetLibraries/XDNA2/kernels/add.cc
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All
+// rights reserved. SPDX-License-Identifier: Apache-2.0
 
 #define NOCPP
 
@@ -10,45 +10,45 @@
 #include <stdlib.h>
 #include <type_traits>
 
-template <typename T_in, typename T_out> void eltwise_add(T_in *a, T_in *b, T_out *c, int size)
-{
-    for (int i = 0; i < size; i++) {
-        c[i] = a[i] + b[i];
-    }
+template <typename T_in, typename T_out>
+void eltwise_add(T_in *a, T_in *b, T_out *c, int size) {
+  for (int i = 0; i < size; i++) {
+    c[i] = a[i] + b[i];
+  }
 }
 
-template <typename T_in, typename T_out> void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size)
-{
-    constexpr int vec_factor = 16;
-    event0();
-    T_in *__restrict pA1 = a;
-    T_in *__restrict pB1 = b;
-    T_out *__restrict pC1 = c;
-    const int F = size / vec_factor;
-    AIE_PREPARE_FOR_PIPELINING
-    AIE_LOOP_MIN_ITERATION_COUNT(16)
-    for (int i = 0; i < F; i++) {
-        aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
-        pA1 += vec_factor;
-        aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
-        pB1 += vec_factor;
-        aie::vector<T_out, vec_factor> cout = aie::add(A0, B0);
-        aie::store_v(pC1, cout);
-        pC1 += vec_factor;
-    }
-    event1();
+template <typename T_in, typename T_out>
+void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) {
+  constexpr int vec_factor = 16;
+  event0();
+  T_in *__restrict pA1 = a;
+  T_in *__restrict pB1 = b;
+  T_out *__restrict pC1 = c;
+  const int F = size / vec_factor;
+  AIE_PREPARE_FOR_PIPELINING
+  AIE_LOOP_MIN_ITERATION_COUNT(16)
+  for (int i = 0; i < F; i++) {
+    aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+    pA1 += vec_factor;
+    aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
+    pB1 += vec_factor;
+    aie::vector<T_out, vec_factor> cout = aie::add(A0, B0);
+    aie::store_v(pC1, cout);
+    pC1 += vec_factor;
+  }
+  event1();
 }
 
 extern "C" {
 
-void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size)
-{
-    eltwise_add<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out,
+                             int size) {
+  eltwise_add<bfloat16, bfloat16>(a_in, b_in, c_out, size);
 }
 
-void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size)
-{
-    eltwise_vadd<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out,
+                             int size) {
+  eltwise_vadd<bfloat16, bfloat16>(a_in, b_in, c_out, size);
 }
 
 } // extern "C"

From 4427f5a7b0ac73777a75a30e973c0ea66a7715eb Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 26 Mar 2026 11:49:10 +0100
Subject: [PATCH 15/16] Add general todos for future refactoring

---
 CMakeLists.txt                       |  3 ---
 Deeploy/Targets/XDNA2/Bindings.py    |  2 ++
 DeeployTest/Platforms/XDNA2/main.cpp |  3 +++
 DeeployTest/generateNetwork_xdna2.py | 12 +++++-------
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c23ccca7b..ffc4d64085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,9 +311,6 @@ if(platform STREQUAL XDNA2)
   message(STATUS "==============================================================================")
   message(STATUS "")
 
-  # XDNA2 uses its own CMakeLists.txt in DeeployTest/Platforms/XDNA2/
-  # which handles the two-step build: xclbin -> host binary.
-  # AIE kernel compilation is in TargetLibraries/XDNA2/.
   add_subdirectory(TargetLibraries/XDNA2)
   add_subdirectory(DeeployTest/Platforms/XDNA2)
 
diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py
index 14b8b0317a..1f0e7f7587 100644
--- a/Deeploy/Targets/XDNA2/Bindings.py
+++ b/Deeploy/Targets/XDNA2/Bindings.py
@@ -15,6 +15,8 @@
 _ADD_INPUT_KEYS = ['data_in_1', 'data_in_2']
 _ADD_OUTPUT_KEYS = ['data_out']
 
+# JUNGVI: TODO: This logic should not be boiled down for 1 operator but should be applied on every nodes of the network
+# Likewise the kernelName and object file name should be specified in the node template of each operator.
 XDNA2Transformer = MLIRCodeTransformation(
     devicePasses = [
         MLIRObjectFifoPass(
diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
index 20d748265d..0cb5186f38 100644
--- a/DeeployTest/Platforms/XDNA2/main.cpp
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -122,6 +122,7 @@ int main(int argc, char **argv) {
   //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
   //    by generateNetwork_xdna2.py.
   // -----------------------------------------------------------------------
+  // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs (with respect to the amount of bo available)
   static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
                 "Input 0 and input 1 must have the same number of elements");
   static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,
@@ -160,6 +161,8 @@ int main(int argc, char **argv) {
   // 6. Launch kernel and wait for completion
   //    opcode 3 = execute NPU instruction stream
   // -----------------------------------------------------------------------
+  // JUNGVI: TODO: Collect runtime and display it
+  // JUNGVI: TODO: Enable warmup iterations
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, static_cast<uint32_t>(n_instr), bo_in0,
                     bo_in1, bo_out);
diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py
index 43fd941926..969c41200f 100644
--- a/DeeployTest/generateNetwork_xdna2.py
+++ b/DeeployTest/generateNetwork_xdna2.py
@@ -3,8 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """XDNA2 network generation script.
 
-JUNGVI: TODO: Move this script to ONNX4Deeploy
-
 Replaces the generic ``generateNetwork.py`` for the XDNA2 platform.
 Instead of emitting C code it:
 
@@ -32,7 +30,6 @@
 
 
 def _tilingScheduler(graph: gs.Graph):
-    """Scheduler that returns List[List[gs.Node]] as required by the tiling framework."""
     return [[node] for node in graph.nodes]
 
 
@@ -135,13 +132,15 @@ def generateNetworkXDNA2(args):
         # Force bfloat16_t — BF16 test data stored as float32 in npz would be
         # inferred as float32_t by minimalFloatType, but the XDNA2 kernel
         # requires bfloat16_t inputs.
+        # JUNGVI: TODO: Align minimalFloatType to properly handle bf16 and don't force types.
         inputTypes[f"input_{index}"] = PointerClass(bfloat16_t)
         inputOffsets[f"input_{index}"] = 0
 
     _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
 
+    # JUNGVI: TODO: Extend with the whole NPU array
     # Define memory hierarchy: L1 (AIE core local) and L3 (shared)
-    l1_size = int(getattr(args, 'l1', None) or 8192)  # 8KB default
+    l1_size = int(getattr(args, 'l1', None) or 64000)  # 64KB default
     l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024)  # 128MB default
 
     log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}")
@@ -151,9 +150,7 @@ def generateNetworkXDNA2(args):
     memory_hierarchy = MemoryHierarchy([l1_level, l3_level])
     memory_hierarchy.setDefaultMemoryLevel("L3")  # Tensors default to L3
 
-    # Create memory-aware platform with AIE core engine
-    # defaultTargetMemoryLevel=L1 tells the tiling framework that computation
-    # targets L1, so it must tile data from L3 into L1-sized chunks.
+    # Create memory-aware platform with AIE core engines
     mem_platform = MemoryXDNA2Platform(
         memoryHierarchy = memory_hierarchy,
         defaultTargetMemoryLevel = l1_level,
@@ -187,6 +184,7 @@ def generateNetworkXDNA2(args):
     with open(f'{args.dumpdir}/testinputs.h', 'w') as f:
         f.write(testInputStr)
 
+    # JUNGVI: TODO: Move this in ONNX4Deeploy
     # Recompute golden outputs from the actual BF16 inputs the hardware will
     # see.  The original outputs.npz may have been computed in float32
     # precision, which can differ by several BF16 ULPs.

From a82fd526a15fe817dee6e27b4b5589096d6af03c Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Thu, 26 Mar 2026 11:49:34 +0100
Subject: [PATCH 16/16] Format

---
 DeeployTest/Platforms/XDNA2/main.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
index 0cb5186f38..7984ef8130 100644
--- a/DeeployTest/Platforms/XDNA2/main.cpp
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -122,7 +122,8 @@ int main(int argc, char **argv) {
   //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
   //    by generateNetwork_xdna2.py.
   // -----------------------------------------------------------------------
-  // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs (with respect to the amount of bo available)
+  // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs
+  // (with respect to the amount of bo available)
   static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
                 "Input 0 and input 1 must have the same number of elements");
   static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,