diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml
index 14e80631d1..c642bfe6d2 100644
--- a/.github/workflows/_runner-chimera.yml
+++ b/.github/workflows/_runner-chimera.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml
index 3fbdf0ee16..c6be8af465 100644
--- a/.github/workflows/_runner-cortexm.yml
+++ b/.github/workflows/_runner-cortexm.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml
index a5c8b3ac98..6934014447 100644
--- a/.github/workflows/_runner-gap9-tiled.yml
+++ b/.github/workflows/_runner-gap9-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml
index e1d6e452a6..cc790d3d33 100644
--- a/.github/workflows/_runner-gap9.yml
+++ b/.github/workflows/_runner-gap9.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml
index 6681cbac96..b44b47f73d 100644
--- a/.github/workflows/_runner-generic.yml
+++ b/.github/workflows/_runner-generic.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml
index deb4809330..b2f0ae4f7a 100644
--- a/.github/workflows/_runner-mempool.yml
+++ b/.github/workflows/_runner-mempool.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml
index b1f5f2fcb3..664d5f01be 100644
--- a/.github/workflows/_runner-siracusa-neureka-tiled.yml
+++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index ea9c8989af..cc09f234e0 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml
index ea8fe5d405..1c51333f7a 100644
--- a/.github/workflows/_runner-siracusa.yml
+++ b/.github/workflows/_runner-siracusa.yml
@@ -25,6 +25,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml
index fbd5195b08..bcdd58a166 100644
--- a/.github/workflows/_runner-snitch-tiled-sequential.yml
+++ b/.github/workflows/_runner-snitch-tiled-sequential.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml
index bc599e4fe7..48130ea26a 100644
--- a/.github/workflows/_runner-snitch.yml
+++ b/.github/workflows/_runner-snitch.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml
index b067664f40..2624cbe15d 100644
--- a/.github/workflows/_runner-softhier.yml
+++ b/.github/workflows/_runner-softhier.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml
new file mode 100644
index 0000000000..2c08f1bf46
--- /dev/null
+++ b/.github/workflows/_runner-xdna2.yml
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-xdna2
+
+"on":
+  workflow_call:
+    inputs:
+      pytest-marker:
+        required: true
+        type: string
+      docker-image:
+        required: false
+        type: string
+        default: "deeploy-xdna:local"
+
+jobs:
+  test-runner-xdna2:
+    runs-on: xdna2-npu
+    # NOTE: We cannot use the `container:` directive here because
+    # GitHub Actions does not support `--device` flags required for
+    # NPU access (/dev/accel/accel0). Instead we use explicit
+    # `docker run` commands.
+    steps:
+      - name: Fix workspace permissions
+        shell: bash
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}":/workspace \
+            ${{ inputs.docker-image }} \
+            chown -R $(id -u):$(id -g) /workspace || true
+
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
+
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Run Tests in Docker
+        shell: bash
+        run: |
+          docker run --rm \
+            --device /dev/accel/accel0 \
+            --ulimit memlock=-1 \
+            -v /opt/xilinx:/opt/xilinx \
+            -v "${{ github.workspace }}":/app/Deeploy \
+            -w /app/Deeploy \
+            ${{ inputs.docker-image }} \
+            bash -c "
+              pip install -e . &&
+              cd DeeployTest &&
+              pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}'
+            "
diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml
index fc468306b1..84f2779e4c 100644
--- a/.github/workflows/ci-deeploy.yml
+++ b/.github/workflows/ci-deeploy.yml
@@ -35,6 +35,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
@@ -49,6 +51,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/ci-platform-xdna2.yml b/.github/workflows/ci-platform-xdna2.yml
new file mode 100644
index 0000000000..ccf455edf7
--- /dev/null
+++ b/.github/workflows/ci-platform-xdna2.yml
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • XDNA2
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: "XDNA2 Docker image (must be pre-built on the runner)"
+        required: false
+        default: "deeploy-xdna:local"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  xdna2-kernels:
+    uses: ./.github/workflows/_runner-xdna2.yml
+    with:
+      pytest-marker: "kernels"
+      docker-image: ${{ inputs.docker_image || 'deeploy-xdna:local' }}
diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml
index 038789ce40..e6f382c5ca 100644
--- a/.github/workflows/infra-generate-ccache-gap9.yml
+++ b/.github/workflows/infra-generate-ccache-gap9.yml
@@ -23,6 +23,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:latest' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml
index e4d00ea911..1fdba01512 100644
--- a/.github/workflows/infra-generate-ccache.yml
+++ b/.github/workflows/infra-generate-ccache.yml
@@ -22,6 +22,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.gitignore b/.gitignore
index d9e4faace3..a9993aac54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,3 +57,7 @@ CHANGELOG_GEN.md
 # Container Artifacts
 .pyusbip/
 .cache/
+
+# Claude context file
+CLAUDE.md
+Container/xrt-debs/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c8a024c15..ffc4d64085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier)
   message(STATUS "Building for platform 'SoftHier'")
 elseif(platform STREQUAL Chimera)
   message(STATUS "Building for platform 'Chimera'")
+elseif(platform STREQUAL XDNA2)
+  message(STATUS "Building for platform 'XDNA2'")
 else()
   message(FATAL_ERROR "Invalid platform '${platform}' specified!")
 endif()
@@ -299,5 +301,20 @@ if(platform STREQUAL Chimera)
 
 endif()
 
+if(platform STREQUAL XDNA2)
+
+  project(${TESTNAME} LANGUAGES CXX)
+
+  message(STATUS "============================= XDNA2 Configuration ============================")
+  message(STATUS "[cMake  ]   GENERATED_SOURCE         = " ${GENERATED_SOURCE})
+  message(STATUS "[cMake  ]   TESTNAME                 = " ${TESTNAME})
+  message(STATUS "==============================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/XDNA2)
+  add_subdirectory(DeeployTest/Platforms/XDNA2)
+
+endif()
+
 
 print_simulation_config()
diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna
new file mode 100644
index 0000000000..16907402df
--- /dev/null
+++ b/Container/Dockerfile.deeploy-xdna
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+FROM ubuntu:24.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+ENV LLVM_INSTALL_DIR="nope"
+
+RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository -y ppa:amd-team/xrt \
+    && apt-get update && apt-get install -y \
+    cmake \
+    ninja-build \
+    g++ \
+    git \
+    git-lfs \
+    python3 \
+    python3-pip \
+    python-is-python3 \
+    uuid-dev \
+    wget \
+    curl \
+    ccache \
+    libxrt2 \
+    libxrt-npu2 \
+    libxrt-dev \
+    libxrt-utils \
+    libxrt-utils-npu \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV XILINX_XRT=/opt/xilinx/xrt
+ENV PATH=${XILINX_XRT}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib
+
+
+WORKDIR /app
+COPY pyproject.toml requirements-xdna.txt ./
+RUN pip install toml-to-requirements && \
+    toml-to-req --toml-file pyproject.toml && \
+    pip install -r requirements.txt && \
+    pip install -r requirements-xdna.txt && \
+    rm -f requirements.txt pyproject.toml requirements-xdna.txt
+
+ENV MLIR_AIE_PYTHON=/usr/bin/python3
+
+WORKDIR /app/Deeploy
diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py
new file mode 100644
index 0000000000..9fd92757a1
--- /dev/null
+++ b/Deeploy/MLIRDataTypes.py
@@ -0,0 +1,206 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Base classes for MLIR-emitting node templates and code transformations.
+
+This module provides:
+
+* :class:`MLIRNodeTemplate` — a :class:`NodeTemplate` subclass whose
+  ``emit()`` method populates an ``mlir.ir.Module`` instead of rendering C.
+* :class:`MLIRExecutionBlock` — MLIR-specific execution state replacing the
+  C-oriented :class:`ExecutionBlock` (code-snippet deque) with MLIR builder
+  state (tile references, ObjectFifo handles, tiling parameters).
+* :class:`MLIRCodeTransformationPass` — base class for MLIR code
+  transformation passes that operate on an :class:`MLIRExecutionBlock`.
+* :class:`MLIRCodeTransformation` — two-phase pass container
+  (``devicePasses`` + ``runtimeSequencePasses``) that the deployer
+  orchestrates inside ``@aie_d.device`` and ``@aiex_d.runtime_sequence``
+  regions respectively.
+
+All classes are intentionally dialect-agnostic so that future MLIR-based
+backends (NVGPU, Linalg, …) can reuse them.
+"""
+
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+
+# ======================================================================
+# MLIRExecutionBlock
+# ======================================================================
+
+
+class MLIRExecutionBlock:
+    """MLIR-specific execution state for a single operator.
+
+    Replaces the C-oriented :class:`ExecutionBlock` (which holds a deque of
+    :class:`CodeSnippet` objects) with fields that carry MLIR builder state
+    through the code-transformation pipeline.
+
+    Passes populate fields progressively:
+
+    1. The deployer sets ``computeTile``, ``shimTile``,
+       ``operatorRepresentation``, and ``patternMemoryConstraint``.
+    2. A device-phase pass (e.g. ``MLIRObjectFifoPass``) fills
+       ``fifoMap``, ``fifoTypes``, ``tileSize``, ``numTiles``,
+       ``kernelFuncName``, and ``kernelObjFile``.
+    3. The deployer sets ``runtimeSequenceArgs`` before the runtime-
+       sequence phase.
+    4. A runtime-sequence pass (e.g. ``MLIRRuntimeSequencePass``) reads
+       all of the above to emit DMA configuration.
+    """
+
+    def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None:
+        # MLIR tile references (set by deployer)
+        self.computeTile: Any = computeTile
+        self.shimTile: Any = shimTile
+
+        # Operator metadata (set by deployer from parser)
+        self.operatorRepresentation: OperatorRepresentation = {}
+
+        # Tiling constraint from midend solver (may be None)
+        self.patternMemoryConstraint: Any = None
+
+        # Populated by device-phase passes (e.g. MLIRObjectFifoPass)
+        self.fifoMap: Dict[str, str] = {}  # tensor name → FIFO name
+        self.fifoTypes: Dict[str, Any] = {}  # tensor name → MemRefType
+        self.tileSize: int = 0
+        self.numTiles: int = 0
+        self.numElements: int = 0
+        self.kernelFuncName: Optional[str] = None
+        self.kernelObjFile: Optional[str] = None
+
+        # The MLIRNodeTemplate for this node (set by deployer, called by
+        # MLIRComputeCorePass to emit the kernel call inside the core block)
+        self.template: Optional[Any] = None
+
+        # Set by deployer before runtime-sequence phase
+        self.runtimeSequenceArgs: List[Any] = []
+
+        # Input / output tensor name lists (set by deployer from parser)
+        self.inputNames: List[str] = []
+        self.outputNames: List[str] = []
+
+        # Trace support (populated by device-phase trace passes, read by
+        # runtime-sequence trace pass)
+        self.traceConfigs: List[str] = []
+        self.traceBufferSize: int = 0
+
+
+# ======================================================================
+# MLIRCodeTransformationPass / MLIRCodeTransformation
+# ======================================================================
+
+
+class MLIRCodeTransformationPass:
+    """Base class for passes that transform an :class:`MLIRExecutionBlock`.
+
+    Subclasses override :meth:`apply` to read / mutate the block's fields
+    and optionally emit MLIR operations into the current insertion point.
+    """
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        return ctxt, mlirBlock
+
+
+class MLIRCodeTransformation:
+    """Two-phase pass container for MLIR code transformations.
+
+    *devicePasses* run inside an ``@aie_d.device(...)`` region (ObjectFifo
+    creation, external-kernel declarations, …).
+
+    *runtimeSequencePasses* run inside an ``@aiex_d.runtime_sequence``
+    block (DMA configuration, token await, …).
+
+    The deployer calls :meth:`applyDevicePasses` and
+    :meth:`applyRuntimeSequencePasses` at the appropriate points.
+    """
+
+    def __init__(self,
+                 devicePasses: Optional[List[MLIRCodeTransformationPass]] = None,
+                 runtimeSequencePasses: Optional[List[MLIRCodeTransformationPass]] = None) -> None:
+        self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or []
+        self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or []
+
+    def applyDevicePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+                          name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        for _pass in self.devicePasses:
+            ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
+        return ctxt, mlirBlock
+
+    def applyRuntimeSequencePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+                                   name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        for _pass in self.runtimeSequencePasses:
+            ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name)
+        return ctxt, mlirBlock
+
+
+# ======================================================================
+# MLIRNodeTemplate
+# ======================================================================
+
+
+class MLIRNodeTemplate(NodeTemplate):
+    """NodeTemplate subclass that emits MLIR instead of C code.
+
+    Subclasses must override :meth:`emit` to add dialect operations to an
+    ``mlir.ir.Module`` (or region / insertion point provided via *kwargs*).
+
+    ``generate()`` is overridden as a convenience that constructs a
+    standalone module, calls :meth:`emit`, and returns the MLIR text.
+    The base-class ``alignToContext`` / ``hoistTransientBuffers`` hooks are
+    retained and work unchanged.
+    """
+
+    def __init__(self):
+        # Empty Mako template — no C code is generated.
+        super().__init__("")
+
+    # ------------------------------------------------------------------
+    # Subclass API
+    # ------------------------------------------------------------------
+
+    @abstractmethod
+    def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
+        """Populate an MLIR module with the operations for this node.
+
+        The caller (typically the deployer) sets up an ``mlir.ir.Module``
+        with the appropriate device wrapper and passes dialect-specific
+        context through *kwargs* (e.g. insertion point, tile references,
+        ObjectFifo handles).
+
+        Parameters
+        ----------
+        operatorRepresentation : OperatorRepresentation
+            The parser's node representation (buffer names, sizes, types …).
+        **kwargs
+            Dialect-specific context provided by the deployer.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # NodeTemplate overrides
+    # ------------------------------------------------------------------
+
+    def generate(self, operatorRepresentation = {}, **kwargs) -> str:
+        """Generate an MLIR string for this node.
+
+        This default implementation is a thin wrapper: it delegates to
+        :meth:`emit`.  Deployers that need to build a single module from
+        multiple nodes should call :meth:`emit` directly with the shared
+        module context and then stringify the complete module themselves.
+
+        Returns
+        -------
+        str
+            MLIR text (printable module or fragment).
+        """
+        self.emit(operatorRepresentation, **kwargs)
+        return ""
diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py
new file mode 100644
index 0000000000..1f0e7f7587
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Bindings.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import bfloat16_t
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.MLIRDataTypes import MLIRCodeTransformation
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import MLIRComputeCorePass
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass
+from Deeploy.Targets.XDNA2.Templates import AddTemplate
+from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker
+
+_ADD_INPUT_KEYS = ['data_in_1', 'data_in_2']
+_ADD_OUTPUT_KEYS = ['data_out']
+
+# JUNGVI: TODO: This logic should not be boiled down for 1 operator but should be applied on every nodes of the network
+# Likewise the kernelName and object file name should be specified in the node template of each operator.
+XDNA2Transformer = MLIRCodeTransformation(
+    devicePasses = [
+        MLIRObjectFifoPass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+            kernelFuncName = "eltwise_add_bf16_vector",
+            kernelObjFile = "add.o",
+        ),
+        MLIRComputeCorePass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+        ),
+    ],
+    runtimeSequencePasses = [
+        MLIRRuntimeSequencePass(
+            inputTensorKeys = _ADD_INPUT_KEYS,
+            outputTensorKeys = _ADD_OUTPUT_KEYS,
+        ),
+    ],
+)
+
+XDNA2AddBindings = [
+    NodeBinding(
+        XDNA2AddChecker([PointerClass(bfloat16_t), PointerClass(bfloat16_t)], [PointerClass(bfloat16_t)]),
+        AddTemplate.referenceTemplate,
+        XDNA2Transformer,
+    )
+]
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
new file mode 100644
index 0000000000..7d98b30d6e
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that emits the AIE core block with tiling loops.
+
+This pass constructs the structural MLIR around the compute kernel:
+
+1. Opens an ``@aie_d.core`` block linked to the kernel object file.
+2. Opens an infinite outer ``scf.for`` loop (streaming).
+3. Opens an inner ``scf.for`` tiling loop (``numTiles`` iterations).
+4. Acquires input/output ObjectFifo elements.
+5. Builds a modified ``operatorRepresentation`` where tensor keys
+   (e.g. ``data_in_1``) are replaced with the acquired MLIR memref
+   values and ``size`` is replaced with the tile size — mirroring
+   how ``TilingVariableReplacement`` rewrites buffer names for C
+   backends.
+6. Calls ``template.emit(modifiedOpRepr)`` — the template only emits
+   its ``func_d.call`` using values from ``operatorRepresentation``.
+7. Releases all FIFO elements and closes loops.
+
+The pass is operator-agnostic: it only needs the tensor key lists and
+reads everything else from the :class:`MLIRExecutionBlock` populated by
+prior passes (e.g. :class:`MLIRObjectFifoPass`).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Tuple
+
+from aie.dialects import aie as aie_d
+from aie.dialects import scf as scf_d
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+
+class MLIRComputeCorePass(MLIRCodeTransformationPass):
+    """Emit ``@aie_d.core`` with tiling loops and FIFO acquire/release.
+
+    The template stored on ``mlirBlock.template`` is called inside the
+    inner loop with a *modified* ``operatorRepresentation`` whose tensor
+    entries point to acquired MLIR memref values instead of buffer name
+    strings.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors.
+    outputTensorKeys : list of str
+        Keys that name output tensors.
+    """
+
+    def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        computeTile = mlirBlock.computeTile
+        kernelObj = mlirBlock.kernelObjFile
+        tileSize = mlirBlock.tileSize
+        numTiles = mlirBlock.numTiles
+        opRepr = mlirBlock.operatorRepresentation
+        template = mlirBlock.template
+
+        # Use the first tensor's type as representative tile memref type
+        firstKey = self.inputTensorKeys[0]
+        tileTy = mlirBlock.fifoTypes[firstKey]
+
+        @aie_d.core(computeTile)
+        def _core():
+            subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy)
+            for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1):
+                for _ in scf_d.for_(0, numTiles, 1):
+                    # Acquire all input FIFO elements
+                    acquiredElements = {}
+                    for key in self.inputTensorKeys:
+                        fifoName = mlirBlock.fifoMap[key]
+                        acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0)
+
+                    # Acquire all output FIFO elements
+                    for key in self.outputTensorKeys:
+                        fifoName = mlirBlock.fifoMap[key]
+                        acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1)
+                        acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0)
+
+                    # Build modified opRepr: replace tensor names with MLIR
+                    # values, replace size with tile size.  This mirrors the
+                    # C backend's TilingVariableReplacement pass.
+                    modifiedOpRepr = {**opRepr, 'size': tileSize, **acquiredElements}
+
+                    # Call the template — it only emits func_d.call()
+                    template.emit(modifiedOpRepr)
+
+                    # Release all inputs
+                    for key in self.inputTensorKeys:
+                        aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1)
+                    # Release all outputs
+                    for key in self.outputTensorKeys:
+                        aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1)
+
+                    scf_d.yield_([])
+                scf_d.yield_([])
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py
new file mode 100644
index 0000000000..789a9a68cf
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that emits a core trace configuration.
+
+Emits an ``aie.trace`` block on the compute tile that captures core
+instruction events, stall events, and port-monitoring events.  The
+configuration name is appended to ``mlirBlock.traceConfigs`` so that the
+runtime-sequence trace pass can activate it.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+from aie.dialects.aie import DMAChannelDir, TraceMode, TracePacketType, WireBundle, trace, trace_event, trace_mode, \
+    trace_packet, trace_port, trace_start, trace_stop
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+_DEFAULT_CORE_EVENTS = [
+    "INSTR_EVENT_0",
+    "INSTR_EVENT_1",
+    "INSTR_VECTOR",
+    "MEMORY_STALL",
+    "STREAM_STALL",
+    "LOCK_STALL",
+    "PORT_RUNNING_0",
+    "PORT_RUNNING_1",
+]
+
+_DEFAULT_CORE_PORTS = [
+    (0, WireBundle.DMA, 0, DMAChannelDir.S2MM),
+    (1, WireBundle.DMA, 0, DMAChannelDir.MM2S),
+]
+
+
+class MLIRCoreTracePass(MLIRCodeTransformationPass):
+    """Emit a core trace configuration on the compute tile.
+
+    Parameters
+    ----------
+    packetId : int
+        Trace packet ID (default 1).
+    events : list of str, optional
+        Event names to capture (max 8).  Defaults to the reference set of
+        instruction / stall / port-running events.
+    ports : list of tuple, optional
+        ``(slot, WireBundle, channel, DMAChannelDir)`` tuples for
+        port-monitoring event slots.
+    startBroadcast : int
+        Broadcast channel that starts the trace (default 15).
+    stopBroadcast : int
+        Broadcast channel that stops the trace (default 14).
+    """
+
+    def __init__(
+        self,
+        packetId: int = 1,
+        events: Optional[List[str]] = None,
+        ports: Optional[List[tuple]] = None,
+        startBroadcast: int = 15,
+        stopBroadcast: int = 14,
+    ) -> None:
+        self.packetId = packetId
+        self.events = events if events is not None else list(_DEFAULT_CORE_EVENTS)
+        self.ports = ports if ports is not None else list(_DEFAULT_CORE_PORTS)
+        self.startBroadcast = startBroadcast
+        self.stopBroadcast = stopBroadcast
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        computeTile = mlirBlock.computeTile
+        configName = f"core_trace_{name}"
+
+        @trace(computeTile, configName)
+        def _core_trace():
+            trace_mode(TraceMode.EventTime)
+            trace_packet(self.packetId, TracePacketType.Core)
+            for event in self.events:
+                trace_event(event)
+            for slot, port, channel, direction in self.ports:
+                trace_port(slot, port, channel, direction)
+            trace_start(broadcast = self.startBroadcast)
+            trace_stop(broadcast = self.stopBroadcast)
+
+        mlirBlock.traceConfigs.append(configName)
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py
new file mode 100644
index 0000000000..3e2757c3cf
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that emits a memory trace configuration.
+
+Emits an ``aie.trace`` block on the compute tile that captures DMA
+start/finish/starvation events from the memory module trace unit.  The
+configuration name is appended to ``mlirBlock.traceConfigs`` so that the
+runtime-sequence trace pass can activate it.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+from aie.dialects.aie import TracePacketType, trace, trace_event, trace_packet, trace_start, trace_stop
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+_DEFAULT_MEM_EVENTS = [
+    "DMA_S2MM_0_START_TASK",
+    "DMA_S2MM_1_START_TASK",
+    "DMA_MM2S_0_START_TASK",
+    "DMA_S2MM_0_FINISHED_TASK",
+    "DMA_S2MM_1_FINISHED_TASK",
+    "DMA_MM2S_0_FINISHED_TASK",
+    "DMA_S2MM_0_STREAM_STARVATION",
+    "DMA_S2MM_1_STREAM_STARVATION",
+]
+
+
+class MLIRMemTracePass(MLIRCodeTransformationPass):
+    """Emit a memory trace configuration on the compute tile.
+
+    Parameters
+    ----------
+    packetId : int
+        Trace packet ID (default 3).
+    events : list of str, optional
+        Event names to capture (max 8).  Defaults to DMA start / finish /
+        starvation events matching the reference example.
+    startEvent : str
+        Event that starts the trace (default ``"BROADCAST_15"``).
+    stopEvent : str
+        Event that stops the trace (default ``"BROADCAST_14"``).
+    """
+
+    def __init__(
+        self,
+        packetId: int = 3,
+        events: Optional[List[str]] = None,
+        startEvent: str = "BROADCAST_15",
+        stopEvent: str = "BROADCAST_14",
+    ) -> None:
+        self.packetId = packetId
+        self.events = events if events is not None else list(_DEFAULT_MEM_EVENTS)
+        self.startEvent = startEvent
+        self.stopEvent = stopEvent
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        computeTile = mlirBlock.computeTile
+        configName = f"mem_trace_{name}"
+
+        @trace(computeTile, configName)
+        def _mem_trace():
+            trace_packet(self.packetId, TracePacketType.Mem)
+            for event in self.events:
+                trace_event(event)
+            trace_start(event = self.startEvent)
+            trace_stop(event = self.stopEvent)
+
+        mlirBlock.traceConfigs.append(configName)
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
new file mode 100644
index 0000000000..9d86f4d834
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py
@@ -0,0 +1,140 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Device-phase pass that creates ObjectFifos and declares external kernels.
+
+Given an :class:`MLIRExecutionBlock` with ``computeTile``, ``shimTile``,
+``operatorRepresentation``, and (optionally) ``patternMemoryConstraint``,
+this pass:
+
+1. Derives ``tileSize`` and ``numTiles`` (from tiling solver or fallback).
+2. Creates one ``aie_d.object_fifo`` per input tensor (shim → compute)
+   and one per output tensor (compute → shim), all with depth 2
+   (double-buffering).
+3. Declares the external kernel via ``aie_d.external_func``.
+4. Stores FIFO names, types, and kernel metadata on the block for
+   downstream passes and the compute template.
+
+The pass is operator-agnostic — it only needs the tensor names and a
+tile-size derivation function.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Tuple
+
+import aie.ir as ir
+import numpy as np
+from aie.dialects import aie as aie_d
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+MAX_TILE_SIZE = 1024
+
+
+def _deriveTileSize(numElements: int, patternMemoryConstraint) -> int:
+    """Extract tile size from the tiling solution, or fall back to MAX_TILE_SIZE."""
+    tileSize = min(numElements, MAX_TILE_SIZE)
+
+    if patternMemoryConstraint is not None:
+        try:
+            nodeConstraint = patternMemoryConstraint.nodeConstraints[0]
+            outputConstraints = nodeConstraint.outputTensorMemoryConstraints
+            if outputConstraints:
+                firstOutputName = list(outputConstraints.keys())[0]
+                tensorConstraint = outputConstraints[firstOutputName]
+                if "L1" in tensorConstraint.memoryConstraints:
+                    l1Constraint = tensorConstraint.memoryConstraints["L1"]
+                    if l1Constraint.shape is not None:
+                        tileSize = int(np.prod(l1Constraint.shape))
+        except (AttributeError, IndexError, KeyError):
+            pass
+
+    # Ensure tile_size evenly divides num_elements
+    if numElements % tileSize != 0:
+        tileSize = max(d for d in range(1, tileSize + 1) if numElements % d == 0)
+
+    return tileSize
+
+
+class MLIRObjectFifoPass(MLIRCodeTransformationPass):
+    """Create ObjectFifos and declare the external kernel.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors
+        (e.g. ``['data_in_1', 'data_in_2']``).
+    outputTensorKeys : list of str
+        Keys that name output tensors (e.g. ``['data_out']``).
+    kernelFuncName : str
+        Symbol name of the external AIE kernel function.
+    kernelObjFile : str
+        Object file to link with the AIE core (e.g. ``"add.o"``).
+    kernelArgTypes : callable, optional
+        A callable ``(tile_memref_type) -> list[ir.Type]`` that returns
+        the kernel's argument types.  Defaults to
+        ``[tile_ty, tile_ty, tile_ty, i32]`` (suitable for binary
+        elementwise ops).
+    fifoDepth : int
+        ObjectFifo depth (default 2 for double-buffering).
+    """
+
+    def __init__(self,
+                 inputTensorKeys: list,
+                 outputTensorKeys: list,
+                 kernelFuncName: str,
+                 kernelObjFile: str,
+                 kernelArgTypes = None,
+                 fifoDepth: int = 2) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+        self.kernelFuncName = kernelFuncName
+        self.kernelObjFile = kernelObjFile
+        self._kernelArgTypes = kernelArgTypes
+        self.fifoDepth = fifoDepth
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        opRepr = mlirBlock.operatorRepresentation
+        numElements = int(opRepr['size'])
+        tileSize = _deriveTileSize(numElements, mlirBlock.patternMemoryConstraint)
+        numTiles = numElements // tileSize
+
+        mlirBlock.tileSize = tileSize
+        mlirBlock.numTiles = numTiles
+        mlirBlock.numElements = numElements
+        mlirBlock.kernelFuncName = self.kernelFuncName
+        mlirBlock.kernelObjFile = self.kernelObjFile
+
+        tileTy = ir.MemRefType.get((tileSize,), ir.BF16Type.get())
+        computeTile = mlirBlock.computeTile
+        shimTile = mlirBlock.shimTile
+
+        # Create input ObjectFifos (shim → compute)
+        for idx, key in enumerate(self.inputTensorKeys):
+            fifoName = f"in{idx + 1}_0"
+            aie_d.object_fifo(fifoName, shimTile, [computeTile], self.fifoDepth, tileTy)
+            mlirBlock.fifoMap[key] = fifoName
+            mlirBlock.fifoTypes[key] = tileTy
+
+        # Create output ObjectFifos (compute → shim)
+        for idx, key in enumerate(self.outputTensorKeys):
+            fifoName = f"out_{idx}"
+            aie_d.object_fifo(fifoName, computeTile, [shimTile], self.fifoDepth, tileTy)
+            mlirBlock.fifoMap[key] = fifoName
+            mlirBlock.fifoTypes[key] = tileTy
+
+        # Declare external kernel
+        i32 = ir.IntegerType.get_signless(32)
+        if self._kernelArgTypes is not None:
+            argTypes = self._kernelArgTypes(tileTy)
+        else:
+            # Default: binary elementwise  (in1, in2, out, size)
+            argTypes = [tileTy, tileTy, tileTy, i32]
+        aie_d.external_func(self.kernelFuncName, argTypes, link_with = self.kernelObjFile)
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
new file mode 100644
index 0000000000..6331bd0914
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Runtime-sequence pass that configures shim DMA for L3 ↔ L1 transfers.
+
+Given an :class:`MLIRExecutionBlock` whose device-phase passes have already
+populated ``fifoMap``, ``numElements``, and ``runtimeSequenceArgs``, this
+pass emits ``aiex_d.dma_configure_task_for`` / ``dma_start_task`` /
+``dma_await_task`` / ``dma_free_task`` operations directly into the current
+``@aiex_d.runtime_sequence`` insertion point.
+
+The pass is operator-agnostic — it iterates over the FIFO map and
+runtime-sequence arguments to configure DMA for every input and output
+tensor.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Tuple
+
+import aie.ir as ir
+from aie.dialects import aie as aie_d
+from aie.dialects import aiex as aiex_d
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+
+class MLIRRuntimeSequencePass(MLIRCodeTransformationPass):
+    """Emit DMA configuration inside a ``runtime_sequence`` block.
+
+    Parameters
+    ----------
+    inputTensorKeys : list of str
+        Keys in ``operatorRepresentation`` that name input tensors.
+    outputTensorKeys : list of str
+        Keys that name output tensors.
+    """
+
+    def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None:
+        self.inputTensorKeys = inputTensorKeys
+        self.outputTensorKeys = outputTensorKeys
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+        numElements = mlirBlock.numElements
+        seqArgs = mlirBlock.runtimeSequenceArgs
+
+        dims = [
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = 1, stride = 0),
+            aie_d.bd_dim_layout(size = numElements, stride = 1),
+        ]
+
+        # Build ordered list of (fifoName, seqArg, isOutput)
+        transfers = []
+        allKeys = self.inputTensorKeys + self.outputTensorKeys
+        for idx, key in enumerate(allKeys):
+            fifoName = mlirBlock.fifoMap[key]
+            isOutput = key in self.outputTensorKeys
+            transfers.append((fifoName, seqArgs[idx], isOutput))
+
+        inputTasks = []
+        outputTasks = []
+
+        for fifoName, seqArg, isOutput in transfers:
+            if isOutput:
+                task = aiex_d.dma_configure_task_for(fifoName, issue_token = True)
+            else:
+                task = aiex_d.dma_configure_task_for(fifoName)
+            block = task.body.blocks.append()
+            with ir.InsertionPoint(block):
+                aie_d.dma_bd(seqArg, offset = 0, len = numElements, dimensions = dims, burst_length = 0)
+                aie_d.end()
+            aiex_d.dma_start_task(task)
+
+            if isOutput:
+                outputTasks.append(task)
+            else:
+                inputTasks.append(task)
+
+        # Await output tasks, then free input tasks
+        for task in outputTasks:
+            aiex_d.dma_await_task(task)
+        for task in inputTasks:
+            aiex_d.dma_free_task(task)
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py
new file mode 100644
index 0000000000..b429c00eb2
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Runtime-sequence pass that activates trace configurations.
+
+Emits ``aie.trace.host_config`` to set up the host-side trace buffer and
+``aie.trace.start_config`` for each trace configuration registered by
+device-phase passes on the :class:`MLIRExecutionBlock`.
+
+This pass must run **before** the DMA configuration pass
+(:class:`MLIRRuntimeSequencePass`) inside the ``runtime_sequence`` block.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Tuple
+
+from aie.dialects.aie import trace_host_config, trace_start_config
+
+from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import NetworkContext
+
+
+class MLIRTraceRuntimePass(MLIRCodeTransformationPass):
+    """Emit trace host configuration and activate trace configs.
+
+    Reads ``mlirBlock.traceConfigs`` (populated by device-phase trace
+    passes such as :class:`MLIRCoreTracePass` / :class:`MLIRMemTracePass`)
+    and ``mlirBlock.traceBufferSize`` (set by the deployer).  If there are
+    no trace configs, this pass is a no-op.
+    """
+
+    def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock,
+              name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]:
+
+        if not mlirBlock.traceConfigs:
+            return ctxt, mlirBlock
+
+        trace_host_config(buffer_size = mlirBlock.traceBufferSize)
+
+        for configName in mlirBlock.traceConfigs:
+            trace_start_config(configName)
+
+        return ctxt, mlirBlock
diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000000..85db5baffa
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRCoreTracePass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRMemTracePass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import *
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRTraceRuntimePass import *
diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py
new file mode 100644
index 0000000000..565749c5d2
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Deployer.py
@@ -0,0 +1,202 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""XDNA2 deployer — generates mlir-aie MLIR using ``aie.dialects``.
+
+Unlike other Deeploy deployers that generate C code via Mako templates,
+this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations
+and returns the verified MLIR text.
+
+MLIR generation is split into two phases orchestrated by
+:class:`MLIRCodeTransformation`:
+
+1. **Device phase** — inside ``@aie_d.device(npu2)``: for each operator,
+   run ``devicePasses`` (ObjectFifo creation, external-kernel
+   declaration) then call ``template.emit()`` (compute core only).
+2. **Runtime-sequence phase** — inside ``@aiex_d.runtime_sequence``:
+   for each operator, run ``runtimeSequencePasses`` (DMA configuration).
+"""
+
+from __future__ import annotations
+
+import copy
+from typing import Callable, Dict, Optional, Type
+
+import aie.ir as ir
+import onnx_graphsurgeon as gs
+from aie.dialects import aie as aie_d
+from aie.dialects import aiex as aiex_d
+from aie.extras.context import mlir_mod_ctx
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MLIRDataTypes import MLIRCodeTransformation, MLIRExecutionBlock, MLIRNodeTemplate
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRCoreTracePass import MLIRCoreTracePass
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRMemTracePass import MLIRMemTracePass
+from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRTraceRuntimePass import MLIRTraceRuntimePass
+
+
+class XDNA2Deployer(SignPropDeployer):
+    """Deployer for the XDNA2 (AIE2p) platform.
+
+    Generates an mlir-aie MLIR module via two-phase code transformation:
+
+    * **Device phase**: ``MLIRObjectFifoPass`` creates ObjectFifos and
+      declares external kernels; the bound ``MLIRNodeTemplate`` emits
+      the compute core.
+    * **Runtime-sequence phase**: ``MLIRRuntimeSequencePass`` configures
+      shim DMA for L3 ↔ L1 transfers.
+
+    The module is verified via MLIR's built-in verifier before being
+    returned as a string.
+    """
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first: bool = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Optional[Dict[str, int]] = None,
+                 enableTrace: bool = False,
+                 traceBufferSize: int = 8192):
+        super().__init__(
+            graph,
+            deploymentPlatform,
+            inputTypes,
+            loweringOptimizer,
+            scheduler,
+            name,
+            default_channels_first = default_channels_first,
+            deeployStateDir = deeployStateDir,
+            inputOffsets = inputOffsets if inputOffsets is not None else {},
+        )
+        self.enableTrace = enableTrace
+        self.traceBufferSize = traceBufferSize
+
+    # ------------------------------------------------------------------
+    # MLIR generation
+    # ------------------------------------------------------------------
+
+    def generateMLIR(self) -> str:
+        """Generate an mlir-aie MLIR module for the prepared graph.
+
+        Iterates over bound layers in two phases:
+
+        1. **Device phase** — for each node, creates an
+           :class:`MLIRExecutionBlock`, runs device-phase code-
+           transformation passes (ObjectFifo creation, kernel
+           declaration), then calls ``template.emit()`` (compute core).
+        2. **Runtime-sequence phase** — opens an
+           ``@aiex_d.runtime_sequence`` block, sets
+           ``runtimeSequenceArgs`` on each block, then runs
+           runtime-sequence passes (DMA configuration).
+
+        Returns
+        -------
+        str
+            Verified MLIR module string.
+        """
+        assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()"
+
+        # Collect per-node info from the bound layers
+        nodes = []
+        for nodeName, layer in self.layerBinding.items():
+            mapper = layer.mapper
+            binder = mapper.binder
+            template = binder.template
+            opRepr = mapper.parser.operatorRepresentation
+            codeTransformer = binder.codeTransformer
+
+            # Tiling constraint from the midend solver (may be None)
+            executionBlock = binder.executionBlock
+            tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None)
+
+            if not isinstance(template, MLIRNodeTemplate):
+                raise RuntimeError(f"Node '{nodeName}' has no MLIRNodeTemplate — "
+                                   f"only BF16 Add is supported in this release.")
+            if not isinstance(codeTransformer, MLIRCodeTransformation):
+                raise RuntimeError(f"Node '{nodeName}' uses a non-MLIR CodeTransformation — "
+                                   f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.")
+
+            # When tracing is enabled, shallow-copy the code transformer
+            # to inject trace passes without mutating the shared singleton
+            # (XDNA2Transformer in Bindings.py).
+            if self.enableTrace:
+                codeTransformer = copy.copy(codeTransformer)
+                codeTransformer.devicePasses = list(codeTransformer.devicePasses) + [
+                    MLIRCoreTracePass(),
+                    MLIRMemTracePass(),
+                ]
+                codeTransformer.runtimeSequencePasses = [
+                    MLIRTraceRuntimePass(),
+                ] + list(codeTransformer.runtimeSequencePasses)
+
+            nodes.append({
+                'nodeName': nodeName,
+                'template': template,
+                'opRepr': opRepr,
+                'codeTransformer': codeTransformer,
+                'tilingConstraint': tilingConstraint,
+            })
+
+        if not nodes:
+            raise RuntimeError("No bound layers found — cannot generate MLIR.")
+
+        # Build the MLIR module
+        mlirBlocks = []
+
+        with mlir_mod_ctx() as ctx:
+
+            @aie_d.device(aie_d.AIEDevice.npu2)
+            def _device():
+                computeTile = aie_d.tile(0, 2)  # TODO: generalize to full array
+                shimTile = aie_d.tile(0, 0)
+
+                # === Device phase ===
+                for node in nodes:
+                    # Create MLIRExecutionBlock with deployer-level state
+                    eb = MLIRExecutionBlock(computeTile = computeTile, shimTile = shimTile)
+                    eb.operatorRepresentation = node['opRepr']
+                    eb.patternMemoryConstraint = node['tilingConstraint']
+                    eb.template = node['template']
+                    if self.enableTrace:
+                        eb.traceBufferSize = self.traceBufferSize
+
+                    log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" +
+                             (" (tiled)" if node['tilingConstraint'] else ""))
+
+                    # Run device-phase passes:
+                    #  1. MLIRObjectFifoPass — creates FIFOs, declares kernel
+                    #  2. MLIRComputeCorePass — opens core + loops, calls
+                    #     template.emit() with acquired FIFO elements in opRepr
+                    self.ctxt, eb = node['codeTransformer'].applyDevicePasses(self.ctxt, eb, node['nodeName'])
+
+                    mlirBlocks.append((node, eb))
+
+                # === Runtime-sequence phase ===
+                # Derive tensor type from the first node's numElements
+                _, firstEb = mlirBlocks[0]
+                numElements = firstEb.numElements
+                tensorTy = ir.MemRefType.get((numElements,), ir.BF16Type.get())
+
+                @aiex_d.runtime_sequence(tensorTy, tensorTy, tensorTy)
+                def _seq(*args):
+                    for node, eb in mlirBlocks:
+                        eb.runtimeSequenceArgs = list(args)
+                        log.info(f"[XDNA2] Runtime-sequence phase for '{node['nodeName']}'")
+                        self.ctxt, eb = node['codeTransformer'].applyRuntimeSequencePasses(
+                            self.ctxt, eb, node['nodeName'])
+
+            module = ctx.module
+            assert module.operation.verify(), \
+                "[XDNA2] Generated MLIR module failed verification"
+
+        mlirStr = str(module)
+        log.info(f"[XDNA2] MLIR module generated ({len(mlirStr)} bytes)")
+        return mlirStr
diff --git a/Deeploy/Targets/XDNA2/Parsers.py b/Deeploy/Targets/XDNA2/Parsers.py
new file mode 100644
index 0000000000..c665312dbd
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Parsers.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# XDNA2 reuses the Generic AddParser (see Platform.py).
+# Add any XDNA2-specific parsers here as the platform grows.
diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py
new file mode 100644
index 0000000000..b54ce8acb9
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Platform.py
@@ -0,0 +1,157 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
+    NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
+from Deeploy.Targets.Generic.Layers import AddLayer
+from Deeploy.Targets.Generic.Parsers import AddParser
+from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
+from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings
+
+# Standard mapper for non-tiled deployment
+XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings)
+
+# Tiling-ready mapper for tiled deployment
+XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings)
+
+# Standard mapping (used when tiling is disabled)
+XDNA2Mapping = {
+    'Add': AddLayer([XDNA2AddMapper]),
+}
+
+# Tiling-ready mapping (used when tiling is enabled)
+XDNA2TilingMapping = {
+    'Add': AddLayer([XDNA2AddTilableMapper]),
+}
+
+# Buffer classes reuse Generic templates since XDNA2Deployer manages its own
+# output format (MLIR + test headers) and these templates are never rendered.
+
+
+class XDNA2VariableBuffer(VariableBuffer):
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class XDNA2TransientBuffer(TransientBuffer):
+    initTemplate = AllocateTemplate.referenceInitTemplate
+    allocTemplate = AllocateTemplate.referenceAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceLocalTemplate
+
+
+class XDNA2ConstantBuffer(ConstantBuffer):
+    initTemplate = AllocateTemplate.referenceGlobalInitTemplate
+    allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate
+    deallocTemplate = FreeTemplate.referenceGlobalTemplate
+
+
+class XDNA2StructBuffer(StructBuffer):
+    initTemplate = AllocateTemplate.referenceStructInitTemplate
+    allocTemplate = AllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+# No topology optimization passes needed for the initial Add-only platform.
+XDNA2Optimizer = TopologyOptimizer([], name = "XDNA2Optimizer")
+
+
+class XDNA2Engine(DeploymentEngine):
+
+    def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", includeList = None) -> None:
+        if includeList is None:
+            includeList = []
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class XDNA2AIECoreEngine(DeploymentEngine):
+    """AIE core execution engine with L1 local memory as preferred memory level.
+
+    The AIE core has 8KB of local memory (L1) for temporary buffers and computation.
+    Data is transferred from L3 (shared memory) to L1 as needed.
+    """
+
+    def __init__(self,
+                 name: str = "XDNA2_AIE_Core",
+                 Mapping = XDNA2Mapping,
+                 initCode: str = "",
+                 includeList = None,
+                 preferredMemoryLevel: str = "L1") -> None:
+        if includeList is None:
+            includeList = []
+        super().__init__(name, Mapping, initCode, includeList)
+        self.preferredMemoryLevel = preferredMemoryLevel
+
+
+class XDNA2Platform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = None,
+                 variableBuffer = XDNA2VariableBuffer,
+                 constantBuffer = XDNA2ConstantBuffer,
+                 structBuffer = XDNA2StructBuffer,
+                 transientBuffer = XDNA2TransientBuffer):
+        if engines is None:
+            engines = [XDNA2Engine()]
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class MemoryXDNA2Platform(MemoryPlatform):
+    """XDNA2 platform with memory hierarchy support for tiling.
+
+    Defines the memory hierarchy:
+    - L1: 8KB per AIE core (local memory)
+    - L3: Shared memory for entire AIE array
+    """
+
+    def __init__(self,
+                 memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel,
+                 engines = None,
+                 variableBuffer = XDNA2VariableBuffer,
+                 constantBuffer = XDNA2ConstantBuffer,
+                 structBuffer = XDNA2StructBuffer,
+                 transientBuffer = XDNA2TransientBuffer) -> None:
+        if engines is None:
+            engines = [XDNA2AIECoreEngine()]
+        super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,
+                         structBuffer, transientBuffer)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node.
+
+        For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level).
+        Otherwise use the default target memory level (typically L3).
+        """
+        # Check if node has an engine assignment
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+
+        return self.defaultTargetMemoryLevel.name
+
+
+class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper):
+    """Wrapper for XDNA2Platform with memory-level support."""
+
+    def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy,
+                 defaultTargetMemoryLevel: MemoryLevel):
+        assert isinstance(platform, XDNA2Platform), \
+            f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}"
+        super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel)
+
+    def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str:
+        """Get the target memory level for a tensor in a given node."""
+        if hasattr(node, '_engine_assignment'):
+            engine = node._engine_assignment
+            if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'):
+                return engine.preferredMemoryLevel
+
+        return self.defaultTargetMemoryLevel.name
diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
new file mode 100644
index 0000000000..6c526a9e38
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""XDNA2 MLIR template for BF16 elementwise Add — pure compute primitive.
+
+This template emits **only** a ``func_d.call`` to the vectorised
+``eltwise_add_bf16_vector`` kernel.  It receives its operands (acquired
+ObjectFifo element memrefs) and tile size through
+``operatorRepresentation``, exactly like a C Mako template receives
+buffer-name strings.
+
+All structural MLIR (``@aie_d.core``, loops, FIFO acquire/release,
+ObjectFifo creation, DMA configuration) is handled by
+:class:`MLIRCodeTransformationPass` instances upstream.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import aie.ir as ir
+from aie.dialects import arith as arith_d
+from aie.dialects import func as func_d
+
+from Deeploy.MLIRDataTypes import MLIRNodeTemplate
+
+if TYPE_CHECKING:
+    from Deeploy.DeeployTypes import OperatorRepresentation
+
+
+class XDNA2AddTemplate(MLIRNodeTemplate):
+    """Pure compute-primitive for BF16 elementwise Add on XDNA2.
+
+    ``emit()`` is called by :class:`MLIRComputeCorePass` inside an
+    already-open ``@aie_d.core`` + tiling-loop context, with
+    ``operatorRepresentation`` entries replaced by live MLIR values:
+
+    * ``data_in_1``, ``data_in_2``, ``data_out`` — acquired memref
+      elements (from ObjectFifo acquire).
+    * ``size`` — tile size (Python int).
+    """
+
+    KERNEL_FN = "eltwise_add_bf16_vector"
+
+    def __init__(self):
+        super().__init__()
+
+    def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None:
+        """Emit a single ``func.call`` to the vectorised Add kernel."""
+        i32 = ir.IntegerType.get_signless(32)
+        sizeVal = arith_d.constant(i32, int(operatorRepresentation['size']))
+        func_d.call([], self.KERNEL_FN, [
+            operatorRepresentation['data_in_1'],
+            operatorRepresentation['data_in_2'],
+            operatorRepresentation['data_out'],
+            sizeVal,
+        ])
+
+
+referenceTemplate = XDNA2AddTemplate()
diff --git a/Deeploy/Targets/XDNA2/Templates/__init__.py b/Deeploy/Targets/XDNA2/Templates/__init__.py
new file mode 100644
index 0000000000..4694b67df5
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Templates/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py
new file mode 100644
index 0000000000..b2282c34b0
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/Tiler.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation."""
+
+from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
+from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+# For Add operator, reuse the generic BOP (Binary Operator) tile constraint
+# which handles equal-dimension binary operations
+XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = XDNA2AddBindings,
+                                                      tileConstraint = AddTileConstraint())
diff --git a/Deeploy/Targets/XDNA2/TypeCheckers.py b/Deeploy/Targets/XDNA2/TypeCheckers.py
new file mode 100644
index 0000000000..cb9c98fd39
--- /dev/null
+++ b/Deeploy/Targets/XDNA2/TypeCheckers.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Optional, Sequence, Type
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
+from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
+
+
+class XDNA2AddChecker(SignPropTypeChecker):
+    """Type checker for BF16 elementwise Add on XDNA2.
+
+    Both inputs and the output are bfloat16_t pointers.
+    """
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]:
+        # Float types do not have a meaningful nLevels — return 1 as a neutral value.
+        return [1]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
+        # BF16 is a signed floating-point type.
+        return [True]
diff --git a/DeeployTest/Platforms/XDNA2/CMakeLists.txt b/DeeployTest/Platforms/XDNA2/CMakeLists.txt
new file mode 100644
index 0000000000..b96bb5d092
--- /dev/null
+++ b/DeeployTest/Platforms/XDNA2/CMakeLists.txt
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ---------------------------------------------------------------------------
+# XDNA2 (AIE2p) testbench CMake configuration
+#
+# Included via add_subdirectory() by the top-level CMakeLists.txt when
+#   -Dplatform=XDNA2
+# is passed.  It orchestrates two build steps:
+#
+#   1. Compile network.mlir to network.xclbin + npu_insts.bin with aiecc.py.
+#   2. Compile the XRT host binary (main.cpp) with the system compiler.
+#
+# AIE kernel compilation is handled by TargetLibraries/XDNA2/CMakeLists.txt.
+#
+# Required variables (set via environment or CMake cache):
+#   MLIR_AIE_INSTALL_DIR  – path to the mlir-aie installation
+#                           (auto-resolved from aie.utils.config or env)
+#   LLVM_AIE_INSTALL_DIR  – path to the llvm-aie installation
+#                           (auto-resolved from aie.utils.config or env)
+#   XRT_INSTALL_DIR       – path to the XRT installation
+#                           (default: $ENV{XILINX_XRT} or /opt/xilinx/xrt)
+#   GENERATED_SOURCE      – directory containing network.mlir, testinputs.h, testoutputs.h
+#                           (set by the Deeploy test runner)
+#   TESTNAME              – name of the test target (set by the Deeploy test runner)
+# ---------------------------------------------------------------------------
+
+# ---------------------------------------------------------------------------
+# Resolve toolchain and runtime paths
+# ---------------------------------------------------------------------------
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+# --- llvm-aie (Peano) install dir (needed for --peano flag) ---
+set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir")
+if(NOT LLVM_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());"
+        OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT LLVM_AIE_INSTALL_DIR)
+        message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. "
+            "Set LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.")
+    endif()
+endif()
+
+# --- mlir-aie install dir (needed for aiecc.py) ---
+set(MLIR_AIE_INSTALL_DIR "$ENV{MLIR_AIE_INSTALL_DIR}" CACHE PATH "mlir-aie install dir")
+if(NOT MLIR_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.root_path());"
+        OUTPUT_VARIABLE MLIR_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT MLIR_AIE_INSTALL_DIR)
+        message(FATAL_ERROR "[XDNA2] Could not find mlir-aie install dir. "
+            "Set MLIR_AIE_INSTALL_DIR or install the mlir-aie wheel.")
+    endif()
+endif()
+
+# --- XRT install dir ---
+if(NOT XRT_INSTALL_DIR)
+    if(DEFINED ENV{XILINX_XRT})
+        set(XRT_INSTALL_DIR $ENV{XILINX_XRT})
+    else()
+        set(XRT_INSTALL_DIR "/opt/xilinx/xrt")
+    endif()
+endif()
+
+set(AIECC_PY "${MLIR_AIE_INSTALL_DIR}/bin/aiecc.py")
+
+# Deeploy-generated sources
+set(NETWORK_MLIR "${GENERATED_SOURCE}/network.mlir")
+
+message(STATUS "[XDNA2] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2] MLIR_AIE_INSTALL_DIR = ${MLIR_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2] XRT_INSTALL_DIR      = ${XRT_INSTALL_DIR}")
+message(STATUS "[XDNA2] GENERATED_SOURCE      = ${GENERATED_SOURCE}")
+message(STATUS "[XDNA2] TESTNAME              = ${TESTNAME}")
+
+# ---------------------------------------------------------------------------
+# Step 1: Compile MLIR -> xclbin + npu_insts.bin
+# ---------------------------------------------------------------------------
+set(XCLBIN      "${CMAKE_CURRENT_BINARY_DIR}/network.xclbin")
+set(NPU_INSTS   "${CMAKE_CURRENT_BINARY_DIR}/npu_insts.bin")
+
+add_custom_command(
+    OUTPUT  "${XCLBIN}" "${NPU_INSTS}"
+    # Copy kernel objects into aiecc.py working dir so the linker scripts
+    # generated by aiecc.py can find them via INPUT(kernel.o).
+    COMMAND ${CMAKE_COMMAND} -E copy ${XDNA2_KERNEL_OBJECTS} "${CMAKE_CURRENT_BINARY_DIR}"
+    COMMAND ${CMAKE_COMMAND} -E env
+            "PATH=${MLIR_AIE_INSTALL_DIR}/bin:$ENV{PATH}"
+            "python" "${AIECC_PY}"
+            --no-aiesim
+            --no-xchesscc
+            --no-xbridge
+            --peano "${LLVM_AIE_INSTALL_DIR}"
+            --aie-generate-cdo
+            --aie-generate-npu-insts
+            --npu-insts-name npu_insts.bin
+            --aie-generate-xclbin
+            --dump-intermediates
+            --xclbin-kernel-name=MLIR_AIE
+            --xclbin-name network.xclbin
+            "${NETWORK_MLIR}"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    DEPENDS "${NETWORK_MLIR}" ${XDNA2_KERNEL_OBJECTS} xdna2_kernels
+    COMMENT "[XDNA2] Compiling MLIR -> network.xclbin + npu_insts.bin"
+    VERBATIM
+)
+add_custom_target(xdna2_xclbin DEPENDS "${XCLBIN}" "${NPU_INSTS}")
+
+# ---------------------------------------------------------------------------
+# Step 2: Compile XRT host binary
+# ---------------------------------------------------------------------------
+add_executable("${TESTNAME}"
+    "${CMAKE_CURRENT_LIST_DIR}/main.cpp"
+)
+
+target_include_directories("${TESTNAME}" PRIVATE
+    "${XRT_INSTALL_DIR}/include"
+    "${GENERATED_SOURCE}"
+)
+
+target_link_directories("${TESTNAME}" PRIVATE
+    "${XRT_INSTALL_DIR}/lib"
+)
+
+target_link_libraries("${TESTNAME}" PRIVATE
+    xrt_coreutil
+    uuid
+    dl
+    pthread
+)
+
+target_compile_features("${TESTNAME}" PRIVATE cxx_std_17)
+
+# The xclbin and npu_insts must be available at runtime in the same directory
+# as the binary.  Add a post-build step to copy them.
+add_custom_command(TARGET "${TESTNAME}" POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${XCLBIN}" "$<TARGET_FILE_DIR:${TESTNAME}>/network.xclbin"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${NPU_INSTS}" "$<TARGET_FILE_DIR:${TESTNAME}>/npu_insts.bin"
+    COMMENT "[XDNA2] Copying xclbin and npu_insts to binary directory"
+)
+
+add_dependencies("${TESTNAME}" xdna2_xclbin)
diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp
new file mode 100644
index 0000000000..7fa3c4aefc
--- /dev/null
+++ b/DeeployTest/Platforms/XDNA2/main.cpp
@@ -0,0 +1,255 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// XRT C++ testbench for the XDNA2 (AIE2p) platform.
+// Loads network.xclbin produced by aiecc.py, runs the MLIR_AIE kernel,
+// reads back outputs and compares against golden reference values.
+// Output format: "Errors: X out of Y" (required by output_parser.py).
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_hw_context.h"
+#include "xrt/xrt_kernel.h"
+
+// Generated by Deeploy's generateNetwork_xdna2.py:
+//   testinputs.h  – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i}
+//   defines testoutputs.h – uint16_t arrays of BF16 bit patterns +
+//   N_ELEMENTS_OUTPUT{i} defines
+#include "testinputs.h"
+#include "testoutputs.h"
+
+// ---------------------------------------------------------------------------
+// BF16 helpers
+// ---------------------------------------------------------------------------
+static float bf16_to_float(uint16_t bf16) {
+  uint32_t f32_bits = static_cast<uint32_t>(bf16) << 16;
+  float f;
+  std::memcpy(&f, &f32_bits, sizeof(f));
+  return f;
+}
+
+static bool bf16_nearly_equal(uint16_t a, uint16_t b, float rtol = 0.0f,
+                              float atol = 0.0f) {
+  // Default: allow 1 BF16 ULP difference to account for hardware rounding.
+  // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values.
+  float fa = bf16_to_float(a);
+  float fb = bf16_to_float(b);
+  float diff = std::fabs(fa - fb);
+
+  // Compute 1 ULP for the reference value's magnitude
+  uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits)
+  float ulp;
+  if (ref_exp == 0)
+    ulp = std::ldexp(1.0f, -133); // subnormal ULP
+  else
+    ulp = std::ldexp(1.0f,
+                     static_cast<int>(ref_exp) - 127 - 7); // 7 mantissa bits
+
+  float tol = std::fmax(atol + rtol * std::fabs(fb), ulp);
+  return diff <= tol;
+}
+
+// ---------------------------------------------------------------------------
+// Read the NPU instruction binary produced by aiecc.py
+// ---------------------------------------------------------------------------
+static std::vector<uint32_t> read_instr_binary(const std::string &path) {
+  std::ifstream file(path, std::ios::binary);
+  if (!file.is_open()) {
+    throw std::runtime_error("Cannot open instruction file: " + path);
+  }
+  file.seekg(0, std::ios::end);
+  size_t byte_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<uint32_t> instr(byte_size / sizeof(uint32_t));
+  file.read(reinterpret_cast<char *>(instr.data()), byte_size);
+  return instr;
+}
+
+int main(int argc, char **argv) {
+  // Paths to the compiled artefacts: default to the directory containing
+  // this binary so the test works regardless of the working directory or
+  // whether it is run inside a container.
+  std::string bin_dir;
+  {
+    std::string argv0(argv[0]);
+    auto sep = argv0.rfind('/');
+    bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep);
+  }
+  std::string xclbin_path = bin_dir + "/network.xclbin";
+  std::string instr_path = bin_dir + "/npu_insts.bin";
+
+  bool verbose = false;
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg == "-v" || arg == "--verbose" || arg == "-vv") {
+      verbose = true;
+    }
+  }
+  if (argc >= 2 && argv[1][0] != '-')
+    xclbin_path = argv[1];
+  if (argc >= 3 && argv[2][0] != '-')
+    instr_path = argv[2];
+
+  // -----------------------------------------------------------------------
+  // 1. Open XRT device, register xclbin, create hw_context
+  //    (matches mlir-aie test_utils::init_xrt_load_kernel pattern)
+  // -----------------------------------------------------------------------
+  auto device = xrt::device(0);
+  auto xclbin = xrt::xclbin(xclbin_path);
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, "MLIR_AIE");
+
+  // -----------------------------------------------------------------------
+  // 2. Read NPU instruction binary
+  // -----------------------------------------------------------------------
+  std::vector<uint32_t> instr_v = read_instr_binary(instr_path);
+  size_t n_instr = instr_v.size();
+
+  // -----------------------------------------------------------------------
+  // 3. Derive element counts from the testinputs/testoutputs header defines.
+  //    N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set
+  //    by generateNetwork_xdna2.py.
+  // -----------------------------------------------------------------------
+  // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs
+  // (with respect to the amount of bo available)
+  static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1,
+                "Input 0 and input 1 must have the same number of elements");
+  static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,
+                "Inputs and output must have the same number of elements");
+
+  const size_t n_elem = N_ELEMENTS_OUTPUT0;
+  const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes
+  const size_t buf_bytes = n_elem * elem_size;
+
+  // -----------------------------------------------------------------------
+  // 4. Allocate XRT buffer objects
+  //    Kernel args: (0:opcode, 1:instr_bo, 2:instr_len,
+  //                  3:in0, 4:in1, 5:out, 6:ctrlpkts, 7:trace)
+  // -----------------------------------------------------------------------
+  auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_in0 =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_in1 =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out =
+      xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // Control packets buffer (required by the kernel ABI)
+  auto bo_ctrlpkts =
+      xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
+
+  // Trace buffer: allocated at 4x the requested size (hardware requirement).
+  // When TRACE_BUFFER_SIZE == 0 (no tracing), allocate a minimal 1-byte
+  // placeholder so the kernel call signature stays the same.
+  constexpr size_t trace_alloc =
+      TRACE_BUFFER_SIZE > 0 ? TRACE_BUFFER_SIZE * 4 : 1;
+  auto bo_trace =
+      xrt::bo(device, trace_alloc, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7));
+
+  // Zero-initialise trace buffer
+  if constexpr (TRACE_BUFFER_SIZE > 0) {
+    std::memset(bo_trace.map<void *>(), 0, trace_alloc);
+    bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  }
+
+  // -----------------------------------------------------------------------
+  // 5. Copy data into device buffers
+  // -----------------------------------------------------------------------
+  std::memcpy(bo_instr.map<uint32_t *>(), instr_v.data(),
+              n_instr * sizeof(uint32_t));
+  std::memcpy(bo_in0.map<void *>(), testInputVector0, buf_bytes);
+  std::memcpy(bo_in1.map<void *>(), testInputVector1, buf_bytes);
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // -----------------------------------------------------------------------
+  // 6. Launch kernel and wait for completion
+  //    opcode 3 = execute NPU instruction stream
+  // -----------------------------------------------------------------------
+  // JUNGVI: TODO: Collect runtime and display it
+  // JUNGVI: TODO: Enable warmup iterations
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, static_cast<uint32_t>(n_instr), bo_in0,
+                    bo_in1, bo_out, bo_ctrlpkts, bo_trace);
+  run.wait();
+
+  // -----------------------------------------------------------------------
+  // 7. Sync output back and compare against golden reference
+  // -----------------------------------------------------------------------
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  const uint16_t *hw_out = bo_out.map<const uint16_t *>();
+  const uint16_t *golden_out = testOutputVector0;
+
+  int errors = 0;
+  for (size_t i = 0; i < n_elem; ++i) {
+    bool match = bf16_nearly_equal(hw_out[i], golden_out[i]);
+    if (!match) {
+      ++errors;
+      if (errors <= 10) {
+        std::cerr << "  Mismatch at index " << i
+                  << ": hw=" << bf16_to_float(hw_out[i]) << " (0x" << std::hex
+                  << hw_out[i] << std::dec << ")"
+                  << "  ref=" << bf16_to_float(golden_out[i]) << " (0x"
+                  << std::hex << golden_out[i] << std::dec << ")"
+                  << "  diff="
+                  << std::fabs(bf16_to_float(hw_out[i]) -
+                               bf16_to_float(golden_out[i]))
+                  << "\n";
+      }
+    }
+    if (verbose) {
+      float hw_f = bf16_to_float(hw_out[i]);
+      float ref_f = bf16_to_float(golden_out[i]);
+      std::cout << "[" << i << "] hw=" << hw_f << "  ref=" << ref_f
+                << "  diff=" << std::fabs(hw_f - ref_f)
+                << (match ? "" : "  *** MISMATCH") << "\n";
+    }
+  }
+
+  // Output format required by testUtils/core/output_parser.py
+  std::cout << "Errors: " << errors << " out of " << n_elem << "\n";
+
+  // -----------------------------------------------------------------------
+  // 8. Read back trace data and write to trace.txt
+  // -----------------------------------------------------------------------
+  if constexpr (TRACE_BUFFER_SIZE > 0) {
+    bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    const uint32_t *trace_data = bo_trace.map<const uint32_t *>();
+    size_t trace_words = TRACE_BUFFER_SIZE / sizeof(uint32_t);
+
+    std::string trace_path = bin_dir + "/trace.txt";
+    std::ofstream trace_file(trace_path);
+    if (trace_file.is_open()) {
+      for (size_t i = 0; i < trace_words; ++i) {
+        if (trace_data[i] != 0) {
+          trace_file << std::hex << std::setfill('0') << std::setw(8)
+                     << trace_data[i] << "\n";
+        }
+      }
+      trace_file.close();
+      std::cout << "Trace written to " << trace_path << "\n";
+    } else {
+      std::cerr << "Warning: could not open " << trace_path << " for writing\n";
+    }
+  }
+
+  return (errors == 0) ? 0 : 1;
+}
diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz
new file mode 100644
index 0000000000..816afb2bc8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx b/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx
new file mode 100644
index 0000000000..acefe12a69
Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz
new file mode 100644
index 0000000000..1304bf2845
Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz differ
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index c7077067d9..e37ddcf99b 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -68,6 +68,7 @@ def pytest_configure(config: pytest.Config) -> None:
                             "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)")
     config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test")
     config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)")
+    config.addinivalue_line("markers", "xdna2: mark test as an XDNA2 (AIE2p) platform test")
     config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)")
     config.addinivalue_line("markers", "models: mark test as a model test (full networks)")
     config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py
new file mode 100644
index 0000000000..a8f6b78694
--- /dev/null
+++ b/DeeployTest/deeployRunner_xdna2.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform.
+
+Usage (from DeeployTest/):
+    python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular [--skipsim] [-v]
+    python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular --trace [--trace-buffer-size 16384]
+"""
+
+import json
+import os
+import sys
+from glob import glob
+
+from testUtils.deeployRunner import main
+
+
+def _add_xdna2_args(parser):
+    """Register XDNA2-specific CLI arguments."""
+    parser.add_argument('--trace',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Enable execution tracing in the generated MLIR')
+    parser.add_argument('--trace-buffer-size',
+                        type = int,
+                        default = 8192,
+                        help = 'Trace buffer size in bytes (default: 8192)')
+
+
+def _add_xdna2_gen_args(args, gen_args_list):
+    """Forward XDNA2-specific arguments to the generation script."""
+    if getattr(args, 'trace', False):
+        gen_args_list.append('--trace')
+        trace_buffer_size = getattr(args, 'trace_buffer_size', 8192)
+        if trace_buffer_size != 8192:
+            gen_args_list.append(f'--trace-buffer-size={trace_buffer_size}')
+
+
+def _xdna2_post_sim(config, result, args):
+    """Parse trace.txt into a Perfetto-compatible trace.json after simulation."""
+    if not getattr(args, 'trace', False):
+        return
+
+    build_dir = config.build_dir
+    trace_txt = os.path.join(build_dir, "bin", "trace.txt")
+    if not os.path.isfile(trace_txt):
+        print(f"Warning: --trace enabled but {trace_txt} not found; skipping trace parsing.")
+        return
+
+    # Find the MLIR with lowered NpuWrite32 ops (trace event register config).
+    # aiecc.py produces this when invoked with --dump-intermediates.
+    prj_pattern = os.path.join(build_dir, "DeeployTest", "Platforms", "XDNA2", "network.mlir.prj",
+                               "main_physical_with_elfs.mlir")
+    candidates = glob(prj_pattern)
+    if not candidates:
+        print(f"Warning: lowered MLIR not found at {prj_pattern}; skipping trace parsing.")
+        return
+    lowered_mlir = candidates[0]
+
+    trace_json = os.path.join(build_dir, "bin", "trace.json")
+
+    try:
+        from aie.utils.trace.parse import align_column_start_index, check_for_valid_trace, convert_commands_to_json, \
+            convert_to_byte_stream, convert_to_commands, parse_mlir_trace_events, setup_trace_metadata, \
+            trace_pkts_de_interleave, trim_trace_pkts
+
+        with open(trace_txt, "r") as f:
+            trace_pkts = f.read().split("\n")
+
+        with open(lowered_mlir, "r") as f:
+            mlir_str = f.read()
+
+        pid_events, events_module = parse_mlir_trace_events(mlir_str)
+
+        if not check_for_valid_trace(trace_txt, trace_pkts):
+            print(f"Warning: trace data in {trace_txt} appears invalid; skipping trace parsing.")
+            return
+
+        trimmed = trim_trace_pkts(trace_pkts)
+        sorted_pkts = trace_pkts_de_interleave(trimmed)
+        byte_streams = convert_to_byte_stream(sorted_pkts)
+        commands = convert_to_commands(byte_streams, False)
+
+        pid_events = align_column_start_index(pid_events, commands)
+
+        trace_events = []
+        setup_trace_metadata(trace_events, pid_events, events_module)
+        convert_commands_to_json(trace_events, commands, pid_events, events_module)
+
+        with open(trace_json, "w") as f:
+            json.dump(trace_events, f)
+
+        print(f"Trace parsed: {trace_json} ({len(trace_events)} events)")
+
+    except SystemExit:
+        print(f"Warning: trace parsing failed (mlir-aie parser error). "
+              f"Ensure the build was done with --trace enabled.")
+    except Exception as e:
+        print(f"Warning: trace parsing failed: {e}")
+
+
+if __name__ == '__main__':
+    sys.exit(
+        main(default_platform = "XDNA2",
+             default_simulator = "host",
+             tiling_enabled = True,
+             parser_setup_callback = _add_xdna2_args,
+             gen_args_callback = _add_xdna2_gen_args,
+             post_sim_callback = _xdna2_post_sim))
diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py
new file mode 100644
index 0000000000..789d37716c
--- /dev/null
+++ b/DeeployTest/generateNetwork_xdna2.py
@@ -0,0 +1,241 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""XDNA2 network generation script.
+
+Replaces the generic ``generateNetwork.py`` for the XDNA2 platform.
+Instead of emitting C code it:
+
+1. Loads the ONNX model and npz test-data.
+2. Prepares the XDNA2Deployer (type checking + graph binding).
+3. Emits ``testinputs.h`` and ``testoutputs.h`` with raw BF16 uint16_t arrays.
+4. Calls ``deployer.generateMLIR()`` and writes ``network.mlir``.
+"""
+
+import os
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+from testUtils.platformMapping import mapDeployer
+from testUtils.testRunner import TestGeneratorArgumentParser
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import bfloat16_t
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
+from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, XDNA2AIECoreEngine, XDNA2TilingMapping
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
+
+
+def _tilingScheduler(graph: gs.Graph):
+    return [[node] for node in graph.nodes]
+
+
+def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray:
+    """Convert a float32 numpy array to an array of BF16 bit patterns (uint16_t).
+
+    Uses round-to-nearest-even (the standard IEEE 754 rounding mode).
+    """
+    f32 = arr.astype(np.float32)
+    raw = f32.view(np.uint32)
+    # Standard round-to-nearest-even: add 0x7FFF + BF16_LSB to the full word,
+    # then truncate.  The 0x7FFF biases values just below the midpoint to
+    # round down, while adding the BF16 LSB provides tie-breaking to even.
+    bf16_lsb = (raw >> 16) & 1
+    raw = raw + np.uint32(0x7FFF) + bf16_lsb
+    bf16 = (raw >> 16).astype(np.uint16)
+    return bf16
+
+
+def _bf16_to_float32(bf16: np.ndarray) -> np.ndarray:
+    """Convert an array of BF16 uint16 bit patterns back to float32."""
+    f32_bits = bf16.astype(np.uint32) << 16
+    return f32_bits.view(np.float32)
+
+
+def _generate_xdna2_inputs_header(input_arrays: list) -> str:
+    """Generate testinputs.h with raw uint16_t BF16 bit-pattern arrays."""
+    lines = []
+    lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna")
+    lines.append("// SPDX-License-Identifier: Apache-2.0")
+    lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.")
+    lines.append("#pragma once")
+    lines.append("#include <stdint.h>")
+    lines.append("")
+
+    vec_names = []
+    for idx, arr in enumerate(input_arrays):
+        bf16 = _float32_to_bf16_uint16(arr.flatten())
+        n = len(bf16)
+        name = f"testInputVector{idx}"
+        vec_names.append(name)
+        hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16)
+        lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};")
+        lines.append(f"#define N_ELEMENTS_INPUT{idx} {n}u")
+        lines.append("")
+
+    lines.append(f"static const void *testInputVector[{len(vec_names)}] = {{")
+    lines.append("    " + ", ".join(f"(const void *){n}" for n in vec_names))
+    lines.append("};")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def _generate_xdna2_outputs_header(output_arrays: list) -> str:
+    """Generate testoutputs.h with raw uint16_t BF16 bit-pattern arrays."""
+    lines = []
+    lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna")
+    lines.append("// SPDX-License-Identifier: Apache-2.0")
+    lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.")
+    lines.append("#pragma once")
+    lines.append("#include <stdint.h>")
+    lines.append("")
+
+    vec_names = []
+    for idx, arr in enumerate(output_arrays):
+        bf16 = _float32_to_bf16_uint16(arr.flatten())
+        n = len(bf16)
+        name = f"testOutputVector{idx}"
+        vec_names.append(name)
+        hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16)
+        lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};")
+        lines.append(f"#define N_ELEMENTS_OUTPUT{idx} {n}u")
+        lines.append("")
+
+    lines.append(f"static const void *testOutputVector[{len(vec_names)}] = {{")
+    lines.append("    " + ", ".join(f"(const void *){n}" for n in vec_names))
+    lines.append("};")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def generateNetworkXDNA2(args):
+    log.debug("Arguments: %s", args)
+
+    onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
+    graph = gs.import_onnx(onnx_graph)
+
+    inputs_npz = np.load(f'{args.dir}/inputs.npz')
+    outputs_npz = np.load(f'{args.dir}/outputs.npz')
+
+    test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files]
+    test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files]
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    for index, (name, values) in enumerate(zip(inputs_npz.files, test_inputs_f32)):
+        if np.prod(values.shape) == 0:
+            continue
+        # Force bfloat16_t — BF16 test data stored as float32 in npz would be
+        # inferred as float32_t by minimalFloatType, but the XDNA2 kernel
+        # requires bfloat16_t inputs.
+        # JUNGVI: TODO: Align minimalFloatType to properly handle bf16 and don't force types.
+        inputTypes[f"input_{index}"] = PointerClass(bfloat16_t)
+        inputOffsets[f"input_{index}"] = 0
+
+    _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates")
+
+    # JUNGVI: TODO: Extend with the whole NPU array
+    # Define memory hierarchy: L1 (AIE core local) and L3 (shared)
+    l1_size = int(getattr(args, 'l1', None) or 64000)  # 64KB default
+    l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024)  # 128MB default
+
+    log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}")
+
+    l1_level = MemoryLevel("L1", neighbourNames = ["L3"], size = l1_size)
+    l3_level = MemoryLevel("L3", neighbourNames = ["L1"], size = l3_size)
+    memory_hierarchy = MemoryHierarchy([l1_level, l3_level])
+    memory_hierarchy.setDefaultMemoryLevel("L3")  # Tensors default to L3
+
+    # Create memory-aware platform with AIE core engines
+    mem_platform = MemoryXDNA2Platform(
+        memoryHierarchy = memory_hierarchy,
+        defaultTargetMemoryLevel = l1_level,
+        engines = [XDNA2AIECoreEngine(Mapping = XDNA2TilingMapping, preferredMemoryLevel = "L1")])
+
+    # Create base deployer with memory platform
+    deployer = mapDeployer(mem_platform,
+                           graph,
+                           inputTypes,
+                           scheduler = _tilingScheduler,
+                           deeployStateDir = _DEEPLOYSTATEDIR,
+                           inputOffsets = inputOffsets)
+
+    # Wrap with MemoryDeployerWrapper (adds memory level annotation)
+    deployer = MemoryDeployerWrapper(deployer)
+
+    # Wrap with TilerDeployerWrapper (adds tiling)
+    deployer = TilerDeployerWrapper(deployer, workDir = _DEEPLOYSTATEDIR)
+
+    # Enable tracing if requested
+    enableTrace = getattr(args, 'trace', False)
+    if enableTrace:
+        traceBufferSize = int(getattr(args, 'trace_buffer_size', None) or 8192)
+        deployer.enableTrace = True
+        deployer.traceBufferSize = traceBufferSize
+        log.info(f"[XDNA2] Tracing enabled (buffer_size={traceBufferSize})")
+
+    # frontEnd() parses the graph; bind() triggers tiling via wrappers
+    deployer.frontEnd()
+    deployer.bind()
+    deployer.prepared = True
+    log.info("[XDNA2] Tiling completed, proceeding with MLIR generation")
+
+    # Create output directory
+    os.makedirs(args.dumpdir, exist_ok = True)
+
+    # Write testinputs.h (raw BF16 bit patterns as uint16_t)
+    testInputStr = _generate_xdna2_inputs_header(test_inputs_f32)
+    # Append trace buffer size define so the host binary knows whether to
+    # allocate a trace buffer and how large it should be.
+    if enableTrace:
+        testInputStr += f"#define TRACE_BUFFER_SIZE {traceBufferSize}u\n"
+    else:
+        testInputStr += "#define TRACE_BUFFER_SIZE 0u\n"
+    with open(f'{args.dumpdir}/testinputs.h', 'w') as f:
+        f.write(testInputStr)
+
+    # JUNGVI: TODO: Move this in ONNX4Deeploy
+    # Recompute golden outputs from the actual BF16 inputs the hardware will
+    # see.  The original outputs.npz may have been computed in float32
+    # precision, which can differ by several BF16 ULPs.
+    bf16_inputs = [_float32_to_bf16_uint16(a.flatten()) for a in test_inputs_f32]
+    bf16_input_f32 = [_bf16_to_float32(b) for b in bf16_inputs]
+    golden_f32 = bf16_input_f32[0]
+    for inp in bf16_input_f32[1:]:
+        golden_f32 = golden_f32 + inp
+    test_outputs_bf16 = [golden_f32.reshape(arr.shape) for arr in test_outputs_f32]
+
+    # Write testoutputs.h (raw BF16 bit patterns as uint16_t)
+    testOutputStr = _generate_xdna2_outputs_header(test_outputs_bf16)
+    with open(f'{args.dumpdir}/testoutputs.h', 'w') as f:
+        f.write(testOutputStr)
+
+    # Write network.mlir
+    mlir_str = deployer.generateMLIR()
+    with open(f'{args.dumpdir}/network.mlir', 'w') as f:
+        f.write(mlir_str)
+
+    log.info(f"[XDNA2] Generated: testinputs.h, testoutputs.h, network.mlir -> {args.dumpdir}")
+
+
+if __name__ == '__main__':
+    parser = TestGeneratorArgumentParser(tiling_arguments = True,
+                                         description = "Deeploy XDNA2 Code Generation Utility.")
+    parser.add_argument('--trace',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Enable execution tracing in the generated MLIR')
+    parser.add_argument('--trace-buffer-size',
+                        type = int,
+                        default = 8192,
+                        help = 'Trace buffer size in bytes (default: 8192)')
+    args, _ = parser.parse_known_args()
+
+    if args.platform != 'XDNA2':
+        parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}")
+
+    generateNetworkXDNA2(args)
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 1dcddeea62..a259c93ad7 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -27,7 +27,9 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None:
 
     script_dir = Path(__file__).parent.parent.parent
 
-    if config.tiling:
+    if config.platform == "XDNA2":
+        generation_script = script_dir / "generateNetwork_xdna2.py"
+    elif config.tiling:
         generation_script = script_dir / "testMVP.py"
     else:
         generation_script = script_dir / "generateNetwork.py"
@@ -166,6 +168,9 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
         # Run binary directly
         binary_path = Path(config.build_dir) / "bin" / config.test_name
         cmd = [str(binary_path)]
+        # Propagate verbosity to the host binary (e.g. XDNA2 main.cpp uses -v)
+        if config.verbose >= 1:
+            cmd.append("-v")
     else:
         # Run via CMake target
         cmake_cmd = os.environ.get("CMAKE", "cmake")
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..87a6db3c0b 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -139,6 +139,12 @@ def __init__(self,
                               type = int,
                               default = 1024000,
                               help = 'L2 size in bytes\n')
+            self.add_argument('--l3',
+                              metavar = '<size>',
+                              dest = 'l3',
+                              type = int,
+                              default = None,
+                              help = 'L3 size in bytes\n')
             self.add_argument('--randomizedMemoryScheduler',
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
@@ -181,7 +187,8 @@ def create_config_from_args(args: argparse.Namespace,
                             platform: str,
                             simulator: str,
                             tiling: bool,
-                            platform_specific_cmake_args: Optional[list] = None) -> DeeployTestConfig:
+                            platform_specific_cmake_args: Optional[list] = None,
+                            gen_args_callback = None) -> DeeployTestConfig:
 
     script_path = Path(__file__).resolve()
     base_dir = script_path.parent.parent
@@ -221,6 +228,8 @@ def create_config_from_args(args: argparse.Namespace,
             gen_args_list.append(f"--l1={args.l1}")
         if hasattr(args, 'l2') and args.l2 and args.l2 != 1024000:
             gen_args_list.append(f"--l2={args.l2}")
+        if hasattr(args, 'l3') and args.l3:
+            gen_args_list.append(f"--l3={args.l3}")
         if hasattr(args, 'randomizedMemoryScheduler') and args.randomizedMemoryScheduler:
             gen_args_list.append("--randomizedMemoryScheduler")
         if hasattr(args, 'profileTiling') and args.profileTiling:
@@ -235,6 +244,10 @@ def create_config_from_args(args: argparse.Namespace,
     if not tiling and getattr(args, 'profileUntiled', False):
         gen_args_list.append("--profileUntiled")
 
+    # Allow platform-specific runners to append their own generation args
+    if gen_args_callback:
+        gen_args_callback(args, gen_args_list)
+
     config = DeeployTestConfig(
         test_name = test_name,
         test_dir = test_dir_abs,
@@ -313,7 +326,9 @@ def main(default_platform: Optional[str] = None,
          tiling_enabled: bool = False,
          platform_specific_cmake_args: Optional[list] = None,
          parsed_args: Optional[argparse.Namespace] = None,
-         parser_setup_callback = None):
+         parser_setup_callback = None,
+         gen_args_callback = None,
+         post_sim_callback = None):
     """
     Main entry point for Deeploy test runners.
 
@@ -324,6 +339,10 @@ def main(default_platform: Optional[str] = None,
         platform_specific_cmake_args: Additional CMake arguments for platform-specific configurations
         parsed_args: Pre-parsed arguments (if None, will parse from sys.argv)
         parser_setup_callback: Optional callback to configure parser before parsing (receives parser as arg)
+        gen_args_callback: Optional callback ``(args, gen_args_list) -> None`` to append
+            platform-specific generation arguments after the base args are collected.
+        post_sim_callback: Optional callback ``(config, result, args) -> None`` invoked
+            after simulation completes (e.g. for trace post-processing).
     """
 
     if parsed_args is None:
@@ -348,6 +367,7 @@ def main(default_platform: Optional[str] = None,
         "snitch": "Snitch",
         "chimera": "Chimera",
         "softhier": "SoftHier",
+        "xdna2": "XDNA2",
     }
 
     if args.platform:
@@ -405,13 +425,17 @@ def main(default_platform: Optional[str] = None,
     if hasattr(args, 'num_clusters'):
         platform_specific_cmake_args.append(f"-DNUM_CLUSTERS={args.num_clusters}")
 
-    config = create_config_from_args(args, platform, simulator, tiling_enabled, platform_specific_cmake_args)
+    config = create_config_from_args(args, platform, simulator, tiling_enabled, platform_specific_cmake_args,
+                                     gen_args_callback)
 
     print_configuration(config)
 
     try:
         result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim)
 
+        if post_sim_callback and not args.skipsim:
+            post_sim_callback(config, result, args)
+
         print_colored_result(result, config.test_name)
 
         return 0 if result.success else 1
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906f9..9155ed77ae 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -31,7 +31,7 @@
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -76,6 +76,10 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Chimera":
         Platform = ChimeraPlatform()
 
+    elif platformName == "XDNA2":
+        from Deeploy.Targets.XDNA2.Platform import XDNA2Platform
+        Platform = XDNA2Platform()
+
     else:
         raise RuntimeError(f"Deployment platform {platformName} is not implemented")
 
@@ -274,6 +278,30 @@ def mapDeployer(platform: DeploymentPlatform,
                                    deeployStateDir = deeployStateDir)
 
     else:
-        raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+        # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms
+        try:
+            from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer
+            from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, \
+                XDNA2Optimizer, XDNA2Platform
+        except ImportError:
+            raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+
+        if not isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)):
+            raise RuntimeError(f"Deployer for platform {platform} is not implemented")
+
+        if loweringOptimizer is None:
+            loweringOptimizer = XDNA2Optimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = XDNA2Deployer(graph,
+                                 platform,
+                                 inputTypes,
+                                 loweringOptimizer,
+                                 scheduler,
+                                 name = name,
+                                 default_channels_first = default_channels_first,
+                                 deeployStateDir = deeployStateDir)
 
     return deployer
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 9578c2f26c..e233cc9b1d 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -61,7 +61,7 @@ def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int
 
 class TestGeneratorArgumentParser(argparse.ArgumentParser):
 
-    def __init__(self, description = None):
+    def __init__(self, tiling_arguments: bool = False, description = None):
 
         formatter = _ArgumentDefaultMetavarTypeFormatter
 
@@ -70,6 +70,8 @@ def __init__(self, description = None):
         else:
             super().__init__(description = description, formatter_class = formatter)
 
+        self.tiling_arguments = tiling_arguments
+
         self.add_argument('-t',
                           metavar = '<dir>',
                           dest = 'dir',
@@ -90,6 +92,27 @@ def __init__(self, description = None):
                           help = 'Set the output dump folder\n')
         self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n')
 
+        # Tiling-related arguments (for XDNA2 and other tiled platforms)
+        if self.tiling_arguments:
+            self.add_argument('--l1',
+                              metavar = '<size>',
+                              dest = 'l1',
+                              type = int,
+                              default = None,
+                              help = 'Set L1 memory size in bytes (enables tiling if specified).\n')
+            self.add_argument('--l3',
+                              metavar = '<size>',
+                              dest = 'l3',
+                              type = int,
+                              default = None,
+                              help = 'Set L3 memory size in bytes.\n')
+            self.add_argument('--defaultMemLevel',
+                              metavar = '<level>',
+                              dest = 'defaultMemLevel',
+                              type = str,
+                              default = "L3",
+                              help = 'Set default memory level (default: L3)\n')
+
         self.args = None
 
     def parse_args(self, args = None, namespace = None) -> argparse.Namespace:
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 6d9f3cfcd7..dca5c7b7cc 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -42,6 +42,7 @@
 from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS
 from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS
 from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS
+from test_xdna2_config import KERNEL_TESTS as XDNA2_KERNEL_TESTS
 from testUtils.pytestRunner import create_test_config, run_and_assert_test
 
 
@@ -117,6 +118,11 @@ def param_id(param):
         "model_tests": GAP9_MODEL_TESTS,
         "default_num_cores": GAP9_DEFAULT_NUM_CORES,
     },
+    "xdna2": {
+        "platform": "XDNA2",
+        "simulator": "host",
+        "kernel_tests": XDNA2_KERNEL_TESTS,
+    },
 }
 
 ### Markers summary ###
@@ -987,3 +993,21 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch
         double_buffer = True,
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.xdna2
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", XDNA2_KERNEL_TESTS, ids = XDNA2_KERNEL_TESTS)
+def test_xdna2_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["xdna2"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
diff --git a/DeeployTest/test_xdna2_config.py b/DeeployTest/test_xdna2_config.py
new file mode 100644
index 0000000000..7988aa09b1
--- /dev/null
+++ b/DeeployTest/test_xdna2_config.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Test list for the XDNA2 platform.
+# Each entry is a relative path under DeeployTest/Tests/.
+
+KERNEL_TESTS = [
+    "Kernels/BF16/Add/Regular",
+]
diff --git a/README_XDNA.md b/README_XDNA.md
new file mode 100644
index 0000000000..56cfcb1225
--- /dev/null
+++ b/README_XDNA.md
@@ -0,0 +1,51 @@
+# How to use Deeploy on the XDNA2 NPU
+
+A dockerfile containing everything required to run on XDNA2 is available to build with the dockerfile at `Container/Dockerfile.deeploy-xdna`.
+
+You can build it locally on Ubuntu 24.04 with:
+```
+docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local .
+```
+
+You need to have XRT installed on your host, once installed it is present in `/opt/xilinx/xrt`. You can run the docker container previously built with:
+```
+docker run -it \
+  --device /dev/accel/accel0 \
+  --ulimit memlock=-1 \
+  -v "$(pwd)":/app/Deeploy \
+  -v /opt/xilinx:/opt/xilinx \
+  --name deeploy_dev \
+  deeploy-xdna:local
+```
+
+Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation.
+
+Once the container is started you can run a simple Add node, from ONNX to execution with:
+```
+pip install -e ./ && \
+cd DeeployTest && \
+python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/
+```
+
+## CI with a Self-Hosted Runner
+
+XDNA2 tests run on a self-hosted GitHub Actions runner with NPU access.
+The Docker image is built locally on the runner (not distributed via GHCR).
+
+### One-time setup on the runner machine
+
+1. Build the Docker image:
+   ```
+   docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local .
+   ```
+
+2. Register the GitHub Actions runner (Settings → Actions → Runners → New self-hosted runner).
+   Use the label **`xdna2-npu`** and install as a service:
+   ```
+   ./svc.sh install && ./svc.sh start
+   ```
+
+3. Make sure the runner user has access to `/dev/accel/accel0` (e.g. is in the `render` group).
+
+Once the runner is registered, pushes and PRs automatically trigger the
+`CI • XDNA2` workflow defined in `.github/workflows/ci-platform-xdna2.yml`.
\ No newline at end of file
diff --git a/TargetLibraries/XDNA2/CMakeLists.txt b/TargetLibraries/XDNA2/CMakeLists.txt
new file mode 100644
index 0000000000..c2e1ffdecd
--- /dev/null
+++ b/TargetLibraries/XDNA2/CMakeLists.txt
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ---------------------------------------------------------------------------
+# XDNA2 (AIE2p) kernel library
+#
+# Compiles AIE C++ kernels using the llvm-aie (Peano) cross-compiler.
+# Exports a CMake target `xdna2_kernels` that other targets can depend on,
+# and sets XDNA2_KERNEL_OBJECTS in the parent scope.
+# ---------------------------------------------------------------------------
+
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+# --- Resolve llvm-aie (Peano) install dir ---
+set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir")
+if(NOT LLVM_AIE_INSTALL_DIR)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());"
+        OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+endif()
+if(NOT LLVM_AIE_INSTALL_DIR)
+    message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. "
+        "Please set the environment variable LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.")
+endif()
+
+# --- Resolve mlir-aie include dir (aie_api headers) ---
+if(NOT MLIR_AIE_INCLUDE_DIR)
+    if(DEFINED ENV{MLIR_AIE_INCLUDE_DIR})
+        set(MLIR_AIE_INCLUDE_DIR $ENV{MLIR_AIE_INCLUDE_DIR})
+    else()
+        execute_process(
+            COMMAND ${Python3_EXECUTABLE}
+                    -c "import aie.utils.config; print(aie.utils.config.cxx_header_path());"
+            OUTPUT_VARIABLE MLIR_AIE_INCLUDE_DIR
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+            RESULT_VARIABLE _aie_cfg_result
+        )
+        if(NOT _aie_cfg_result EQUAL 0 OR NOT MLIR_AIE_INCLUDE_DIR)
+            message(FATAL_ERROR "[XDNA2] Could not query aie.utils.config.cxx_header_path(). "
+                "Please set the environment variable MLIR_AIE_INCLUDE_DIR or install the mlir-aie wheel.")
+        endif()
+    endif()
+endif()
+
+set(LLVM_AIE_CLANG "${LLVM_AIE_INSTALL_DIR}/bin/clang++")
+
+message(STATUS "[XDNA2 Kernels] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}")
+message(STATUS "[XDNA2 Kernels] MLIR_AIE_INCLUDE_DIR = ${MLIR_AIE_INCLUDE_DIR}")
+
+# ---------------------------------------------------------------------------
+# Compile AIE kernels
+# ---------------------------------------------------------------------------
+file(GLOB XDNA2_KERNEL_SOURCES "${CMAKE_CURRENT_LIST_DIR}/kernels/*.cc")
+
+set(XDNA2_KERNEL_OBJECTS "")
+
+foreach(KERNEL_SRC ${XDNA2_KERNEL_SOURCES})
+    get_filename_component(KERNEL_NAME ${KERNEL_SRC} NAME_WE)
+    set(KERNEL_OBJ "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_NAME}.o")
+
+    add_custom_command(
+        OUTPUT  "${KERNEL_OBJ}"
+        COMMAND "${LLVM_AIE_CLANG}"
+                --target=aie2p-none-unknown-elf
+                "-I${MLIR_AIE_INCLUDE_DIR}"
+                -std=c++20
+                -Wno-parentheses
+                -Wno-attributes
+                -Wno-macro-redefined
+                -Wno-empty-body
+                -O2
+                -DNDEBUG
+                -c "${KERNEL_SRC}"
+                -o "${KERNEL_OBJ}"
+        DEPENDS "${KERNEL_SRC}"
+        COMMENT "[XDNA2] Compiling AIE kernel: ${KERNEL_NAME}.cc -> ${KERNEL_NAME}.o"
+        VERBATIM
+    )
+
+    list(APPEND XDNA2_KERNEL_OBJECTS "${KERNEL_OBJ}")
+endforeach()
+
+add_custom_target(xdna2_kernels DEPENDS ${XDNA2_KERNEL_OBJECTS})
+
+# Export kernel objects to parent scope so the testbench CMake can use them
+set(XDNA2_KERNEL_OBJECTS "${XDNA2_KERNEL_OBJECTS}" PARENT_SCOPE)
diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc
new file mode 100644
index 0000000000..13b8b54637
--- /dev/null
+++ b/TargetLibraries/XDNA2/kernels/add.cc
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All
+// rights reserved. SPDX-License-Identifier: Apache-2.0
+
+#define NOCPP
+
+#include <aie_api/aie.hpp>
+#include <aie_kernels/aie_kernel_utils.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+template <typename T_in, typename T_out>
+void eltwise_add(T_in *a, T_in *b, T_out *c, int size) {
+  for (int i = 0; i < size; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <typename T_in, typename T_out>
+void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) {
+  constexpr int vec_factor = 16;
+  event0();
+  T_in *__restrict pA1 = a;
+  T_in *__restrict pB1 = b;
+  T_out *__restrict pC1 = c;
+  const int F = size / vec_factor;
+  AIE_PREPARE_FOR_PIPELINING
+  AIE_LOOP_MIN_ITERATION_COUNT(16)
+  for (int i = 0; i < F; i++) {
+    aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+    pA1 += vec_factor;
+    aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
+    pB1 += vec_factor;
+    aie::vector<T_out, vec_factor> cout = aie::add(A0, B0);
+    aie::store_v(pC1, cout);
+    pC1 += vec_factor;
+  }
+  event1();
+}
+
+extern "C" {
+
+void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out,
+                             int size) {
+  eltwise_add<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+}
+
+void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out,
+                             int size) {
+  eltwise_vadd<bfloat16, bfloat16>(a_in, b_in, c_out, size);
+}
+
+} // extern "C"
diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst
index 0b3d97c761..c957bb22b2 100644
--- a/docs/tutorials/overview.rst
+++ b/docs/tutorials/overview.rst
@@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t
 
    introduction
    debugging
+   xdna2_tracing
 
 
diff --git a/docs/tutorials/xdna2_tracing.md b/docs/tutorials/xdna2_tracing.md
new file mode 100644
index 0000000000..d1b4bcb4e8
--- /dev/null
+++ b/docs/tutorials/xdna2_tracing.md
@@ -0,0 +1,68 @@
+<!--
+SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# XDNA2 Execution Tracing
+
+The XDNA2 backend supports optional AIE execution tracing.
+When enabled, the generated MLIR includes trace configuration for both **core
+events** (instruction execution, stalls, port activity) and **memory events**
+(DMA start/finish/starvation).
+After execution on the NPU, the raw trace data is parsed into a JSON file
+that can be visualised in [Perfetto](https://ui.perfetto.dev/).
+
+## Quick Start
+
+From the `DeeployTest/` directory:
+
+```bash
+# Generate, build, run on the NPU, and parse the trace in one step:
+python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular --trace
+
+# With a custom trace buffer size (default: 8192 bytes):
+python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular \
+    --trace --trace-buffer-size 16384
+```
+
+After execution two files are produced next to the test binary
+(inside `TEST_XDNA2/build_master/bin/`):
+
+| File | Description |
+|------|-------------|
+| `trace.txt` | Raw hex trace words read back from the NPU |
+| `trace.json` | Parsed trace in [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU) |
+
+Open `trace.json` in [Perfetto UI](https://ui.perfetto.dev/) to visualise
+core and memory trace timelines.
+
+## How It Works
+
+Enabling `--trace` triggers three additional MLIR code-transformation passes
+during code generation:
+
+1. **`MLIRCoreTracePass`** — Emits an `aie.trace` block on the compute tile
+   configured for 8 core events (vector instructions, stalls, port activity)
+   with packet-based routing.
+2. **`MLIRMemTracePass`** — Emits a second `aie.trace` block for 8 memory/DMA
+   events (S2MM/MM2S start, finish, starvation) with event-based
+   start/stop synchronised to the core trace via broadcast signals.
+3. **`MLIRTraceRuntimePass`** — Adds `trace_host_config` and
+   `trace_start_config` calls to the runtime sequence to activate the
+   configured traces at execution time.
+
+On the host side, the XRT testbench (`main.cpp`) allocates a trace buffer
+object, passes it as kernel argument 7, and writes the data back to `trace.txt` after execution.
+
+The post-simulation callback in `deeployRunner_xdna2.py` then invokes the
+mlir-aie trace parser (`aie.utils.trace.parse`) against the lowered MLIR
+(`main_physical_with_elfs.mlir`) to produce the final `trace.json`.
+
+## Traced Events
+
+- `INSTR_EVENT_0`: Emitted by the `event0();` call, usually called at the beginning of the kernels (see `TargetLibraries/XDNA2/kernels/add.cc`).
+- `INSTR_EVENT_1`: Emitted by the `event1();` call, usually called at the end of the kernels.
+- `INSTR_VECTOR`: Emitted every time the vector unit is used, can be useful to see how well the kernel is using the vector unit.
+- `PORT_RUNNING_0`: Emitted when a DMA transfer is running on port 0.
+- `PORT_RUNNING_1`: Emitted when a DMA transfer is running on port 1.
diff --git a/requirements-xdna.txt b/requirements-xdna.txt
new file mode 100644
index 0000000000..3cd66d39c8
--- /dev/null
+++ b/requirements-xdna.txt
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-3
+--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
+--extra-index-url https://pypi.org/simple
+
+mlir_aie
+llvm-aie