diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml index 14e80631d1..c642bfe6d2 100644 --- a/.github/workflows/_runner-chimera.yml +++ b/.github/workflows/_runner-chimera.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml index 3fbdf0ee16..c6be8af465 100644 --- a/.github/workflows/_runner-cortexm.yml +++ b/.github/workflows/_runner-cortexm.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml index a5c8b3ac98..6934014447 100644 --- a/.github/workflows/_runner-gap9-tiled.yml +++ b/.github/workflows/_runner-gap9-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml index e1d6e452a6..cc790d3d33 100644 --- a/.github/workflows/_runner-gap9.yml +++ b/.github/workflows/_runner-gap9.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml index 6681cbac96..b44b47f73d 100644 --- a/.github/workflows/_runner-generic.yml +++ b/.github/workflows/_runner-generic.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml index deb4809330..b2f0ae4f7a 100644 --- a/.github/workflows/_runner-mempool.yml +++ b/.github/workflows/_runner-mempool.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml index b1f5f2fcb3..664d5f01be 100644 --- a/.github/workflows/_runner-siracusa-neureka-tiled.yml +++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index ea9c8989af..cc09f234e0 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml index ea8fe5d405..1c51333f7a 100644 --- a/.github/workflows/_runner-siracusa.yml +++ b/.github/workflows/_runner-siracusa.yml @@ -25,6 +25,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml index fbd5195b08..bcdd58a166 100644 --- a/.github/workflows/_runner-snitch-tiled-sequential.yml +++ b/.github/workflows/_runner-snitch-tiled-sequential.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml index bc599e4fe7..48130ea26a 100644 --- a/.github/workflows/_runner-snitch.yml +++ b/.github/workflows/_runner-snitch.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml index b067664f40..2624cbe15d 100644 --- a/.github/workflows/_runner-softhier.yml +++ b/.github/workflows/_runner-softhier.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml new file mode 100644 index 0000000000..2c08f1bf46 --- /dev/null +++ b/.github/workflows/_runner-xdna2.yml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-xdna2 + +"on": + workflow_call: + inputs: + pytest-marker: + required: true + type: string + docker-image: + required: false + type: string + default: "deeploy-xdna:local" + +jobs: + test-runner-xdna2: + runs-on: xdna2-npu + # NOTE: We cannot use the `container:` directive here because + # GitHub Actions does not support `--device` flags required for + # NPU access (/dev/accel/accel0). Instead we use explicit + # `docker run` commands. + steps: + - name: Fix workspace permissions + shell: bash + run: | + docker run --rm \ + -v "${{ github.workspace }}":/workspace \ + ${{ inputs.docker-image }} \ + chown -R $(id -u):$(id -g) /workspace || true + + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' + + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Run Tests in Docker + shell: bash + run: | + docker run --rm \ + --device /dev/accel/accel0 \ + --ulimit memlock=-1 \ + -v /opt/xilinx:/opt/xilinx \ + -v "${{ github.workspace }}":/app/Deeploy \ + -w /app/Deeploy \ + ${{ inputs.docker-image }} \ + bash -c " + pip install -e . && + cd DeeployTest && + pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}' + " diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml index fc468306b1..84f2779e4c 100644 --- a/.github/workflows/ci-deeploy.yml +++ b/.github/workflows/ci-deeploy.yml @@ -35,6 +35,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: @@ -49,6 +51,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/ci-platform-xdna2.yml b/.github/workflows/ci-platform-xdna2.yml new file mode 100644 index 0000000000..ccf455edf7 --- /dev/null +++ b/.github/workflows/ci-platform-xdna2.yml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • XDNA2 + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image: + description: "XDNA2 Docker image (must be pre-built on the runner)" + required: false + default: "deeploy-xdna:local" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + xdna2-kernels: + uses: ./.github/workflows/_runner-xdna2.yml + with: + pytest-marker: "kernels" + docker-image: ${{ inputs.docker_image || 'deeploy-xdna:local' }} diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml index 038789ce40..e6f382c5ca 100644 --- a/.github/workflows/infra-generate-ccache-gap9.yml +++ b/.github/workflows/infra-generate-ccache-gap9.yml @@ -23,6 +23,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:latest' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml index e4d00ea911..1fdba01512 100644 --- a/.github/workflows/infra-generate-ccache.yml +++ b/.github/workflows/infra-generate-ccache.yml @@ -22,6 +22,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.gitignore b/.gitignore index d9e4faace3..a9993aac54 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,7 @@ CHANGELOG_GEN.md # Container Artifacts .pyusbip/ .cache/ + +# Claude context file +CLAUDE.md +Container/xrt-debs/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c8a024c15..ffc4d64085 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) message(STATUS "Building for platform 'Chimera'") +elseif(platform STREQUAL XDNA2) + message(STATUS "Building for platform 'XDNA2'") else() message(FATAL_ERROR "Invalid platform '${platform}' specified!") endif() @@ -299,5 +301,20 @@ if(platform STREQUAL Chimera) endif() +if(platform STREQUAL XDNA2) + + project(${TESTNAME} LANGUAGES CXX) + + message(STATUS "============================= XDNA2 Configuration ============================") + message(STATUS "[cMake ] GENERATED_SOURCE = " ${GENERATED_SOURCE}) + message(STATUS "[cMake ] TESTNAME = " ${TESTNAME}) + message(STATUS "==============================================================================") + message(STATUS "") + + add_subdirectory(TargetLibraries/XDNA2) + add_subdirectory(DeeployTest/Platforms/XDNA2) + +endif() + print_simulation_config() diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna new file mode 100644 index 0000000000..16907402df --- /dev/null +++ b/Container/Dockerfile.deeploy-xdna @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +FROM ubuntu:24.04 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV LLVM_INSTALL_DIR="nope" + +RUN apt-get update && apt-get install -y \ + software-properties-common \ + && add-apt-repository -y ppa:amd-team/xrt \ + && apt-get update && apt-get install -y \ + cmake \ + ninja-build \ + g++ \ + git \ + git-lfs \ + python3 \ + python3-pip \ + python-is-python3 \ + uuid-dev \ + wget \ + curl \ + ccache \ + libxrt2 \ + libxrt-npu2 \ + libxrt-dev \ + libxrt-utils \ + libxrt-utils-npu \ + && rm -rf /var/lib/apt/lists/* + +ENV XILINX_XRT=/opt/xilinx/xrt +ENV PATH=${XILINX_XRT}/bin:${PATH} +ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib + + +WORKDIR /app +COPY pyproject.toml requirements-xdna.txt ./ +RUN pip install toml-to-requirements && \ + toml-to-req --toml-file pyproject.toml && \ + pip install -r requirements.txt && \ + pip install -r requirements-xdna.txt && \ + rm -f requirements.txt pyproject.toml requirements-xdna.txt + +ENV MLIR_AIE_PYTHON=/usr/bin/python3 + +WORKDIR /app/Deeploy diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py new file mode 100644 index 0000000000..9fd92757a1 --- /dev/null +++ b/Deeploy/MLIRDataTypes.py @@ -0,0 +1,206 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Base classes for MLIR-emitting node templates and code transformations. + +This module provides: + +* :class:`MLIRNodeTemplate` — a :class:`NodeTemplate` subclass whose + ``emit()`` method populates an ``mlir.ir.Module`` instead of rendering C. +* :class:`MLIRExecutionBlock` — MLIR-specific execution state replacing the + C-oriented :class:`ExecutionBlock` (code-snippet deque) with MLIR builder + state (tile references, ObjectFifo handles, tiling parameters). +* :class:`MLIRCodeTransformationPass` — base class for MLIR code + transformation passes that operate on an :class:`MLIRExecutionBlock`. +* :class:`MLIRCodeTransformation` — two-phase pass container + (``devicePasses`` + ``runtimeSequencePasses``) that the deployer + orchestrates inside ``@aie_d.device`` and ``@aiex_d.runtime_sequence`` + regions respectively. + +All classes are intentionally dialect-agnostic so that future MLIR-based +backends (NVGPU, Linalg, …) can reuse them. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from Deeploy.DeeployTypes import NodeTemplate + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation + +# ====================================================================== +# MLIRExecutionBlock +# ====================================================================== + + +class MLIRExecutionBlock: + """MLIR-specific execution state for a single operator. + + Replaces the C-oriented :class:`ExecutionBlock` (which holds a deque of + :class:`CodeSnippet` objects) with fields that carry MLIR builder state + through the code-transformation pipeline. + + Passes populate fields progressively: + + 1. The deployer sets ``computeTile``, ``shimTile``, + ``operatorRepresentation``, and ``patternMemoryConstraint``. + 2. A device-phase pass (e.g. ``MLIRObjectFifoPass``) fills + ``fifoMap``, ``fifoTypes``, ``tileSize``, ``numTiles``, + ``kernelFuncName``, and ``kernelObjFile``. + 3. The deployer sets ``runtimeSequenceArgs`` before the runtime- + sequence phase. + 4. A runtime-sequence pass (e.g. ``MLIRRuntimeSequencePass``) reads + all of the above to emit DMA configuration. + """ + + def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None: + # MLIR tile references (set by deployer) + self.computeTile: Any = computeTile + self.shimTile: Any = shimTile + + # Operator metadata (set by deployer from parser) + self.operatorRepresentation: OperatorRepresentation = {} + + # Tiling constraint from midend solver (may be None) + self.patternMemoryConstraint: Any = None + + # Populated by device-phase passes (e.g. MLIRObjectFifoPass) + self.fifoMap: Dict[str, str] = {} # tensor name → FIFO name + self.fifoTypes: Dict[str, Any] = {} # tensor name → MemRefType + self.tileSize: int = 0 + self.numTiles: int = 0 + self.numElements: int = 0 + self.kernelFuncName: Optional[str] = None + self.kernelObjFile: Optional[str] = None + + # The MLIRNodeTemplate for this node (set by deployer, called by + # MLIRComputeCorePass to emit the kernel call inside the core block) + self.template: Optional[Any] = None + + # Set by deployer before runtime-sequence phase + self.runtimeSequenceArgs: List[Any] = [] + + # Input / output tensor name lists (set by deployer from parser) + self.inputNames: List[str] = [] + self.outputNames: List[str] = [] + + # Trace support (populated by device-phase trace passes, read by + # runtime-sequence trace pass) + self.traceConfigs: List[str] = [] + self.traceBufferSize: int = 0 + + +# ====================================================================== +# MLIRCodeTransformationPass / MLIRCodeTransformation +# ====================================================================== + + +class MLIRCodeTransformationPass: + """Base class for passes that transform an :class:`MLIRExecutionBlock`. + + Subclasses override :meth:`apply` to read / mutate the block's fields + and optionally emit MLIR operations into the current insertion point. + """ + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + return ctxt, mlirBlock + + +class MLIRCodeTransformation: + """Two-phase pass container for MLIR code transformations. + + *devicePasses* run inside an ``@aie_d.device(...)`` region (ObjectFifo + creation, external-kernel declarations, …). + + *runtimeSequencePasses* run inside an ``@aiex_d.runtime_sequence`` + block (DMA configuration, token await, …). + + The deployer calls :meth:`applyDevicePasses` and + :meth:`applyRuntimeSequencePasses` at the appropriate points. + """ + + def __init__(self, + devicePasses: Optional[List[MLIRCodeTransformationPass]] = None, + runtimeSequencePasses: Optional[List[MLIRCodeTransformationPass]] = None) -> None: + self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or [] + self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or [] + + def applyDevicePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + for _pass in self.devicePasses: + ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) + return ctxt, mlirBlock + + def applyRuntimeSequencePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + for _pass in self.runtimeSequencePasses: + ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) + return ctxt, mlirBlock + + +# ====================================================================== +# MLIRNodeTemplate +# ====================================================================== + + +class MLIRNodeTemplate(NodeTemplate): + """NodeTemplate subclass that emits MLIR instead of C code. + + Subclasses must override :meth:`emit` to add dialect operations to an + ``mlir.ir.Module`` (or region / insertion point provided via *kwargs*). + + ``generate()`` is overridden as a convenience that constructs a + standalone module, calls :meth:`emit`, and returns the MLIR text. + The base-class ``alignToContext`` / ``hoistTransientBuffers`` hooks are + retained and work unchanged. + """ + + def __init__(self): + # Empty Mako template — no C code is generated. + super().__init__("") + + # ------------------------------------------------------------------ + # Subclass API + # ------------------------------------------------------------------ + + @abstractmethod + def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: + """Populate an MLIR module with the operations for this node. + + The caller (typically the deployer) sets up an ``mlir.ir.Module`` + with the appropriate device wrapper and passes dialect-specific + context through *kwargs* (e.g. insertion point, tile references, + ObjectFifo handles). + + Parameters + ---------- + operatorRepresentation : OperatorRepresentation + The parser's node representation (buffer names, sizes, types …). + **kwargs + Dialect-specific context provided by the deployer. + """ + ... + + # ------------------------------------------------------------------ + # NodeTemplate overrides + # ------------------------------------------------------------------ + + def generate(self, operatorRepresentation = {}, **kwargs) -> str: + """Generate an MLIR string for this node. + + This default implementation is a thin wrapper: it delegates to + :meth:`emit`. Deployers that need to build a single module from + multiple nodes should call :meth:`emit` directly with the shared + module context and then stringify the complete module themselves. + + Returns + ------- + str + MLIR text (printable module or fragment). + """ + self.emit(operatorRepresentation, **kwargs) + return "" diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py new file mode 100644 index 0000000000..1f0e7f7587 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Bindings.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import bfloat16_t +from Deeploy.DeeployTypes import NodeBinding +from Deeploy.MLIRDataTypes import MLIRCodeTransformation +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import MLIRComputeCorePass +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass +from Deeploy.Targets.XDNA2.Templates import AddTemplate +from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker + +_ADD_INPUT_KEYS = ['data_in_1', 'data_in_2'] +_ADD_OUTPUT_KEYS = ['data_out'] + +# JUNGVI: TODO: This logic should not be boiled down for 1 operator but should be applied on every nodes of the network +# Likewise the kernelName and object file name should be specified in the node template of each operator. +XDNA2Transformer = MLIRCodeTransformation( + devicePasses = [ + MLIRObjectFifoPass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + kernelFuncName = "eltwise_add_bf16_vector", + kernelObjFile = "add.o", + ), + MLIRComputeCorePass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + ), + ], + runtimeSequencePasses = [ + MLIRRuntimeSequencePass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + ), + ], +) + +XDNA2AddBindings = [ + NodeBinding( + XDNA2AddChecker([PointerClass(bfloat16_t), PointerClass(bfloat16_t)], [PointerClass(bfloat16_t)]), + AddTemplate.referenceTemplate, + XDNA2Transformer, + ) +] diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py new file mode 100644 index 0000000000..7d98b30d6e --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that emits the AIE core block with tiling loops. + +This pass constructs the structural MLIR around the compute kernel: + +1. Opens an ``@aie_d.core`` block linked to the kernel object file. +2. Opens an infinite outer ``scf.for`` loop (streaming). +3. Opens an inner ``scf.for`` tiling loop (``numTiles`` iterations). +4. Acquires input/output ObjectFifo elements. +5. Builds a modified ``operatorRepresentation`` where tensor keys + (e.g. ``data_in_1``) are replaced with the acquired MLIR memref + values and ``size`` is replaced with the tile size — mirroring + how ``TilingVariableReplacement`` rewrites buffer names for C + backends. +6. Calls ``template.emit(modifiedOpRepr)`` — the template only emits + its ``func_d.call`` using values from ``operatorRepresentation``. +7. Releases all FIFO elements and closes loops. + +The pass is operator-agnostic: it only needs the tensor key lists and +reads everything else from the :class:`MLIRExecutionBlock` populated by +prior passes (e.g. :class:`MLIRObjectFifoPass`). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Tuple + +from aie.dialects import aie as aie_d +from aie.dialects import scf as scf_d + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + + +class MLIRComputeCorePass(MLIRCodeTransformationPass): + """Emit ``@aie_d.core`` with tiling loops and FIFO acquire/release. + + The template stored on ``mlirBlock.template`` is called inside the + inner loop with a *modified* ``operatorRepresentation`` whose tensor + entries point to acquired MLIR memref values instead of buffer name + strings. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors. + outputTensorKeys : list of str + Keys that name output tensors. + """ + + def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + computeTile = mlirBlock.computeTile + kernelObj = mlirBlock.kernelObjFile + tileSize = mlirBlock.tileSize + numTiles = mlirBlock.numTiles + opRepr = mlirBlock.operatorRepresentation + template = mlirBlock.template + + # Use the first tensor's type as representative tile memref type + firstKey = self.inputTensorKeys[0] + tileTy = mlirBlock.fifoTypes[firstKey] + + @aie_d.core(computeTile) + def _core(): + subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy) + for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): + for _ in scf_d.for_(0, numTiles, 1): + # Acquire all input FIFO elements + acquiredElements = {} + for key in self.inputTensorKeys: + fifoName = mlirBlock.fifoMap[key] + acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0) + + # Acquire all output FIFO elements + for key in self.outputTensorKeys: + fifoName = mlirBlock.fifoMap[key] + acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0) + + # Build modified opRepr: replace tensor names with MLIR + # values, replace size with tile size. This mirrors the + # C backend's TilingVariableReplacement pass. + modifiedOpRepr = {**opRepr, 'size': tileSize, **acquiredElements} + + # Call the template — it only emits func_d.call() + template.emit(modifiedOpRepr) + + # Release all inputs + for key in self.inputTensorKeys: + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1) + # Release all outputs + for key in self.outputTensorKeys: + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1) + + scf_d.yield_([]) + scf_d.yield_([]) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py new file mode 100644 index 0000000000..789a9a68cf --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRCoreTracePass.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that emits a core trace configuration. + +Emits an ``aie.trace`` block on the compute tile that captures core +instruction events, stall events, and port-monitoring events. The +configuration name is appended to ``mlirBlock.traceConfigs`` so that the +runtime-sequence trace pass can activate it. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Optional, Tuple + +from aie.dialects.aie import DMAChannelDir, TraceMode, TracePacketType, WireBundle, trace, trace_event, trace_mode, \ + trace_packet, trace_port, trace_start, trace_stop + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + +_DEFAULT_CORE_EVENTS = [ + "INSTR_EVENT_0", + "INSTR_EVENT_1", + "INSTR_VECTOR", + "MEMORY_STALL", + "STREAM_STALL", + "LOCK_STALL", + "PORT_RUNNING_0", + "PORT_RUNNING_1", +] + +_DEFAULT_CORE_PORTS = [ + (0, WireBundle.DMA, 0, DMAChannelDir.S2MM), + (1, WireBundle.DMA, 0, DMAChannelDir.MM2S), +] + + +class MLIRCoreTracePass(MLIRCodeTransformationPass): + """Emit a core trace configuration on the compute tile. + + Parameters + ---------- + packetId : int + Trace packet ID (default 1). + events : list of str, optional + Event names to capture (max 8). Defaults to the reference set of + instruction / stall / port-running events. + ports : list of tuple, optional + ``(slot, WireBundle, channel, DMAChannelDir)`` tuples for + port-monitoring event slots. + startBroadcast : int + Broadcast channel that starts the trace (default 15). + stopBroadcast : int + Broadcast channel that stops the trace (default 14). + """ + + def __init__( + self, + packetId: int = 1, + events: Optional[List[str]] = None, + ports: Optional[List[tuple]] = None, + startBroadcast: int = 15, + stopBroadcast: int = 14, + ) -> None: + self.packetId = packetId + self.events = events if events is not None else list(_DEFAULT_CORE_EVENTS) + self.ports = ports if ports is not None else list(_DEFAULT_CORE_PORTS) + self.startBroadcast = startBroadcast + self.stopBroadcast = stopBroadcast + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + computeTile = mlirBlock.computeTile + configName = f"core_trace_{name}" + + @trace(computeTile, configName) + def _core_trace(): + trace_mode(TraceMode.EventTime) + trace_packet(self.packetId, TracePacketType.Core) + for event in self.events: + trace_event(event) + for slot, port, channel, direction in self.ports: + trace_port(slot, port, channel, direction) + trace_start(broadcast = self.startBroadcast) + trace_stop(broadcast = self.stopBroadcast) + + mlirBlock.traceConfigs.append(configName) + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py new file mode 100644 index 0000000000..3e2757c3cf --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRMemTracePass.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that emits a memory trace configuration. + +Emits an ``aie.trace`` block on the compute tile that captures DMA +start/finish/starvation events from the memory module trace unit. The +configuration name is appended to ``mlirBlock.traceConfigs`` so that the +runtime-sequence trace pass can activate it. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Optional, Tuple + +from aie.dialects.aie import TracePacketType, trace, trace_event, trace_packet, trace_start, trace_stop + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + +_DEFAULT_MEM_EVENTS = [ + "DMA_S2MM_0_START_TASK", + "DMA_S2MM_1_START_TASK", + "DMA_MM2S_0_START_TASK", + "DMA_S2MM_0_FINISHED_TASK", + "DMA_S2MM_1_FINISHED_TASK", + "DMA_MM2S_0_FINISHED_TASK", + "DMA_S2MM_0_STREAM_STARVATION", + "DMA_S2MM_1_STREAM_STARVATION", +] + + +class MLIRMemTracePass(MLIRCodeTransformationPass): + """Emit a memory trace configuration on the compute tile. + + Parameters + ---------- + packetId : int + Trace packet ID (default 3). + events : list of str, optional + Event names to capture (max 8). Defaults to DMA start / finish / + starvation events matching the reference example. + startEvent : str + Event that starts the trace (default ``"BROADCAST_15"``). + stopEvent : str + Event that stops the trace (default ``"BROADCAST_14"``). + """ + + def __init__( + self, + packetId: int = 3, + events: Optional[List[str]] = None, + startEvent: str = "BROADCAST_15", + stopEvent: str = "BROADCAST_14", + ) -> None: + self.packetId = packetId + self.events = events if events is not None else list(_DEFAULT_MEM_EVENTS) + self.startEvent = startEvent + self.stopEvent = stopEvent + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + computeTile = mlirBlock.computeTile + configName = f"mem_trace_{name}" + + @trace(computeTile, configName) + def _mem_trace(): + trace_packet(self.packetId, TracePacketType.Mem) + for event in self.events: + trace_event(event) + trace_start(event = self.startEvent) + trace_stop(event = self.stopEvent) + + mlirBlock.traceConfigs.append(configName) + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py new file mode 100644 index 0000000000..9d86f4d834 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that creates ObjectFifos and declares external kernels. + +Given an :class:`MLIRExecutionBlock` with ``computeTile``, ``shimTile``, +``operatorRepresentation``, and (optionally) ``patternMemoryConstraint``, +this pass: + +1. Derives ``tileSize`` and ``numTiles`` (from tiling solver or fallback). +2. Creates one ``aie_d.object_fifo`` per input tensor (shim → compute) + and one per output tensor (compute → shim), all with depth 2 + (double-buffering). +3. Declares the external kernel via ``aie_d.external_func``. +4. Stores FIFO names, types, and kernel metadata on the block for + downstream passes and the compute template. + +The pass is operator-agnostic — it only needs the tensor names and a +tile-size derivation function. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Tuple + +import aie.ir as ir +import numpy as np +from aie.dialects import aie as aie_d + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + +MAX_TILE_SIZE = 1024 + + +def _deriveTileSize(numElements: int, patternMemoryConstraint) -> int: + """Extract tile size from the tiling solution, or fall back to MAX_TILE_SIZE.""" + tileSize = min(numElements, MAX_TILE_SIZE) + + if patternMemoryConstraint is not None: + try: + nodeConstraint = patternMemoryConstraint.nodeConstraints[0] + outputConstraints = nodeConstraint.outputTensorMemoryConstraints + if outputConstraints: + firstOutputName = list(outputConstraints.keys())[0] + tensorConstraint = outputConstraints[firstOutputName] + if "L1" in tensorConstraint.memoryConstraints: + l1Constraint = tensorConstraint.memoryConstraints["L1"] + if l1Constraint.shape is not None: + tileSize = int(np.prod(l1Constraint.shape)) + except (AttributeError, IndexError, KeyError): + pass + + # Ensure tile_size evenly divides num_elements + if numElements % tileSize != 0: + tileSize = max(d for d in range(1, tileSize + 1) if numElements % d == 0) + + return tileSize + + +class MLIRObjectFifoPass(MLIRCodeTransformationPass): + """Create ObjectFifos and declare the external kernel. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors + (e.g. ``['data_in_1', 'data_in_2']``). + outputTensorKeys : list of str + Keys that name output tensors (e.g. ``['data_out']``). + kernelFuncName : str + Symbol name of the external AIE kernel function. + kernelObjFile : str + Object file to link with the AIE core (e.g. ``"add.o"``). + kernelArgTypes : callable, optional + A callable ``(tile_memref_type) -> list[ir.Type]`` that returns + the kernel's argument types. Defaults to + ``[tile_ty, tile_ty, tile_ty, i32]`` (suitable for binary + elementwise ops). + fifoDepth : int + ObjectFifo depth (default 2 for double-buffering). + """ + + def __init__(self, + inputTensorKeys: list, + outputTensorKeys: list, + kernelFuncName: str, + kernelObjFile: str, + kernelArgTypes = None, + fifoDepth: int = 2) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + self.kernelFuncName = kernelFuncName + self.kernelObjFile = kernelObjFile + self._kernelArgTypes = kernelArgTypes + self.fifoDepth = fifoDepth + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + opRepr = mlirBlock.operatorRepresentation + numElements = int(opRepr['size']) + tileSize = _deriveTileSize(numElements, mlirBlock.patternMemoryConstraint) + numTiles = numElements // tileSize + + mlirBlock.tileSize = tileSize + mlirBlock.numTiles = numTiles + mlirBlock.numElements = numElements + mlirBlock.kernelFuncName = self.kernelFuncName + mlirBlock.kernelObjFile = self.kernelObjFile + + tileTy = ir.MemRefType.get((tileSize,), ir.BF16Type.get()) + computeTile = mlirBlock.computeTile + shimTile = mlirBlock.shimTile + + # Create input ObjectFifos (shim → compute) + for idx, key in enumerate(self.inputTensorKeys): + fifoName = f"in{idx + 1}_0" + aie_d.object_fifo(fifoName, shimTile, [computeTile], self.fifoDepth, tileTy) + mlirBlock.fifoMap[key] = fifoName + mlirBlock.fifoTypes[key] = tileTy + + # Create output ObjectFifos (compute → shim) + for idx, key in enumerate(self.outputTensorKeys): + fifoName = f"out_{idx}" + aie_d.object_fifo(fifoName, computeTile, [shimTile], self.fifoDepth, tileTy) + mlirBlock.fifoMap[key] = fifoName + mlirBlock.fifoTypes[key] = tileTy + + # Declare external kernel + i32 = ir.IntegerType.get_signless(32) + if self._kernelArgTypes is not None: + argTypes = self._kernelArgTypes(tileTy) + else: + # Default: binary elementwise (in1, in2, out, size) + argTypes = [tileTy, tileTy, tileTy, i32] + aie_d.external_func(self.kernelFuncName, argTypes, link_with = self.kernelObjFile) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py new file mode 100644 index 0000000000..6331bd0914 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Runtime-sequence pass that configures shim DMA for L3 ↔ L1 transfers. + +Given an :class:`MLIRExecutionBlock` whose device-phase passes have already +populated ``fifoMap``, ``numElements``, and ``runtimeSequenceArgs``, this +pass emits ``aiex_d.dma_configure_task_for`` / ``dma_start_task`` / +``dma_await_task`` / ``dma_free_task`` operations directly into the current +``@aiex_d.runtime_sequence`` insertion point. + +The pass is operator-agnostic — it iterates over the FIFO map and +runtime-sequence arguments to configure DMA for every input and output +tensor. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Tuple + +import aie.ir as ir +from aie.dialects import aie as aie_d +from aie.dialects import aiex as aiex_d + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + + +class MLIRRuntimeSequencePass(MLIRCodeTransformationPass): + """Emit DMA configuration inside a ``runtime_sequence`` block. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors. + outputTensorKeys : list of str + Keys that name output tensors. + """ + + def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + numElements = mlirBlock.numElements + seqArgs = mlirBlock.runtimeSequenceArgs + + dims = [ + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = numElements, stride = 1), + ] + + # Build ordered list of (fifoName, seqArg, isOutput) + transfers = [] + allKeys = self.inputTensorKeys + self.outputTensorKeys + for idx, key in enumerate(allKeys): + fifoName = mlirBlock.fifoMap[key] + isOutput = key in self.outputTensorKeys + transfers.append((fifoName, seqArgs[idx], isOutput)) + + inputTasks = [] + outputTasks = [] + + for fifoName, seqArg, isOutput in transfers: + if isOutput: + task = aiex_d.dma_configure_task_for(fifoName, issue_token = True) + else: + task = aiex_d.dma_configure_task_for(fifoName) + block = task.body.blocks.append() + with ir.InsertionPoint(block): + aie_d.dma_bd(seqArg, offset = 0, len = numElements, dimensions = dims, burst_length = 0) + aie_d.end() + aiex_d.dma_start_task(task) + + if isOutput: + outputTasks.append(task) + else: + inputTasks.append(task) + + # Await output tasks, then free input tasks + for task in outputTasks: + aiex_d.dma_await_task(task) + for task in inputTasks: + aiex_d.dma_free_task(task) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py new file mode 100644 index 0000000000..b429c00eb2 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRTraceRuntimePass.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Runtime-sequence pass that activates trace configurations. + +Emits ``aie.trace.host_config`` to set up the host-side trace buffer and +``aie.trace.start_config`` for each trace configuration registered by +device-phase passes on the :class:`MLIRExecutionBlock`. + +This pass must run **before** the DMA configuration pass +(:class:`MLIRRuntimeSequencePass`) inside the ``runtime_sequence`` block. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Tuple + +from aie.dialects.aie import trace_host_config, trace_start_config + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + + +class MLIRTraceRuntimePass(MLIRCodeTransformationPass): + """Emit trace host configuration and activate trace configs. + + Reads ``mlirBlock.traceConfigs`` (populated by device-phase trace + passes such as :class:`MLIRCoreTracePass` / :class:`MLIRMemTracePass`) + and ``mlirBlock.traceBufferSize`` (set by the deployer). If there are + no trace configs, this pass is a no-op. + """ + + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + + if not mlirBlock.traceConfigs: + return ctxt, mlirBlock + + trace_host_config(buffer_size = mlirBlock.traceBufferSize) + + for configName in mlirBlock.traceConfigs: + trace_start_config(configName) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py new file mode 100644 index 0000000000..85db5baffa --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRCoreTracePass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRMemTracePass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRTraceRuntimePass import * diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py new file mode 100644 index 0000000000..565749c5d2 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""XDNA2 deployer — generates mlir-aie MLIR using ``aie.dialects``. + +Unlike other Deeploy deployers that generate C code via Mako templates, +this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations +and returns the verified MLIR text. + +MLIR generation is split into two phases orchestrated by +:class:`MLIRCodeTransformation`: + +1. **Device phase** — inside ``@aie_d.device(npu2)``: for each operator, + run ``devicePasses`` (ObjectFifo creation, external-kernel + declaration) then call ``template.emit()`` (compute core only). +2. **Runtime-sequence phase** — inside ``@aiex_d.runtime_sequence``: + for each operator, run ``runtimeSequencePasses`` (DMA configuration). +""" + +from __future__ import annotations + +import copy +from typing import Callable, Dict, Optional, Type + +import aie.ir as ir +import onnx_graphsurgeon as gs +from aie.dialects import aie as aie_d +from aie.dialects import aiex as aiex_d +from aie.extras.context import mlir_mod_ctx + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MLIRDataTypes import MLIRCodeTransformation, MLIRExecutionBlock, MLIRNodeTemplate +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRCoreTracePass import MLIRCoreTracePass +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRMemTracePass import MLIRMemTracePass +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRTraceRuntimePass import MLIRTraceRuntimePass + + +class XDNA2Deployer(SignPropDeployer): + """Deployer for the XDNA2 (AIE2p) platform. + + Generates an mlir-aie MLIR module via two-phase code transformation: + + * **Device phase**: ``MLIRObjectFifoPass`` creates ObjectFifos and + declares external kernels; the bound ``MLIRNodeTemplate`` emits + the compute core. + * **Runtime-sequence phase**: ``MLIRRuntimeSequencePass`` configures + shim DMA for L3 ↔ L1 transfers. + + The module is verified via MLIR's built-in verifier before being + returned as a string. + """ + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda x: x, + name: str = 'DeeployNetwork', + default_channels_first: bool = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets: Optional[Dict[str, int]] = None, + enableTrace: bool = False, + traceBufferSize: int = 8192): + super().__init__( + graph, + deploymentPlatform, + inputTypes, + loweringOptimizer, + scheduler, + name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir, + inputOffsets = inputOffsets if inputOffsets is not None else {}, + ) + self.enableTrace = enableTrace + self.traceBufferSize = traceBufferSize + + # ------------------------------------------------------------------ + # MLIR generation + # ------------------------------------------------------------------ + + def generateMLIR(self) -> str: + """Generate an mlir-aie MLIR module for the prepared graph. + + Iterates over bound layers in two phases: + + 1. **Device phase** — for each node, creates an + :class:`MLIRExecutionBlock`, runs device-phase code- + transformation passes (ObjectFifo creation, kernel + declaration), then calls ``template.emit()`` (compute core). + 2. **Runtime-sequence phase** — opens an + ``@aiex_d.runtime_sequence`` block, sets + ``runtimeSequenceArgs`` on each block, then runs + runtime-sequence passes (DMA configuration). + + Returns + ------- + str + Verified MLIR module string. + """ + assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()" + + # Collect per-node info from the bound layers + nodes = [] + for nodeName, layer in self.layerBinding.items(): + mapper = layer.mapper + binder = mapper.binder + template = binder.template + opRepr = mapper.parser.operatorRepresentation + codeTransformer = binder.codeTransformer + + # Tiling constraint from the midend solver (may be None) + executionBlock = binder.executionBlock + tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None) + + if not isinstance(template, MLIRNodeTemplate): + raise RuntimeError(f"Node '{nodeName}' has no MLIRNodeTemplate — " + f"only BF16 Add is supported in this release.") + if not isinstance(codeTransformer, MLIRCodeTransformation): + raise RuntimeError(f"Node '{nodeName}' uses a non-MLIR CodeTransformation — " + f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.") + + # When tracing is enabled, shallow-copy the code transformer + # to inject trace passes without mutating the shared singleton + # (XDNA2Transformer in Bindings.py). + if self.enableTrace: + codeTransformer = copy.copy(codeTransformer) + codeTransformer.devicePasses = list(codeTransformer.devicePasses) + [ + MLIRCoreTracePass(), + MLIRMemTracePass(), + ] + codeTransformer.runtimeSequencePasses = [ + MLIRTraceRuntimePass(), + ] + list(codeTransformer.runtimeSequencePasses) + + nodes.append({ + 'nodeName': nodeName, + 'template': template, + 'opRepr': opRepr, + 'codeTransformer': codeTransformer, + 'tilingConstraint': tilingConstraint, + }) + + if not nodes: + raise RuntimeError("No bound layers found — cannot generate MLIR.") + + # Build the MLIR module + mlirBlocks = [] + + with mlir_mod_ctx() as ctx: + + @aie_d.device(aie_d.AIEDevice.npu2) + def _device(): + computeTile = aie_d.tile(0, 2) # TODO: generalize to full array + shimTile = aie_d.tile(0, 0) + + # === Device phase === + for node in nodes: + # Create MLIRExecutionBlock with deployer-level state + eb = MLIRExecutionBlock(computeTile = computeTile, shimTile = shimTile) + eb.operatorRepresentation = node['opRepr'] + eb.patternMemoryConstraint = node['tilingConstraint'] + eb.template = node['template'] + if self.enableTrace: + eb.traceBufferSize = self.traceBufferSize + + log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" + + (" (tiled)" if node['tilingConstraint'] else "")) + + # Run device-phase passes: + # 1. MLIRObjectFifoPass — creates FIFOs, declares kernel + # 2. MLIRComputeCorePass — opens core + loops, calls + # template.emit() with acquired FIFO elements in opRepr + self.ctxt, eb = node['codeTransformer'].applyDevicePasses(self.ctxt, eb, node['nodeName']) + + mlirBlocks.append((node, eb)) + + # === Runtime-sequence phase === + # Derive tensor type from the first node's numElements + _, firstEb = mlirBlocks[0] + numElements = firstEb.numElements + tensorTy = ir.MemRefType.get((numElements,), ir.BF16Type.get()) + + @aiex_d.runtime_sequence(tensorTy, tensorTy, tensorTy) + def _seq(*args): + for node, eb in mlirBlocks: + eb.runtimeSequenceArgs = list(args) + log.info(f"[XDNA2] Runtime-sequence phase for '{node['nodeName']}'") + self.ctxt, eb = node['codeTransformer'].applyRuntimeSequencePasses( + self.ctxt, eb, node['nodeName']) + + module = ctx.module + assert module.operation.verify(), \ + "[XDNA2] Generated MLIR module failed verification" + + mlirStr = str(module) + log.info(f"[XDNA2] MLIR module generated ({len(mlirStr)} bytes)") + return mlirStr diff --git a/Deeploy/Targets/XDNA2/Parsers.py b/Deeploy/Targets/XDNA2/Parsers.py new file mode 100644 index 0000000000..c665312dbd --- /dev/null +++ b/Deeploy/Targets/XDNA2/Parsers.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# XDNA2 reuses the Generic AddParser (see Platform.py). +# Add any XDNA2-specific parsers here as the platform grows. diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py new file mode 100644 index 0000000000..b54ce8acb9 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Platform.py @@ -0,0 +1,157 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \ + NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper +from Deeploy.Targets.Generic.Layers import AddLayer +from Deeploy.Targets.Generic.Parsers import AddParser +from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate +from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings +from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings + +# Standard mapper for non-tiled deployment +XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings) + +# Tiling-ready mapper for tiled deployment +XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings) + +# Standard mapping (used when tiling is disabled) +XDNA2Mapping = { + 'Add': AddLayer([XDNA2AddMapper]), +} + +# Tiling-ready mapping (used when tiling is enabled) +XDNA2TilingMapping = { + 'Add': AddLayer([XDNA2AddTilableMapper]), +} + +# Buffer classes reuse Generic templates since XDNA2Deployer manages its own +# output format (MLIR + test headers) and these templates are never rendered. + + +class XDNA2VariableBuffer(VariableBuffer): + initTemplate = AllocateTemplate.referenceInitTemplate + allocTemplate = AllocateTemplate.referenceAllocateTemplate + deallocTemplate = FreeTemplate.referenceLocalTemplate + + +class XDNA2TransientBuffer(TransientBuffer): + initTemplate = AllocateTemplate.referenceInitTemplate + allocTemplate = AllocateTemplate.referenceAllocateTemplate + deallocTemplate = FreeTemplate.referenceLocalTemplate + + +class XDNA2ConstantBuffer(ConstantBuffer): + initTemplate = AllocateTemplate.referenceGlobalInitTemplate + allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate + deallocTemplate = FreeTemplate.referenceGlobalTemplate + + +class XDNA2StructBuffer(StructBuffer): + initTemplate = AllocateTemplate.referenceStructInitTemplate + allocTemplate = AllocateTemplate.referenceStructAllocateTemplate + deallocTemplate = NodeTemplate("") + + +# No topology optimization passes needed for the initial Add-only platform. +XDNA2Optimizer = TopologyOptimizer([], name = "XDNA2Optimizer") + + +class XDNA2Engine(DeploymentEngine): + + def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", includeList = None) -> None: + if includeList is None: + includeList = [] + super().__init__(name, Mapping, initCode, includeList) + + +class XDNA2AIECoreEngine(DeploymentEngine): + """AIE core execution engine with L1 local memory as preferred memory level. + + The AIE core has 8KB of local memory (L1) for temporary buffers and computation. + Data is transferred from L3 (shared memory) to L1 as needed. + """ + + def __init__(self, + name: str = "XDNA2_AIE_Core", + Mapping = XDNA2Mapping, + initCode: str = "", + includeList = None, + preferredMemoryLevel: str = "L1") -> None: + if includeList is None: + includeList = [] + super().__init__(name, Mapping, initCode, includeList) + self.preferredMemoryLevel = preferredMemoryLevel + + +class XDNA2Platform(DeploymentPlatform): + + def __init__(self, + engines = None, + variableBuffer = XDNA2VariableBuffer, + constantBuffer = XDNA2ConstantBuffer, + structBuffer = XDNA2StructBuffer, + transientBuffer = XDNA2TransientBuffer): + if engines is None: + engines = [XDNA2Engine()] + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + +class MemoryXDNA2Platform(MemoryPlatform): + """XDNA2 platform with memory hierarchy support for tiling. + + Defines the memory hierarchy: + - L1: 8KB per AIE core (local memory) + - L3: Shared memory for entire AIE array + """ + + def __init__(self, + memoryHierarchy: MemoryHierarchy, + defaultTargetMemoryLevel: MemoryLevel, + engines = None, + variableBuffer = XDNA2VariableBuffer, + constantBuffer = XDNA2ConstantBuffer, + structBuffer = XDNA2StructBuffer, + transientBuffer = XDNA2TransientBuffer) -> None: + if engines is None: + engines = [XDNA2AIECoreEngine()] + super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer, + structBuffer, transientBuffer) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + """Get the target memory level for a tensor in a given node. + + For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level). + Otherwise use the default target memory level (typically L3). + """ + # Check if node has an engine assignment + if hasattr(node, '_engine_assignment'): + engine = node._engine_assignment + if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): + return engine.preferredMemoryLevel + + return self.defaultTargetMemoryLevel.name + + +class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper): + """Wrapper for XDNA2Platform with memory-level support.""" + + def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, + defaultTargetMemoryLevel: MemoryLevel): + assert isinstance(platform, XDNA2Platform), \ + f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}" + super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + """Get the target memory level for a tensor in a given node.""" + if hasattr(node, '_engine_assignment'): + engine = node._engine_assignment + if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): + return engine.preferredMemoryLevel + + return self.defaultTargetMemoryLevel.name diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py new file mode 100644 index 0000000000..6c526a9e38 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""XDNA2 MLIR template for BF16 elementwise Add — pure compute primitive. + +This template emits **only** a ``func_d.call`` to the vectorised +``eltwise_add_bf16_vector`` kernel. It receives its operands (acquired +ObjectFifo element memrefs) and tile size through +``operatorRepresentation``, exactly like a C Mako template receives +buffer-name strings. + +All structural MLIR (``@aie_d.core``, loops, FIFO acquire/release, +ObjectFifo creation, DMA configuration) is handled by +:class:`MLIRCodeTransformationPass` instances upstream. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import aie.ir as ir +from aie.dialects import arith as arith_d +from aie.dialects import func as func_d + +from Deeploy.MLIRDataTypes import MLIRNodeTemplate + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import OperatorRepresentation + + +class XDNA2AddTemplate(MLIRNodeTemplate): + """Pure compute-primitive for BF16 elementwise Add on XDNA2. + + ``emit()`` is called by :class:`MLIRComputeCorePass` inside an + already-open ``@aie_d.core`` + tiling-loop context, with + ``operatorRepresentation`` entries replaced by live MLIR values: + + * ``data_in_1``, ``data_in_2``, ``data_out`` — acquired memref + elements (from ObjectFifo acquire). + * ``size`` — tile size (Python int). + """ + + KERNEL_FN = "eltwise_add_bf16_vector" + + def __init__(self): + super().__init__() + + def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: + """Emit a single ``func.call`` to the vectorised Add kernel.""" + i32 = ir.IntegerType.get_signless(32) + sizeVal = arith_d.constant(i32, int(operatorRepresentation['size'])) + func_d.call([], self.KERNEL_FN, [ + operatorRepresentation['data_in_1'], + operatorRepresentation['data_in_2'], + operatorRepresentation['data_out'], + sizeVal, + ]) + + +referenceTemplate = XDNA2AddTemplate() diff --git a/Deeploy/Targets/XDNA2/Templates/__init__.py b/Deeploy/Targets/XDNA2/Templates/__init__.py new file mode 100644 index 0000000000..4694b67df5 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Templates/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py new file mode 100644 index 0000000000..b2282c34b0 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Tiler.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation.""" + +from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint +from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings + +# For Add operator, reuse the generic BOP (Binary Operator) tile constraint +# which handles equal-dimension binary operations +XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = XDNA2AddBindings, + tileConstraint = AddTileConstraint()) diff --git a/Deeploy/Targets/XDNA2/TypeCheckers.py b/Deeploy/Targets/XDNA2/TypeCheckers.py new file mode 100644 index 0000000000..cb9c98fd39 --- /dev/null +++ b/Deeploy/Targets/XDNA2/TypeCheckers.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Optional, Sequence, Type + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker +from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer + + +class XDNA2AddChecker(SignPropTypeChecker): + """Type checker for BF16 elementwise Add on XDNA2. + + Both inputs and the output are bfloat16_t pointers. + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: + # Float types do not have a meaningful nLevels — return 1 as a neutral value. + return [1] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: + # BF16 is a signed floating-point type. + return [True] diff --git a/DeeployTest/Platforms/XDNA2/CMakeLists.txt b/DeeployTest/Platforms/XDNA2/CMakeLists.txt new file mode 100644 index 0000000000..b96bb5d092 --- /dev/null +++ b/DeeployTest/Platforms/XDNA2/CMakeLists.txt @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# --------------------------------------------------------------------------- +# XDNA2 (AIE2p) testbench CMake configuration +# +# Included via add_subdirectory() by the top-level CMakeLists.txt when +# -Dplatform=XDNA2 +# is passed. It orchestrates two build steps: +# +# 1. Compile network.mlir to network.xclbin + npu_insts.bin with aiecc.py. +# 2. Compile the XRT host binary (main.cpp) with the system compiler. +# +# AIE kernel compilation is handled by TargetLibraries/XDNA2/CMakeLists.txt. +# +# Required variables (set via environment or CMake cache): +# MLIR_AIE_INSTALL_DIR – path to the mlir-aie installation +# (auto-resolved from aie.utils.config or env) +# LLVM_AIE_INSTALL_DIR – path to the llvm-aie installation +# (auto-resolved from aie.utils.config or env) +# XRT_INSTALL_DIR – path to the XRT installation +# (default: $ENV{XILINX_XRT} or /opt/xilinx/xrt) +# GENERATED_SOURCE – directory containing network.mlir, testinputs.h, testoutputs.h +# (set by the Deeploy test runner) +# TESTNAME – name of the test target (set by the Deeploy test runner) +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# Resolve toolchain and runtime paths +# --------------------------------------------------------------------------- +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +# --- llvm-aie (Peano) install dir (needed for --peano flag) --- +set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir") +if(NOT LLVM_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());" + OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT LLVM_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. " + "Set LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.") + endif() +endif() + +# --- mlir-aie install dir (needed for aiecc.py) --- +set(MLIR_AIE_INSTALL_DIR "$ENV{MLIR_AIE_INSTALL_DIR}" CACHE PATH "mlir-aie install dir") +if(NOT MLIR_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.root_path());" + OUTPUT_VARIABLE MLIR_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT MLIR_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find mlir-aie install dir. " + "Set MLIR_AIE_INSTALL_DIR or install the mlir-aie wheel.") + endif() +endif() + +# --- XRT install dir --- +if(NOT XRT_INSTALL_DIR) + if(DEFINED ENV{XILINX_XRT}) + set(XRT_INSTALL_DIR $ENV{XILINX_XRT}) + else() + set(XRT_INSTALL_DIR "/opt/xilinx/xrt") + endif() +endif() + +set(AIECC_PY "${MLIR_AIE_INSTALL_DIR}/bin/aiecc.py") + +# Deeploy-generated sources +set(NETWORK_MLIR "${GENERATED_SOURCE}/network.mlir") + +message(STATUS "[XDNA2] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2] MLIR_AIE_INSTALL_DIR = ${MLIR_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2] XRT_INSTALL_DIR = ${XRT_INSTALL_DIR}") +message(STATUS "[XDNA2] GENERATED_SOURCE = ${GENERATED_SOURCE}") +message(STATUS "[XDNA2] TESTNAME = ${TESTNAME}") + +# --------------------------------------------------------------------------- +# Step 1: Compile MLIR -> xclbin + npu_insts.bin +# --------------------------------------------------------------------------- +set(XCLBIN "${CMAKE_CURRENT_BINARY_DIR}/network.xclbin") +set(NPU_INSTS "${CMAKE_CURRENT_BINARY_DIR}/npu_insts.bin") + +add_custom_command( + OUTPUT "${XCLBIN}" "${NPU_INSTS}" + # Copy kernel objects into aiecc.py working dir so the linker scripts + # generated by aiecc.py can find them via INPUT(kernel.o). + COMMAND ${CMAKE_COMMAND} -E copy ${XDNA2_KERNEL_OBJECTS} "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${CMAKE_COMMAND} -E env + "PATH=${MLIR_AIE_INSTALL_DIR}/bin:$ENV{PATH}" + "python" "${AIECC_PY}" + --no-aiesim + --no-xchesscc + --no-xbridge + --peano "${LLVM_AIE_INSTALL_DIR}" + --aie-generate-cdo + --aie-generate-npu-insts + --npu-insts-name npu_insts.bin + --aie-generate-xclbin + --dump-intermediates + --xclbin-kernel-name=MLIR_AIE + --xclbin-name network.xclbin + "${NETWORK_MLIR}" + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS "${NETWORK_MLIR}" ${XDNA2_KERNEL_OBJECTS} xdna2_kernels + COMMENT "[XDNA2] Compiling MLIR -> network.xclbin + npu_insts.bin" + VERBATIM +) +add_custom_target(xdna2_xclbin DEPENDS "${XCLBIN}" "${NPU_INSTS}") + +# --------------------------------------------------------------------------- +# Step 2: Compile XRT host binary +# --------------------------------------------------------------------------- +add_executable("${TESTNAME}" + "${CMAKE_CURRENT_LIST_DIR}/main.cpp" +) + +target_include_directories("${TESTNAME}" PRIVATE + "${XRT_INSTALL_DIR}/include" + "${GENERATED_SOURCE}" +) + +target_link_directories("${TESTNAME}" PRIVATE + "${XRT_INSTALL_DIR}/lib" +) + +target_link_libraries("${TESTNAME}" PRIVATE + xrt_coreutil + uuid + dl + pthread +) + +target_compile_features("${TESTNAME}" PRIVATE cxx_std_17) + +# The xclbin and npu_insts must be available at runtime in the same directory +# as the binary. Add a post-build step to copy them. +add_custom_command(TARGET "${TESTNAME}" POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${XCLBIN}" "$/network.xclbin" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${NPU_INSTS}" "$/npu_insts.bin" + COMMENT "[XDNA2] Copying xclbin and npu_insts to binary directory" +) + +add_dependencies("${TESTNAME}" xdna2_xclbin) diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp new file mode 100644 index 0000000000..7fa3c4aefc --- /dev/null +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -0,0 +1,255 @@ +// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +// XRT C++ testbench for the XDNA2 (AIE2p) platform. +// Loads network.xclbin produced by aiecc.py, runs the MLIR_AIE kernel, +// reads back outputs and compares against golden reference values. +// Output format: "Errors: X out of Y" (required by output_parser.py). + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_hw_context.h" +#include "xrt/xrt_kernel.h" + +// Generated by Deeploy's generateNetwork_xdna2.py: +// testinputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} +// defines testoutputs.h – uint16_t arrays of BF16 bit patterns + +// N_ELEMENTS_OUTPUT{i} defines +#include "testinputs.h" +#include "testoutputs.h" + +// --------------------------------------------------------------------------- +// BF16 helpers +// --------------------------------------------------------------------------- +static float bf16_to_float(uint16_t bf16) { + uint32_t f32_bits = static_cast(bf16) << 16; + float f; + std::memcpy(&f, &f32_bits, sizeof(f)); + return f; +} + +static bool bf16_nearly_equal(uint16_t a, uint16_t b, float rtol = 0.0f, + float atol = 0.0f) { + // Default: allow 1 BF16 ULP difference to account for hardware rounding. + // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values. + float fa = bf16_to_float(a); + float fb = bf16_to_float(b); + float diff = std::fabs(fa - fb); + + // Compute 1 ULP for the reference value's magnitude + uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits) + float ulp; + if (ref_exp == 0) + ulp = std::ldexp(1.0f, -133); // subnormal ULP + else + ulp = std::ldexp(1.0f, + static_cast(ref_exp) - 127 - 7); // 7 mantissa bits + + float tol = std::fmax(atol + rtol * std::fabs(fb), ulp); + return diff <= tol; +} + +// --------------------------------------------------------------------------- +// Read the NPU instruction binary produced by aiecc.py +// --------------------------------------------------------------------------- +static std::vector read_instr_binary(const std::string &path) { + std::ifstream file(path, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("Cannot open instruction file: " + path); + } + file.seekg(0, std::ios::end); + size_t byte_size = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector instr(byte_size / sizeof(uint32_t)); + file.read(reinterpret_cast(instr.data()), byte_size); + return instr; +} + +int main(int argc, char **argv) { + // Paths to the compiled artefacts: default to the directory containing + // this binary so the test works regardless of the working directory or + // whether it is run inside a container. + std::string bin_dir; + { + std::string argv0(argv[0]); + auto sep = argv0.rfind('/'); + bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep); + } + std::string xclbin_path = bin_dir + "/network.xclbin"; + std::string instr_path = bin_dir + "/npu_insts.bin"; + + bool verbose = false; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "-v" || arg == "--verbose" || arg == "-vv") { + verbose = true; + } + } + if (argc >= 2 && argv[1][0] != '-') + xclbin_path = argv[1]; + if (argc >= 3 && argv[2][0] != '-') + instr_path = argv[2]; + + // ----------------------------------------------------------------------- + // 1. Open XRT device, register xclbin, create hw_context + // (matches mlir-aie test_utils::init_xrt_load_kernel pattern) + // ----------------------------------------------------------------------- + auto device = xrt::device(0); + auto xclbin = xrt::xclbin(xclbin_path); + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, "MLIR_AIE"); + + // ----------------------------------------------------------------------- + // 2. Read NPU instruction binary + // ----------------------------------------------------------------------- + std::vector instr_v = read_instr_binary(instr_path); + size_t n_instr = instr_v.size(); + + // ----------------------------------------------------------------------- + // 3. Derive element counts from the testinputs/testoutputs header defines. + // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set + // by generateNetwork_xdna2.py. + // ----------------------------------------------------------------------- + // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs + // (with respect to the amount of bo available) + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, + "Input 0 and input 1 must have the same number of elements"); + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0, + "Inputs and output must have the same number of elements"); + + const size_t n_elem = N_ELEMENTS_OUTPUT0; + const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes + const size_t buf_bytes = n_elem * elem_size; + + // ----------------------------------------------------------------------- + // 4. Allocate XRT buffer objects + // Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, + // 3:in0, 4:in1, 5:out, 6:ctrlpkts, 7:trace) + // ----------------------------------------------------------------------- + auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in0 = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_in1 = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + // Control packets buffer (required by the kernel ABI) + auto bo_ctrlpkts = + xrt::bo(device, 8, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6)); + + // Trace buffer: allocated at 4x the requested size (hardware requirement). + // When TRACE_BUFFER_SIZE == 0 (no tracing), allocate a minimal 1-byte + // placeholder so the kernel call signature stays the same. + constexpr size_t trace_alloc = + TRACE_BUFFER_SIZE > 0 ? TRACE_BUFFER_SIZE * 4 : 1; + auto bo_trace = + xrt::bo(device, trace_alloc, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7)); + + // Zero-initialise trace buffer + if constexpr (TRACE_BUFFER_SIZE > 0) { + std::memset(bo_trace.map(), 0, trace_alloc); + bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + // ----------------------------------------------------------------------- + // 5. Copy data into device buffers + // ----------------------------------------------------------------------- + std::memcpy(bo_instr.map(), instr_v.data(), + n_instr * sizeof(uint32_t)); + std::memcpy(bo_in0.map(), testInputVector0, buf_bytes); + std::memcpy(bo_in1.map(), testInputVector1, buf_bytes); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ----------------------------------------------------------------------- + // 6. Launch kernel and wait for completion + // opcode 3 = execute NPU instruction stream + // ----------------------------------------------------------------------- + // JUNGVI: TODO: Collect runtime and display it + // JUNGVI: TODO: Enable warmup iterations + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, static_cast(n_instr), bo_in0, + bo_in1, bo_out, bo_ctrlpkts, bo_trace); + run.wait(); + + // ----------------------------------------------------------------------- + // 7. Sync output back and compare against golden reference + // ----------------------------------------------------------------------- + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + const uint16_t *hw_out = bo_out.map(); + const uint16_t *golden_out = testOutputVector0; + + int errors = 0; + for (size_t i = 0; i < n_elem; ++i) { + bool match = bf16_nearly_equal(hw_out[i], golden_out[i]); + if (!match) { + ++errors; + if (errors <= 10) { + std::cerr << " Mismatch at index " << i + << ": hw=" << bf16_to_float(hw_out[i]) << " (0x" << std::hex + << hw_out[i] << std::dec << ")" + << " ref=" << bf16_to_float(golden_out[i]) << " (0x" + << std::hex << golden_out[i] << std::dec << ")" + << " diff=" + << std::fabs(bf16_to_float(hw_out[i]) - + bf16_to_float(golden_out[i])) + << "\n"; + } + } + if (verbose) { + float hw_f = bf16_to_float(hw_out[i]); + float ref_f = bf16_to_float(golden_out[i]); + std::cout << "[" << i << "] hw=" << hw_f << " ref=" << ref_f + << " diff=" << std::fabs(hw_f - ref_f) + << (match ? "" : " *** MISMATCH") << "\n"; + } + } + + // Output format required by testUtils/core/output_parser.py + std::cout << "Errors: " << errors << " out of " << n_elem << "\n"; + + // ----------------------------------------------------------------------- + // 8. Read back trace data and write to trace.txt + // ----------------------------------------------------------------------- + if constexpr (TRACE_BUFFER_SIZE > 0) { + bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + const uint32_t *trace_data = bo_trace.map(); + size_t trace_words = TRACE_BUFFER_SIZE / sizeof(uint32_t); + + std::string trace_path = bin_dir + "/trace.txt"; + std::ofstream trace_file(trace_path); + if (trace_file.is_open()) { + for (size_t i = 0; i < trace_words; ++i) { + if (trace_data[i] != 0) { + trace_file << std::hex << std::setfill('0') << std::setw(8) + << trace_data[i] << "\n"; + } + } + trace_file.close(); + std::cout << "Trace written to " << trace_path << "\n"; + } else { + std::cerr << "Warning: could not open " << trace_path << " for writing\n"; + } + } + + return (errors == 0) ? 0 : 1; +} diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz new file mode 100644 index 0000000000..816afb2bc8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx b/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx new file mode 100644 index 0000000000..acefe12a69 Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz new file mode 100644 index 0000000000..1304bf2845 Binary files /dev/null and b/DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz differ diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index c7077067d9..e37ddcf99b 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -68,6 +68,7 @@ def pytest_configure(config: pytest.Config) -> None: "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)") config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test") config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)") + config.addinivalue_line("markers", "xdna2: mark test as an XDNA2 (AIE2p) platform test") config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)") config.addinivalue_line("markers", "models: mark test as a model test (full networks)") config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration") diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py new file mode 100644 index 0000000000..a8f6b78694 --- /dev/null +++ b/DeeployTest/deeployRunner_xdna2.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform. + +Usage (from DeeployTest/): + python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular [--skipsim] [-v] + python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular --trace [--trace-buffer-size 16384] +""" + +import json +import os +import sys +from glob import glob + +from testUtils.deeployRunner import main + + +def _add_xdna2_args(parser): + """Register XDNA2-specific CLI arguments.""" + parser.add_argument('--trace', + action = 'store_true', + default = False, + help = 'Enable execution tracing in the generated MLIR') + parser.add_argument('--trace-buffer-size', + type = int, + default = 8192, + help = 'Trace buffer size in bytes (default: 8192)') + + +def _add_xdna2_gen_args(args, gen_args_list): + """Forward XDNA2-specific arguments to the generation script.""" + if getattr(args, 'trace', False): + gen_args_list.append('--trace') + trace_buffer_size = getattr(args, 'trace_buffer_size', 8192) + if trace_buffer_size != 8192: + gen_args_list.append(f'--trace-buffer-size={trace_buffer_size}') + + +def _xdna2_post_sim(config, result, args): + """Parse trace.txt into a Perfetto-compatible trace.json after simulation.""" + if not getattr(args, 'trace', False): + return + + build_dir = config.build_dir + trace_txt = os.path.join(build_dir, "bin", "trace.txt") + if not os.path.isfile(trace_txt): + print(f"Warning: --trace enabled but {trace_txt} not found; skipping trace parsing.") + return + + # Find the MLIR with lowered NpuWrite32 ops (trace event register config). + # aiecc.py produces this when invoked with --dump-intermediates. + prj_pattern = os.path.join(build_dir, "DeeployTest", "Platforms", "XDNA2", "network.mlir.prj", + "main_physical_with_elfs.mlir") + candidates = glob(prj_pattern) + if not candidates: + print(f"Warning: lowered MLIR not found at {prj_pattern}; skipping trace parsing.") + return + lowered_mlir = candidates[0] + + trace_json = os.path.join(build_dir, "bin", "trace.json") + + try: + from aie.utils.trace.parse import align_column_start_index, check_for_valid_trace, convert_commands_to_json, \ + convert_to_byte_stream, convert_to_commands, parse_mlir_trace_events, setup_trace_metadata, \ + trace_pkts_de_interleave, trim_trace_pkts + + with open(trace_txt, "r") as f: + trace_pkts = f.read().split("\n") + + with open(lowered_mlir, "r") as f: + mlir_str = f.read() + + pid_events, events_module = parse_mlir_trace_events(mlir_str) + + if not check_for_valid_trace(trace_txt, trace_pkts): + print(f"Warning: trace data in {trace_txt} appears invalid; skipping trace parsing.") + return + + trimmed = trim_trace_pkts(trace_pkts) + sorted_pkts = trace_pkts_de_interleave(trimmed) + byte_streams = convert_to_byte_stream(sorted_pkts) + commands = convert_to_commands(byte_streams, False) + + pid_events = align_column_start_index(pid_events, commands) + + trace_events = [] + setup_trace_metadata(trace_events, pid_events, events_module) + convert_commands_to_json(trace_events, commands, pid_events, events_module) + + with open(trace_json, "w") as f: + json.dump(trace_events, f) + + print(f"Trace parsed: {trace_json} ({len(trace_events)} events)") + + except SystemExit: + print(f"Warning: trace parsing failed (mlir-aie parser error). " + f"Ensure the build was done with --trace enabled.") + except Exception as e: + print(f"Warning: trace parsing failed: {e}") + + +if __name__ == '__main__': + sys.exit( + main(default_platform = "XDNA2", + default_simulator = "host", + tiling_enabled = True, + parser_setup_callback = _add_xdna2_args, + gen_args_callback = _add_xdna2_gen_args, + post_sim_callback = _xdna2_post_sim)) diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py new file mode 100644 index 0000000000..789d37716c --- /dev/null +++ b/DeeployTest/generateNetwork_xdna2.py @@ -0,0 +1,241 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""XDNA2 network generation script. + +Replaces the generic ``generateNetwork.py`` for the XDNA2 platform. +Instead of emitting C code it: + +1. Loads the ONNX model and npz test-data. +2. Prepares the XDNA2Deployer (type checking + graph binding). +3. Emits ``testinputs.h`` and ``testoutputs.h`` with raw BF16 uint16_t arrays. +4. Calls ``deployer.generateMLIR()`` and writes ``network.mlir``. +""" + +import os + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.platformMapping import mapDeployer +from testUtils.testRunner import TestGeneratorArgumentParser + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import bfloat16_t +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, XDNA2AIECoreEngine, XDNA2TilingMapping +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + + +def _tilingScheduler(graph: gs.Graph): + return [[node] for node in graph.nodes] + + +def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray: + """Convert a float32 numpy array to an array of BF16 bit patterns (uint16_t). + + Uses round-to-nearest-even (the standard IEEE 754 rounding mode). + """ + f32 = arr.astype(np.float32) + raw = f32.view(np.uint32) + # Standard round-to-nearest-even: add 0x7FFF + BF16_LSB to the full word, + # then truncate. The 0x7FFF biases values just below the midpoint to + # round down, while adding the BF16 LSB provides tie-breaking to even. + bf16_lsb = (raw >> 16) & 1 + raw = raw + np.uint32(0x7FFF) + bf16_lsb + bf16 = (raw >> 16).astype(np.uint16) + return bf16 + + +def _bf16_to_float32(bf16: np.ndarray) -> np.ndarray: + """Convert an array of BF16 uint16 bit patterns back to float32.""" + f32_bits = bf16.astype(np.uint32) << 16 + return f32_bits.view(np.float32) + + +def _generate_xdna2_inputs_header(input_arrays: list) -> str: + """Generate testinputs.h with raw uint16_t BF16 bit-pattern arrays.""" + lines = [] + lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna") + lines.append("// SPDX-License-Identifier: Apache-2.0") + lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.") + lines.append("#pragma once") + lines.append("#include ") + lines.append("") + + vec_names = [] + for idx, arr in enumerate(input_arrays): + bf16 = _float32_to_bf16_uint16(arr.flatten()) + n = len(bf16) + name = f"testInputVector{idx}" + vec_names.append(name) + hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16) + lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};") + lines.append(f"#define N_ELEMENTS_INPUT{idx} {n}u") + lines.append("") + + lines.append(f"static const void *testInputVector[{len(vec_names)}] = {{") + lines.append(" " + ", ".join(f"(const void *){n}" for n in vec_names)) + lines.append("};") + lines.append("") + return "\n".join(lines) + + +def _generate_xdna2_outputs_header(output_arrays: list) -> str: + """Generate testoutputs.h with raw uint16_t BF16 bit-pattern arrays.""" + lines = [] + lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna") + lines.append("// SPDX-License-Identifier: Apache-2.0") + lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.") + lines.append("#pragma once") + lines.append("#include ") + lines.append("") + + vec_names = [] + for idx, arr in enumerate(output_arrays): + bf16 = _float32_to_bf16_uint16(arr.flatten()) + n = len(bf16) + name = f"testOutputVector{idx}" + vec_names.append(name) + hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16) + lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};") + lines.append(f"#define N_ELEMENTS_OUTPUT{idx} {n}u") + lines.append("") + + lines.append(f"static const void *testOutputVector[{len(vec_names)}] = {{") + lines.append(" " + ", ".join(f"(const void *){n}" for n in vec_names)) + lines.append("};") + lines.append("") + return "\n".join(lines) + + +def generateNetworkXDNA2(args): + log.debug("Arguments: %s", args) + + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + inputs_npz = np.load(f'{args.dir}/inputs.npz') + outputs_npz = np.load(f'{args.dir}/outputs.npz') + + test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files] + test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files] + + inputTypes = {} + inputOffsets = {} + + for index, (name, values) in enumerate(zip(inputs_npz.files, test_inputs_f32)): + if np.prod(values.shape) == 0: + continue + # Force bfloat16_t — BF16 test data stored as float32 in npz would be + # inferred as float32_t by minimalFloatType, but the XDNA2 kernel + # requires bfloat16_t inputs. + # JUNGVI: TODO: Align minimalFloatType to properly handle bf16 and don't force types. + inputTypes[f"input_{index}"] = PointerClass(bfloat16_t) + inputOffsets[f"input_{index}"] = 0 + + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + # JUNGVI: TODO: Extend with the whole NPU array + # Define memory hierarchy: L1 (AIE core local) and L3 (shared) + l1_size = int(getattr(args, 'l1', None) or 64000) # 64KB default + l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024) # 128MB default + + log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}") + + l1_level = MemoryLevel("L1", neighbourNames = ["L3"], size = l1_size) + l3_level = MemoryLevel("L3", neighbourNames = ["L1"], size = l3_size) + memory_hierarchy = MemoryHierarchy([l1_level, l3_level]) + memory_hierarchy.setDefaultMemoryLevel("L3") # Tensors default to L3 + + # Create memory-aware platform with AIE core engines + mem_platform = MemoryXDNA2Platform( + memoryHierarchy = memory_hierarchy, + defaultTargetMemoryLevel = l1_level, + engines = [XDNA2AIECoreEngine(Mapping = XDNA2TilingMapping, preferredMemoryLevel = "L1")]) + + # Create base deployer with memory platform + deployer = mapDeployer(mem_platform, + graph, + inputTypes, + scheduler = _tilingScheduler, + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets) + + # Wrap with MemoryDeployerWrapper (adds memory level annotation) + deployer = MemoryDeployerWrapper(deployer) + + # Wrap with TilerDeployerWrapper (adds tiling) + deployer = TilerDeployerWrapper(deployer, workDir = _DEEPLOYSTATEDIR) + + # Enable tracing if requested + enableTrace = getattr(args, 'trace', False) + if enableTrace: + traceBufferSize = int(getattr(args, 'trace_buffer_size', None) or 8192) + deployer.enableTrace = True + deployer.traceBufferSize = traceBufferSize + log.info(f"[XDNA2] Tracing enabled (buffer_size={traceBufferSize})") + + # frontEnd() parses the graph; bind() triggers tiling via wrappers + deployer.frontEnd() + deployer.bind() + deployer.prepared = True + log.info("[XDNA2] Tiling completed, proceeding with MLIR generation") + + # Create output directory + os.makedirs(args.dumpdir, exist_ok = True) + + # Write testinputs.h (raw BF16 bit patterns as uint16_t) + testInputStr = _generate_xdna2_inputs_header(test_inputs_f32) + # Append trace buffer size define so the host binary knows whether to + # allocate a trace buffer and how large it should be. + if enableTrace: + testInputStr += f"#define TRACE_BUFFER_SIZE {traceBufferSize}u\n" + else: + testInputStr += "#define TRACE_BUFFER_SIZE 0u\n" + with open(f'{args.dumpdir}/testinputs.h', 'w') as f: + f.write(testInputStr) + + # JUNGVI: TODO: Move this in ONNX4Deeploy + # Recompute golden outputs from the actual BF16 inputs the hardware will + # see. The original outputs.npz may have been computed in float32 + # precision, which can differ by several BF16 ULPs. + bf16_inputs = [_float32_to_bf16_uint16(a.flatten()) for a in test_inputs_f32] + bf16_input_f32 = [_bf16_to_float32(b) for b in bf16_inputs] + golden_f32 = bf16_input_f32[0] + for inp in bf16_input_f32[1:]: + golden_f32 = golden_f32 + inp + test_outputs_bf16 = [golden_f32.reshape(arr.shape) for arr in test_outputs_f32] + + # Write testoutputs.h (raw BF16 bit patterns as uint16_t) + testOutputStr = _generate_xdna2_outputs_header(test_outputs_bf16) + with open(f'{args.dumpdir}/testoutputs.h', 'w') as f: + f.write(testOutputStr) + + # Write network.mlir + mlir_str = deployer.generateMLIR() + with open(f'{args.dumpdir}/network.mlir', 'w') as f: + f.write(mlir_str) + + log.info(f"[XDNA2] Generated: testinputs.h, testoutputs.h, network.mlir -> {args.dumpdir}") + + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(tiling_arguments = True, + description = "Deeploy XDNA2 Code Generation Utility.") + parser.add_argument('--trace', + action = 'store_true', + default = False, + help = 'Enable execution tracing in the generated MLIR') + parser.add_argument('--trace-buffer-size', + type = int, + default = 8192, + help = 'Trace buffer size in bytes (default: 8192)') + args, _ = parser.parse_known_args() + + if args.platform != 'XDNA2': + parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}") + + generateNetworkXDNA2(args) diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..a259c93ad7 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -27,7 +27,9 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: script_dir = Path(__file__).parent.parent.parent - if config.tiling: + if config.platform == "XDNA2": + generation_script = script_dir / "generateNetwork_xdna2.py" + elif config.tiling: generation_script = script_dir / "testMVP.py" else: generation_script = script_dir / "generateNetwork.py" @@ -166,6 +168,9 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: # Run binary directly binary_path = Path(config.build_dir) / "bin" / config.test_name cmd = [str(binary_path)] + # Propagate verbosity to the host binary (e.g. XDNA2 main.cpp uses -v) + if config.verbose >= 1: + cmd.append("-v") else: # Run via CMake target cmake_cmd = os.environ.get("CMAKE", "cmake") diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..87a6db3c0b 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -139,6 +139,12 @@ def __init__(self, type = int, default = 1024000, help = 'L2 size in bytes\n') + self.add_argument('--l3', + metavar = '', + dest = 'l3', + type = int, + default = None, + help = 'L3 size in bytes\n') self.add_argument('--randomizedMemoryScheduler', action = "store_true", help = 'Enable randomized memory scheduler\n') @@ -181,7 +187,8 @@ def create_config_from_args(args: argparse.Namespace, platform: str, simulator: str, tiling: bool, - platform_specific_cmake_args: Optional[list] = None) -> DeeployTestConfig: + platform_specific_cmake_args: Optional[list] = None, + gen_args_callback = None) -> DeeployTestConfig: script_path = Path(__file__).resolve() base_dir = script_path.parent.parent @@ -221,6 +228,8 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.append(f"--l1={args.l1}") if hasattr(args, 'l2') and args.l2 and args.l2 != 1024000: gen_args_list.append(f"--l2={args.l2}") + if hasattr(args, 'l3') and args.l3: + gen_args_list.append(f"--l3={args.l3}") if hasattr(args, 'randomizedMemoryScheduler') and args.randomizedMemoryScheduler: gen_args_list.append("--randomizedMemoryScheduler") if hasattr(args, 'profileTiling') and args.profileTiling: @@ -235,6 +244,10 @@ def create_config_from_args(args: argparse.Namespace, if not tiling and getattr(args, 'profileUntiled', False): gen_args_list.append("--profileUntiled") + # Allow platform-specific runners to append their own generation args + if gen_args_callback: + gen_args_callback(args, gen_args_list) + config = DeeployTestConfig( test_name = test_name, test_dir = test_dir_abs, @@ -313,7 +326,9 @@ def main(default_platform: Optional[str] = None, tiling_enabled: bool = False, platform_specific_cmake_args: Optional[list] = None, parsed_args: Optional[argparse.Namespace] = None, - parser_setup_callback = None): + parser_setup_callback = None, + gen_args_callback = None, + post_sim_callback = None): """ Main entry point for Deeploy test runners. @@ -324,6 +339,10 @@ def main(default_platform: Optional[str] = None, platform_specific_cmake_args: Additional CMake arguments for platform-specific configurations parsed_args: Pre-parsed arguments (if None, will parse from sys.argv) parser_setup_callback: Optional callback to configure parser before parsing (receives parser as arg) + gen_args_callback: Optional callback ``(args, gen_args_list) -> None`` to append + platform-specific generation arguments after the base args are collected. + post_sim_callback: Optional callback ``(config, result, args) -> None`` invoked + after simulation completes (e.g. for trace post-processing). """ if parsed_args is None: @@ -348,6 +367,7 @@ def main(default_platform: Optional[str] = None, "snitch": "Snitch", "chimera": "Chimera", "softhier": "SoftHier", + "xdna2": "XDNA2", } if args.platform: @@ -405,13 +425,17 @@ def main(default_platform: Optional[str] = None, if hasattr(args, 'num_clusters'): platform_specific_cmake_args.append(f"-DNUM_CLUSTERS={args.num_clusters}") - config = create_config_from_args(args, platform, simulator, tiling_enabled, platform_specific_cmake_args) + config = create_config_from_args(args, platform, simulator, tiling_enabled, platform_specific_cmake_args, + gen_args_callback) print_configuration(config) try: result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim) + if post_sim_callback and not args.skipsim: + post_sim_callback(config, result, args) + print_colored_result(result, config.test_name) return 0 if result.success else 1 diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906f9..9155ed77ae 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -31,7 +31,7 @@ from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -76,6 +76,10 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Chimera": Platform = ChimeraPlatform() + elif platformName == "XDNA2": + from Deeploy.Targets.XDNA2.Platform import XDNA2Platform + Platform = XDNA2Platform() + else: raise RuntimeError(f"Deployment platform {platformName} is not implemented") @@ -274,6 +278,30 @@ def mapDeployer(platform: DeploymentPlatform, deeployStateDir = deeployStateDir) else: - raise RuntimeError(f"Deployer for platform {platform} is not implemented") + # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms + try: + from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer + from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, \ + XDNA2Optimizer, XDNA2Platform + except ImportError: + raise RuntimeError(f"Deployer for platform {platform} is not implemented") + + if not isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)): + raise RuntimeError(f"Deployer for platform {platform} is not implemented") + + if loweringOptimizer is None: + loweringOptimizer = XDNA2Optimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = XDNA2Deployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) return deployer diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 9578c2f26c..e233cc9b1d 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -61,7 +61,7 @@ def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int class TestGeneratorArgumentParser(argparse.ArgumentParser): - def __init__(self, description = None): + def __init__(self, tiling_arguments: bool = False, description = None): formatter = _ArgumentDefaultMetavarTypeFormatter @@ -70,6 +70,8 @@ def __init__(self, description = None): else: super().__init__(description = description, formatter_class = formatter) + self.tiling_arguments = tiling_arguments + self.add_argument('-t', metavar = '', dest = 'dir', @@ -90,6 +92,27 @@ def __init__(self, description = None): help = 'Set the output dump folder\n') self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n') + # Tiling-related arguments (for XDNA2 and other tiled platforms) + if self.tiling_arguments: + self.add_argument('--l1', + metavar = '', + dest = 'l1', + type = int, + default = None, + help = 'Set L1 memory size in bytes (enables tiling if specified).\n') + self.add_argument('--l3', + metavar = '', + dest = 'l3', + type = int, + default = None, + help = 'Set L3 memory size in bytes.\n') + self.add_argument('--defaultMemLevel', + metavar = '', + dest = 'defaultMemLevel', + type = str, + default = "L3", + help = 'Set default memory level (default: L3)\n') + self.args = None def parse_args(self, args = None, namespace = None) -> argparse.Namespace: diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..dca5c7b7cc 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -42,6 +42,7 @@ from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS +from test_xdna2_config import KERNEL_TESTS as XDNA2_KERNEL_TESTS from testUtils.pytestRunner import create_test_config, run_and_assert_test @@ -117,6 +118,11 @@ def param_id(param): "model_tests": GAP9_MODEL_TESTS, "default_num_cores": GAP9_DEFAULT_NUM_CORES, }, + "xdna2": { + "platform": "XDNA2", + "simulator": "host", + "kernel_tests": XDNA2_KERNEL_TESTS, + }, } ### Markers summary ### @@ -987,3 +993,21 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch double_buffer = True, ) run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.xdna2 +@pytest.mark.kernels +@pytest.mark.parametrize("test_name", XDNA2_KERNEL_TESTS, ids = XDNA2_KERNEL_TESTS) +def test_xdna2_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + platform_config = PLATFORM_CONFIGS["xdna2"] + config = create_test_config( + test_name = test_name, + platform = platform_config["platform"], + simulator = platform_config["simulator"], + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/DeeployTest/test_xdna2_config.py b/DeeployTest/test_xdna2_config.py new file mode 100644 index 0000000000..7988aa09b1 --- /dev/null +++ b/DeeployTest/test_xdna2_config.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# Test list for the XDNA2 platform. +# Each entry is a relative path under DeeployTest/Tests/. + +KERNEL_TESTS = [ + "Kernels/BF16/Add/Regular", +] diff --git a/README_XDNA.md b/README_XDNA.md new file mode 100644 index 0000000000..56cfcb1225 --- /dev/null +++ b/README_XDNA.md @@ -0,0 +1,51 @@ +# How to use Deeploy on the XDNA2 NPU + +A dockerfile containing everything required to run on XDNA2 is available to build with the dockerfile at `Container/Dockerfile.deeploy-xdna`. + +You can build it locally on Ubuntu 24.04 with: +``` +docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local . +``` + +You need to have XRT installed on your host, once installed it is present in `/opt/xilinx/xrt`. You can run the docker container previously built with: +``` +docker run -it \ + --device /dev/accel/accel0 \ + --ulimit memlock=-1 \ + -v "$(pwd)":/app/Deeploy \ + -v /opt/xilinx:/opt/xilinx \ + --name deeploy_dev \ + deeploy-xdna:local +``` + +Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation. + +Once the container is started you can run a simple Add node, from ONNX to execution with: +``` +pip install -e ./ && \ +cd DeeployTest && \ +python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/ +``` + +## CI with a Self-Hosted Runner + +XDNA2 tests run on a self-hosted GitHub Actions runner with NPU access. +The Docker image is built locally on the runner (not distributed via GHCR). + +### One-time setup on the runner machine + +1. Build the Docker image: + ``` + docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local . + ``` + +2. Register the GitHub Actions runner (Settings → Actions → Runners → New self-hosted runner). + Use the label **`xdna2-npu`** and install as a service: + ``` + ./svc.sh install && ./svc.sh start + ``` + +3. Make sure the runner user has access to `/dev/accel/accel0` (e.g. is in the `render` group). + +Once the runner is registered, pushes and PRs automatically trigger the +`CI • XDNA2` workflow defined in `.github/workflows/ci-platform-xdna2.yml`. \ No newline at end of file diff --git a/TargetLibraries/XDNA2/CMakeLists.txt b/TargetLibraries/XDNA2/CMakeLists.txt new file mode 100644 index 0000000000..c2e1ffdecd --- /dev/null +++ b/TargetLibraries/XDNA2/CMakeLists.txt @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# --------------------------------------------------------------------------- +# XDNA2 (AIE2p) kernel library +# +# Compiles AIE C++ kernels using the llvm-aie (Peano) cross-compiler. +# Exports a CMake target `xdna2_kernels` that other targets can depend on, +# and sets XDNA2_KERNEL_OBJECTS in the parent scope. +# --------------------------------------------------------------------------- + +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +# --- Resolve llvm-aie (Peano) install dir --- +set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir") +if(NOT LLVM_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());" + OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) +endif() +if(NOT LLVM_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. " + "Please set the environment variable LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.") +endif() + +# --- Resolve mlir-aie include dir (aie_api headers) --- +if(NOT MLIR_AIE_INCLUDE_DIR) + if(DEFINED ENV{MLIR_AIE_INCLUDE_DIR}) + set(MLIR_AIE_INCLUDE_DIR $ENV{MLIR_AIE_INCLUDE_DIR}) + else() + execute_process( + COMMAND ${Python3_EXECUTABLE} + -c "import aie.utils.config; print(aie.utils.config.cxx_header_path());" + OUTPUT_VARIABLE MLIR_AIE_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE _aie_cfg_result + ) + if(NOT _aie_cfg_result EQUAL 0 OR NOT MLIR_AIE_INCLUDE_DIR) + message(FATAL_ERROR "[XDNA2] Could not query aie.utils.config.cxx_header_path(). " + "Please set the environment variable MLIR_AIE_INCLUDE_DIR or install the mlir-aie wheel.") + endif() + endif() +endif() + +set(LLVM_AIE_CLANG "${LLVM_AIE_INSTALL_DIR}/bin/clang++") + +message(STATUS "[XDNA2 Kernels] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2 Kernels] MLIR_AIE_INCLUDE_DIR = ${MLIR_AIE_INCLUDE_DIR}") + +# --------------------------------------------------------------------------- +# Compile AIE kernels +# --------------------------------------------------------------------------- +file(GLOB XDNA2_KERNEL_SOURCES "${CMAKE_CURRENT_LIST_DIR}/kernels/*.cc") + +set(XDNA2_KERNEL_OBJECTS "") + +foreach(KERNEL_SRC ${XDNA2_KERNEL_SOURCES}) + get_filename_component(KERNEL_NAME ${KERNEL_SRC} NAME_WE) + set(KERNEL_OBJ "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_NAME}.o") + + add_custom_command( + OUTPUT "${KERNEL_OBJ}" + COMMAND "${LLVM_AIE_CLANG}" + --target=aie2p-none-unknown-elf + "-I${MLIR_AIE_INCLUDE_DIR}" + -std=c++20 + -Wno-parentheses + -Wno-attributes + -Wno-macro-redefined + -Wno-empty-body + -O2 + -DNDEBUG + -c "${KERNEL_SRC}" + -o "${KERNEL_OBJ}" + DEPENDS "${KERNEL_SRC}" + COMMENT "[XDNA2] Compiling AIE kernel: ${KERNEL_NAME}.cc -> ${KERNEL_NAME}.o" + VERBATIM + ) + + list(APPEND XDNA2_KERNEL_OBJECTS "${KERNEL_OBJ}") +endforeach() + +add_custom_target(xdna2_kernels DEPENDS ${XDNA2_KERNEL_OBJECTS}) + +# Export kernel objects to parent scope so the testbench CMake can use them +set(XDNA2_KERNEL_OBJECTS "${XDNA2_KERNEL_OBJECTS}" PARENT_SCOPE) diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc new file mode 100644 index 0000000000..13b8b54637 --- /dev/null +++ b/TargetLibraries/XDNA2/kernels/add.cc @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All +// rights reserved. SPDX-License-Identifier: Apache-2.0 + +#define NOCPP + +#include +#include +#include +#include +#include +#include + +template +void eltwise_add(T_in *a, T_in *b, T_out *c, int size) { + for (int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } +} + +template +void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) { + constexpr int vec_factor = 16; + event0(); + T_in *__restrict pA1 = a; + T_in *__restrict pB1 = b; + T_out *__restrict pC1 = c; + const int F = size / vec_factor; + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(16) + for (int i = 0; i < F; i++) { + aie::vector A0 = aie::load_v(pA1); + pA1 += vec_factor; + aie::vector B0 = aie::load_v(pB1); + pB1 += vec_factor; + aie::vector cout = aie::add(A0, B0); + aie::store_v(pC1, cout); + pC1 += vec_factor; + } + event1(); +} + +extern "C" { + +void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, + int size) { + eltwise_add(a_in, b_in, c_out, size); +} + +void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, + int size) { + eltwise_vadd(a_in, b_in, c_out, size); +} + +} // extern "C" diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst index 0b3d97c761..c957bb22b2 100644 --- a/docs/tutorials/overview.rst +++ b/docs/tutorials/overview.rst @@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t introduction debugging + xdna2_tracing diff --git a/docs/tutorials/xdna2_tracing.md b/docs/tutorials/xdna2_tracing.md new file mode 100644 index 0000000000..d1b4bcb4e8 --- /dev/null +++ b/docs/tutorials/xdna2_tracing.md @@ -0,0 +1,68 @@ + + +# XDNA2 Execution Tracing + +The XDNA2 backend supports optional AIE execution tracing. +When enabled, the generated MLIR includes trace configuration for both **core +events** (instruction execution, stalls, port activity) and **memory events** +(DMA start/finish/starvation). +After execution on the NPU, the raw trace data is parsed into a JSON file +that can be visualised in [Perfetto](https://ui.perfetto.dev/). + +## Quick Start + +From the `DeeployTest/` directory: + +```bash +# Generate, build, run on the NPU, and parse the trace in one step: +python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular --trace + +# With a custom trace buffer size (default: 8192 bytes): +python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular \ + --trace --trace-buffer-size 16384 +``` + +After execution two files are produced next to the test binary +(inside `TEST_XDNA2/build_master/bin/`): + +| File | Description | +|------|-------------| +| `trace.txt` | Raw hex trace words read back from the NPU | +| `trace.json` | Parsed trace in [Chrome Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU) | + +Open `trace.json` in [Perfetto UI](https://ui.perfetto.dev/) to visualise +core and memory trace timelines. + +## How It Works + +Enabling `--trace` triggers three additional MLIR code-transformation passes +during code generation: + +1. **`MLIRCoreTracePass`** — Emits an `aie.trace` block on the compute tile + configured for 8 core events (vector instructions, stalls, port activity) + with packet-based routing. +2. **`MLIRMemTracePass`** — Emits a second `aie.trace` block for 8 memory/DMA + events (S2MM/MM2S start, finish, starvation) with event-based + start/stop synchronised to the core trace via broadcast signals. +3. **`MLIRTraceRuntimePass`** — Adds `trace_host_config` and + `trace_start_config` calls to the runtime sequence to activate the + configured traces at execution time. + +On the host side, the XRT testbench (`main.cpp`) allocates a trace buffer +object, passes it as kernel argument 7, and writes the data back to `trace.txt` after execution. + +The post-simulation callback in `deeployRunner_xdna2.py` then invokes the +mlir-aie trace parser (`aie.utils.trace.parse`) against the lowered MLIR +(`main_physical_with_elfs.mlir`) to produce the final `trace.json`. + +## Traced Events + +- `INSTR_EVENT_0`: Emitted by the `event0();` call, usually called at the beginning of the kernels (see `TargetLibraries/XDNA2/kernels/add.cc`). +- `INSTR_EVENT_1`: Emitted by the `event1();` call, usually called at the end of the kernels. +- `INSTR_VECTOR`: Emitted every time the vector unit is used, can be useful to see how well the kernel is using the vector unit. +- `PORT_RUNNING_0`: Emitted when a DMA transfer is running on port 0. +- `PORT_RUNNING_1`: Emitted when a DMA transfer is running on port 1. diff --git a/requirements-xdna.txt b/requirements-xdna.txt new file mode 100644 index 0000000000..3cd66d39c8 --- /dev/null +++ b/requirements-xdna.txt @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-3 +--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly +--extra-index-url https://pypi.org/simple + +mlir_aie +llvm-aie