From 05b9404760fe8e6584fcd395d10ad635b9a82efd Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 12 Mar 2026 15:51:43 +0100 Subject: [PATCH 01/16] Update gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d9e4faace3..7ffc9ca243 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,6 @@ CHANGELOG_GEN.md # Container Artifacts .pyusbip/ .cache/ + +# Claude context file +CLAUDE.md \ No newline at end of file From 5615ed49f3c17ddcd3c7b568d561c7f880179290 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 12 Mar 2026 18:02:54 +0100 Subject: [PATCH 02/16] XDNA2 Platform Beta Support --- CMakeLists.txt | 20 ++ Deeploy/Targets/XDNA2/Bindings.py | 22 ++ Deeploy/Targets/XDNA2/Deployer.py | 204 ++++++++++++++++++ Deeploy/Targets/XDNA2/Parsers.py | 6 + Deeploy/Targets/XDNA2/Platform.py | 69 ++++++ .../Targets/XDNA2/Templates/AddTemplate.py | 81 +++++++ Deeploy/Targets/XDNA2/Templates/__init__.py | 3 + Deeploy/Targets/XDNA2/TypeCheckers.py | 29 +++ DeeployTest/Platforms/XDNA2/CMakeLists.txt | 149 +++++++++++++ DeeployTest/Platforms/XDNA2/main.cpp | 194 +++++++++++++++++ .../Tests/Kernels/BF16/Add/Regular/inputs.npz | Bin 0 -> 8706 bytes .../Kernels/BF16/Add/Regular/network.onnx | Bin 0 -> 128 bytes .../Kernels/BF16/Add/Regular/outputs.npz | Bin 0 -> 4366 bytes DeeployTest/conftest.py | 1 + DeeployTest/deeployRunner_xdna2.py | 17 ++ DeeployTest/generateNetwork_xdna2.py | 189 ++++++++++++++++ DeeployTest/testUtils/core/execution.py | 5 + DeeployTest/testUtils/deeployRunner.py | 1 + DeeployTest/testUtils/platformMapping.py | 23 +- DeeployTest/test_platforms.py | 24 +++ DeeployTest/test_xdna2_config.py | 10 + TargetLibraries/XDNA2/CMakeLists.txt | 90 ++++++++ TargetLibraries/XDNA2/kernels/add.cc | 54 +++++ requirements-dev.txt | 7 + 24 files changed, 1197 insertions(+), 1 deletion(-) create mode 100644 Deeploy/Targets/XDNA2/Bindings.py create mode 100644 Deeploy/Targets/XDNA2/Deployer.py create mode 100644 Deeploy/Targets/XDNA2/Parsers.py create mode 100644 Deeploy/Targets/XDNA2/Platform.py create mode 100644 Deeploy/Targets/XDNA2/Templates/AddTemplate.py create mode 100644 Deeploy/Targets/XDNA2/Templates/__init__.py create mode 100644 Deeploy/Targets/XDNA2/TypeCheckers.py create mode 100644 DeeployTest/Platforms/XDNA2/CMakeLists.txt create mode 100644 DeeployTest/Platforms/XDNA2/main.cpp create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/network.onnx create mode 100644 DeeployTest/Tests/Kernels/BF16/Add/Regular/outputs.npz create mode 100644 DeeployTest/deeployRunner_xdna2.py create mode 100644 DeeployTest/generateNetwork_xdna2.py create mode 100644 DeeployTest/test_xdna2_config.py create mode 100644 TargetLibraries/XDNA2/CMakeLists.txt create mode 100644 TargetLibraries/XDNA2/kernels/add.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c8a024c15..8c23ccca7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) message(STATUS "Building for platform 'Chimera'") +elseif(platform STREQUAL XDNA2) + message(STATUS "Building for platform 'XDNA2'") else() message(FATAL_ERROR "Invalid platform '${platform}' specified!") endif() @@ -299,5 +301,23 @@ if(platform STREQUAL Chimera) endif() +if(platform STREQUAL XDNA2) + + project(${TESTNAME} LANGUAGES CXX) + + message(STATUS "============================= XDNA2 Configuration ============================") + message(STATUS "[cMake ] GENERATED_SOURCE = " ${GENERATED_SOURCE}) + message(STATUS "[cMake ] TESTNAME = " ${TESTNAME}) + message(STATUS "==============================================================================") + message(STATUS "") + + # XDNA2 uses its own CMakeLists.txt in DeeployTest/Platforms/XDNA2/ + # which handles the two-step build: xclbin -> host binary. + # AIE kernel compilation is in TargetLibraries/XDNA2/. + add_subdirectory(TargetLibraries/XDNA2) + add_subdirectory(DeeployTest/Platforms/XDNA2) + +endif() + print_simulation_config() diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py new file mode 100644 index 0000000000..68d7672787 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Bindings.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import bfloat16_t +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding +from Deeploy.Targets.XDNA2.Templates import AddTemplate +from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker + +# XDNA2 does not use the standard C code transformation pipeline. +# The deployer generates a holistic MLIR module, not per-node C snippets. +# An empty CodeTransformation is used as a placeholder. +XDNA2Transformer = CodeTransformation([]) + +XDNA2AddBindings = [ + NodeBinding( + XDNA2AddChecker([PointerClass(bfloat16_t), PointerClass(bfloat16_t)], [PointerClass(bfloat16_t)]), + AddTemplate.referenceTemplate, + XDNA2Transformer, + ) +] diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py new file mode 100644 index 0000000000..7aa77668eb --- /dev/null +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -0,0 +1,204 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import subprocess +import tempfile +from typing import Callable, Dict, Optional, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.Targets.XDNA2.Templates.AddTemplate import XDNA2NodeTemplate + +# JUNGVI: Will be removed once Deeploy generates it's own MLIR + +# Default path to the mlir-aie Python environment. +# Can be overridden via the MLIR_AIE_PYTHON env variable. +_DEFAULT_IRON_PYTHON = os.environ.get( + "MLIR_AIE_PYTHON", + "/scratch/jungvi/micromamba/envs/iron/bin/python", +) + +# Path to the IRON design scripts shipped with mlir-aie examples. +# Can be overridden via the IRON_OPERATORS_DIR env variable. +_DEFAULT_IRON_OPERATORS_DIR = os.environ.get( + "IRON_OPERATORS_DIR", + "/scratch/jungvi/IRON/iron/operators", +) + + +class XDNA2Deployer(SignPropDeployer): + """Deployer for the XDNA2 (AIE2p) platform. + + Unlike other Deeploy deployers that generate C code, this deployer + generates an mlir-aie MLIR module. The MLIR is produced by invoking the + IRON operator ``design.py`` scripts as subprocesses (using the mlir-aie + Python environment) so that the main Deeploy environment does not need to + have ``aie.iron`` installed. + + It also writes ``testinputs.h`` and ``testoutputs.h`` via the XDNA2 + generation script so the XRT C++ testbench can be compiled against + known-good golden values. + """ + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda x: x, + name: str = 'DeeployNetwork', + default_channels_first: bool = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets: Optional[Dict[str, int]] = None, + iron_python: Optional[str] = None, + iron_operators_dir: Optional[str] = None): + """ + Parameters + ---------- + iron_python : str, optional + Path to the Python interpreter in the mlir-aie (IRON) environment. + Defaults to ``MLIR_AIE_PYTHON`` env variable or + ``/scratch/jungvi/micromamba/envs/iron/bin/python``. + iron_operators_dir : str, optional + Path to the IRON operators directory containing per-operator + ``design.py`` scripts. + Defaults to ``IRON_OPERATORS_DIR`` env variable or + ``/scratch/jungvi/IRON/iron/operators``. + """ + super().__init__( + graph, + deploymentPlatform, + inputTypes, + loweringOptimizer, + scheduler, + name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir, + inputOffsets = inputOffsets if inputOffsets is not None else {}, + ) + self._iron_python = iron_python or _DEFAULT_IRON_PYTHON + self._iron_operators_dir = iron_operators_dir or _DEFAULT_IRON_OPERATORS_DIR + + # ------------------------------------------------------------------ + # MLIR generation + # ------------------------------------------------------------------ + + def generateMLIR(self) -> str: + """Generate an mlir-aie MLIR module for the prepared graph. + + Iterates over ``self.layerBinding``, extracts AIE parameters from each + bound template, and calls the corresponding IRON ``design.py`` script + as a subprocess. Currently only a single BF16 Add node is supported. + + Returns + ------- + str + MLIR module string (ready to be written to ``network.mlir``). + + Raises + ------ + RuntimeError + If the graph contains unsupported operators or if the IRON + subprocess fails. + """ + assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()" + + mlir_parts = [] + + for node_name, layer in self.layerBinding.items(): + mapper = layer.mapper + template = mapper.binder.template + op_repr = mapper.parser.operatorRepresentation + + if not isinstance(template, XDNA2NodeTemplate): + raise RuntimeError( + f"Node '{node_name}' has no XDNA2NodeTemplate — " + f"only BF16 Add is supported in this release.") + + aie_params = template.getAIEParams(op_repr) + log.info(f"[XDNA2] Generating MLIR for node '{node_name}' " + f"with params: {aie_params}") + + mlir_str = self._generate_add_mlir(aie_params) + mlir_parts.append(mlir_str) + + if not mlir_parts: + raise RuntimeError("No bound layers found in graph — cannot generate MLIR.") + + # For a single-node graph the MLIR is just the one module. + # Multi-node support would require merging modules. + return mlir_parts[0] + + def _generate_add_mlir(self, aie_params: dict) -> str: + """Call the IRON elementwise_add design.py to produce MLIR. + + Parameters + ---------- + aie_params : dict + Dict with keys: num_elements, n_cols, n_channels, tile_size, trace_size. + + Returns + ------- + str + MLIR module string. + """ + design_script = os.path.join( + self._iron_operators_dir, "elementwise_add", "design.py" + ) + + if not os.path.isfile(design_script): + raise RuntimeError( + f"IRON design script not found: {design_script}\n" + f"Set IRON_OPERATORS_DIR to point to the IRON operators directory.") + + if not os.path.isfile(self._iron_python): + raise RuntimeError( + f"IRON Python interpreter not found: {self._iron_python}\n" + f"Set MLIR_AIE_PYTHON to the mlir-aie Python interpreter.") + + with tempfile.NamedTemporaryFile(suffix=".mlir", delete=False) as tmp: + output_path = tmp.name + + try: + cmd = [ + self._iron_python, + design_script, + "--dev", "npu2", + "--length", str(aie_params['num_elements']), + "--columns", str(aie_params['n_cols']), + "--channels", str(aie_params['n_channels']), + "--tile-size", str(aie_params['tile_size']), + "--trace-size", str(aie_params['trace_size']), + "--output-file-path", output_path, + ] + + log.debug(f"[XDNA2] Running: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + check=False, + capture_output=True, + text=True, + ) + + if result.returncode != 0: + raise RuntimeError( + f"IRON design.py failed (exit {result.returncode}):\n" + f" cmd: {' '.join(cmd)}\n" + f" stdout: {result.stdout}\n" + f" stderr: {result.stderr}") + + with open(output_path, 'r') as f: + mlir_str = f.read() + + finally: + if os.path.exists(output_path): + os.unlink(output_path) + + return mlir_str diff --git a/Deeploy/Targets/XDNA2/Parsers.py b/Deeploy/Targets/XDNA2/Parsers.py new file mode 100644 index 0000000000..c665312dbd --- /dev/null +++ b/Deeploy/Targets/XDNA2/Parsers.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# XDNA2 reuses the Generic AddParser (see Platform.py). +# Add any XDNA2-specific parsers here as the platform grows. diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py new file mode 100644 index 0000000000..82ef1ec3d2 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Platform.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ + StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer +from Deeploy.Targets.Generic.Layers import AddLayer +from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate +from Deeploy.Targets.Generic.Parsers import AddParser +from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings + +XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings) + +XDNA2Mapping = { + 'Add': AddLayer([XDNA2AddMapper]), +} + +# Buffer classes reuse Generic templates since XDNA2Deployer manages its own +# output format (MLIR + test headers) and these templates are never rendered. + + +class XDNA2VariableBuffer(VariableBuffer): + initTemplate = AllocateTemplate.referenceInitTemplate + allocTemplate = AllocateTemplate.referenceAllocateTemplate + deallocTemplate = FreeTemplate.referenceLocalTemplate + + +class XDNA2TransientBuffer(TransientBuffer): + initTemplate = AllocateTemplate.referenceInitTemplate + allocTemplate = AllocateTemplate.referenceAllocateTemplate + deallocTemplate = FreeTemplate.referenceLocalTemplate + + +class XDNA2ConstantBuffer(ConstantBuffer): + initTemplate = AllocateTemplate.referenceGlobalInitTemplate + allocTemplate = AllocateTemplate.referenceGlobalAllocateTemplate + deallocTemplate = FreeTemplate.referenceGlobalTemplate + + +class XDNA2StructBuffer(StructBuffer): + initTemplate = AllocateTemplate.referenceStructInitTemplate + allocTemplate = AllocateTemplate.referenceStructAllocateTemplate + deallocTemplate = NodeTemplate("") + + +# No topology optimization passes needed for the initial Add-only platform. +XDNA2Optimizer = TopologyOptimizer([], name = "XDNA2Optimizer") + + +class XDNA2Engine(DeploymentEngine): + + def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", + includeList = None) -> None: + if includeList is None: + includeList = [] + super().__init__(name, Mapping, initCode, includeList) + + +class XDNA2Platform(DeploymentPlatform): + + def __init__(self, + engines = None, + variableBuffer = XDNA2VariableBuffer, + constantBuffer = XDNA2ConstantBuffer, + structBuffer = XDNA2StructBuffer, + transientBuffer = XDNA2TransientBuffer): + if engines is None: + engines = [XDNA2Engine()] + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py new file mode 100644 index 0000000000..050413eedc --- /dev/null +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + + +class XDNA2NodeTemplate(NodeTemplate): + """Base class for XDNA2 templates. + + Temporary Feature: + Unlike Mako-based templates for C code, XDNA2 templates do not produce + code snippets. Instead they store AIE kernel metadata that the + XDNA2Deployer reads when generating the holistic MLIR module. + """ + + def __init__(self, kernel_fn_name: str, kernel_obj: str, kernel_src: str, tile_size: int = 1024): + """Initialize an XDNA2NodeTemplate. + + Parameters + ---------- + kernel_fn_name : str + Name of the AIE C++ kernel function (e.g. "eltwise_add_bf16_vector"). + kernel_obj : str + Compiled kernel object file name (e.g. "add.o"). + kernel_src : str + Kernel source file name relative to TargetLibraries/XDNA2/kernels/ + (e.g. "add.cc"). + tile_size : int + Number of elements per tile (default 1024, max 4096). + """ + # Empty Mako template — no C code is generated per node. + super().__init__("") + self.kernel_fn_name = kernel_fn_name + self.kernel_obj = kernel_obj + self.kernel_src = kernel_src + self.tile_size = tile_size + + def getAIEParams(self, operatorRepresentation: dict) -> dict: + """Return the aie.iron parameters for this node. + + Parameters + ---------- + operatorRepresentation : dict + The operator representation dict produced by the parser. + + Returns + ------- + dict + Parameters to pass to the corresponding aie.iron design function. + """ + raise NotImplementedError + + +class XDNA2AddTemplate(XDNA2NodeTemplate): + """XDNA2 template for BF16 elementwise Add.""" + + def __init__(self): + super().__init__( + kernel_fn_name = "eltwise_add_bf16_vector", + kernel_obj = "add.o", + kernel_src = "add.cc", + tile_size = 1024, + ) + + def getAIEParams(self, operatorRepresentation: dict) -> dict: + num_elements = int(operatorRepresentation['size']) + tile_size = min(num_elements, self.tile_size) + # Ensure num_elements is divisible by tile_size + if num_elements % tile_size != 0: + tile_size = 1 + return { + 'num_elements': num_elements, + 'n_cols': 1, + 'n_channels': 1, + 'tile_size': tile_size, + 'trace_size': 0, + } + + +referenceTemplate = XDNA2AddTemplate() diff --git a/Deeploy/Targets/XDNA2/Templates/__init__.py b/Deeploy/Targets/XDNA2/Templates/__init__.py new file mode 100644 index 0000000000..4694b67df5 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Templates/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/Deeploy/Targets/XDNA2/TypeCheckers.py b/Deeploy/Targets/XDNA2/TypeCheckers.py new file mode 100644 index 0000000000..cb9c98fd39 --- /dev/null +++ b/Deeploy/Targets/XDNA2/TypeCheckers.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Optional, Sequence, Type + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker +from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer + + +class XDNA2AddChecker(SignPropTypeChecker): + """Type checker for BF16 elementwise Add on XDNA2. + + Both inputs and the output are bfloat16_t pointers. + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: + # Float types do not have a meaningful nLevels — return 1 as a neutral value. + return [1] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: + # BF16 is a signed floating-point type. + return [True] diff --git a/DeeployTest/Platforms/XDNA2/CMakeLists.txt b/DeeployTest/Platforms/XDNA2/CMakeLists.txt new file mode 100644 index 0000000000..d017d7f22f --- /dev/null +++ b/DeeployTest/Platforms/XDNA2/CMakeLists.txt @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# --------------------------------------------------------------------------- +# XDNA2 (AIE2p) testbench CMake configuration +# +# Included via add_subdirectory() by the top-level CMakeLists.txt when +# -Dplatform=XDNA2 +# is passed. It orchestrates two build steps: +# +# 1. Compile network.mlir to network.xclbin + npu_insts.bin with aiecc.py. +# 2. Compile the XRT host binary (main.cpp) with the system compiler. +# +# AIE kernel compilation is handled by TargetLibraries/XDNA2/CMakeLists.txt. +# +# Required variables (set via environment or CMake cache): +# MLIR_AIE_INSTALL_DIR – path to the mlir-aie installation +# (auto-resolved from aie.utils.config or env) +# LLVM_AIE_INSTALL_DIR – path to the llvm-aie installation +# (auto-resolved from aie.utils.config or env) +# XRT_INSTALL_DIR – path to the XRT installation +# (default: $ENV{XILINX_XRT} or /opt/xilinx/xrt) +# GENERATED_SOURCE – directory containing network.mlir, testinputs.h, testoutputs.h +# (set by the Deeploy test runner) +# TESTNAME – name of the test target (set by the Deeploy test runner) +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# Resolve toolchain and runtime paths +# --------------------------------------------------------------------------- +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +# --- llvm-aie (Peano) install dir (needed for --peano flag) --- +set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir") +if(NOT LLVM_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());" + OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT LLVM_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. " + "Set LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.") + endif() +endif() + +# --- mlir-aie install dir (needed for aiecc.py) --- +set(MLIR_AIE_INSTALL_DIR "$ENV{MLIR_AIE_INSTALL_DIR}" CACHE PATH "mlir-aie install dir") +if(NOT MLIR_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.root_path());" + OUTPUT_VARIABLE MLIR_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT MLIR_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find mlir-aie install dir. " + "Set MLIR_AIE_INSTALL_DIR or install the mlir-aie wheel.") + endif() +endif() + +# --- XRT install dir --- +if(NOT XRT_INSTALL_DIR) + if(DEFINED ENV{XILINX_XRT}) + set(XRT_INSTALL_DIR $ENV{XILINX_XRT}) + else() + set(XRT_INSTALL_DIR "/opt/xilinx/xrt") + endif() +endif() + +set(AIECC_PY "${MLIR_AIE_INSTALL_DIR}/bin/aiecc.py") + +# Deeploy-generated sources +set(NETWORK_MLIR "${GENERATED_SOURCE}/network.mlir") + +message(STATUS "[XDNA2] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2] MLIR_AIE_INSTALL_DIR = ${MLIR_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2] XRT_INSTALL_DIR = ${XRT_INSTALL_DIR}") +message(STATUS "[XDNA2] GENERATED_SOURCE = ${GENERATED_SOURCE}") +message(STATUS "[XDNA2] TESTNAME = ${TESTNAME}") + +# --------------------------------------------------------------------------- +# Step 1: Compile MLIR -> xclbin + npu_insts.bin +# --------------------------------------------------------------------------- +set(XCLBIN "${CMAKE_CURRENT_BINARY_DIR}/network.xclbin") +set(NPU_INSTS "${CMAKE_CURRENT_BINARY_DIR}/npu_insts.bin") + +add_custom_command( + OUTPUT "${XCLBIN}" "${NPU_INSTS}" + # Copy kernel objects into aiecc.py working dir so the linker scripts + # generated by aiecc.py can find them via INPUT(kernel.o). + COMMAND ${CMAKE_COMMAND} -E copy ${XDNA2_KERNEL_OBJECTS} "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${CMAKE_COMMAND} -E env + "PATH=${MLIR_AIE_INSTALL_DIR}/bin:$ENV{PATH}" + "python" "${AIECC_PY}" + --no-aiesim + --no-xchesscc + --no-xbridge + --peano "${LLVM_AIE_INSTALL_DIR}" + --aie-generate-cdo + --aie-generate-npu-insts + --npu-insts-name npu_insts.bin + --aie-generate-xclbin + --xclbin-kernel-name=MLIR_AIE + --xclbin-name network.xclbin + "${NETWORK_MLIR}" + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS "${NETWORK_MLIR}" ${XDNA2_KERNEL_OBJECTS} xdna2_kernels + COMMENT "[XDNA2] Compiling MLIR -> network.xclbin + npu_insts.bin" + VERBATIM +) +add_custom_target(xdna2_xclbin DEPENDS "${XCLBIN}" "${NPU_INSTS}") + +# --------------------------------------------------------------------------- +# Step 2: Compile XRT host binary +# --------------------------------------------------------------------------- +add_executable("${TESTNAME}" + "${CMAKE_CURRENT_LIST_DIR}/main.cpp" +) + +target_include_directories("${TESTNAME}" PRIVATE + "${XRT_INSTALL_DIR}/include" + "${GENERATED_SOURCE}" +) + +target_link_directories("${TESTNAME}" PRIVATE + "${XRT_INSTALL_DIR}/lib" +) + +target_link_libraries("${TESTNAME}" PRIVATE + xrt_coreutil + uuid + dl + pthread +) + +target_compile_features("${TESTNAME}" PRIVATE cxx_std_17) + +# The xclbin and npu_insts must be available at runtime in the same directory +# as the binary. Add a post-build step to copy them. +add_custom_command(TARGET "${TESTNAME}" POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${XCLBIN}" "$/network.xclbin" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${NPU_INSTS}" "$/npu_insts.bin" + COMMENT "[XDNA2] Copying xclbin and npu_insts to binary directory" +) + +add_dependencies("${TESTNAME}" xdna2_xclbin) diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp new file mode 100644 index 0000000000..07ffb7a0ca --- /dev/null +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -0,0 +1,194 @@ +// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +// XRT C++ testbench for the XDNA2 (AIE2p) platform. +// Loads network.xclbin produced by aiecc.py, runs the MLIR_AIE kernel, +// reads back outputs and compares against golden reference values. +// Output format: "Errors: X out of Y" (required by output_parser.py). + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_hw_context.h" +#include "xrt/xrt_kernel.h" + +// Generated by Deeploy's generateNetwork_xdna2.py: +// testinputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} defines +// testoutputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_OUTPUT{i} defines +#include "testinputs.h" +#include "testoutputs.h" + +// --------------------------------------------------------------------------- +// BF16 helpers +// --------------------------------------------------------------------------- +static float bf16_to_float(uint16_t bf16) +{ + uint32_t f32_bits = static_cast(bf16) << 16; + float f; + std::memcpy(&f, &f32_bits, sizeof(f)); + return f; +} + +static bool bf16_nearly_equal(uint16_t a, uint16_t b, + float rtol = 0.0f, float atol = 0.0f) +{ + // Default: allow 1 BF16 ULP difference to account for hardware rounding. + // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values. + float fa = bf16_to_float(a); + float fb = bf16_to_float(b); + float diff = std::fabs(fa - fb); + + // Compute 1 ULP for the reference value's magnitude + uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits) + float ulp; + if (ref_exp == 0) + ulp = std::ldexp(1.0f, -133); // subnormal ULP + else + ulp = std::ldexp(1.0f, static_cast(ref_exp) - 127 - 7); // 7 mantissa bits + + float tol = std::fmax(atol + rtol * std::fabs(fb), ulp); + return diff <= tol; +} + +// --------------------------------------------------------------------------- +// Read the NPU instruction binary produced by aiecc.py +// --------------------------------------------------------------------------- +static std::vector read_instr_binary(const std::string &path) +{ + std::ifstream file(path, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("Cannot open instruction file: " + path); + } + file.seekg(0, std::ios::end); + size_t byte_size = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector instr(byte_size / sizeof(uint32_t)); + file.read(reinterpret_cast(instr.data()), byte_size); + return instr; +} + +int main(int argc, char **argv) +{ + // Paths to the compiled artefacts (relative to the binary's working dir) + std::string xclbin_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/network.xclbin"; + std::string instr_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/npu_insts.bin"; + + bool verbose = false; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "-v" || arg == "--verbose" || arg == "-vv") { + verbose = true; + } + } + if (argc >= 2 && argv[1][0] != '-') xclbin_path = argv[1]; + if (argc >= 3 && argv[2][0] != '-') instr_path = argv[2]; + + // ----------------------------------------------------------------------- + // 1. Open XRT device, register xclbin, create hw_context + // (matches mlir-aie test_utils::init_xrt_load_kernel pattern) + // ----------------------------------------------------------------------- + auto device = xrt::device(0); + auto xclbin = xrt::xclbin(xclbin_path); + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, "MLIR_AIE"); + + // ----------------------------------------------------------------------- + // 2. Read NPU instruction binary + // ----------------------------------------------------------------------- + std::vector instr_v = read_instr_binary(instr_path); + size_t n_instr = instr_v.size(); + + // ----------------------------------------------------------------------- + // 3. Derive element counts from the testinputs/testoutputs header defines. + // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set + // by generateNetwork_xdna2.py. + // ----------------------------------------------------------------------- + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, + "Input 0 and input 1 must have the same number of elements"); + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0, + "Inputs and output must have the same number of elements"); + + const size_t n_elem = N_ELEMENTS_OUTPUT0; + const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes + const size_t buf_bytes = n_elem * elem_size; + + // ----------------------------------------------------------------------- + // 4. Allocate XRT buffer objects + // Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out) + // ----------------------------------------------------------------------- + auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in0 = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_in1 = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + // ----------------------------------------------------------------------- + // 5. Copy data into device buffers + // ----------------------------------------------------------------------- + std::memcpy(bo_instr.map(), instr_v.data(), n_instr * sizeof(uint32_t)); + std::memcpy(bo_in0.map(), testInputVector0, buf_bytes); + std::memcpy(bo_in1.map(), testInputVector1, buf_bytes); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ----------------------------------------------------------------------- + // 6. Launch kernel and wait for completion + // opcode 3 = execute NPU instruction stream + // ----------------------------------------------------------------------- + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, static_cast(n_instr), + bo_in0, bo_in1, bo_out); + run.wait(); + + // ----------------------------------------------------------------------- + // 7. Sync output back and compare against golden reference + // ----------------------------------------------------------------------- + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + const uint16_t *hw_out = bo_out.map(); + const uint16_t *golden_out = testOutputVector0; + + int errors = 0; + for (size_t i = 0; i < n_elem; ++i) { + bool match = bf16_nearly_equal(hw_out[i], golden_out[i]); + if (!match) { + ++errors; + if (errors <= 10) { + std::cerr << " Mismatch at index " << i + << ": hw=" << bf16_to_float(hw_out[i]) + << " (0x" << std::hex << hw_out[i] << std::dec << ")" + << " ref=" << bf16_to_float(golden_out[i]) + << " (0x" << std::hex << golden_out[i] << std::dec << ")" + << " diff=" << std::fabs(bf16_to_float(hw_out[i]) - bf16_to_float(golden_out[i])) + << "\n"; + } + } + if (verbose) { + float hw_f = bf16_to_float(hw_out[i]); + float ref_f = bf16_to_float(golden_out[i]); + std::cout << "[" << i << "] hw=" << hw_f + << " ref=" << ref_f + << " diff=" << std::fabs(hw_f - ref_f) + << (match ? "" : " *** MISMATCH") + << "\n"; + } + } + + // Output format required by testUtils/core/output_parser.py + std::cout << "Errors: " << errors << " out of " << n_elem << "\n"; + + return (errors == 0) ? 0 : 1; +} diff --git a/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz b/DeeployTest/Tests/Kernels/BF16/Add/Regular/inputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..3cfdd76a1f7a397616b6f353c278b0bb20ce5efb GIT binary patch literal 8706 zcmd6NX;{wR*KdO+LxV=Cq-Y{Ti0c0Cb&DcN$xj(eD2dV_M5!d2=ZT_HiDpF_?(beV zqDUGH4MG$lrGzp+e$W5SdCs}coAc_deeJ!kwbov*_O;iw_UE%~EQKe?3H(Q93WPqt zF&rcI>nvPguz0TST=f9; z|5us#QaFPwQVYcdRW5WkctU9HH*77wh-q(yNc8(`ntUOgr{J^&jn$pNQFRq4o1A96 zEaG72_Kzg>`&R6~&m$ZCd{OPy4p0qT%f_x{sLT3!$nnpB?;JIH*Xk+lTyvGWnTdn{ z^jcE3{vk<7>Lxd&CAiu0E!18^824_l!AE7Ppm!n;EVjR61Lf>-k<13dMVT!KD*4+{@#SGy<=T!@4i!@Nk`^hPssNv+8o~o5OHhg`r#q@YQ0v^Su=@21 zT4s@hF;ekp?6eQNB7c)99WvzNtp(7{@1iD~G{_Y#btv3$qQR?bs^xQw zn#-I)-+#W5;Elsnd~om)S>3?DXJf`CT@qrh{6o$YB$|x8o1vYBXAR8RIPj zaY^ZYaOsteD>#Z`O^`dymyQXBpm@?!(qisJy1x}e*WHcq;CdxF zVO0yebkp$W9Zi&9?2g*W^_07`440qLfXy+xsM87_2L2Al-jiA6P^=xiO!@=W{>EHC z{YS*nHUPbS#Q8^M!pU0|8@zV;5#3R)g8p-)=|t}^uy0i-bGNNeU7CR?P8xwc+k+jjb^kPp2`X=UJZ_NTyFykJyKhh-mj_%kG9(50}!J3+rgQbS)HBMB}5$&M0$b z7L-cdqZ=RR*CuY)rU_4qh=a~mNIF@~%M7`1pjrs7B%>F z8aIWD;i%sZteJd+CBJ%5v!NDK%*44X?&acQuI20zP|iB&Db@JSbl0jXz?d==rA@45>m(YHyJ{R#})6@!k4(?I6*DYEtP z7m~K_1!nAbhla~9*apz*NTqWv!Wf4L19aRN0QqexsJePJh#RkWOWGdxmL zy$QDKPQ_MR8z{aN28o9~$URX{;^7uS3?^*Hn=*>@`8*Z6Xl)%i_Bjw#j-N&cApvTa z#lroq1iv4*#jU#5BwKeAY*}o=ahhHLU3Oa`G}s=Jx~72HxE%Mbye+LX65ub{)C5WP z@9F;GZf0v{KC~+|;rAt3Sp4-qo?L5*Q=R8w9$!UXhb;eDwgZzLHA2rV z9w3~GMW7oyg~+Fw!mdg;Vp}c-N1PX<0TF}0W*2I6xP@ZI%h z6AP{ucfx_)B+=(O40+Tt&BbyMnC!}K2%JZCqgLS|n{HzNdowof(u30X5g=GlOAKNY zsJ_Qr;ym#@oe*>YPVRe;wT}wvVB2rhc0in%SOOh-&hT{pGL*38V|xw5Kcz7V<=!fD zwJ$#>Gr3auTWyH=H`U{9<22e))5t_@EMv{Q*MPlm8F5pb1A0aa(4^fNgo0b3YU>dc zAc3a_5+KfG8h^(&3taHL9q)u}z?m5VWd7-;^lD8o-6p6>h0JV7rD8Bn`@9&A z=IX)~&D-Sr5lK+4OC`BPi0k1{&I%i!qx1W@IDh6z>{^qFMq#_Khabn*>J$>YqwAR& z-&bIuw+m>kVW9JEVIl1Vt0y)p#1{6Aomo`M3q?a*nO7m+l20Jjch;0ED9 zVypfRI`%(A+3z_x_Ns%fHWR@dQ5(qfJxsnV?8V^`VMy(iMTMC;kaSrcmwnT~v!^G5 zmh}o)?f#nN-)P4yo&+q}eG-j6c#ep`(5-4)6@&ve#jftgnhD{Aw^b z_mbL$2s5Ik;l%xd8rClxAsgew(e!2;mEBhY9~OqiKj?3JNMf>q(-sb?r* zr$hSQ*VC}j4`8>&fZ7KM_Yh_czNaI{I%(~L zEYg4J7`<2R18S<${E-VA&~xo-ydEUS7rJr?i&nXz(~uP{^o@aeBS-1gkr33g*vr~l zo&wJfKlGfw7iE4)!uGD6H1@`9XiRbBnPe>juYi+S@!O22j`q~%LOM=XZ}?l6&Ij?b z3M$`q0uOzwV?XSSgN!j5{?m9w3)B7hPOKJ4X&mYISP6~8@%VXdEVi$DO=PZkFj_|^ z!^62;`qVXsjK7;nZcSeWN@?q0%)S}np%j0iQ4s79OM^Q(Cs09Z2yeIDW85q~AW88J z$jzMq`RgfNlDiuhoj2ha8J(dx7>q4y!d#c2KA7Yd4_&-{81-rk#86GJPku$jzhpvb zz6KSY{@H|il@TcESw?;@ zxJg#l=Yq!lC>V7Sqnf6fIO3E;pL#ivcSGyw^I=(3I~q)O2n*tHgF51&O!BR@i3WZ3 zBQ7r^NpnX$GR|MAtV9)vU=pr8EXE(P8N#23hd3tYzc^o?4YP@E=};v#hu?Ai9$E6N z9%gKfLxDp#=_!_r&qjA+%cOkzq5AX*CV!f2Y&Vy29f8-<;s=7-FvB%*pzvh|EmxBw<5icp_I1 zvYd~h>A5P*<{n^8eit&C;R`XC{sYeU3vp;m3$>|{N6R+CTGhvp9HC%Lbu)rpV!@2% zoKL7;@g2LLSny2EcGAJ7hwNdK*F4>CE=2E)HZ=ax)3)zMA6 zrmOQkRl4x?@{MR)FNrovt&|Z{W0s!Fq_=EN!4WBae8Jy_`t!e#f!I1M3aux5#M&6W z9}4us+D7=bdIk3&FrJKSl!R63KW zecPlq;x)OilMD}@wZW-V=Sk!T1vqV;h}Gj2Y;&b2%qZIqMrE7G_4hmBj@b{AlXC`M zX(bUwwMx2cw2*YinDEa?)R8lRd-*sG?-x#P^_ei7pT(iXP0 zd9zXF#@Mpt5glEn0ZMaLng}i^AvO_nVA%lDV`rAbrLh}itK@m~K34_J3j|QF>mmNW z6^a>GPT-Y_jf{rj-#@T#5xtkI2D2xW5V6*5BJsGD2-K>A&Hn%|YNE}Q|4;B@;s3AT z#kIGh{QT4!8W%Q+pI9NwENDrs6?ha1dnTm8id<__peTS7_l%Q_*Ic-GjJxIexGD)$y~z4R3D{q^g> zJ?lpbdX%Y1z%w!{BY@FXe8o(XX{DxL$}uGQHcFXj!-c|Vy84U~&fhwTQxaqe8?G!P zOGmYF&R7QUHSR%};V&X=>IYHd=Rqky4$_yXg1P=QI4UjzalTQQwsR5KL>A)0&RKGZ-u=?TJgLiOyj0@BbG(W^W3G}yg#i2+`-ZeuS(?<1 z0xl2mqVZ`QdS`tEwXS^yQs?io{pd$?IHLSJ?{{PDl~Z(L(tK=uS_VF^btoOk1p(^M zEcI7n-gRAL!|xq2IV#uR{H$&^XBMbv{+TCf28mwh5HYwM_}=Lzt776O<2 zIHbn&5a^~wk}a$hs^=PjM))h{{<%_P8qNd7rz-s7L)Y-bNGe+Ws-}Y5PSDB2d|bcC zA7|{?hMwlC++&$v!tDk5IotLrlG<%g@+ZhFfm-qwh~C{wcvTsFijZDufJK zgN1ouy}ymz*|-|!XS?9VnW8Wu+8A^)WpN^rf|3zwI956tdn#k`WPB>N>Rw@MR$*F%YQw@Z2?|{m|99XXxgf{})L2gJ4chgWv;6yS+Fg_$|5?NVj$7Ci_3{;%}Jqd+JPs%KQHETMCx zk1qMWhAocxz_yFoq5g4Qs5+zti$)b;O>G^_JXsC=airG;kXYu6D@GcSA^9(HJk zJi{6yw!)DfZ%Bjf0~7gD{Q`V>o+aEWKFkCf2a(Nxftaj+C@;uJfQ!FvA-Z0l6gT$J zgCh56{Yr7Z_q!01?6nDA7vIJ6wz}A)orI<{4`Z~{R;I&6g#U=E3VA=d5E>Umhi<9y zE#(8Tw)q@fzPFH{I&_7uoS2HX<5pnyXf8@Wd_X4Llm;J(0T6DnroCZ&vQ~kjpJN(a zD>KEEXekt0Udc%0R-Eh5P9 zU>g|}9s;d5AIau@I`GtApPT$~3!P-T7H5h-B?tZj8f{NRF=h5iJpM?QE4C@0NFFx9 zy+_^P_~{pHv0pG%9gzZ;ZFv|Id6is!`Wf0whRBQae45`Xi1JEp)LHBV?LKgW?n-on z`<;Ia{i1DP?sAh_{o#{Y%i}Ovx&X#JGodEm7yNr~;Hu^Sz#NNaa@wy6ttZvItx=@Peir+4@CClN8G<-2s~CP1M||DPOz7Sv4dA}&h}K;_%e^4`I~z!YJSj! zJURUMcp(^xhLEuiEjX(v&69Q#hD|&Lwi}_PFKzPeOs7uv`37rI#u18Q#n>!9J zcffADR=jkZP=gm9z$Xr=2(_YQkg!Vzgr%IA? z@(`#58R1y$7EE#&AP;2%K%j3H6MiWdKdF?UYf&Y7Eq{gGMm>zDU>x){ZRO<@Nb@W0 zR?vp4TC`)9C^s;q249OdP^ZmZ5Vz|Mu~;GvQ23tNHgt*XlUW6U#@6WFumj>~DR$1d zL|%w3B|C4c;*lrSyheu^ctSrMe^0(fgRG;m(kzb%7$&0nN<+A2oQhY+rBUe340Jtz z8S18ACfKrx>TLQ$m~K&iXVC&Svg8kRgo+U-JD#<(Ct5;F z(R(-_A{?^m)$?Pxdff%Q@_jx(ogSdK`?*wqLnpna6amxro`H2o=h3$F;(R?(HBg)@ z2??&}iK*!^P?(mCKCuE^sdK(4+XNu=qa7d3_CN=vn{YWPl+F>chXc9Z#Om`C_F_T_ zT(R?}#vglecD5DvD^GzrLHEe^r3oYS(12`?&~L9{S8G9c{*H2?hSN z&0dhLOrRZa} zkvRN3Va#27uouVs@K1P-Ubq={xku|E#Ji!1O)5*uN*IG^19 z8Vt2xzP9ug?reYypFB8OvgBRfD8VRuUx=n>=cvI6e4XF6$4#{2q z5_!H{aJnW;T5l9r;( zHcQ~x>j3iet~AXFas=mh;+VerE?(;`Lv>woGa{Q7p(wYkmb?t2j8@SI?y0@xDe!3pZC-pHy-|x_W^X`FC zL74L{AJ06V11Z+#C{Zbky+v7M{hc3lQNnJpG&jf9K5FcOVtg=BkpJ>rJnn-_7PP4A~a z0}>gwDG!%Ab-{hhSlB9(M#Od(p;Wa!nGzX`zVrSPQrgxK>3fSw8l2A8x^WCuWfs64 zNQYJ5qCvjC5v85al083z!G7f<90?ZWO;n9_+i}0jKYe5$niMy0cG|U+`fiq&sCmgWzm{1x|P`U@jC}Zi6+a z>w#Up11o#?P$MgVhg$oX)CsYevicF3qbUb{v#&BQx^@s{-;)%Uv2ZKV1@pHkVDisJ z91US77`%-5;CepJb`#=z42Up!vtsc^ZylAl(S_NZS!nQhKWQ~RLrbOR!p{|>Bwk}7 zPOg|scb*A_E3S7*esejdJkf^GM-gb;bQOx^L*V(oByv%3CW)SGN3lX5a-Oi%*vN}0 zE_H;MHYseTsyN=0hI!mD>=u8IQcsK_B%&YYc~wHVW(&&389}mGCH#5yjuzc&W^986 z_<8YRq(&nQXSlB=fsgC)=J-S~(F}p3$5&&?sd!xdQ-Z&BPZ+iDmBR<8@1X3beiFL# z4ZVIP9DT>9;0g6`klmsT(X&NpVSpOkN=w3!lnG41Qw6d&W|%sEeog}Jt-_2DAvkY7 z1+d(hKfE;!_D){G-+fIL(@6-p`dpwt3jY#heXL-AGe z=(_j>e&lb5-9aYcH>Df1avo6Lw{6Ia&x6S=Md)PT1Ts6s*_Aa(*zTl{eTG(8u9br+ zi*GV^bI;+7$VvRqyKmCpx8C5nXfBS|xZ(tHM{JN1;I>NbVIoYA5fP_eoav^GmPfXc zTbd#G;E*eP9XW%^=M(5LF$FAMp+j`WG%=w(f-2niqE7@kf63f&BCm3g7$3XFN;r4_ z1!pZG?6?&3=ypP5Tp`-ao56t%pYW-_1F^}t3%3lu6Z@)p~;?CxP=acj6|07)D1cm`m3hNK#z- zU$VZI9qtvw;;U1*YA*fs>mfZ1H+)W{M}vrAO9g(Yhy{greD;??49e-Hf%#l@27ou~Ows18iE4x|YMSJzUuc>Oa%0gZc0*|0n(OPzTm5Yov{z-!it@98iArff}g=!q1(@ zvFA|=Q8U;N%f@v;X5AOK>A3>Go-RX-I*xTyb>YUx-^9jJP)JVr|JHl)p9YJ6*C4@v v*Z-vx<9`qQpSZ`rhyNo%g6!WujsFDxHwt27DI)r>OX%tE|T`A)u*@1ARBuDNH<=9)RU?QF)0De(RW>bwU`p)-Qd z@IrWdG~GP{w*_b}Rns){ zT%@V1rs?Sy5ES6*>*5#S?(vJacHJK6@skhqcJ=r8S!*rSH&~>rt*f?A?f(nufQBxd zdAkIE6TOa;#lpa0{U};`9i(Bd`B3EAEkt)Q)Nrb#W6Gi}b*b<7bXIEyl7{Ri-b~@uj0cyRS&VuY+@cI65T)(7*dMGBM z*N_|*mLCL8{TlGoXThEF6pVJ0XJvA|m^W)0CTXh>-Ot5Tb~Ki>Iab34TPZwZUy4&> zV=5;kNnk_WR|u1pXJ$q9^!)l2*fS>*uf%VGPdgSuZE`3K+V_F`(JOGh^$F<5cu}XJ zo3wm4Mdgw#QY3p8`5!f4P$>s$$HG`j#dPL*#s#z02TA+)SKv9m6q064MtISS1F3nW zZ>JcSUzLe@-pkQ8pdF*t$FU_N0q~CehNkwRwD*b@Gx#)}MZ7Dd{$H|5PTK_T)5bJp z`X9i2LmoyrOvLH;L$N7Bg2jyd1Rr zd>Y5w-i*1G<5~Bo^`x%S4dXT)Bw}Lc>HXi+Nq?g$slTTQp|88C+TD1(y~LY{@0!o; z&+mb_p%@5vXhEyK8Waxu!J?!cICb+~T{rYh(ggPSb#mQJ(}& zdUDY1+W@vdG+5TI@k~jefLlk5s9NfQ$_%+UOq`*B`d%uWjM@h<-knEe>aGf{9~M!M zWl}6Ns*hf}nFO`!O?YP6bx8WVk^Heq0Wr@A)6!P4I;ForN$mg>#`=&fqiD9<>X>l; zXfq78>}GQ=t;K{BK3M1J1iYMGnAdcQyo|^prxSj_3zxGXVJO4Ay;{+CxCq`g_EEQL zU$nmV7Mu8|ajx4aSpCDt8h#SY@EyX!lLBh1lTAy~CNl5Ko<#C)1^8?7xDWpK;mdS+ z&i&LxF5NQ@vclZBk@KS1wZs%NWRB3IEoHzbFHuDzg1*X;kAr zq*CAGk@W9mOYuzhJV%>8nd8L$VSNm&PRHW9f-1CJbBii1t;BcBqA=Se3-}-9vB$g; ztUv0oGd_BNQqedd{Tf2F7IO}k$Iz`tf(2J!1NZC{l-eOiW3-A%zjy{jqz6_$F4+et zT?EAUB}D&S0a5yMS?{b=6l=<+Irj$f%c0NYcDEeowsJgE*|m{aKH$TQ1=leAx+a7e z711gW8ijvAp{)T#=)YbX^w}8yI`#zY#3Ug+G6h6ahDdT}4dq?GFRWEr%%0_%vJcy` zap9F|?5##TY8XspdaAeaU2Q(_-CvM>%FiLZr2@N34Y;`}_QG6M8EndG112Xg==qR_ z7KZ{c|JG+>SuM-0*+!A)s&aFY|4VZBL^;7)Y3`!%6~4JRpVQ6RN9!-OkkU(?ctr9R z9x~Ym2eMbe+m1@2U9cV1^fiP}<2o?@&oylG=@i&K*9M#h#;bIonr{JAZ8PQU-kF$X ztsI6Lo(K~;Hy&R9J_5a+(}iBz#^@3j1}tKcVCtR%OlqEr=L_WF&Fl#LKA{74m#U(> za6fiATeB6xR&1+>EK}RljNyCzLA_`UiY$l7`k%MisC$m7%}f}WmYl%~c^0ukMNqT+HHp2Rif3>B{Bc$HF)_n}zQ3G{4b_ub_Z}m{zj8^q$?*hP z)pZBgNrt$eh?=vux6!~`e>0*503mu!6hEi;QZDT9TO(P#t6+8KVd~YR2%EEKWAU2}L|TQ<8}cp2d%x)AzfdNSy>*yxYV`S=;p^|*%d-#h9YRXJwp=} zJCLeQ0N&rqH{WV0G1qiHceO_wxK(qx@{8|;g0+Fr{;dm7;0sWKdD!T5o8~wi$5H7XnE70e`)61W602H~_xdq?X`T*8BZ&pU$qb4AQuwh!lI93VZRCt=GiEjDv) zC;Fx+V?lfZri@?1KHipN&#LudaP18At?w34Lm#N0z+*k%x4?_kEfDeXPrN;>4NjBG z>HXZv%=t_{u{?H}IR99~naBR?zRdi@Tpe>?@a&|oV4N*QExBDplgZPC!Sy%%x~mDxz9B~)wz_F{JmVrD$5 z7@r`TtNH9~%Vo&VJxfO%(y(b;E1npZVcZWJGuxp1DE{Yq{Gu!dU)raz?FJUCL3I+A zUUr0w>>n~#uTFy+KL>3N77f^`{pG3@}Z0n z>d9)<4&pPt7_gI4$2Z%b(KXQ>SQ@KECgqo7+@c0*zvdzt{Q2jeh?`=N{$!|q9ZDoB z4Y=RCM^I0t8TZQDV1C#Za_^7?yR5+Da!SP^c=J2lTX&i$EG{E<33G_?Lsb~5mL~k6 zNcJGf07YNxFi9y5tUMsZi?xSv!{OcBT-k16Kh=V9$9Zgllmf<_aU)s>dSS zw}ii+n(_D^rn8RkMF?4nQJd$%G=VzQHi~0U+-2y#uoV8bIS#ga`82w$o$PI>7u;L0 zhQ*xwN_IZ`irG^vnaP$+V20Q5`p{0a?RAEv(LTrt$)QG0%@CH^Zx(i4mE5>y4+}16 z&;t#t(Qp4&w2Jx-wLJCMR&{AImb#L9kDLVI$NgZIv7G8{Oe76m60l{r1RUwE#!ST& zFjUrr0nsDamA(gwk^zW`?8GHK4YVvqnq^t8mLJpRm$# None: "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)") config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test") config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)") + config.addinivalue_line("markers", "xdna2: mark test as an XDNA2 (AIE2p) platform test") config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)") config.addinivalue_line("markers", "models: mark test as a model test (full networks)") config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration") diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py new file mode 100644 index 0000000000..9d4f27a477 --- /dev/null +++ b/DeeployTest/deeployRunner_xdna2.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +"""Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform. + +Usage (from DeeployTest/): + python deeployRunner_xdna2.py -t Tests/Kernels/BF16/Add/Regular [--skipsim] [-v] +""" + +import sys + +from testUtils.deeployRunner import main + +if __name__ == '__main__': + sys.exit(main(default_platform="XDNA2", default_simulator="host")) diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py new file mode 100644 index 0000000000..69af71f429 --- /dev/null +++ b/DeeployTest/generateNetwork_xdna2.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +"""XDNA2 network generation script. + +JUNGVI: TODO: Move this script to ONNX4Deeploy + +Replaces the generic ``generateNetwork.py`` for the XDNA2 platform. +Instead of emitting C code it: + +1. Loads the ONNX model and npz test-data. +2. Prepares the XDNA2Deployer (type checking + graph binding). +3. Emits ``testinputs.h`` and ``testoutputs.h`` with raw BF16 uint16_t arrays. +4. Calls ``deployer.generateMLIR()`` and writes ``network.mlir``. +""" + +import os +import struct + +import numpy as np +import onnx +import onnx_graphsurgeon as gs + +from testUtils.platformMapping import mapDeployer, mapPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.typeMapping import inferTypeAndOffset + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import bfloat16_t +from Deeploy.DeeployTypes import _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log + + +def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray: + """Convert a float32 numpy array to an array of BF16 bit patterns (uint16_t). + + Uses round-to-nearest-even (the standard IEEE 754 rounding mode). + """ + f32 = arr.astype(np.float32) + raw = f32.view(np.uint32) + # Standard round-to-nearest-even: add 0x7FFF + BF16_LSB to the full word, + # then truncate. The 0x7FFF biases values just below the midpoint to + # round down, while adding the BF16 LSB provides tie-breaking to even. + bf16_lsb = (raw >> 16) & 1 + raw = raw + np.uint32(0x7FFF) + bf16_lsb + bf16 = (raw >> 16).astype(np.uint16) + return bf16 + + +def _bf16_to_float32(bf16: np.ndarray) -> np.ndarray: + """Convert an array of BF16 uint16 bit patterns back to float32.""" + f32_bits = bf16.astype(np.uint32) << 16 + return f32_bits.view(np.float32) + + +def _generate_xdna2_inputs_header(input_arrays: list) -> str: + """Generate testinputs.h with raw uint16_t BF16 bit-pattern arrays.""" + lines = [] + lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna") + lines.append("// SPDX-License-Identifier: Apache-2.0") + lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.") + lines.append("#pragma once") + lines.append("#include ") + lines.append("") + + vec_names = [] + for idx, arr in enumerate(input_arrays): + bf16 = _float32_to_bf16_uint16(arr.flatten()) + n = len(bf16) + name = f"testInputVector{idx}" + vec_names.append(name) + hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16) + lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};") + lines.append(f"#define N_ELEMENTS_INPUT{idx} {n}u") + lines.append("") + + lines.append(f"static const void *testInputVector[{len(vec_names)}] = {{") + lines.append(" " + ", ".join(f"(const void *){n}" for n in vec_names)) + lines.append("};") + lines.append("") + return "\n".join(lines) + + +def _generate_xdna2_outputs_header(output_arrays: list) -> str: + """Generate testoutputs.h with raw uint16_t BF16 bit-pattern arrays.""" + lines = [] + lines.append("// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna") + lines.append("// SPDX-License-Identifier: Apache-2.0") + lines.append("// Auto-generated by generateNetwork_xdna2.py — do not edit.") + lines.append("#pragma once") + lines.append("#include ") + lines.append("") + + vec_names = [] + for idx, arr in enumerate(output_arrays): + bf16 = _float32_to_bf16_uint16(arr.flatten()) + n = len(bf16) + name = f"testOutputVector{idx}" + vec_names.append(name) + hex_vals = ", ".join(f"0x{v:04x}u" for v in bf16) + lines.append(f"static const uint16_t {name}[{n}] = {{{hex_vals}}};") + lines.append(f"#define N_ELEMENTS_OUTPUT{idx} {n}u") + lines.append("") + + lines.append(f"static const void *testOutputVector[{len(vec_names)}] = {{") + lines.append(" " + ", ".join(f"(const void *){n}" for n in vec_names)) + lines.append("};") + lines.append("") + return "\n".join(lines) + + +def generateNetworkXDNA2(args): + log.debug("Arguments: %s", args) + + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + inputs_npz = np.load(f'{args.dir}/inputs.npz') + outputs_npz = np.load(f'{args.dir}/outputs.npz') + + test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files] + test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files] + + # XDNA2 is a non-signprop platform: signProp = False + platform, signProp = mapPlatform(args.platform) + + inputTypes = {} + inputOffsets = {} + + for index, (name, values) in enumerate(zip(inputs_npz.files, test_inputs_f32)): + if np.prod(values.shape) == 0: + continue + # Force bfloat16_t — BF16 test data stored as float32 in npz would be + # inferred as float32_t by minimalFloatType, but the XDNA2 kernel + # requires bfloat16_t inputs. + inputTypes[f"input_{index}"] = PointerClass(bfloat16_t) + inputOffsets[f"input_{index}"] = 0 + + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + deployer = mapDeployer(platform, + graph, + inputTypes, + deeployStateDir=_DEEPLOYSTATEDIR, + inputOffsets=inputOffsets) + + # Prepare the deployer (type checking + binding) + deployer.prepare(_NoVerbosity) + + # Create output directory + os.makedirs(args.dumpdir, exist_ok=True) + + # Write testinputs.h (raw BF16 bit patterns as uint16_t) + testInputStr = _generate_xdna2_inputs_header(test_inputs_f32) + with open(f'{args.dumpdir}/testinputs.h', 'w') as f: + f.write(testInputStr) + + # Recompute golden outputs from the actual BF16 inputs the hardware will + # see. The original outputs.npz may have been computed in float32 + # precision, which can differ by several BF16 ULPs. + bf16_inputs = [_float32_to_bf16_uint16(a.flatten()) for a in test_inputs_f32] + bf16_input_f32 = [_bf16_to_float32(b) for b in bf16_inputs] + golden_f32 = bf16_input_f32[0] + for inp in bf16_input_f32[1:]: + golden_f32 = golden_f32 + inp + test_outputs_bf16 = [golden_f32.reshape(arr.shape) for arr in test_outputs_f32] + + # Write testoutputs.h (raw BF16 bit patterns as uint16_t) + testOutputStr = _generate_xdna2_outputs_header(test_outputs_bf16) + with open(f'{args.dumpdir}/testoutputs.h', 'w') as f: + f.write(testOutputStr) + + # Write network.mlir + mlir_str = deployer.generateMLIR() + with open(f'{args.dumpdir}/network.mlir', 'w') as f: + f.write(mlir_str) + + log.info(f"[XDNA2] Generated: testinputs.h, testoutputs.h, network.mlir -> {args.dumpdir}") + + +if __name__ == '__main__': + parser = TestGeneratorArgumentParser(description="Deeploy XDNA2 Code Generation Utility.") + args = parser.parse_args() + + if args.platform != 'XDNA2': + parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}") + + generateNetworkXDNA2(args) diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..572df44be1 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -29,6 +29,8 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: if config.tiling: generation_script = script_dir / "testMVP.py" + elif config.platform == "XDNA2": + generation_script = script_dir / "generateNetwork_xdna2.py" else: generation_script = script_dir / "generateNetwork.py" @@ -166,6 +168,9 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: # Run binary directly binary_path = Path(config.build_dir) / "bin" / config.test_name cmd = [str(binary_path)] + # Propagate verbosity to the host binary (e.g. XDNA2 main.cpp uses -v) + if config.verbose >= 1: + cmd.append("-v") else: # Run via CMake target cmake_cmd = os.environ.get("CMAKE", "cmake") diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..78d5ff9cd6 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None, "snitch": "Snitch", "chimera": "Chimera", "softhier": "SoftHier", + "xdna2": "XDNA2", } if args.platform: diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906f9..28425393cb 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -29,9 +29,11 @@ from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform +from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer +from Deeploy.Targets.XDNA2.Platform import XDNA2Optimizer, XDNA2Platform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Chimera": Platform = ChimeraPlatform() + elif platformName == "XDNA2": + Platform = XDNA2Platform() + else: raise RuntimeError(f"Deployment platform {platformName} is not implemented") @@ -273,6 +278,22 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + elif isinstance(platform, XDNA2Platform): + if loweringOptimizer is None: + loweringOptimizer = XDNA2Optimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = XDNA2Deployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) + else: raise RuntimeError(f"Deployer for platform {platform} is not implemented") diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..dca5c7b7cc 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -42,6 +42,7 @@ from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS +from test_xdna2_config import KERNEL_TESTS as XDNA2_KERNEL_TESTS from testUtils.pytestRunner import create_test_config, run_and_assert_test @@ -117,6 +118,11 @@ def param_id(param): "model_tests": GAP9_MODEL_TESTS, "default_num_cores": GAP9_DEFAULT_NUM_CORES, }, + "xdna2": { + "platform": "XDNA2", + "simulator": "host", + "kernel_tests": XDNA2_KERNEL_TESTS, + }, } ### Markers summary ### @@ -987,3 +993,21 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch double_buffer = True, ) run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.xdna2 +@pytest.mark.kernels +@pytest.mark.parametrize("test_name", XDNA2_KERNEL_TESTS, ids = XDNA2_KERNEL_TESTS) +def test_xdna2_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + platform_config = PLATFORM_CONFIGS["xdna2"] + config = create_test_config( + test_name = test_name, + platform = platform_config["platform"], + simulator = platform_config["simulator"], + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/DeeployTest/test_xdna2_config.py b/DeeployTest/test_xdna2_config.py new file mode 100644 index 0000000000..7988aa09b1 --- /dev/null +++ b/DeeployTest/test_xdna2_config.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# Test list for the XDNA2 platform. +# Each entry is a relative path under DeeployTest/Tests/. + +KERNEL_TESTS = [ + "Kernels/BF16/Add/Regular", +] diff --git a/TargetLibraries/XDNA2/CMakeLists.txt b/TargetLibraries/XDNA2/CMakeLists.txt new file mode 100644 index 0000000000..c2e1ffdecd --- /dev/null +++ b/TargetLibraries/XDNA2/CMakeLists.txt @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# --------------------------------------------------------------------------- +# XDNA2 (AIE2p) kernel library +# +# Compiles AIE C++ kernels using the llvm-aie (Peano) cross-compiler. +# Exports a CMake target `xdna2_kernels` that other targets can depend on, +# and sets XDNA2_KERNEL_OBJECTS in the parent scope. +# --------------------------------------------------------------------------- + +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +# --- Resolve llvm-aie (Peano) install dir --- +set(LLVM_AIE_INSTALL_DIR "$ENV{LLVM_AIE_INSTALL_DIR}" CACHE PATH "llvm-aie (Peano) install dir") +if(NOT LLVM_AIE_INSTALL_DIR) + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import aie.utils.config; print(aie.utils.config.peano_install_dir());" + OUTPUT_VARIABLE LLVM_AIE_INSTALL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) +endif() +if(NOT LLVM_AIE_INSTALL_DIR) + message(FATAL_ERROR "[XDNA2] Could not find llvm-aie install dir. " + "Please set the environment variable LLVM_AIE_INSTALL_DIR or install the llvm-aie wheel.") +endif() + +# --- Resolve mlir-aie include dir (aie_api headers) --- +if(NOT MLIR_AIE_INCLUDE_DIR) + if(DEFINED ENV{MLIR_AIE_INCLUDE_DIR}) + set(MLIR_AIE_INCLUDE_DIR $ENV{MLIR_AIE_INCLUDE_DIR}) + else() + execute_process( + COMMAND ${Python3_EXECUTABLE} + -c "import aie.utils.config; print(aie.utils.config.cxx_header_path());" + OUTPUT_VARIABLE MLIR_AIE_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE _aie_cfg_result + ) + if(NOT _aie_cfg_result EQUAL 0 OR NOT MLIR_AIE_INCLUDE_DIR) + message(FATAL_ERROR "[XDNA2] Could not query aie.utils.config.cxx_header_path(). " + "Please set the environment variable MLIR_AIE_INCLUDE_DIR or install the mlir-aie wheel.") + endif() + endif() +endif() + +set(LLVM_AIE_CLANG "${LLVM_AIE_INSTALL_DIR}/bin/clang++") + +message(STATUS "[XDNA2 Kernels] LLVM_AIE_INSTALL_DIR = ${LLVM_AIE_INSTALL_DIR}") +message(STATUS "[XDNA2 Kernels] MLIR_AIE_INCLUDE_DIR = ${MLIR_AIE_INCLUDE_DIR}") + +# --------------------------------------------------------------------------- +# Compile AIE kernels +# --------------------------------------------------------------------------- +file(GLOB XDNA2_KERNEL_SOURCES "${CMAKE_CURRENT_LIST_DIR}/kernels/*.cc") + +set(XDNA2_KERNEL_OBJECTS "") + +foreach(KERNEL_SRC ${XDNA2_KERNEL_SOURCES}) + get_filename_component(KERNEL_NAME ${KERNEL_SRC} NAME_WE) + set(KERNEL_OBJ "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_NAME}.o") + + add_custom_command( + OUTPUT "${KERNEL_OBJ}" + COMMAND "${LLVM_AIE_CLANG}" + --target=aie2p-none-unknown-elf + "-I${MLIR_AIE_INCLUDE_DIR}" + -std=c++20 + -Wno-parentheses + -Wno-attributes + -Wno-macro-redefined + -Wno-empty-body + -O2 + -DNDEBUG + -c "${KERNEL_SRC}" + -o "${KERNEL_OBJ}" + DEPENDS "${KERNEL_SRC}" + COMMENT "[XDNA2] Compiling AIE kernel: ${KERNEL_NAME}.cc -> ${KERNEL_NAME}.o" + VERBATIM + ) + + list(APPEND XDNA2_KERNEL_OBJECTS "${KERNEL_OBJ}") +endforeach() + +add_custom_target(xdna2_kernels DEPENDS ${XDNA2_KERNEL_OBJECTS}) + +# Export kernel objects to parent scope so the testbench CMake can use them +set(XDNA2_KERNEL_OBJECTS "${XDNA2_KERNEL_OBJECTS}" PARENT_SCOPE) diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc new file mode 100644 index 0000000000..1a53e47398 --- /dev/null +++ b/TargetLibraries/XDNA2/kernels/add.cc @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#define NOCPP + +#include +#include +#include +#include +#include +#include + +template void eltwise_add(T_in *a, T_in *b, T_out *c, int size) +{ + for (int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } +} + +template void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) +{ + constexpr int vec_factor = 16; + event0(); + T_in *__restrict pA1 = a; + T_in *__restrict pB1 = b; + T_out *__restrict pC1 = c; + const int F = size / vec_factor; + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(16) + for (int i = 0; i < F; i++) { + aie::vector A0 = aie::load_v(pA1); + pA1 += vec_factor; + aie::vector B0 = aie::load_v(pB1); + pB1 += vec_factor; + aie::vector cout = aie::add(A0, B0); + aie::store_v(pC1, cout); + pC1 += vec_factor; + } + event1(); +} + +extern "C" { + +void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size) +{ + eltwise_add(a_in, b_in, c_out, size); +} + +void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size) +{ + eltwise_vadd(a_in, b_in, c_out, size); +} + +} // extern "C" diff --git a/requirements-dev.txt b/requirements-dev.txt index 6d047b4957..5cbdc0ef64 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 +--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly +--extra-index-url https://pypi.org/simple + +mlir_aie==v1.2.1 +llvm-aie + # Quality of life netron debugpy From d039415104bea6c78bb23c7517efd59fe414b5f7 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Tue, 17 Mar 2026 14:14:38 +0100 Subject: [PATCH 03/16] Add XDNA container --- .gitignore | 3 +- Container/Dockerfile.deeploy-xdna | 58 ++++++++++++++++++++++++++++ DeeployTest/Platforms/XDNA2/main.cpp | 14 +++++-- README_XDNA.md | 30 ++++++++++++++ 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 Container/Dockerfile.deeploy-xdna create mode 100644 README_XDNA.md diff --git a/.gitignore b/.gitignore index 7ffc9ca243..a9993aac54 100644 --- a/.gitignore +++ b/.gitignore @@ -59,4 +59,5 @@ CHANGELOG_GEN.md .cache/ # Claude context file -CLAUDE.md \ No newline at end of file +CLAUDE.md +Container/xrt-debs/ diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna new file mode 100644 index 0000000000..f39d1df3ed --- /dev/null +++ b/Container/Dockerfile.deeploy-xdna @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +FROM ubuntu:24.04 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV LLVM_INSTALL_DIR="nope" + +RUN apt-get update && apt-get install -y \ + software-properties-common \ + && add-apt-repository -y ppa:amd-team/xrt \ + && apt-get update && apt-get install -y \ + cmake \ + ninja-build \ + g++ \ + git \ + git-lfs \ + python3 \ + python3-pip \ + python-is-python3 \ + uuid-dev \ + wget \ + curl \ + ccache \ + libxrt2 \ + libxrt-npu2 \ + libxrt-dev \ + libxrt-utils \ + libxrt-utils-npu \ + && rm -rf /var/lib/apt/lists/* + +ENV XILINX_XRT=/opt/xilinx/xrt +ENV PATH=${XILINX_XRT}/bin:${PATH} +ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib + + +WORKDIR /app +COPY pyproject.toml ./ +RUN pip install toml-to-requirements && \ + toml-to-req --toml-file pyproject.toml && \ + pip install -r requirements.txt && \ + rm -f requirements.txt pyproject.toml + +RUN pip install \ + --extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 \ + --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ + "mlir_aie==v1.2.1" \ + llvm-aie + +ENV MLIR_AIE_PYTHON=/usr/bin/python3 +ENV IRON_OPERATORS_DIR=/usr/lib/python3/dist-packages/aie/iron/operators + +WORKDIR /app/Deeploy diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp index 07ffb7a0ca..046384e4db 100644 --- a/DeeployTest/Platforms/XDNA2/main.cpp +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -79,9 +79,17 @@ static std::vector read_instr_binary(const std::string &path) int main(int argc, char **argv) { - // Paths to the compiled artefacts (relative to the binary's working dir) - std::string xclbin_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/network.xclbin"; - std::string instr_path = "/scratch/jungvi/Deeploy/DeeployTest/TEST_XDNA2/build_master/bin/npu_insts.bin"; + // Paths to the compiled artefacts: default to the directory containing + // this binary so the test works regardless of the working directory or + // whether it is run inside a container. + std::string bin_dir; + { + std::string argv0(argv[0]); + auto sep = argv0.rfind('/'); + bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep); + } + std::string xclbin_path = bin_dir + "/network.xclbin"; + std::string instr_path = bin_dir + "/npu_insts.bin"; bool verbose = false; for (int i = 1; i < argc; ++i) { diff --git a/README_XDNA.md b/README_XDNA.md new file mode 100644 index 0000000000..a96a3550c8 --- /dev/null +++ b/README_XDNA.md @@ -0,0 +1,30 @@ +# How to use Deeploy on the XDNA2 NPU + +A dockerfile containing everything required to run on XDNA2 is available to build with the dockerfile at `Container/Dockerfile.deeploy-xdna`. + +You can build it locally on Ubuntu 24.04 with: +``` +docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local . +``` + +You need to have XRT installed on your host, once installed it is present in `/opt/xilinx/xrt`. You can run the docker container previously built with: +``` +docker run -it \ + --device /dev/accel/accel0 \ + --ulimit memlock=-1 \ + -v /scratch/jungvi/IRON:/opt/IRON \ + -e IRON_OPERATORS_DIR=/opt/IRON/iron/operators \ + -v "$(pwd)":/app/Deeploy \ + -v /opt/xilinx:/opt/xilinx \ + --name deeploy_dev \ + deeploy-xdna:local +``` + +Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation. + +Once the container is started you can a simple Add node, from ONNX to execution with: +``` +pip install -e ./ && \ +cd DeeployTest && \ +python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/ +``` \ No newline at end of file From e66864a75a9b792fbd5198bb96c1054010267c6b Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Wed, 18 Mar 2026 11:25:30 +0100 Subject: [PATCH 04/16] First attempt at generating MLIR code with Deeploy --- Container/Dockerfile.deeploy-xdna | 1 - Deeploy/MLIRDataTypes.py | 83 ++++++++ Deeploy/Targets/XDNA2/Deployer.py | 190 +++++------------ .../Targets/XDNA2/Templates/AddTemplate.py | 201 +++++++++++++----- 4 files changed, 287 insertions(+), 188 deletions(-) create mode 100644 Deeploy/MLIRDataTypes.py diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna index f39d1df3ed..fd62657740 100644 --- a/Container/Dockerfile.deeploy-xdna +++ b/Container/Dockerfile.deeploy-xdna @@ -53,6 +53,5 @@ RUN pip install \ llvm-aie ENV MLIR_AIE_PYTHON=/usr/bin/python3 -ENV IRON_OPERATORS_DIR=/usr/lib/python3/dist-packages/aie/iron/operators WORKDIR /app/Deeploy diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py new file mode 100644 index 0000000000..2091307858 --- /dev/null +++ b/Deeploy/MLIRDataTypes.py @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Base class for MLIR-emitting node templates. + +This module provides :class:`MLIRNodeTemplate`, a :class:`NodeTemplate` +subclass whose ``generate()`` method produces an MLIR string instead of C +code. Concrete subclasses override :meth:`emit` to populate an +``mlir.ir.Module`` using dialect-specific Python bindings (e.g. +``aie.dialects`` for the XDNA2 backend). + +The class is intentionally dialect-agnostic so that future MLIR-based +backends (NVGPU, Linalg, …) can reuse the same base. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import TYPE_CHECKING + +from Deeploy.DeeployTypes import NodeTemplate + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation + + +class MLIRNodeTemplate(NodeTemplate): + """NodeTemplate subclass that emits MLIR instead of C code. + + Subclasses must override :meth:`emit` to add dialect operations to an + ``mlir.ir.Module`` (or region / insertion point provided via *kwargs*). + + ``generate()`` is overridden as a convenience that constructs a + standalone module, calls :meth:`emit`, and returns the MLIR text. + The base-class ``alignToContext`` / ``hoistTransientBuffers`` hooks are + retained and work unchanged. + """ + + def __init__(self): + # Empty Mako template — no C code is generated. + super().__init__("") + + # ------------------------------------------------------------------ + # Subclass API + # ------------------------------------------------------------------ + + @abstractmethod + def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: + """Populate an MLIR module with the operations for this node. + + The caller (typically the deployer) sets up an ``mlir.ir.Module`` + with the appropriate device wrapper and passes dialect-specific + context through *kwargs* (e.g. insertion point, tile references, + ObjectFifo handles). + + Parameters + ---------- + operatorRepresentation : OperatorRepresentation + The parser's node representation (buffer names, sizes, types …). + **kwargs + Dialect-specific context provided by the deployer. + """ + ... + + # ------------------------------------------------------------------ + # NodeTemplate overrides + # ------------------------------------------------------------------ + + def generate(self, operatorRepresentation={}, **kwargs) -> str: + """Generate an MLIR string for this node. + + This default implementation is a thin wrapper: it delegates to + :meth:`emit`. Deployers that need to build a single module from + multiple nodes should call :meth:`emit` directly with the shared + module context and then stringify the complete module themselves. + + Returns + ------- + str + MLIR text (printable module or fragment). + """ + self.emit(operatorRepresentation, **kwargs) + return "" diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py index 7aa77668eb..7df9a1976d 100644 --- a/Deeploy/Targets/XDNA2/Deployer.py +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -1,49 +1,38 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 +"""XDNA2 deployer — generates mlir-aie MLIR using ``aie.dialects``. + +Unlike other Deeploy deployers that generate C code via Mako templates, +this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations +and returns the verified MLIR text. +""" + +from __future__ import annotations -import os -import subprocess -import tempfile from typing import Callable, Dict, Optional, Type import onnx_graphsurgeon as gs +from aie.extras.context import mlir_mod_ctx +from aie.dialects import aie as aie_d +from aie.dialects import aiex as aiex_d +import aie.ir as ir + from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer from Deeploy.Logging import DEFAULT_LOGGER as log -from Deeploy.Targets.XDNA2.Templates.AddTemplate import XDNA2NodeTemplate - -# JUNGVI: Will be removed once Deeploy generates it's own MLIR - -# Default path to the mlir-aie Python environment. -# Can be overridden via the MLIR_AIE_PYTHON env variable. -_DEFAULT_IRON_PYTHON = os.environ.get( - "MLIR_AIE_PYTHON", - "/scratch/jungvi/micromamba/envs/iron/bin/python", -) - -# Path to the IRON design scripts shipped with mlir-aie examples. -# Can be overridden via the IRON_OPERATORS_DIR env variable. -_DEFAULT_IRON_OPERATORS_DIR = os.environ.get( - "IRON_OPERATORS_DIR", - "/scratch/jungvi/IRON/iron/operators", -) +from Deeploy.MLIRDataTypes import MLIRNodeTemplate class XDNA2Deployer(SignPropDeployer): """Deployer for the XDNA2 (AIE2p) platform. - Unlike other Deeploy deployers that generate C code, this deployer - generates an mlir-aie MLIR module. The MLIR is produced by invoking the - IRON operator ``design.py`` scripts as subprocesses (using the mlir-aie - Python environment) so that the main Deeploy environment does not need to - have ``aie.iron`` installed. - - It also writes ``testinputs.h`` and ``testoutputs.h`` via the XDNA2 - generation script so the XRT C++ testbench can be compiled against - known-good golden values. + Generates an mlir-aie MLIR module by calling :meth:`emit` / + :meth:`emitRuntimeSequence` on each bound :class:`MLIRNodeTemplate`. + The module is verified via MLIR's built-in verifier before being + returned as a string. """ def __init__(self, @@ -55,22 +44,7 @@ def __init__(self, name: str = 'DeeployNetwork', default_channels_first: bool = False, deeployStateDir: str = "DeeployStateDir", - inputOffsets: Optional[Dict[str, int]] = None, - iron_python: Optional[str] = None, - iron_operators_dir: Optional[str] = None): - """ - Parameters - ---------- - iron_python : str, optional - Path to the Python interpreter in the mlir-aie (IRON) environment. - Defaults to ``MLIR_AIE_PYTHON`` env variable or - ``/scratch/jungvi/micromamba/envs/iron/bin/python``. - iron_operators_dir : str, optional - Path to the IRON operators directory containing per-operator - ``design.py`` scripts. - Defaults to ``IRON_OPERATORS_DIR`` env variable or - ``/scratch/jungvi/IRON/iron/operators``. - """ + inputOffsets: Optional[Dict[str, int]] = None): super().__init__( graph, deploymentPlatform, @@ -82,8 +56,6 @@ def __init__(self, deeployStateDir = deeployStateDir, inputOffsets = inputOffsets if inputOffsets is not None else {}, ) - self._iron_python = iron_python or _DEFAULT_IRON_PYTHON - self._iron_operators_dir = iron_operators_dir or _DEFAULT_IRON_OPERATORS_DIR # ------------------------------------------------------------------ # MLIR generation @@ -92,113 +64,65 @@ def __init__(self, def generateMLIR(self) -> str: """Generate an mlir-aie MLIR module for the prepared graph. - Iterates over ``self.layerBinding``, extracts AIE parameters from each - bound template, and calls the corresponding IRON ``design.py`` script - as a subprocess. Currently only a single BF16 Add node is supported. + Iterates over bound layers, calls each template's ``emit()`` + to construct AIE operations, adds a ``runtime_sequence`` for + host-side DMA, verifies the module, and returns the MLIR text. Returns ------- str - MLIR module string (ready to be written to ``network.mlir``). - - Raises - ------ - RuntimeError - If the graph contains unsupported operators or if the IRON - subprocess fails. + Verified MLIR module string. """ assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()" - mlir_parts = [] - + # Collect templates and their operator representations + nodes = [] for node_name, layer in self.layerBinding.items(): mapper = layer.mapper template = mapper.binder.template op_repr = mapper.parser.operatorRepresentation - if not isinstance(template, XDNA2NodeTemplate): + if not isinstance(template, MLIRNodeTemplate): raise RuntimeError( - f"Node '{node_name}' has no XDNA2NodeTemplate — " + f"Node '{node_name}' has no MLIRNodeTemplate — " f"only BF16 Add is supported in this release.") - aie_params = template.getAIEParams(op_repr) - log.info(f"[XDNA2] Generating MLIR for node '{node_name}' " - f"with params: {aie_params}") - - mlir_str = self._generate_add_mlir(aie_params) - mlir_parts.append(mlir_str) + nodes.append((node_name, template, op_repr)) - if not mlir_parts: - raise RuntimeError("No bound layers found in graph — cannot generate MLIR.") + if not nodes: + raise RuntimeError("No bound layers found — cannot generate MLIR.") - # For a single-node graph the MLIR is just the one module. - # Multi-node support would require merging modules. - return mlir_parts[0] + # Build the MLIR module + with mlir_mod_ctx() as ctx: - def _generate_add_mlir(self, aie_params: dict) -> str: - """Call the IRON elementwise_add design.py to produce MLIR. + @aie_d.device(aie_d.AIEDevice.npu2) + def _device(): + compute_tile = aie_d.tile(0, 2) # JUNGVI: This will have to change when we deploy on the whole array + shim_tile = aie_d.tile(0, 0) - Parameters - ---------- - aie_params : dict - Dict with keys: num_elements, n_cols, n_channels, tile_size, trace_size. + # Emit each node's operations (ObjectFifos, core, kernel decls) + for node_name, template, op_repr in nodes: + log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'") + template.emit(op_repr, + compute_tile=compute_tile, + shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly? - Returns - ------- - str - MLIR module string. - """ - design_script = os.path.join( - self._iron_operators_dir, "elementwise_add", "design.py" - ) - - if not os.path.isfile(design_script): - raise RuntimeError( - f"IRON design script not found: {design_script}\n" - f"Set IRON_OPERATORS_DIR to point to the IRON operators directory.") - - if not os.path.isfile(self._iron_python): - raise RuntimeError( - f"IRON Python interpreter not found: {self._iron_python}\n" - f"Set MLIR_AIE_PYTHON to the mlir-aie Python interpreter.") - - with tempfile.NamedTemporaryFile(suffix=".mlir", delete=False) as tmp: - output_path = tmp.name - - try: - cmd = [ - self._iron_python, - design_script, - "--dev", "npu2", - "--length", str(aie_params['num_elements']), - "--columns", str(aie_params['n_cols']), - "--channels", str(aie_params['n_channels']), - "--tile-size", str(aie_params['tile_size']), - "--trace-size", str(aie_params['trace_size']), - "--output-file-path", output_path, - ] - - log.debug(f"[XDNA2] Running: {' '.join(cmd)}") - - result = subprocess.run( - cmd, - check=False, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - raise RuntimeError( - f"IRON design.py failed (exit {result.returncode}):\n" - f" cmd: {' '.join(cmd)}\n" - f" stdout: {result.stdout}\n" - f" stderr: {result.stderr}") + # Runtime sequence: collect tensor types from all nodes' I/O + # For now (single-node), derive from the first node. + _, first_template, first_op_repr = nodes[0] + params = first_template.getAIEParams(first_op_repr) + num_elements = params['num_elements'] + tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get()) - with open(output_path, 'r') as f: - mlir_str = f.read() + @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def _seq(*args): + for _, template, op_repr in nodes: + template.emitRuntimeSequence(op_repr, list(args)) - finally: - if os.path.exists(output_path): - os.unlink(output_path) + module = ctx.module + assert module.operation.verify(), \ + "[XDNA2] Generated MLIR module failed verification" + mlir_str = str(module) + log.info(f"[XDNA2] MLIR module generated ({len(mlir_str)} bytes)") return mlir_str diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py index 050413eedc..47dcb41d10 100644 --- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -1,81 +1,174 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 +"""XDNA2 MLIR template for BF16 elementwise Add. -from Deeploy.DeeployTypes import NodeTemplate +Uses ``aie.dialects`` (from the pip-installed ``mlir-aie`` package) to emit +verified MLIR operations into an existing module context provided by the +:class:`XDNA2Deployer`. +""" +from __future__ import annotations -class XDNA2NodeTemplate(NodeTemplate): - """Base class for XDNA2 templates. +from typing import TYPE_CHECKING - Temporary Feature: - Unlike Mako-based templates for C code, XDNA2 templates do not produce - code snippets. Instead they store AIE kernel metadata that the - XDNA2Deployer reads when generating the holistic MLIR module. +from aie.dialects import aie as aie_d +from aie.dialects import aiex as aiex_d +from aie.dialects import arith as arith_d +from aie.dialects import func as func_d +from aie.dialects import scf as scf_d +import aie.ir as ir + +from Deeploy.MLIRDataTypes import MLIRNodeTemplate + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import OperatorRepresentation + + +class XDNA2AddTemplate(MLIRNodeTemplate): + """MLIR template for BF16 elementwise Add on XDNA2 (AIE2p). + + The :meth:`emit` method constructs a single-core AIE program with: + + * Two input ObjectFifos and one output ObjectFifo (depth 2 for + double-buffering). + * A compute core that loops, acquiring / releasing FIFO elements and + calling the vectorised ``eltwise_add_bf16_vector`` kernel. + * A runtime sequence that configures shim DMA for L3 ↔ L1 transfers. + + Parameters are extracted from the *operatorRepresentation* populated + by the parser (``size`` = total number of BF16 elements). """ - def __init__(self, kernel_fn_name: str, kernel_obj: str, kernel_src: str, tile_size: int = 1024): - """Initialize an XDNA2NodeTemplate. + KERNEL_FN = "eltwise_add_bf16_vector" + KERNEL_OBJ = "add.o" + MAX_TILE_SIZE = 1024 - Parameters - ---------- - kernel_fn_name : str - Name of the AIE C++ kernel function (e.g. "eltwise_add_bf16_vector"). - kernel_obj : str - Compiled kernel object file name (e.g. "add.o"). - kernel_src : str - Kernel source file name relative to TargetLibraries/XDNA2/kernels/ - (e.g. "add.cc"). - tile_size : int - Number of elements per tile (default 1024, max 4096). - """ - # Empty Mako template — no C code is generated per node. - super().__init__("") - self.kernel_fn_name = kernel_fn_name - self.kernel_obj = kernel_obj - self.kernel_src = kernel_src - self.tile_size = tile_size + def __init__(self): + super().__init__() - def getAIEParams(self, operatorRepresentation: dict) -> dict: - """Return the aie.iron parameters for this node. + # ------------------------------------------------------------------ + # Parameter helpers + # ------------------------------------------------------------------ - Parameters - ---------- - operatorRepresentation : dict - The operator representation dict produced by the parser. + def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict: + """Extract AIE parameters from the operator representation. Returns ------- dict - Parameters to pass to the corresponding aie.iron design function. + ``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and + ensuring divisibility). """ - raise NotImplementedError - - -class XDNA2AddTemplate(XDNA2NodeTemplate): - """XDNA2 template for BF16 elementwise Add.""" - - def __init__(self): - super().__init__( - kernel_fn_name = "eltwise_add_bf16_vector", - kernel_obj = "add.o", - kernel_src = "add.cc", - tile_size = 1024, - ) - - def getAIEParams(self, operatorRepresentation: dict) -> dict: num_elements = int(operatorRepresentation['size']) - tile_size = min(num_elements, self.tile_size) - # Ensure num_elements is divisible by tile_size + tile_size = min(num_elements, self.MAX_TILE_SIZE) if num_elements % tile_size != 0: tile_size = 1 return { 'num_elements': num_elements, - 'n_cols': 1, - 'n_channels': 1, 'tile_size': tile_size, - 'trace_size': 0, } + # ------------------------------------------------------------------ + # MLIR emission + # ------------------------------------------------------------------ + + def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: + """Add AIE operations for a BF16 Add node into the current device context. + + Must be called inside an ``@aie_d.device(...)`` region (the deployer + sets this up). The following keyword arguments are expected: + + * ``compute_tile`` — result of ``aie_d.tile(col, row)`` + * ``shim_tile`` — result of ``aie_d.tile(col, 0)`` + """ + params = self.getAIEParams(operatorRepresentation) + num_elements = params['num_elements'] + tile_size = params['tile_size'] + num_tiles = num_elements // tile_size + + compute_tile = kwargs['compute_tile'] + shim_tile = kwargs['shim_tile'] + + # MemRef types + tile_ty = ir.MemRefType.get((tile_size,), ir.BF16Type.get()) + i32 = ir.IntegerType.get_signless(32) + + # ObjectFifos (depth 2 for double-buffering) + aie_d.object_fifo("in1_0", shim_tile, [compute_tile], 2, tile_ty) + aie_d.object_fifo("in2_0", shim_tile, [compute_tile], 2, tile_ty) + aie_d.object_fifo("out_0", compute_tile, [shim_tile], 2, tile_ty) + + # External kernel declaration + aie_d.external_func(self.KERNEL_FN, [tile_ty, tile_ty, tile_ty, i32]) + + # Compute core + @aie_d.core(compute_tile, link_with=self.KERNEL_OBJ) + def _core(): + subview_ty = aie_d.ObjectFifoSubviewType.get(tile_ty) + for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): + for _ in scf_d.for_(0, num_tiles, 1): + acq_in1 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in1_0", 1) + elem_in1 = aie_d.objectfifo_subview_access(tile_ty, acq_in1, 0) + acq_in2 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in2_0", 1) + elem_in2 = aie_d.objectfifo_subview_access(tile_ty, acq_in2, 0) + acq_out = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Produce, "out_0", 1) + elem_out = aie_d.objectfifo_subview_access(tile_ty, acq_out, 0) + size_val = arith_d.constant(i32, tile_size) + func_d.call([], self.KERNEL_FN, [elem_in1, elem_in2, elem_out, size_val]) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in1_0", 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in2_0", 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, "out_0", 1) + scf_d.yield_([]) + scf_d.yield_([]) + + def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation, + seq_args: list) -> None: + """Emit DMA configuration inside a runtime_sequence block. + + Parameters + ---------- + operatorRepresentation : OperatorRepresentation + Node representation (used to extract ``num_elements``). + seq_args : list + Block arguments of the runtime_sequence (memref values for + in1, in2, out — in the order matching the ONNX graph I/O). + """ + params = self.getAIEParams(operatorRepresentation) + num_elements = params['num_elements'] + + dims = [ + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=num_elements, stride=1), + ] + + in1, in2, out = seq_args[0], seq_args[1], seq_args[2] + + task_in1 = aiex_d.dma_configure_task_for("in1_0") + block_in1 = task_in1.body.blocks.append() + with ir.InsertionPoint(block_in1): + aie_d.dma_bd(in1, offset=0, len=num_elements, dimensions=dims, burst_length=0) + aie_d.end() + aiex_d.dma_start_task(task_in1) + + task_in2 = aiex_d.dma_configure_task_for("in2_0") + block_in2 = task_in2.body.blocks.append() + with ir.InsertionPoint(block_in2): + aie_d.dma_bd(in2, offset=0, len=num_elements, dimensions=dims, burst_length=0) + aie_d.end() + aiex_d.dma_start_task(task_in2) + + task_out = aiex_d.dma_configure_task_for("out_0", issue_token=True) + block_out = task_out.body.blocks.append() + with ir.InsertionPoint(block_out): + aie_d.dma_bd(out, offset=0, len=num_elements, dimensions=dims, burst_length=0) + aie_d.end() + aiex_d.dma_start_task(task_out) + aiex_d.dma_await_task(task_out) + aiex_d.dma_free_task(task_in1) + aiex_d.dma_free_task(task_in2) + referenceTemplate = XDNA2AddTemplate() From d8548468a1d4cfd7f1398d1d5a975d5be5e01da5 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Wed, 18 Mar 2026 13:26:15 +0100 Subject: [PATCH 05/16] Generate tiled code but too much logic is in the Template --- Deeploy/Targets/XDNA2/Deployer.py | 27 ++++-- Deeploy/Targets/XDNA2/Platform.py | 89 ++++++++++++++++++- .../Targets/XDNA2/Templates/AddTemplate.py | 66 ++++++++++++-- Deeploy/Targets/XDNA2/Tiler.py | 16 ++++ DeeployTest/deeployRunner_xdna2.py | 2 +- DeeployTest/generateNetwork_xdna2.py | 58 +++++++++--- DeeployTest/testUtils/core/execution.py | 6 +- DeeployTest/testUtils/deeployRunner.py | 8 ++ DeeployTest/testUtils/platformMapping.py | 5 +- DeeployTest/testUtils/testRunner.py | 25 +++++- 10 files changed, 266 insertions(+), 36 deletions(-) create mode 100644 Deeploy/Targets/XDNA2/Tiler.py diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py index 7df9a1976d..34fd1b52b5 100644 --- a/Deeploy/Targets/XDNA2/Deployer.py +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -24,6 +24,7 @@ from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.MLIRDataTypes import MLIRNodeTemplate +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint class XDNA2Deployer(SignPropDeployer): @@ -67,6 +68,10 @@ def generateMLIR(self) -> str: Iterates over bound layers, calls each template's ``emit()`` to construct AIE operations, adds a ``runtime_sequence`` for host-side DMA, verifies the module, and returns the MLIR text. + + If tiling is enabled (patternMemoryConstraint available), passes + tiling information to templates to generate tiled transfers and + compute kernels. Returns ------- @@ -81,13 +86,17 @@ def generateMLIR(self) -> str: mapper = layer.mapper template = mapper.binder.template op_repr = mapper.parser.operatorRepresentation + + # Check if tiling is enabled by looking for patternMemoryConstraint + executionBlock = mapper.binder.executionBlock + tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None) if not isinstance(template, MLIRNodeTemplate): raise RuntimeError( f"Node '{node_name}' has no MLIRNodeTemplate — " f"only BF16 Add is supported in this release.") - nodes.append((node_name, template, op_repr)) + nodes.append((node_name, template, op_repr, tilingConstraint)) if not nodes: raise RuntimeError("No bound layers found — cannot generate MLIR.") @@ -101,23 +110,25 @@ def _device(): shim_tile = aie_d.tile(0, 0) # Emit each node's operations (ObjectFifos, core, kernel decls) - for node_name, template, op_repr in nodes: - log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'") + for node_name, template, op_repr, tilingConstraint in nodes: + log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" + + (" with tiling" if tilingConstraint else "")) template.emit(op_repr, compute_tile=compute_tile, - shim_tile=shim_tile) # JUNGVI: What should be the interface of the MLIR template emission exactly? + shim_tile=shim_tile, + tilingConstraint=tilingConstraint) # Pass tiling info # Runtime sequence: collect tensor types from all nodes' I/O # For now (single-node), derive from the first node. - _, first_template, first_op_repr = nodes[0] - params = first_template.getAIEParams(first_op_repr) + _, first_template, first_op_repr, first_tilingConstraint = nodes[0] + params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint) num_elements = params['num_elements'] tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get()) @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty) def _seq(*args): - for _, template, op_repr in nodes: - template.emitRuntimeSequence(op_repr, list(args)) + for _, template, op_repr, tilingConstraint in nodes: + template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint) module = ctx.module assert module.operation.verify(), \ diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py index 82ef1ec3d2..4a186aca7c 100644 --- a/Deeploy/Targets/XDNA2/Platform.py +++ b/Deeploy/Targets/XDNA2/Platform.py @@ -2,19 +2,34 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ - StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \ + NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper from Deeploy.Targets.Generic.Layers import AddLayer from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.Parsers import AddParser from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings +from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings +# Standard mapper for non-tiled deployment XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings) +# Tiling-ready mapper for tiled deployment +XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings) + +# Standard mapping (used when tiling is disabled) XDNA2Mapping = { 'Add': AddLayer([XDNA2AddMapper]), } +# Tiling-ready mapping (used when tiling is enabled) +XDNA2TilingMapping = { + 'Add': AddLayer([XDNA2AddTilableMapper]), +} + # Buffer classes reuse Generic templates since XDNA2Deployer manages its own # output format (MLIR + test headers) and these templates are never rendered. @@ -56,6 +71,21 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = super().__init__(name, Mapping, initCode, includeList) +class XDNA2AIECoreEngine(DeploymentEngine): + """AIE core execution engine with L1 local memory as preferred memory level. + + The AIE core has 8KB of local memory (L1) for temporary buffers and computation. + Data is transferred from L3 (shared memory) to L1 as needed. + """ + + def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "", + includeList = None, preferredMemoryLevel: str = "L1") -> None: + if includeList is None: + includeList = [] + super().__init__(name, Mapping, initCode, includeList) + self.preferredMemoryLevel = preferredMemoryLevel + + class XDNA2Platform(DeploymentPlatform): def __init__(self, @@ -67,3 +97,58 @@ def __init__(self, if engines is None: engines = [XDNA2Engine()] super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + +class MemoryXDNA2Platform(MemoryPlatform): + """XDNA2 platform with memory hierarchy support for tiling. + + Defines the memory hierarchy: + - L1: 8KB per AIE core (local memory) + - L3: Shared memory for entire AIE array + """ + + def __init__(self, + memoryHierarchy: MemoryHierarchy, + defaultTargetMemoryLevel: MemoryLevel, + engines = None, + variableBuffer = XDNA2VariableBuffer, + constantBuffer = XDNA2ConstantBuffer, + structBuffer = XDNA2StructBuffer, + transientBuffer = XDNA2TransientBuffer) -> None: + if engines is None: + engines = [XDNA2AIECoreEngine()] + super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer, + structBuffer, transientBuffer) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + """Get the target memory level for a tensor in a given node. + + For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level). + Otherwise use the default target memory level (typically L3). + """ + # Check if node has an engine assignment + if hasattr(node, '_engine_assignment'): + engine = node._engine_assignment + if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): + return engine.preferredMemoryLevel + + return self.defaultTargetMemoryLevel.name + + +class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper): + """Wrapper for XDNA2Platform with memory-level support.""" + + def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, + defaultTargetMemoryLevel: MemoryLevel): + assert isinstance(platform, XDNA2Platform), \ + f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}" + super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + """Get the target memory level for a tensor in a given node.""" + if hasattr(node, '_engine_assignment'): + engine = node._engine_assignment + if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): + return engine.preferredMemoryLevel + + return self.defaultTargetMemoryLevel.name diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py index 47dcb41d10..3a62d6f757 100644 --- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -12,6 +12,8 @@ from typing import TYPE_CHECKING +import numpy as np + from aie.dialects import aie as aie_d from aie.dialects import aiex as aiex_d from aie.dialects import arith as arith_d @@ -51,19 +53,56 @@ def __init__(self): # Parameter helpers # ------------------------------------------------------------------ - def getAIEParams(self, operatorRepresentation: OperatorRepresentation) -> dict: + def getAIEParams(self, operatorRepresentation: OperatorRepresentation, + tilingConstraint=None) -> dict: """Extract AIE parameters from the operator representation. + + If tilingConstraint is available (tiling enabled), use information + from it. Otherwise fall back to fixed tile sizes. + + Parameters + ---------- + operatorRepresentation : OperatorRepresentation + Parsed operator representation containing 'size' (total elements). + tilingConstraint : PatternMemoryConstraints, optional + Tiling solution from the solver. If provided, tile size is derived + from the tiling solution. Returns ------- dict - ``num_elements``, ``tile_size`` (clamped to MAX_TILE_SIZE and - ensuring divisibility). + ``num_elements``, ``tile_size`` (from tiling solution if available, + otherwise clamped to MAX_TILE_SIZE). """ num_elements = int(operatorRepresentation['size']) - tile_size = min(num_elements, self.MAX_TILE_SIZE) + + # If tiling is enabled, extract tile size from the tiling solution + if tilingConstraint is not None: + # tilingConstraint is a PatternMemoryConstraints with nodeConstraints + nodeConstraint = tilingConstraint.nodeConstraints[0] + outputConstraints = nodeConstraint.outputTensorMemoryConstraints + if outputConstraints: + # Get the first output tensor's L1 memory constraint (tile shape) + firstOutputName = list(outputConstraints.keys())[0] + tensorConstraint = outputConstraints[firstOutputName] + # Use L1 constraint which holds the tile shape for the AIE core + if "L1" in tensorConstraint.memoryConstraints: + l1Constraint = tensorConstraint.memoryConstraints["L1"] + if l1Constraint.shape is not None: + tile_size = int(np.prod(l1Constraint.shape)) + else: + tile_size = min(num_elements, self.MAX_TILE_SIZE) + else: + tile_size = min(num_elements, self.MAX_TILE_SIZE) + else: + tile_size = min(num_elements, self.MAX_TILE_SIZE) + else: + tile_size = min(num_elements, self.MAX_TILE_SIZE) + if num_elements % tile_size != 0: - tile_size = 1 + # Round down to the largest divisor of num_elements that fits + tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0) + return { 'num_elements': num_elements, 'tile_size': tile_size, @@ -81,8 +120,17 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None * ``compute_tile`` — result of ``aie_d.tile(col, row)`` * ``shim_tile`` — result of ``aie_d.tile(col, 0)`` + * ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution + + Parameters + ---------- + operatorRepresentation : OperatorRepresentation + Parsed operator representation with 'size' and other attributes + **kwargs + compute_tile, shim_tile, tilingConstraint (optional) """ - params = self.getAIEParams(operatorRepresentation) + tilingConstraint = kwargs.get('tilingConstraint', None) + params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint) num_elements = params['num_elements'] tile_size = params['tile_size'] num_tiles = num_elements // tile_size @@ -123,7 +171,7 @@ def _core(): scf_d.yield_([]) def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation, - seq_args: list) -> None: + seq_args: list, tilingConstraint=None) -> None: """Emit DMA configuration inside a runtime_sequence block. Parameters @@ -133,8 +181,10 @@ def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation, seq_args : list Block arguments of the runtime_sequence (memref values for in1, in2, out — in the order matching the ONNX graph I/O). + tilingConstraint : NodeMemoryConstraint, optional + Tiling solution from the solver (currently ignored, for future use). """ - params = self.getAIEParams(operatorRepresentation) + params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint) num_elements = params['num_elements'] dims = [ diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py new file mode 100644 index 0000000000..9754aa0688 --- /dev/null +++ b/Deeploy/Targets/XDNA2/Tiler.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +"""XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation.""" + +from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint +from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings + +# For Add operator, reuse the generic BOP (Binary Operator) tile constraint +# which handles equal-dimension binary operations +XDNA2AddTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings=XDNA2AddBindings, + tileConstraint=AddTileConstraint() +) diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py index 9d4f27a477..82be139d46 100644 --- a/DeeployTest/deeployRunner_xdna2.py +++ b/DeeployTest/deeployRunner_xdna2.py @@ -14,4 +14,4 @@ from testUtils.deeployRunner import main if __name__ == '__main__': - sys.exit(main(default_platform="XDNA2", default_simulator="host")) + sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True)) diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py index 69af71f429..995eaabbb7 100644 --- a/DeeployTest/generateNetwork_xdna2.py +++ b/DeeployTest/generateNetwork_xdna2.py @@ -22,14 +22,21 @@ import onnx import onnx_graphsurgeon as gs -from testUtils.platformMapping import mapDeployer, mapPlatform +from testUtils.platformMapping import mapDeployer from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.typeMapping import inferTypeAndOffset from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.DataTypes import bfloat16_t -from Deeploy.DeeployTypes import _NoVerbosity from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, XDNA2AIECoreEngine, XDNA2TilingMapping +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + + +def _tilingScheduler(graph: gs.Graph): + """Scheduler that returns List[List[gs.Node]] as required by the tiling framework.""" + return [[node] for node in graph.nodes] def _float32_to_bf16_uint16(arr: np.ndarray) -> np.ndarray: @@ -122,9 +129,6 @@ def generateNetworkXDNA2(args): test_inputs_f32 = [inputs_npz[x] for x in inputs_npz.files] test_outputs_f32 = [outputs_npz[x] for x in outputs_npz.files] - # XDNA2 is a non-signprop platform: signProp = False - platform, signProp = mapPlatform(args.platform) - inputTypes = {} inputOffsets = {} @@ -139,14 +143,45 @@ def generateNetworkXDNA2(args): _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") - deployer = mapDeployer(platform, + # Define memory hierarchy: L1 (AIE core local) and L3 (shared) + l1_size = int(getattr(args, 'l1', None) or 8192) # 8KB default + l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024) # 128MB default + + log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}") + + l1_level = MemoryLevel("L1", neighbourNames=["L3"], size=l1_size) + l3_level = MemoryLevel("L3", neighbourNames=["L1"], size=l3_size) + memory_hierarchy = MemoryHierarchy([l1_level, l3_level]) + memory_hierarchy.setDefaultMemoryLevel("L3") # Tensors default to L3 + + # Create memory-aware platform with AIE core engine + # defaultTargetMemoryLevel=L1 tells the tiling framework that computation + # targets L1, so it must tile data from L3 into L1-sized chunks. + mem_platform = MemoryXDNA2Platform( + memoryHierarchy=memory_hierarchy, + defaultTargetMemoryLevel=l1_level, + engines=[XDNA2AIECoreEngine(Mapping=XDNA2TilingMapping, preferredMemoryLevel="L1")] + ) + + # Create base deployer with memory platform + deployer = mapDeployer(mem_platform, graph, inputTypes, + scheduler=_tilingScheduler, deeployStateDir=_DEEPLOYSTATEDIR, inputOffsets=inputOffsets) - # Prepare the deployer (type checking + binding) - deployer.prepare(_NoVerbosity) + # Wrap with MemoryDeployerWrapper (adds memory level annotation) + deployer = MemoryDeployerWrapper(deployer) + + # Wrap with TilerDeployerWrapper (adds tiling) + deployer = TilerDeployerWrapper(deployer, workDir=_DEEPLOYSTATEDIR) + + # frontEnd() parses the graph; bind() triggers tiling via wrappers + deployer.frontEnd() + deployer.bind() + deployer.prepared = True + log.info("[XDNA2] Tiling completed, proceeding with MLIR generation") # Create output directory os.makedirs(args.dumpdir, exist_ok=True) @@ -180,8 +215,9 @@ def generateNetworkXDNA2(args): if __name__ == '__main__': - parser = TestGeneratorArgumentParser(description="Deeploy XDNA2 Code Generation Utility.") - args = parser.parse_args() + parser = TestGeneratorArgumentParser(tiling_arguments=True, + description="Deeploy XDNA2 Code Generation Utility.") + args, _ = parser.parse_known_args() if args.platform != 'XDNA2': parser.error(f"This script is for the XDNA2 platform. Got: {args.platform}") diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 572df44be1..a259c93ad7 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -27,10 +27,10 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: script_dir = Path(__file__).parent.parent.parent - if config.tiling: - generation_script = script_dir / "testMVP.py" - elif config.platform == "XDNA2": + if config.platform == "XDNA2": generation_script = script_dir / "generateNetwork_xdna2.py" + elif config.tiling: + generation_script = script_dir / "testMVP.py" else: generation_script = script_dir / "generateNetwork.py" diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 78d5ff9cd6..d8b76668dc 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -139,6 +139,12 @@ def __init__(self, type = int, default = 1024000, help = 'L2 size in bytes\n') + self.add_argument('--l3', + metavar = '', + dest = 'l3', + type = int, + default = None, + help = 'L3 size in bytes\n') self.add_argument('--randomizedMemoryScheduler', action = "store_true", help = 'Enable randomized memory scheduler\n') @@ -221,6 +227,8 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.append(f"--l1={args.l1}") if hasattr(args, 'l2') and args.l2 and args.l2 != 1024000: gen_args_list.append(f"--l2={args.l2}") + if hasattr(args, 'l3') and args.l3: + gen_args_list.append(f"--l3={args.l3}") if hasattr(args, 'randomizedMemoryScheduler') and args.randomizedMemoryScheduler: gen_args_list.append("--randomizedMemoryScheduler") if hasattr(args, 'profileTiling') and args.profileTiling: diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 28425393cb..eaa9b2503f 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -30,7 +30,8 @@ from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer -from Deeploy.Targets.XDNA2.Platform import XDNA2Optimizer, XDNA2Platform +from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, XDNA2Optimizer, \ + XDNA2Platform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] _NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"] @@ -278,7 +279,7 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - elif isinstance(platform, XDNA2Platform): + elif isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)): if loweringOptimizer is None: loweringOptimizer = XDNA2Optimizer diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 9578c2f26c..e233cc9b1d 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -61,7 +61,7 @@ def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int class TestGeneratorArgumentParser(argparse.ArgumentParser): - def __init__(self, description = None): + def __init__(self, tiling_arguments: bool = False, description = None): formatter = _ArgumentDefaultMetavarTypeFormatter @@ -70,6 +70,8 @@ def __init__(self, description = None): else: super().__init__(description = description, formatter_class = formatter) + self.tiling_arguments = tiling_arguments + self.add_argument('-t', metavar = '', dest = 'dir', @@ -90,6 +92,27 @@ def __init__(self, description = None): help = 'Set the output dump folder\n') self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n') + # Tiling-related arguments (for XDNA2 and other tiled platforms) + if self.tiling_arguments: + self.add_argument('--l1', + metavar = '', + dest = 'l1', + type = int, + default = None, + help = 'Set L1 memory size in bytes (enables tiling if specified).\n') + self.add_argument('--l3', + metavar = '', + dest = 'l3', + type = int, + default = None, + help = 'Set L3 memory size in bytes.\n') + self.add_argument('--defaultMemLevel', + metavar = '', + dest = 'defaultMemLevel', + type = str, + default = "L3", + help = 'Set default memory level (default: L3)\n') + self.args = None def parse_args(self, args = None, namespace = None) -> argparse.Namespace: From 9f7db2667f4644465c0c3a0bf2d911cbabcf625e Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Wed, 18 Mar 2026 14:37:48 +0100 Subject: [PATCH 06/16] Move data movement in passes. Template represent for loop and aquire/release --- Deeploy/MLIRDataTypes.py | 140 ++++++++++- Deeploy/Targets/XDNA2/Bindings.py | 28 ++- .../MLIRObjectFifoPass.py | 143 +++++++++++ .../MLIRRuntimeSequencePass.py | 93 +++++++ .../CodeTransformationPasses/__init__.py | 6 + Deeploy/Targets/XDNA2/Deployer.py | 133 ++++++---- .../Targets/XDNA2/Templates/AddTemplate.py | 233 +++++------------- 7 files changed, 537 insertions(+), 239 deletions(-) create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py index 2091307858..642fb6fef1 100644 --- a/Deeploy/MLIRDataTypes.py +++ b/Deeploy/MLIRDataTypes.py @@ -1,22 +1,30 @@ # SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 -"""Base class for MLIR-emitting node templates. - -This module provides :class:`MLIRNodeTemplate`, a :class:`NodeTemplate` -subclass whose ``generate()`` method produces an MLIR string instead of C -code. Concrete subclasses override :meth:`emit` to populate an -``mlir.ir.Module`` using dialect-specific Python bindings (e.g. -``aie.dialects`` for the XDNA2 backend). - -The class is intentionally dialect-agnostic so that future MLIR-based -backends (NVGPU, Linalg, …) can reuse the same base. +"""Base classes for MLIR-emitting node templates and code transformations. + +This module provides: + +* :class:`MLIRNodeTemplate` — a :class:`NodeTemplate` subclass whose + ``emit()`` method populates an ``mlir.ir.Module`` instead of rendering C. +* :class:`MLIRExecutionBlock` — MLIR-specific execution state replacing the + C-oriented :class:`ExecutionBlock` (code-snippet deque) with MLIR builder + state (tile references, ObjectFifo handles, tiling parameters). +* :class:`MLIRCodeTransformationPass` — base class for MLIR code + transformation passes that operate on an :class:`MLIRExecutionBlock`. +* :class:`MLIRCodeTransformation` — two-phase pass container + (``devicePasses`` + ``runtimeSequencePasses``) that the deployer + orchestrates inside ``@aie_d.device`` and ``@aiex_d.runtime_sequence`` + regions respectively. + +All classes are intentionally dialect-agnostic so that future MLIR-based +backends (NVGPU, Linalg, …) can reuse them. """ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING +from typing import Any, Dict, List, Optional, TYPE_CHECKING, Tuple from Deeploy.DeeployTypes import NodeTemplate @@ -24,6 +32,116 @@ from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +# ====================================================================== +# MLIRExecutionBlock +# ====================================================================== + +class MLIRExecutionBlock: + """MLIR-specific execution state for a single operator. + + Replaces the C-oriented :class:`ExecutionBlock` (which holds a deque of + :class:`CodeSnippet` objects) with fields that carry MLIR builder state + through the code-transformation pipeline. + + Passes populate fields progressively: + + 1. The deployer sets ``computeTile``, ``shimTile``, + ``operatorRepresentation``, and ``patternMemoryConstraint``. + 2. A device-phase pass (e.g. ``MLIRObjectFifoPass``) fills + ``fifoMap``, ``fifoTypes``, ``tileSize``, ``numTiles``, + ``kernelFuncName``, and ``kernelObjFile``. + 3. The deployer sets ``runtimeSequenceArgs`` before the runtime- + sequence phase. + 4. A runtime-sequence pass (e.g. ``MLIRRuntimeSequencePass``) reads + all of the above to emit DMA configuration. + """ + + def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None: + # MLIR tile references (set by deployer) + self.computeTile: Any = computeTile + self.shimTile: Any = shimTile + + # Operator metadata (set by deployer from parser) + self.operatorRepresentation: OperatorRepresentation = {} + + # Tiling constraint from midend solver (may be None) + self.patternMemoryConstraint: Any = None + + # Populated by device-phase passes (e.g. MLIRObjectFifoPass) + self.fifoMap: Dict[str, str] = {} # tensor name → FIFO name + self.fifoTypes: Dict[str, Any] = {} # tensor name → MemRefType + self.tileSize: int = 0 + self.numTiles: int = 0 + self.numElements: int = 0 + self.kernelFuncName: Optional[str] = None + self.kernelObjFile: Optional[str] = None + + # Set by deployer before runtime-sequence phase + self.runtimeSequenceArgs: List[Any] = [] + + # Input / output tensor name lists (set by deployer from parser) + self.inputNames: List[str] = [] + self.outputNames: List[str] = [] + + +# ====================================================================== +# MLIRCodeTransformationPass / MLIRCodeTransformation +# ====================================================================== + +class MLIRCodeTransformationPass: + """Base class for passes that transform an :class:`MLIRExecutionBlock`. + + Subclasses override :meth:`apply` to read / mutate the block's fields + and optionally emit MLIR operations into the current insertion point. + """ + + def apply(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + return ctxt, mlirBlock + + +class MLIRCodeTransformation: + """Two-phase pass container for MLIR code transformations. + + *devicePasses* run inside an ``@aie_d.device(...)`` region (ObjectFifo + creation, external-kernel declarations, …). + + *runtimeSequencePasses* run inside an ``@aiex_d.runtime_sequence`` + block (DMA configuration, token await, …). + + The deployer calls :meth:`applyDevicePasses` and + :meth:`applyRuntimeSequencePasses` at the appropriate points. + """ + + def __init__(self, + devicePasses: Optional[List[MLIRCodeTransformationPass]] = None, + runtimeSequencePasses: Optional[List[MLIRCodeTransformationPass]] = None) -> None: + self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or [] + self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or [] + + def applyDevicePasses(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + for _pass in self.devicePasses: + ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) + return ctxt, mlirBlock + + def applyRuntimeSequencePasses(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + for _pass in self.runtimeSequencePasses: + ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) + return ctxt, mlirBlock + + +# ====================================================================== +# MLIRNodeTemplate +# ====================================================================== + class MLIRNodeTemplate(NodeTemplate): """NodeTemplate subclass that emits MLIR instead of C code. diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py index 68d7672787..e30bbc6646 100644 --- a/Deeploy/Targets/XDNA2/Bindings.py +++ b/Deeploy/Targets/XDNA2/Bindings.py @@ -4,14 +4,32 @@ from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.DataTypes import bfloat16_t -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding +from Deeploy.DeeployTypes import NodeBinding +from Deeploy.MLIRDataTypes import MLIRCodeTransformation +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass from Deeploy.Targets.XDNA2.Templates import AddTemplate from Deeploy.Targets.XDNA2.TypeCheckers import XDNA2AddChecker -# XDNA2 does not use the standard C code transformation pipeline. -# The deployer generates a holistic MLIR module, not per-node C snippets. -# An empty CodeTransformation is used as a placeholder. -XDNA2Transformer = CodeTransformation([]) +_ADD_INPUT_KEYS = ['data_in_1', 'data_in_2'] +_ADD_OUTPUT_KEYS = ['data_out'] + +XDNA2Transformer = MLIRCodeTransformation( + devicePasses = [ + MLIRObjectFifoPass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + kernelFuncName = "eltwise_add_bf16_vector", + kernelObjFile = "add.o", + ), + ], + runtimeSequencePasses = [ + MLIRRuntimeSequencePass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + ), + ], +) XDNA2AddBindings = [ NodeBinding( diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py new file mode 100644 index 0000000000..be6b492906 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that creates ObjectFifos and declares external kernels. + +Given an :class:`MLIRExecutionBlock` with ``computeTile``, ``shimTile``, +``operatorRepresentation``, and (optionally) ``patternMemoryConstraint``, +this pass: + +1. Derives ``tileSize`` and ``numTiles`` (from tiling solver or fallback). +2. Creates one ``aie_d.object_fifo`` per input tensor (shim → compute) + and one per output tensor (compute → shim), all with depth 2 + (double-buffering). +3. Declares the external kernel via ``aie_d.external_func``. +4. Stores FIFO names, types, and kernel metadata on the block for + downstream passes and the compute template. + +The pass is operator-agnostic — it only needs the tensor names and a +tile-size derivation function. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Tuple + +import numpy as np + +from aie.dialects import aie as aie_d +import aie.ir as ir + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + +MAX_TILE_SIZE = 1024 + + +def _deriveTileSize(numElements: int, patternMemoryConstraint) -> int: + """Extract tile size from the tiling solution, or fall back to MAX_TILE_SIZE.""" + tileSize = min(numElements, MAX_TILE_SIZE) + + if patternMemoryConstraint is not None: + try: + nodeConstraint = patternMemoryConstraint.nodeConstraints[0] + outputConstraints = nodeConstraint.outputTensorMemoryConstraints + if outputConstraints: + firstOutputName = list(outputConstraints.keys())[0] + tensorConstraint = outputConstraints[firstOutputName] + if "L1" in tensorConstraint.memoryConstraints: + l1Constraint = tensorConstraint.memoryConstraints["L1"] + if l1Constraint.shape is not None: + tileSize = int(np.prod(l1Constraint.shape)) + except (AttributeError, IndexError, KeyError): + pass + + # Ensure tile_size evenly divides num_elements + if numElements % tileSize != 0: + tileSize = max(d for d in range(1, tileSize + 1) if numElements % d == 0) + + return tileSize + + +class MLIRObjectFifoPass(MLIRCodeTransformationPass): + """Create ObjectFifos and declare the external kernel. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors + (e.g. ``['data_in_1', 'data_in_2']``). + outputTensorKeys : list of str + Keys that name output tensors (e.g. ``['data_out']``). + kernelFuncName : str + Symbol name of the external AIE kernel function. + kernelObjFile : str + Object file to link with the AIE core (e.g. ``"add.o"``). + kernelArgTypes : callable, optional + A callable ``(tile_memref_type) -> list[ir.Type]`` that returns + the kernel's argument types. Defaults to + ``[tile_ty, tile_ty, tile_ty, i32]`` (suitable for binary + elementwise ops). + fifoDepth : int + ObjectFifo depth (default 2 for double-buffering). + """ + + def __init__(self, + inputTensorKeys: list, + outputTensorKeys: list, + kernelFuncName: str, + kernelObjFile: str, + kernelArgTypes=None, + fifoDepth: int = 2) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + self.kernelFuncName = kernelFuncName + self.kernelObjFile = kernelObjFile + self._kernelArgTypes = kernelArgTypes + self.fifoDepth = fifoDepth + + def apply(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + opRepr = mlirBlock.operatorRepresentation + numElements = int(opRepr['size']) + tileSize = _deriveTileSize(numElements, mlirBlock.patternMemoryConstraint) + numTiles = numElements // tileSize + + mlirBlock.tileSize = tileSize + mlirBlock.numTiles = numTiles + mlirBlock.numElements = numElements + mlirBlock.kernelFuncName = self.kernelFuncName + mlirBlock.kernelObjFile = self.kernelObjFile + + tileTy = ir.MemRefType.get((tileSize,), ir.BF16Type.get()) + computeTile = mlirBlock.computeTile + shimTile = mlirBlock.shimTile + + # Create input ObjectFifos (shim → compute) + for idx, key in enumerate(self.inputTensorKeys): + fifoName = f"in{idx + 1}_0" + aie_d.object_fifo(fifoName, shimTile, [computeTile], self.fifoDepth, tileTy) + mlirBlock.fifoMap[key] = fifoName + mlirBlock.fifoTypes[key] = tileTy + + # Create output ObjectFifos (compute → shim) + for idx, key in enumerate(self.outputTensorKeys): + fifoName = f"out_{idx}" + aie_d.object_fifo(fifoName, computeTile, [shimTile], self.fifoDepth, tileTy) + mlirBlock.fifoMap[key] = fifoName + mlirBlock.fifoTypes[key] = tileTy + + # Declare external kernel + i32 = ir.IntegerType.get_signless(32) + if self._kernelArgTypes is not None: + argTypes = self._kernelArgTypes(tileTy) + else: + # Default: binary elementwise (in1, in2, out, size) + argTypes = [tileTy, tileTy, tileTy, i32] + aie_d.external_func(self.kernelFuncName, argTypes) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py new file mode 100644 index 0000000000..18a4607328 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Runtime-sequence pass that configures shim DMA for L3 ↔ L1 transfers. + +Given an :class:`MLIRExecutionBlock` whose device-phase passes have already +populated ``fifoMap``, ``numElements``, and ``runtimeSequenceArgs``, this +pass emits ``aiex_d.dma_configure_task_for`` / ``dma_start_task`` / +``dma_await_task`` / ``dma_free_task`` operations directly into the current +``@aiex_d.runtime_sequence`` insertion point. + +The pass is operator-agnostic — it iterates over the FIFO map and +runtime-sequence arguments to configure DMA for every input and output +tensor. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Tuple + +from aie.dialects import aie as aie_d +from aie.dialects import aiex as aiex_d +import aie.ir as ir + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + + +class MLIRRuntimeSequencePass(MLIRCodeTransformationPass): + """Emit DMA configuration inside a ``runtime_sequence`` block. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors. + outputTensorKeys : list of str + Keys that name output tensors. + """ + + def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + + def apply(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + numElements = mlirBlock.numElements + seqArgs = mlirBlock.runtimeSequenceArgs + + dims = [ + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=1, stride=0), + aie_d.bd_dim_layout(size=numElements, stride=1), + ] + + # Build ordered list of (fifoName, seqArg, isOutput) + transfers = [] + allKeys = self.inputTensorKeys + self.outputTensorKeys + for idx, key in enumerate(allKeys): + fifoName = mlirBlock.fifoMap[key] + isOutput = key in self.outputTensorKeys + transfers.append((fifoName, seqArgs[idx], isOutput)) + + inputTasks = [] + outputTasks = [] + + for fifoName, seqArg, isOutput in transfers: + if isOutput: + task = aiex_d.dma_configure_task_for(fifoName, issue_token=True) + else: + task = aiex_d.dma_configure_task_for(fifoName) + block = task.body.blocks.append() + with ir.InsertionPoint(block): + aie_d.dma_bd(seqArg, offset=0, len=numElements, dimensions=dims, burst_length=0) + aie_d.end() + aiex_d.dma_start_task(task) + + if isOutput: + outputTasks.append(task) + else: + inputTasks.append(task) + + # Await output tasks, then free input tasks + for task in outputTasks: + aiex_d.dma_await_task(task) + for task in inputTasks: + aiex_d.dma_free_task(task) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py new file mode 100644 index 0000000000..aae227155a --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import * diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py index 34fd1b52b5..16cda89891 100644 --- a/Deeploy/Targets/XDNA2/Deployer.py +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -6,6 +6,15 @@ Unlike other Deeploy deployers that generate C code via Mako templates, this deployer constructs an ``mlir.ir.Module`` with AIE dialect operations and returns the verified MLIR text. + +MLIR generation is split into two phases orchestrated by +:class:`MLIRCodeTransformation`: + +1. **Device phase** — inside ``@aie_d.device(npu2)``: for each operator, + run ``devicePasses`` (ObjectFifo creation, external-kernel + declaration) then call ``template.emit()`` (compute core only). +2. **Runtime-sequence phase** — inside ``@aiex_d.runtime_sequence``: + for each operator, run ``runtimeSequencePasses`` (DMA configuration). """ from __future__ import annotations @@ -23,15 +32,20 @@ from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer from Deeploy.Logging import DEFAULT_LOGGER as log -from Deeploy.MLIRDataTypes import MLIRNodeTemplate -from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.MLIRDataTypes import MLIRCodeTransformation, MLIRExecutionBlock, MLIRNodeTemplate class XDNA2Deployer(SignPropDeployer): """Deployer for the XDNA2 (AIE2p) platform. - Generates an mlir-aie MLIR module by calling :meth:`emit` / - :meth:`emitRuntimeSequence` on each bound :class:`MLIRNodeTemplate`. + Generates an mlir-aie MLIR module via two-phase code transformation: + + * **Device phase**: ``MLIRObjectFifoPass`` creates ObjectFifos and + declares external kernels; the bound ``MLIRNodeTemplate`` emits + the compute core. + * **Runtime-sequence phase**: ``MLIRRuntimeSequencePass`` configures + shim DMA for L3 ↔ L1 transfers. + The module is verified via MLIR's built-in verifier before being returned as a string. """ @@ -65,13 +79,16 @@ def __init__(self, def generateMLIR(self) -> str: """Generate an mlir-aie MLIR module for the prepared graph. - Iterates over bound layers, calls each template's ``emit()`` - to construct AIE operations, adds a ``runtime_sequence`` for - host-side DMA, verifies the module, and returns the MLIR text. - - If tiling is enabled (patternMemoryConstraint available), passes - tiling information to templates to generate tiled transfers and - compute kernels. + Iterates over bound layers in two phases: + + 1. **Device phase** — for each node, creates an + :class:`MLIRExecutionBlock`, runs device-phase code- + transformation passes (ObjectFifo creation, kernel + declaration), then calls ``template.emit()`` (compute core). + 2. **Runtime-sequence phase** — opens an + ``@aiex_d.runtime_sequence`` block, sets + ``runtimeSequenceArgs`` on each block, then runs + runtime-sequence passes (DMA configuration). Returns ------- @@ -80,60 +97,86 @@ def generateMLIR(self) -> str: """ assert self.prepared, "XDNA2Deployer.generateMLIR() called before prepare()" - # Collect templates and their operator representations + # Collect per-node info from the bound layers nodes = [] - for node_name, layer in self.layerBinding.items(): + for nodeName, layer in self.layerBinding.items(): mapper = layer.mapper - template = mapper.binder.template - op_repr = mapper.parser.operatorRepresentation - - # Check if tiling is enabled by looking for patternMemoryConstraint - executionBlock = mapper.binder.executionBlock + binder = mapper.binder + template = binder.template + opRepr = mapper.parser.operatorRepresentation + codeTransformer = binder.codeTransformer + + # Tiling constraint from the midend solver (may be None) + executionBlock = binder.executionBlock tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None) if not isinstance(template, MLIRNodeTemplate): raise RuntimeError( - f"Node '{node_name}' has no MLIRNodeTemplate — " + f"Node '{nodeName}' has no MLIRNodeTemplate — " f"only BF16 Add is supported in this release.") + if not isinstance(codeTransformer, MLIRCodeTransformation): + raise RuntimeError( + f"Node '{nodeName}' uses a non-MLIR CodeTransformation — " + f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.") - nodes.append((node_name, template, op_repr, tilingConstraint)) + nodes.append({ + 'nodeName': nodeName, + 'template': template, + 'opRepr': opRepr, + 'codeTransformer': codeTransformer, + 'tilingConstraint': tilingConstraint, + }) if not nodes: raise RuntimeError("No bound layers found — cannot generate MLIR.") # Build the MLIR module + mlirBlocks = [] + with mlir_mod_ctx() as ctx: @aie_d.device(aie_d.AIEDevice.npu2) def _device(): - compute_tile = aie_d.tile(0, 2) # JUNGVI: This will have to change when we deploy on the whole array - shim_tile = aie_d.tile(0, 0) - - # Emit each node's operations (ObjectFifos, core, kernel decls) - for node_name, template, op_repr, tilingConstraint in nodes: - log.info(f"[XDNA2] Emitting MLIR for node '{node_name}'" + - (" with tiling" if tilingConstraint else "")) - template.emit(op_repr, - compute_tile=compute_tile, - shim_tile=shim_tile, - tilingConstraint=tilingConstraint) # Pass tiling info - - # Runtime sequence: collect tensor types from all nodes' I/O - # For now (single-node), derive from the first node. - _, first_template, first_op_repr, first_tilingConstraint = nodes[0] - params = first_template.getAIEParams(first_op_repr, tilingConstraint=first_tilingConstraint) - num_elements = params['num_elements'] - tensor_ty = ir.MemRefType.get((num_elements,), ir.BF16Type.get()) - - @aiex_d.runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + computeTile = aie_d.tile(0, 2) # TODO: generalize to full array + shimTile = aie_d.tile(0, 0) + + # === Device phase === + for node in nodes: + # Create MLIRExecutionBlock with deployer-level state + eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile) + eb.operatorRepresentation = node['opRepr'] + eb.patternMemoryConstraint = node['tilingConstraint'] + + log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" + + (" (tiled)" if node['tilingConstraint'] else "")) + + # Run device-phase passes (ObjectFifo creation, kernel decl) + self.ctxt, eb = node['codeTransformer'].applyDevicePasses( + self.ctxt, eb, node['nodeName']) + + # Emit compute core (template reads FIFOs etc. from eb) + node['template'].emit(node['opRepr'], executionBlock=eb) + + mlirBlocks.append((node, eb)) + + # === Runtime-sequence phase === + # Derive tensor type from the first node's numElements + _, firstEb = mlirBlocks[0] + numElements = firstEb.numElements + tensorTy = ir.MemRefType.get((numElements,), ir.BF16Type.get()) + + @aiex_d.runtime_sequence(tensorTy, tensorTy, tensorTy) def _seq(*args): - for _, template, op_repr, tilingConstraint in nodes: - template.emitRuntimeSequence(op_repr, list(args), tilingConstraint=tilingConstraint) + for node, eb in mlirBlocks: + eb.runtimeSequenceArgs = list(args) + log.info(f"[XDNA2] Runtime-sequence phase for '{node['nodeName']}'") + self.ctxt, eb = node['codeTransformer'].applyRuntimeSequencePasses( + self.ctxt, eb, node['nodeName']) module = ctx.module assert module.operation.verify(), \ "[XDNA2] Generated MLIR module failed verification" - mlir_str = str(module) - log.info(f"[XDNA2] MLIR module generated ({len(mlir_str)} bytes)") - return mlir_str + mlirStr = str(module) + log.info(f"[XDNA2] MLIR module generated ({len(mlirStr)} bytes)") + return mlirStr diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py index 3a62d6f757..ab0b72be77 100644 --- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -1,21 +1,21 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 -"""XDNA2 MLIR template for BF16 elementwise Add. - -Uses ``aie.dialects`` (from the pip-installed ``mlir-aie`` package) to emit -verified MLIR operations into an existing module context provided by the -:class:`XDNA2Deployer`. +"""XDNA2 MLIR template for BF16 elementwise Add — compute kernel only. + +This template emits only the AIE core compute logic (FIFO +acquire → kernel call → FIFO release). ObjectFifo creation, external +kernel declaration, and DMA runtime-sequence configuration are handled +by :class:`MLIRObjectFifoPass` and :class:`MLIRRuntimeSequencePass` +respectively, which populate the :class:`MLIRExecutionBlock` before +this template's :meth:`emit` is called. """ from __future__ import annotations from typing import TYPE_CHECKING -import numpy as np - from aie.dialects import aie as aie_d -from aie.dialects import aiex as aiex_d from aie.dialects import arith as arith_d from aie.dialects import func as func_d from aie.dialects import scf as scf_d @@ -28,197 +28,74 @@ class XDNA2AddTemplate(MLIRNodeTemplate): - """MLIR template for BF16 elementwise Add on XDNA2 (AIE2p). + """Compute-only MLIR template for BF16 elementwise Add on XDNA2 (AIE2p). - The :meth:`emit` method constructs a single-core AIE program with: + Emits an ``@aie_d.core`` block containing nested loops that acquire + input/output ObjectFifo elements and call the vectorised + ``eltwise_add_bf16_vector`` kernel. - * Two input ObjectFifos and one output ObjectFifo (depth 2 for - double-buffering). - * A compute core that loops, acquiring / releasing FIFO elements and - calling the vectorised ``eltwise_add_bf16_vector`` kernel. - * A runtime sequence that configures shim DMA for L3 ↔ L1 transfers. - - Parameters are extracted from the *operatorRepresentation* populated - by the parser (``size`` = total number of BF16 elements). + All ObjectFifo creation and DMA configuration is performed by + upstream :class:`MLIRCodeTransformationPass` instances. This + template reads FIFO names, tile size, and kernel metadata from the + :class:`MLIRExecutionBlock` passed through ``kwargs['executionBlock']``. """ - KERNEL_FN = "eltwise_add_bf16_vector" - KERNEL_OBJ = "add.o" - MAX_TILE_SIZE = 1024 - def __init__(self): super().__init__() # ------------------------------------------------------------------ - # Parameter helpers - # ------------------------------------------------------------------ - - def getAIEParams(self, operatorRepresentation: OperatorRepresentation, - tilingConstraint=None) -> dict: - """Extract AIE parameters from the operator representation. - - If tilingConstraint is available (tiling enabled), use information - from it. Otherwise fall back to fixed tile sizes. - - Parameters - ---------- - operatorRepresentation : OperatorRepresentation - Parsed operator representation containing 'size' (total elements). - tilingConstraint : PatternMemoryConstraints, optional - Tiling solution from the solver. If provided, tile size is derived - from the tiling solution. - - Returns - ------- - dict - ``num_elements``, ``tile_size`` (from tiling solution if available, - otherwise clamped to MAX_TILE_SIZE). - """ - num_elements = int(operatorRepresentation['size']) - - # If tiling is enabled, extract tile size from the tiling solution - if tilingConstraint is not None: - # tilingConstraint is a PatternMemoryConstraints with nodeConstraints - nodeConstraint = tilingConstraint.nodeConstraints[0] - outputConstraints = nodeConstraint.outputTensorMemoryConstraints - if outputConstraints: - # Get the first output tensor's L1 memory constraint (tile shape) - firstOutputName = list(outputConstraints.keys())[0] - tensorConstraint = outputConstraints[firstOutputName] - # Use L1 constraint which holds the tile shape for the AIE core - if "L1" in tensorConstraint.memoryConstraints: - l1Constraint = tensorConstraint.memoryConstraints["L1"] - if l1Constraint.shape is not None: - tile_size = int(np.prod(l1Constraint.shape)) - else: - tile_size = min(num_elements, self.MAX_TILE_SIZE) - else: - tile_size = min(num_elements, self.MAX_TILE_SIZE) - else: - tile_size = min(num_elements, self.MAX_TILE_SIZE) - else: - tile_size = min(num_elements, self.MAX_TILE_SIZE) - - if num_elements % tile_size != 0: - # Round down to the largest divisor of num_elements that fits - tile_size = max(d for d in range(1, tile_size + 1) if num_elements % d == 0) - - return { - 'num_elements': num_elements, - 'tile_size': tile_size, - } - - # ------------------------------------------------------------------ - # MLIR emission + # MLIR emission — compute kernel only # ------------------------------------------------------------------ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: - """Add AIE operations for a BF16 Add node into the current device context. - - Must be called inside an ``@aie_d.device(...)`` region (the deployer - sets this up). The following keyword arguments are expected: - - * ``compute_tile`` — result of ``aie_d.tile(col, row)`` - * ``shim_tile`` — result of ``aie_d.tile(col, 0)`` - * ``tilingConstraint`` — optional NodeMemoryConstraint for tiled execution - - Parameters - ---------- - operatorRepresentation : OperatorRepresentation - Parsed operator representation with 'size' and other attributes - **kwargs - compute_tile, shim_tile, tilingConstraint (optional) + """Emit the AIE core compute block for a BF16 Add node. + + Must be called inside an ``@aie_d.device(...)`` region **after** + the device-phase code-transformation passes have populated the + :class:`MLIRExecutionBlock`. + + Expected keyword arguments + -------------------------- + executionBlock : MLIRExecutionBlock + Carries ``computeTile``, ``fifoMap``, ``fifoTypes``, + ``tileSize``, ``numTiles``, ``kernelFuncName``, and + ``kernelObjFile`` — all set by ``MLIRObjectFifoPass``. """ - tilingConstraint = kwargs.get('tilingConstraint', None) - params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint) - num_elements = params['num_elements'] - tile_size = params['tile_size'] - num_tiles = num_elements // tile_size + eb = kwargs['executionBlock'] - compute_tile = kwargs['compute_tile'] - shim_tile = kwargs['shim_tile'] + computeTile = eb.computeTile + tileSize = eb.tileSize + numTiles = eb.numTiles + kernelFn = eb.kernelFuncName + kernelObj = eb.kernelObjFile - # MemRef types - tile_ty = ir.MemRefType.get((tile_size,), ir.BF16Type.get()) + # MemRef / scalar types + tileTy = eb.fifoTypes[list(eb.fifoTypes.keys())[0]] i32 = ir.IntegerType.get_signless(32) - # ObjectFifos (depth 2 for double-buffering) - aie_d.object_fifo("in1_0", shim_tile, [compute_tile], 2, tile_ty) - aie_d.object_fifo("in2_0", shim_tile, [compute_tile], 2, tile_ty) - aie_d.object_fifo("out_0", compute_tile, [shim_tile], 2, tile_ty) + # FIFO names (populated by MLIRObjectFifoPass) + in1Fifo = eb.fifoMap['data_in_1'] + in2Fifo = eb.fifoMap['data_in_2'] + outFifo = eb.fifoMap['data_out'] - # External kernel declaration - aie_d.external_func(self.KERNEL_FN, [tile_ty, tile_ty, tile_ty, i32]) - - # Compute core - @aie_d.core(compute_tile, link_with=self.KERNEL_OBJ) + @aie_d.core(computeTile, link_with=kernelObj) def _core(): - subview_ty = aie_d.ObjectFifoSubviewType.get(tile_ty) + subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy) for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): - for _ in scf_d.for_(0, num_tiles, 1): - acq_in1 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in1_0", 1) - elem_in1 = aie_d.objectfifo_subview_access(tile_ty, acq_in1, 0) - acq_in2 = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Consume, "in2_0", 1) - elem_in2 = aie_d.objectfifo_subview_access(tile_ty, acq_in2, 0) - acq_out = aie_d.objectfifo_acquire(subview_ty, aie_d.ObjectFifoPort.Produce, "out_0", 1) - elem_out = aie_d.objectfifo_subview_access(tile_ty, acq_out, 0) - size_val = arith_d.constant(i32, tile_size) - func_d.call([], self.KERNEL_FN, [elem_in1, elem_in2, elem_out, size_val]) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in1_0", 1) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, "in2_0", 1) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, "out_0", 1) + for _ in scf_d.for_(0, numTiles, 1): + acqIn1 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in1Fifo, 1) + elemIn1 = aie_d.objectfifo_subview_access(tileTy, acqIn1, 0) + acqIn2 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in2Fifo, 1) + elemIn2 = aie_d.objectfifo_subview_access(tileTy, acqIn2, 0) + acqOut = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, outFifo, 1) + elemOut = aie_d.objectfifo_subview_access(tileTy, acqOut, 0) + sizeVal = arith_d.constant(i32, tileSize) + func_d.call([], kernelFn, [elemIn1, elemIn2, elemOut, sizeVal]) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in1Fifo, 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in2Fifo, 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, outFifo, 1) scf_d.yield_([]) scf_d.yield_([]) - def emitRuntimeSequence(self, operatorRepresentation: OperatorRepresentation, - seq_args: list, tilingConstraint=None) -> None: - """Emit DMA configuration inside a runtime_sequence block. - - Parameters - ---------- - operatorRepresentation : OperatorRepresentation - Node representation (used to extract ``num_elements``). - seq_args : list - Block arguments of the runtime_sequence (memref values for - in1, in2, out — in the order matching the ONNX graph I/O). - tilingConstraint : NodeMemoryConstraint, optional - Tiling solution from the solver (currently ignored, for future use). - """ - params = self.getAIEParams(operatorRepresentation, tilingConstraint=tilingConstraint) - num_elements = params['num_elements'] - - dims = [ - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=num_elements, stride=1), - ] - - in1, in2, out = seq_args[0], seq_args[1], seq_args[2] - - task_in1 = aiex_d.dma_configure_task_for("in1_0") - block_in1 = task_in1.body.blocks.append() - with ir.InsertionPoint(block_in1): - aie_d.dma_bd(in1, offset=0, len=num_elements, dimensions=dims, burst_length=0) - aie_d.end() - aiex_d.dma_start_task(task_in1) - - task_in2 = aiex_d.dma_configure_task_for("in2_0") - block_in2 = task_in2.body.blocks.append() - with ir.InsertionPoint(block_in2): - aie_d.dma_bd(in2, offset=0, len=num_elements, dimensions=dims, burst_length=0) - aie_d.end() - aiex_d.dma_start_task(task_in2) - - task_out = aiex_d.dma_configure_task_for("out_0", issue_token=True) - block_out = task_out.body.blocks.append() - with ir.InsertionPoint(block_out): - aie_d.dma_bd(out, offset=0, len=num_elements, dimensions=dims, burst_length=0) - aie_d.end() - aiex_d.dma_start_task(task_out) - aiex_d.dma_await_task(task_out) - aiex_d.dma_free_task(task_in1) - aiex_d.dma_free_task(task_in2) - referenceTemplate = XDNA2AddTemplate() From 14f3ced77de9fabc630be440d483896df4102abe Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Wed, 18 Mar 2026 14:47:33 +0100 Subject: [PATCH 07/16] Template is agnostic of tiling and data movement that are handled by code transformation passes --- Deeploy/MLIRDataTypes.py | 4 + Deeploy/Targets/XDNA2/Bindings.py | 5 + .../MLIRComputeCorePass.py | 116 ++++++++++++++++++ .../CodeTransformationPasses/__init__.py | 1 + Deeploy/Targets/XDNA2/Deployer.py | 9 +- .../Targets/XDNA2/Templates/AddTemplate.py | 97 +++++---------- 6 files changed, 159 insertions(+), 73 deletions(-) create mode 100644 Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py index 642fb6fef1..1f8a2be446 100644 --- a/Deeploy/MLIRDataTypes.py +++ b/Deeploy/MLIRDataTypes.py @@ -76,6 +76,10 @@ def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None: self.kernelFuncName: Optional[str] = None self.kernelObjFile: Optional[str] = None + # The MLIRNodeTemplate for this node (set by deployer, called by + # MLIRComputeCorePass to emit the kernel call inside the core block) + self.template: Optional[Any] = None + # Set by deployer before runtime-sequence phase self.runtimeSequenceArgs: List[Any] = [] diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py index e30bbc6646..14b8b0317a 100644 --- a/Deeploy/Targets/XDNA2/Bindings.py +++ b/Deeploy/Targets/XDNA2/Bindings.py @@ -6,6 +6,7 @@ from Deeploy.CommonExtensions.DataTypes import bfloat16_t from Deeploy.DeeployTypes import NodeBinding from Deeploy.MLIRDataTypes import MLIRCodeTransformation +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import MLIRComputeCorePass from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import MLIRObjectFifoPass from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import MLIRRuntimeSequencePass from Deeploy.Targets.XDNA2.Templates import AddTemplate @@ -22,6 +23,10 @@ kernelFuncName = "eltwise_add_bf16_vector", kernelObjFile = "add.o", ), + MLIRComputeCorePass( + inputTensorKeys = _ADD_INPUT_KEYS, + outputTensorKeys = _ADD_OUTPUT_KEYS, + ), ], runtimeSequencePasses = [ MLIRRuntimeSequencePass( diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py new file mode 100644 index 0000000000..7d06fab241 --- /dev/null +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Device-phase pass that emits the AIE core block with tiling loops. + +This pass constructs the structural MLIR around the compute kernel: + +1. Opens an ``@aie_d.core`` block linked to the kernel object file. +2. Opens an infinite outer ``scf.for`` loop (streaming). +3. Opens an inner ``scf.for`` tiling loop (``numTiles`` iterations). +4. Acquires input/output ObjectFifo elements. +5. Builds a modified ``operatorRepresentation`` where tensor keys + (e.g. ``data_in_1``) are replaced with the acquired MLIR memref + values and ``size`` is replaced with the tile size — mirroring + how ``TilingVariableReplacement`` rewrites buffer names for C + backends. +6. Calls ``template.emit(modifiedOpRepr)`` — the template only emits + its ``func_d.call`` using values from ``operatorRepresentation``. +7. Releases all FIFO elements and closes loops. + +The pass is operator-agnostic: it only needs the tensor key lists and +reads everything else from the :class:`MLIRExecutionBlock` populated by +prior passes (e.g. :class:`MLIRObjectFifoPass`). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Tuple + +from aie.dialects import aie as aie_d +from aie.dialects import scf as scf_d + +from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock + +if TYPE_CHECKING: + from Deeploy.DeeployTypes import NetworkContext + + +class MLIRComputeCorePass(MLIRCodeTransformationPass): + """Emit ``@aie_d.core`` with tiling loops and FIFO acquire/release. + + The template stored on ``mlirBlock.template`` is called inside the + inner loop with a *modified* ``operatorRepresentation`` whose tensor + entries point to acquired MLIR memref values instead of buffer name + strings. + + Parameters + ---------- + inputTensorKeys : list of str + Keys in ``operatorRepresentation`` that name input tensors. + outputTensorKeys : list of str + Keys that name output tensors. + """ + + def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> None: + self.inputTensorKeys = inputTensorKeys + self.outputTensorKeys = outputTensorKeys + + def apply(self, + ctxt: NetworkContext, + mlirBlock: MLIRExecutionBlock, + name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: + computeTile = mlirBlock.computeTile + kernelObj = mlirBlock.kernelObjFile + tileSize = mlirBlock.tileSize + numTiles = mlirBlock.numTiles + opRepr = mlirBlock.operatorRepresentation + template = mlirBlock.template + + # Use the first tensor's type as representative tile memref type + firstKey = self.inputTensorKeys[0] + tileTy = mlirBlock.fifoTypes[firstKey] + + @aie_d.core(computeTile, link_with=kernelObj) + def _core(): + subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy) + for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): + for _ in scf_d.for_(0, numTiles, 1): + # Acquire all input FIFO elements + acquiredElements = {} + for key in self.inputTensorKeys: + fifoName = mlirBlock.fifoMap[key] + acq = aie_d.objectfifo_acquire( + subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access( + tileTy, acq, 0) + + # Acquire all output FIFO elements + for key in self.outputTensorKeys: + fifoName = mlirBlock.fifoMap[key] + acq = aie_d.objectfifo_acquire( + subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access( + tileTy, acq, 0) + + # Build modified opRepr: replace tensor names with MLIR + # values, replace size with tile size. This mirrors the + # C backend's TilingVariableReplacement pass. + modifiedOpRepr = {**opRepr, 'size': tileSize, **acquiredElements} + + # Call the template — it only emits func_d.call() + template.emit(modifiedOpRepr) + + # Release all inputs + for key in self.inputTensorKeys: + aie_d.objectfifo_release( + aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1) + # Release all outputs + for key in self.outputTensorKeys: + aie_d.objectfifo_release( + aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1) + + scf_d.yield_([]) + scf_d.yield_([]) + + return ctxt, mlirBlock diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py index aae227155a..fe25ee3fdf 100644 --- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py @@ -3,4 +3,5 @@ # SPDX-License-Identifier: Apache-2.0 from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import * from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import * diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py index 16cda89891..f4a0e0a365 100644 --- a/Deeploy/Targets/XDNA2/Deployer.py +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -146,17 +146,18 @@ def _device(): eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile) eb.operatorRepresentation = node['opRepr'] eb.patternMemoryConstraint = node['tilingConstraint'] + eb.template = node['template'] log.info(f"[XDNA2] Device phase for '{node['nodeName']}'" + (" (tiled)" if node['tilingConstraint'] else "")) - # Run device-phase passes (ObjectFifo creation, kernel decl) + # Run device-phase passes: + # 1. MLIRObjectFifoPass — creates FIFOs, declares kernel + # 2. MLIRComputeCorePass — opens core + loops, calls + # template.emit() with acquired FIFO elements in opRepr self.ctxt, eb = node['codeTransformer'].applyDevicePasses( self.ctxt, eb, node['nodeName']) - # Emit compute core (template reads FIFOs etc. from eb) - node['template'].emit(node['opRepr'], executionBlock=eb) - mlirBlocks.append((node, eb)) # === Runtime-sequence phase === diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py index ab0b72be77..7a13b0625f 100644 --- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -1,24 +1,25 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 -"""XDNA2 MLIR template for BF16 elementwise Add — compute kernel only. - -This template emits only the AIE core compute logic (FIFO -acquire → kernel call → FIFO release). ObjectFifo creation, external -kernel declaration, and DMA runtime-sequence configuration are handled -by :class:`MLIRObjectFifoPass` and :class:`MLIRRuntimeSequencePass` -respectively, which populate the :class:`MLIRExecutionBlock` before -this template's :meth:`emit` is called. +"""XDNA2 MLIR template for BF16 elementwise Add — pure compute primitive. + +This template emits **only** a ``func_d.call`` to the vectorised +``eltwise_add_bf16_vector`` kernel. It receives its operands (acquired +ObjectFifo element memrefs) and tile size through +``operatorRepresentation``, exactly like a C Mako template receives +buffer-name strings. + +All structural MLIR (``@aie_d.core``, loops, FIFO acquire/release, +ObjectFifo creation, DMA configuration) is handled by +:class:`MLIRCodeTransformationPass` instances upstream. """ from __future__ import annotations from typing import TYPE_CHECKING -from aie.dialects import aie as aie_d from aie.dialects import arith as arith_d from aie.dialects import func as func_d -from aie.dialects import scf as scf_d import aie.ir as ir from Deeploy.MLIRDataTypes import MLIRNodeTemplate @@ -28,74 +29,32 @@ class XDNA2AddTemplate(MLIRNodeTemplate): - """Compute-only MLIR template for BF16 elementwise Add on XDNA2 (AIE2p). + """Pure compute-primitive for BF16 elementwise Add on XDNA2. - Emits an ``@aie_d.core`` block containing nested loops that acquire - input/output ObjectFifo elements and call the vectorised - ``eltwise_add_bf16_vector`` kernel. + ``emit()`` is called by :class:`MLIRComputeCorePass` inside an + already-open ``@aie_d.core`` + tiling-loop context, with + ``operatorRepresentation`` entries replaced by live MLIR values: - All ObjectFifo creation and DMA configuration is performed by - upstream :class:`MLIRCodeTransformationPass` instances. This - template reads FIFO names, tile size, and kernel metadata from the - :class:`MLIRExecutionBlock` passed through ``kwargs['executionBlock']``. + * ``data_in_1``, ``data_in_2``, ``data_out`` — acquired memref + elements (from ObjectFifo acquire). + * ``size`` — tile size (Python int). """ + KERNEL_FN = "eltwise_add_bf16_vector" + def __init__(self): super().__init__() - # ------------------------------------------------------------------ - # MLIR emission — compute kernel only - # ------------------------------------------------------------------ - def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None: - """Emit the AIE core compute block for a BF16 Add node. - - Must be called inside an ``@aie_d.device(...)`` region **after** - the device-phase code-transformation passes have populated the - :class:`MLIRExecutionBlock`. - - Expected keyword arguments - -------------------------- - executionBlock : MLIRExecutionBlock - Carries ``computeTile``, ``fifoMap``, ``fifoTypes``, - ``tileSize``, ``numTiles``, ``kernelFuncName``, and - ``kernelObjFile`` — all set by ``MLIRObjectFifoPass``. - """ - eb = kwargs['executionBlock'] - - computeTile = eb.computeTile - tileSize = eb.tileSize - numTiles = eb.numTiles - kernelFn = eb.kernelFuncName - kernelObj = eb.kernelObjFile - - # MemRef / scalar types - tileTy = eb.fifoTypes[list(eb.fifoTypes.keys())[0]] + """Emit a single ``func.call`` to the vectorised Add kernel.""" i32 = ir.IntegerType.get_signless(32) - - # FIFO names (populated by MLIRObjectFifoPass) - in1Fifo = eb.fifoMap['data_in_1'] - in2Fifo = eb.fifoMap['data_in_2'] - outFifo = eb.fifoMap['data_out'] - - @aie_d.core(computeTile, link_with=kernelObj) - def _core(): - subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy) - for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): - for _ in scf_d.for_(0, numTiles, 1): - acqIn1 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in1Fifo, 1) - elemIn1 = aie_d.objectfifo_subview_access(tileTy, acqIn1, 0) - acqIn2 = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, in2Fifo, 1) - elemIn2 = aie_d.objectfifo_subview_access(tileTy, acqIn2, 0) - acqOut = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, outFifo, 1) - elemOut = aie_d.objectfifo_subview_access(tileTy, acqOut, 0) - sizeVal = arith_d.constant(i32, tileSize) - func_d.call([], kernelFn, [elemIn1, elemIn2, elemOut, sizeVal]) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in1Fifo, 1) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, in2Fifo, 1) - aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, outFifo, 1) - scf_d.yield_([]) - scf_d.yield_([]) + sizeVal = arith_d.constant(i32, int(operatorRepresentation['size'])) + func_d.call([], self.KERNEL_FN, [ + operatorRepresentation['data_in_1'], + operatorRepresentation['data_in_2'], + operatorRepresentation['data_out'], + sizeVal, + ]) referenceTemplate = XDNA2AddTemplate() From b850b23d20597d8574aadbd5bacf54da02e665a9 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 19 Mar 2026 09:42:55 +0100 Subject: [PATCH 08/16] Add CI on self hosted runner --- .github/workflows/_runner-xdna2.yml | 47 +++++++++++++++++++++++++ .github/workflows/ci-platform-xdna2.yml | 31 ++++++++++++++++ README_XDNA.md | 29 ++++++++++++--- 3 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/_runner-xdna2.yml create mode 100644 .github/workflows/ci-platform-xdna2.yml diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml new file mode 100644 index 0000000000..f48f99c932 --- /dev/null +++ b/.github/workflows/_runner-xdna2.yml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-xdna2 + +"on": + workflow_call: + inputs: + pytest-marker: + required: true + type: string + docker-image: + required: false + type: string + default: "deeploy-xdna:local" + +jobs: + test-runner-xdna2: + runs-on: xdna2-npu + # NOTE: We cannot use the `container:` directive here because + # GitHub Actions does not support `--device` flags required for + # NPU access (/dev/accel/accel0). Instead we use explicit + # `docker run` commands. + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Run Tests in Docker + shell: bash + run: | + docker run --rm \ + --device /dev/accel/accel0 \ + --ulimit memlock=-1 \ + -v /opt/xilinx:/opt/xilinx \ + -v "${{ github.workspace }}":/app/Deeploy \ + -w /app/Deeploy \ + ${{ inputs.docker-image }} \ + bash -c " + pip install -e . && + pip install -r requirements-dev.txt && + cd DeeployTest && + pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}' + " diff --git a/.github/workflows/ci-platform-xdna2.yml b/.github/workflows/ci-platform-xdna2.yml new file mode 100644 index 0000000000..ccf455edf7 --- /dev/null +++ b/.github/workflows/ci-platform-xdna2.yml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • XDNA2 + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image: + description: "XDNA2 Docker image (must be pre-built on the runner)" + required: false + default: "deeploy-xdna:local" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + xdna2-kernels: + uses: ./.github/workflows/_runner-xdna2.yml + with: + pytest-marker: "kernels" + docker-image: ${{ inputs.docker_image || 'deeploy-xdna:local' }} diff --git a/README_XDNA.md b/README_XDNA.md index a96a3550c8..56cfcb1225 100644 --- a/README_XDNA.md +++ b/README_XDNA.md @@ -12,8 +12,6 @@ You need to have XRT installed on your host, once installed it is present in `/o docker run -it \ --device /dev/accel/accel0 \ --ulimit memlock=-1 \ - -v /scratch/jungvi/IRON:/opt/IRON \ - -e IRON_OPERATORS_DIR=/opt/IRON/iron/operators \ -v "$(pwd)":/app/Deeploy \ -v /opt/xilinx:/opt/xilinx \ --name deeploy_dev \ @@ -22,9 +20,32 @@ docker run -it \ Currently I use the IRON repo to generate my MLIR code, hence I have `-v /scratch/jungvi/IRON:/opt/IRON`, and `-e IRON_OPERATORS_DIR=/opt/IRON/iron/operators`. This will be as soon as the midend and backend of Deeploy are updated to support true MLIR generation. -Once the container is started you can a simple Add node, from ONNX to execution with: +Once the container is started you can run a simple Add node, from ONNX to execution with: ``` pip install -e ./ && \ cd DeeployTest && \ python deeployRunner_xdna2.py -t ./Tests/Kernels/BF16/Add/Regular/ -``` \ No newline at end of file +``` + +## CI with a Self-Hosted Runner + +XDNA2 tests run on a self-hosted GitHub Actions runner with NPU access. +The Docker image is built locally on the runner (not distributed via GHCR). + +### One-time setup on the runner machine + +1. Build the Docker image: + ``` + docker build -f Container/Dockerfile.deeploy-xdna -t deeploy-xdna:local . + ``` + +2. Register the GitHub Actions runner (Settings → Actions → Runners → New self-hosted runner). + Use the label **`xdna2-npu`** and install as a service: + ``` + ./svc.sh install && ./svc.sh start + ``` + +3. Make sure the runner user has access to `/dev/accel/accel0` (e.g. is in the `render` group). + +Once the runner is registered, pushes and PRs automatically trigger the +`CI • XDNA2` workflow defined in `.github/workflows/ci-platform-xdna2.yml`. \ No newline at end of file From d79e36fba00359f13885144eaad990ca0f96fea8 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 19 Mar 2026 09:52:53 +0100 Subject: [PATCH 09/16] Remove unecessary install --- .github/workflows/_runner-xdna2.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml index f48f99c932..0463c54137 100644 --- a/.github/workflows/_runner-xdna2.yml +++ b/.github/workflows/_runner-xdna2.yml @@ -41,7 +41,6 @@ jobs: ${{ inputs.docker-image }} \ bash -c " pip install -e . && - pip install -r requirements-dev.txt && cd DeeployTest && pytest test_platforms.py -v -m 'xdna2 and ${{ inputs.pytest-marker }}' " From 7c995bca034b29d012825a5fa803db159caeea82 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 19 Mar 2026 10:00:47 +0100 Subject: [PATCH 10/16] Add cleanup step before checkout to fix permission --- .github/workflows/_runner-xdna2.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/_runner-xdna2.yml b/.github/workflows/_runner-xdna2.yml index 0463c54137..d9ba66d8a7 100644 --- a/.github/workflows/_runner-xdna2.yml +++ b/.github/workflows/_runner-xdna2.yml @@ -24,6 +24,14 @@ jobs: # NPU access (/dev/accel/accel0). Instead we use explicit # `docker run` commands. steps: + - name: Fix workspace permissions + shell: bash + run: | + docker run --rm \ + -v "${{ github.workspace }}":/workspace \ + ${{ inputs.docker-image }} \ + chown -R $(id -u):$(id -g) /workspace || true + - name: Checkout Repo uses: actions/checkout@v4 with: From fc2b364c03708c49bc110a38e5918c9c874c9153 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Tue, 24 Mar 2026 17:00:53 +0100 Subject: [PATCH 11/16] aie import is optional to not enforce mlir-aie and llvm-aie package installation --- DeeployTest/testUtils/platformMapping.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index eaa9b2503f..58e2e1c396 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -29,9 +29,6 @@ from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform -from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer -from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, XDNA2Optimizer, \ - XDNA2Platform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] _NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "XDNA2"] @@ -80,6 +77,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: Platform = ChimeraPlatform() elif platformName == "XDNA2": + from Deeploy.Targets.XDNA2.Platform import XDNA2Platform Platform = XDNA2Platform() else: @@ -279,7 +277,18 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - elif isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)): + else: + # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms + try: + from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer + from Deeploy.Targets.XDNA2.Platform import (MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, + XDNA2Optimizer, XDNA2Platform) + except ImportError: + raise RuntimeError(f"Deployer for platform {platform} is not implemented") + + if not isinstance(platform, (XDNA2Platform, MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper)): + raise RuntimeError(f"Deployer for platform {platform} is not implemented") + if loweringOptimizer is None: loweringOptimizer = XDNA2Optimizer @@ -295,7 +304,4 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - else: - raise RuntimeError(f"Deployer for platform {platform} is not implemented") - return deployer From 1865530ba68770b70e510fca25d2b81908dcd298 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Tue, 24 Mar 2026 17:01:22 +0100 Subject: [PATCH 12/16] Decouple xdna requirements from dev requirements --- Container/Dockerfile.deeploy-xdna | 11 +++-------- requirements-dev.txt | 7 ------- requirements-xdna.txt | 10 ++++++++++ 3 files changed, 13 insertions(+), 15 deletions(-) create mode 100644 requirements-xdna.txt diff --git a/Container/Dockerfile.deeploy-xdna b/Container/Dockerfile.deeploy-xdna index fd62657740..16907402df 100644 --- a/Container/Dockerfile.deeploy-xdna +++ b/Container/Dockerfile.deeploy-xdna @@ -40,17 +40,12 @@ ENV LD_LIBRARY_PATH=${XILINX_XRT}/lib WORKDIR /app -COPY pyproject.toml ./ +COPY pyproject.toml requirements-xdna.txt ./ RUN pip install toml-to-requirements && \ toml-to-req --toml-file pyproject.toml && \ pip install -r requirements.txt && \ - rm -f requirements.txt pyproject.toml - -RUN pip install \ - --extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 \ - --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ - "mlir_aie==v1.2.1" \ - llvm-aie + pip install -r requirements-xdna.txt && \ + rm -f requirements.txt pyproject.toml requirements-xdna.txt ENV MLIR_AIE_PYTHON=/usr/bin/python3 diff --git a/requirements-dev.txt b/requirements-dev.txt index 5cbdc0ef64..6d047b4957 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,13 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 ---extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 ---extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly ---extra-index-url https://pypi.org/simple - -mlir_aie==v1.2.1 -llvm-aie - # Quality of life netron debugpy diff --git a/requirements-xdna.txt b/requirements-xdna.txt new file mode 100644 index 0000000000..21204f5987 --- /dev/null +++ b/requirements-xdna.txt @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--extra-index-url https://github.com/Xilinx/mlir-aie/releases/expanded_assets/v1.2.1 +--extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly +--extra-index-url https://pypi.org/simple + +mlir_aie==v1.2.1 +llvm-aie From de6f9616d7034f6bf472845e3a23af3a23469a06 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Tue, 24 Mar 2026 17:02:14 +0100 Subject: [PATCH 13/16] Format --- DeeployTest/testUtils/platformMapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 58e2e1c396..9155ed77ae 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -281,8 +281,8 @@ def mapDeployer(platform: DeploymentPlatform, # Lazy-import XDNA2 to avoid requiring mlir-aie on non-XDNA2 platforms try: from Deeploy.Targets.XDNA2.Deployer import XDNA2Deployer - from Deeploy.Targets.XDNA2.Platform import (MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, - XDNA2Optimizer, XDNA2Platform) + from Deeploy.Targets.XDNA2.Platform import MemoryXDNA2Platform, MemoryXDNA2PlatformWrapper, \ + XDNA2Optimizer, XDNA2Platform except ImportError: raise RuntimeError(f"Deployer for platform {platform} is not implemented") From 01d458bfcf7bde410aaf3481e53fcbba20ae1c03 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Tue, 24 Mar 2026 17:02:32 +0100 Subject: [PATCH 14/16] Format --- Deeploy/MLIRDataTypes.py | 20 +- .../MLIRComputeCorePass.py | 24 +- .../MLIRObjectFifoPass.py | 9 +- .../MLIRRuntimeSequencePass.py | 18 +- .../CodeTransformationPasses/__init__.py | 2 +- Deeploy/Targets/XDNA2/Deployer.py | 20 +- Deeploy/Targets/XDNA2/Platform.py | 27 +- .../Targets/XDNA2/Templates/AddTemplate.py | 2 +- Deeploy/Targets/XDNA2/Tiler.py | 7 +- DeeployTest/Platforms/XDNA2/main.cpp | 318 +++++++++--------- DeeployTest/deeployRunner_xdna2.py | 3 +- DeeployTest/generateNetwork_xdna2.py | 28 +- TargetLibraries/XDNA2/kernels/add.cc | 66 ++-- 13 files changed, 261 insertions(+), 283 deletions(-) diff --git a/Deeploy/MLIRDataTypes.py b/Deeploy/MLIRDataTypes.py index 1f8a2be446..8305c26a04 100644 --- a/Deeploy/MLIRDataTypes.py +++ b/Deeploy/MLIRDataTypes.py @@ -24,18 +24,18 @@ from __future__ import annotations from abc import abstractmethod -from typing import Any, Dict, List, Optional, TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from Deeploy.DeeployTypes import NodeTemplate if TYPE_CHECKING: from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation - # ====================================================================== # MLIRExecutionBlock # ====================================================================== + class MLIRExecutionBlock: """MLIR-specific execution state for a single operator. @@ -92,6 +92,7 @@ def __init__(self, computeTile: Any = None, shimTile: Any = None) -> None: # MLIRCodeTransformationPass / MLIRCodeTransformation # ====================================================================== + class MLIRCodeTransformationPass: """Base class for passes that transform an :class:`MLIRExecutionBlock`. @@ -99,9 +100,7 @@ class MLIRCodeTransformationPass: and optionally emit MLIR operations into the current insertion point. """ - def apply(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: return ctxt, mlirBlock @@ -125,17 +124,13 @@ def __init__(self, self.devicePasses: List[MLIRCodeTransformationPass] = devicePasses or [] self.runtimeSequencePasses: List[MLIRCodeTransformationPass] = runtimeSequencePasses or [] - def applyDevicePasses(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def applyDevicePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: for _pass in self.devicePasses: ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) return ctxt, mlirBlock - def applyRuntimeSequencePasses(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def applyRuntimeSequencePasses(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: for _pass in self.runtimeSequencePasses: ctxt, mlirBlock = _pass.apply(ctxt, mlirBlock, name) @@ -146,6 +141,7 @@ def applyRuntimeSequencePasses(self, # MLIRNodeTemplate # ====================================================================== + class MLIRNodeTemplate(NodeTemplate): """NodeTemplate subclass that emits MLIR instead of C code. @@ -188,7 +184,7 @@ def emit(self, operatorRepresentation: OperatorRepresentation, **kwargs) -> None # NodeTemplate overrides # ------------------------------------------------------------------ - def generate(self, operatorRepresentation={}, **kwargs) -> str: + def generate(self, operatorRepresentation = {}, **kwargs) -> str: """Generate an MLIR string for this node. This default implementation is a thin wrapper: it delegates to diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py index 7d06fab241..2f58acc852 100644 --- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRComputeCorePass.py @@ -56,9 +56,7 @@ def __init__(self, inputTensorKeys: List[str], outputTensorKeys: List[str]) -> N self.inputTensorKeys = inputTensorKeys self.outputTensorKeys = outputTensorKeys - def apply(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: computeTile = mlirBlock.computeTile kernelObj = mlirBlock.kernelObjFile @@ -71,7 +69,7 @@ def apply(self, firstKey = self.inputTensorKeys[0] tileTy = mlirBlock.fifoTypes[firstKey] - @aie_d.core(computeTile, link_with=kernelObj) + @aie_d.core(computeTile, link_with = kernelObj) def _core(): subviewTy = aie_d.ObjectFifoSubviewType.get(tileTy) for _ in scf_d.for_(0, 0x7FFFFFFFFFFFFFFF, 1): @@ -80,18 +78,14 @@ def _core(): acquiredElements = {} for key in self.inputTensorKeys: fifoName = mlirBlock.fifoMap[key] - acq = aie_d.objectfifo_acquire( - subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1) - acquiredElements[key] = aie_d.objectfifo_subview_access( - tileTy, acq, 0) + acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Consume, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0) # Acquire all output FIFO elements for key in self.outputTensorKeys: fifoName = mlirBlock.fifoMap[key] - acq = aie_d.objectfifo_acquire( - subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1) - acquiredElements[key] = aie_d.objectfifo_subview_access( - tileTy, acq, 0) + acq = aie_d.objectfifo_acquire(subviewTy, aie_d.ObjectFifoPort.Produce, fifoName, 1) + acquiredElements[key] = aie_d.objectfifo_subview_access(tileTy, acq, 0) # Build modified opRepr: replace tensor names with MLIR # values, replace size with tile size. This mirrors the @@ -103,12 +97,10 @@ def _core(): # Release all inputs for key in self.inputTensorKeys: - aie_d.objectfifo_release( - aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Consume, mlirBlock.fifoMap[key], 1) # Release all outputs for key in self.outputTensorKeys: - aie_d.objectfifo_release( - aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1) + aie_d.objectfifo_release(aie_d.ObjectFifoPort.Produce, mlirBlock.fifoMap[key], 1) scf_d.yield_([]) scf_d.yield_([]) diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py index be6b492906..d49b0e4c03 100644 --- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRObjectFifoPass.py @@ -23,10 +23,9 @@ from typing import TYPE_CHECKING, Tuple +import aie.ir as ir import numpy as np - from aie.dialects import aie as aie_d -import aie.ir as ir from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock @@ -89,7 +88,7 @@ def __init__(self, outputTensorKeys: list, kernelFuncName: str, kernelObjFile: str, - kernelArgTypes=None, + kernelArgTypes = None, fifoDepth: int = 2) -> None: self.inputTensorKeys = inputTensorKeys self.outputTensorKeys = outputTensorKeys @@ -98,9 +97,7 @@ def __init__(self, self._kernelArgTypes = kernelArgTypes self.fifoDepth = fifoDepth - def apply(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: opRepr = mlirBlock.operatorRepresentation numElements = int(opRepr['size']) diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py index 18a4607328..6331bd0914 100644 --- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/MLIRRuntimeSequencePass.py @@ -18,9 +18,9 @@ from typing import TYPE_CHECKING, Tuple +import aie.ir as ir from aie.dialects import aie as aie_d from aie.dialects import aiex as aiex_d -import aie.ir as ir from Deeploy.MLIRDataTypes import MLIRCodeTransformationPass, MLIRExecutionBlock @@ -43,18 +43,16 @@ def __init__(self, inputTensorKeys: list, outputTensorKeys: list) -> None: self.inputTensorKeys = inputTensorKeys self.outputTensorKeys = outputTensorKeys - def apply(self, - ctxt: NetworkContext, - mlirBlock: MLIRExecutionBlock, + def apply(self, ctxt: NetworkContext, mlirBlock: MLIRExecutionBlock, name: str) -> Tuple[NetworkContext, MLIRExecutionBlock]: numElements = mlirBlock.numElements seqArgs = mlirBlock.runtimeSequenceArgs dims = [ - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=1, stride=0), - aie_d.bd_dim_layout(size=numElements, stride=1), + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = 1, stride = 0), + aie_d.bd_dim_layout(size = numElements, stride = 1), ] # Build ordered list of (fifoName, seqArg, isOutput) @@ -70,12 +68,12 @@ def apply(self, for fifoName, seqArg, isOutput in transfers: if isOutput: - task = aiex_d.dma_configure_task_for(fifoName, issue_token=True) + task = aiex_d.dma_configure_task_for(fifoName, issue_token = True) else: task = aiex_d.dma_configure_task_for(fifoName) block = task.body.blocks.append() with ir.InsertionPoint(block): - aie_d.dma_bd(seqArg, offset=0, len=numElements, dimensions=dims, burst_length=0) + aie_d.dma_bd(seqArg, offset = 0, len = numElements, dimensions = dims, burst_length = 0) aie_d.end() aiex_d.dma_start_task(task) diff --git a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py index fe25ee3fdf..f7843db7b3 100644 --- a/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py +++ b/Deeploy/Targets/XDNA2/CodeTransformationPasses/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import * from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRComputeCorePass import * +from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRObjectFifoPass import * from Deeploy.Targets.XDNA2.CodeTransformationPasses.MLIRRuntimeSequencePass import * diff --git a/Deeploy/Targets/XDNA2/Deployer.py b/Deeploy/Targets/XDNA2/Deployer.py index f4a0e0a365..0ea3e2491d 100644 --- a/Deeploy/Targets/XDNA2/Deployer.py +++ b/Deeploy/Targets/XDNA2/Deployer.py @@ -21,12 +21,11 @@ from typing import Callable, Dict, Optional, Type +import aie.ir as ir import onnx_graphsurgeon as gs - -from aie.extras.context import mlir_mod_ctx from aie.dialects import aie as aie_d from aie.dialects import aiex as aiex_d -import aie.ir as ir +from aie.extras.context import mlir_mod_ctx from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer @@ -111,13 +110,11 @@ def generateMLIR(self) -> str: tilingConstraint = getattr(executionBlock, 'patternMemoryConstraint', None) if not isinstance(template, MLIRNodeTemplate): - raise RuntimeError( - f"Node '{nodeName}' has no MLIRNodeTemplate — " - f"only BF16 Add is supported in this release.") + raise RuntimeError(f"Node '{nodeName}' has no MLIRNodeTemplate — " + f"only BF16 Add is supported in this release.") if not isinstance(codeTransformer, MLIRCodeTransformation): - raise RuntimeError( - f"Node '{nodeName}' uses a non-MLIR CodeTransformation — " - f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.") + raise RuntimeError(f"Node '{nodeName}' uses a non-MLIR CodeTransformation — " + f"expected MLIRCodeTransformation, got {type(codeTransformer).__name__}.") nodes.append({ 'nodeName': nodeName, @@ -143,7 +140,7 @@ def _device(): # === Device phase === for node in nodes: # Create MLIRExecutionBlock with deployer-level state - eb = MLIRExecutionBlock(computeTile=computeTile, shimTile=shimTile) + eb = MLIRExecutionBlock(computeTile = computeTile, shimTile = shimTile) eb.operatorRepresentation = node['opRepr'] eb.patternMemoryConstraint = node['tilingConstraint'] eb.template = node['template'] @@ -155,8 +152,7 @@ def _device(): # 1. MLIRObjectFifoPass — creates FIFOs, declares kernel # 2. MLIRComputeCorePass — opens core + loops, calls # template.emit() with acquired FIFO elements in opRepr - self.ctxt, eb = node['codeTransformer'].applyDevicePasses( - self.ctxt, eb, node['nodeName']) + self.ctxt, eb = node['codeTransformer'].applyDevicePasses(self.ctxt, eb, node['nodeName']) mlirBlocks.append((node, eb)) diff --git a/Deeploy/Targets/XDNA2/Platform.py b/Deeploy/Targets/XDNA2/Platform.py index 4a186aca7c..b54ce8acb9 100644 --- a/Deeploy/Targets/XDNA2/Platform.py +++ b/Deeploy/Targets/XDNA2/Platform.py @@ -9,15 +9,15 @@ from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper from Deeploy.Targets.Generic.Layers import AddLayer -from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.Parsers import AddParser +from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.XDNA2.Bindings import XDNA2AddBindings from Deeploy.Targets.XDNA2.Tiler import XDNA2AddTilingReadyBindings # Standard mapper for non-tiled deployment XDNA2AddMapper = NodeMapper(AddParser(), XDNA2AddBindings) -# Tiling-ready mapper for tiled deployment +# Tiling-ready mapper for tiled deployment XDNA2AddTilableMapper = NodeMapper(AddParser(), XDNA2AddTilingReadyBindings) # Standard mapping (used when tiling is disabled) @@ -64,8 +64,7 @@ class XDNA2StructBuffer(StructBuffer): class XDNA2Engine(DeploymentEngine): - def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", - includeList = None) -> None: + def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = "", includeList = None) -> None: if includeList is None: includeList = [] super().__init__(name, Mapping, initCode, includeList) @@ -73,13 +72,17 @@ def __init__(self, name: str = "XDNA2", Mapping = XDNA2Mapping, initCode: str = class XDNA2AIECoreEngine(DeploymentEngine): """AIE core execution engine with L1 local memory as preferred memory level. - + The AIE core has 8KB of local memory (L1) for temporary buffers and computation. Data is transferred from L3 (shared memory) to L1 as needed. """ - def __init__(self, name: str = "XDNA2_AIE_Core", Mapping = XDNA2Mapping, initCode: str = "", - includeList = None, preferredMemoryLevel: str = "L1") -> None: + def __init__(self, + name: str = "XDNA2_AIE_Core", + Mapping = XDNA2Mapping, + initCode: str = "", + includeList = None, + preferredMemoryLevel: str = "L1") -> None: if includeList is None: includeList = [] super().__init__(name, Mapping, initCode, includeList) @@ -101,7 +104,7 @@ def __init__(self, class MemoryXDNA2Platform(MemoryPlatform): """XDNA2 platform with memory hierarchy support for tiling. - + Defines the memory hierarchy: - L1: 8KB per AIE core (local memory) - L3: Shared memory for entire AIE array @@ -122,7 +125,7 @@ def __init__(self, def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: """Get the target memory level for a tensor in a given node. - + For XDNA2, if the node is marked to run on AIE core engine, return L1 (preferred level). Otherwise use the default target memory level (typically L3). """ @@ -131,14 +134,14 @@ def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkCont engine = node._engine_assignment if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): return engine.preferredMemoryLevel - + return self.defaultTargetMemoryLevel.name class MemoryXDNA2PlatformWrapper(MemoryPlatformWrapper): """Wrapper for XDNA2Platform with memory-level support.""" - def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, + def __init__(self, platform: XDNA2Platform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel): assert isinstance(platform, XDNA2Platform), \ f"Given platform is not an instance of XDNA2Platform. Platform type: {type(platform).__name__}" @@ -150,5 +153,5 @@ def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkCont engine = node._engine_assignment if isinstance(engine, XDNA2AIECoreEngine) and hasattr(engine, 'preferredMemoryLevel'): return engine.preferredMemoryLevel - + return self.defaultTargetMemoryLevel.name diff --git a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py index 7a13b0625f..6c526a9e38 100644 --- a/Deeploy/Targets/XDNA2/Templates/AddTemplate.py +++ b/Deeploy/Targets/XDNA2/Templates/AddTemplate.py @@ -18,9 +18,9 @@ from typing import TYPE_CHECKING +import aie.ir as ir from aie.dialects import arith as arith_d from aie.dialects import func as func_d -import aie.ir as ir from Deeploy.MLIRDataTypes import MLIRNodeTemplate diff --git a/Deeploy/Targets/XDNA2/Tiler.py b/Deeploy/Targets/XDNA2/Tiler.py index 9754aa0688..b2282c34b0 100644 --- a/Deeploy/Targets/XDNA2/Tiler.py +++ b/Deeploy/Targets/XDNA2/Tiler.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 - """XDNA2 tiling constraints and tiling-ready node bindings for MLIR code generation.""" from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint @@ -10,7 +9,5 @@ # For Add operator, reuse the generic BOP (Binary Operator) tile constraint # which handles equal-dimension binary operations -XDNA2AddTilingReadyBindings = TilingReadyNodeBindings( - nodeBindings=XDNA2AddBindings, - tileConstraint=AddTileConstraint() -) +XDNA2AddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = XDNA2AddBindings, + tileConstraint = AddTileConstraint()) diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp index 046384e4db..20d748265d 100644 --- a/DeeployTest/Platforms/XDNA2/main.cpp +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -22,181 +22,185 @@ #include "xrt/xrt_kernel.h" // Generated by Deeploy's generateNetwork_xdna2.py: -// testinputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} defines -// testoutputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_OUTPUT{i} defines +// testinputs.h – uint16_t arrays of BF16 bit patterns + N_ELEMENTS_INPUT{i} +// defines testoutputs.h – uint16_t arrays of BF16 bit patterns + +// N_ELEMENTS_OUTPUT{i} defines #include "testinputs.h" #include "testoutputs.h" // --------------------------------------------------------------------------- // BF16 helpers // --------------------------------------------------------------------------- -static float bf16_to_float(uint16_t bf16) -{ - uint32_t f32_bits = static_cast(bf16) << 16; - float f; - std::memcpy(&f, &f32_bits, sizeof(f)); - return f; +static float bf16_to_float(uint16_t bf16) { + uint32_t f32_bits = static_cast(bf16) << 16; + float f; + std::memcpy(&f, &f32_bits, sizeof(f)); + return f; } -static bool bf16_nearly_equal(uint16_t a, uint16_t b, - float rtol = 0.0f, float atol = 0.0f) -{ - // Default: allow 1 BF16 ULP difference to account for hardware rounding. - // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values. - float fa = bf16_to_float(a); - float fb = bf16_to_float(b); - float diff = std::fabs(fa - fb); - - // Compute 1 ULP for the reference value's magnitude - uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits) - float ulp; - if (ref_exp == 0) - ulp = std::ldexp(1.0f, -133); // subnormal ULP - else - ulp = std::ldexp(1.0f, static_cast(ref_exp) - 127 - 7); // 7 mantissa bits - - float tol = std::fmax(atol + rtol * std::fabs(fb), ulp); - return diff <= tol; +static bool bf16_nearly_equal(uint16_t a, uint16_t b, float rtol = 0.0f, + float atol = 0.0f) { + // Default: allow 1 BF16 ULP difference to account for hardware rounding. + // A BF16 ULP at a given magnitude is the gap between adjacent BF16 values. + float fa = bf16_to_float(a); + float fb = bf16_to_float(b); + float diff = std::fabs(fa - fb); + + // Compute 1 ULP for the reference value's magnitude + uint16_t ref_exp = (b >> 7) & 0xFF; // BF16 exponent (8 bits) + float ulp; + if (ref_exp == 0) + ulp = std::ldexp(1.0f, -133); // subnormal ULP + else + ulp = std::ldexp(1.0f, + static_cast(ref_exp) - 127 - 7); // 7 mantissa bits + + float tol = std::fmax(atol + rtol * std::fabs(fb), ulp); + return diff <= tol; } // --------------------------------------------------------------------------- // Read the NPU instruction binary produced by aiecc.py // --------------------------------------------------------------------------- -static std::vector read_instr_binary(const std::string &path) -{ - std::ifstream file(path, std::ios::binary); - if (!file.is_open()) { - throw std::runtime_error("Cannot open instruction file: " + path); - } - file.seekg(0, std::ios::end); - size_t byte_size = file.tellg(); - file.seekg(0, std::ios::beg); - - std::vector instr(byte_size / sizeof(uint32_t)); - file.read(reinterpret_cast(instr.data()), byte_size); - return instr; +static std::vector read_instr_binary(const std::string &path) { + std::ifstream file(path, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("Cannot open instruction file: " + path); + } + file.seekg(0, std::ios::end); + size_t byte_size = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector instr(byte_size / sizeof(uint32_t)); + file.read(reinterpret_cast(instr.data()), byte_size); + return instr; } -int main(int argc, char **argv) -{ - // Paths to the compiled artefacts: default to the directory containing - // this binary so the test works regardless of the working directory or - // whether it is run inside a container. - std::string bin_dir; - { - std::string argv0(argv[0]); - auto sep = argv0.rfind('/'); - bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep); +int main(int argc, char **argv) { + // Paths to the compiled artefacts: default to the directory containing + // this binary so the test works regardless of the working directory or + // whether it is run inside a container. + std::string bin_dir; + { + std::string argv0(argv[0]); + auto sep = argv0.rfind('/'); + bin_dir = (sep == std::string::npos) ? "." : argv0.substr(0, sep); + } + std::string xclbin_path = bin_dir + "/network.xclbin"; + std::string instr_path = bin_dir + "/npu_insts.bin"; + + bool verbose = false; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg == "-v" || arg == "--verbose" || arg == "-vv") { + verbose = true; } - std::string xclbin_path = bin_dir + "/network.xclbin"; - std::string instr_path = bin_dir + "/npu_insts.bin"; - - bool verbose = false; - for (int i = 1; i < argc; ++i) { - std::string arg = argv[i]; - if (arg == "-v" || arg == "--verbose" || arg == "-vv") { - verbose = true; - } + } + if (argc >= 2 && argv[1][0] != '-') + xclbin_path = argv[1]; + if (argc >= 3 && argv[2][0] != '-') + instr_path = argv[2]; + + // ----------------------------------------------------------------------- + // 1. Open XRT device, register xclbin, create hw_context + // (matches mlir-aie test_utils::init_xrt_load_kernel pattern) + // ----------------------------------------------------------------------- + auto device = xrt::device(0); + auto xclbin = xrt::xclbin(xclbin_path); + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, "MLIR_AIE"); + + // ----------------------------------------------------------------------- + // 2. Read NPU instruction binary + // ----------------------------------------------------------------------- + std::vector instr_v = read_instr_binary(instr_path); + size_t n_instr = instr_v.size(); + + // ----------------------------------------------------------------------- + // 3. Derive element counts from the testinputs/testoutputs header defines. + // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set + // by generateNetwork_xdna2.py. + // ----------------------------------------------------------------------- + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, + "Input 0 and input 1 must have the same number of elements"); + static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0, + "Inputs and output must have the same number of elements"); + + const size_t n_elem = N_ELEMENTS_OUTPUT0; + const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes + const size_t buf_bytes = n_elem * elem_size; + + // ----------------------------------------------------------------------- + // 4. Allocate XRT buffer objects + // Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out) + // ----------------------------------------------------------------------- + auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in0 = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_in1 = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = + xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + // ----------------------------------------------------------------------- + // 5. Copy data into device buffers + // ----------------------------------------------------------------------- + std::memcpy(bo_instr.map(), instr_v.data(), + n_instr * sizeof(uint32_t)); + std::memcpy(bo_in0.map(), testInputVector0, buf_bytes); + std::memcpy(bo_in1.map(), testInputVector1, buf_bytes); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ----------------------------------------------------------------------- + // 6. Launch kernel and wait for completion + // opcode 3 = execute NPU instruction stream + // ----------------------------------------------------------------------- + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, static_cast(n_instr), bo_in0, + bo_in1, bo_out); + run.wait(); + + // ----------------------------------------------------------------------- + // 7. Sync output back and compare against golden reference + // ----------------------------------------------------------------------- + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + const uint16_t *hw_out = bo_out.map(); + const uint16_t *golden_out = testOutputVector0; + + int errors = 0; + for (size_t i = 0; i < n_elem; ++i) { + bool match = bf16_nearly_equal(hw_out[i], golden_out[i]); + if (!match) { + ++errors; + if (errors <= 10) { + std::cerr << " Mismatch at index " << i + << ": hw=" << bf16_to_float(hw_out[i]) << " (0x" << std::hex + << hw_out[i] << std::dec << ")" + << " ref=" << bf16_to_float(golden_out[i]) << " (0x" + << std::hex << golden_out[i] << std::dec << ")" + << " diff=" + << std::fabs(bf16_to_float(hw_out[i]) - + bf16_to_float(golden_out[i])) + << "\n"; + } } - if (argc >= 2 && argv[1][0] != '-') xclbin_path = argv[1]; - if (argc >= 3 && argv[2][0] != '-') instr_path = argv[2]; - - // ----------------------------------------------------------------------- - // 1. Open XRT device, register xclbin, create hw_context - // (matches mlir-aie test_utils::init_xrt_load_kernel pattern) - // ----------------------------------------------------------------------- - auto device = xrt::device(0); - auto xclbin = xrt::xclbin(xclbin_path); - device.register_xclbin(xclbin); - xrt::hw_context context(device, xclbin.get_uuid()); - auto kernel = xrt::kernel(context, "MLIR_AIE"); - - // ----------------------------------------------------------------------- - // 2. Read NPU instruction binary - // ----------------------------------------------------------------------- - std::vector instr_v = read_instr_binary(instr_path); - size_t n_instr = instr_v.size(); - - // ----------------------------------------------------------------------- - // 3. Derive element counts from the testinputs/testoutputs header defines. - // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set - // by generateNetwork_xdna2.py. - // ----------------------------------------------------------------------- - static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, - "Input 0 and input 1 must have the same number of elements"); - static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0, - "Inputs and output must have the same number of elements"); - - const size_t n_elem = N_ELEMENTS_OUTPUT0; - const size_t elem_size = sizeof(uint16_t); // BF16 = 2 bytes - const size_t buf_bytes = n_elem * elem_size; - - // ----------------------------------------------------------------------- - // 4. Allocate XRT buffer objects - // Kernel args: (0:opcode, 1:instr_bo, 2:instr_len, 3:in0, 4:in1, 5:out) - // ----------------------------------------------------------------------- - auto bo_instr = xrt::bo(device, n_instr * sizeof(uint32_t), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_in0 = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_in1 = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_out = xrt::bo(device, buf_bytes, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); - - // ----------------------------------------------------------------------- - // 5. Copy data into device buffers - // ----------------------------------------------------------------------- - std::memcpy(bo_instr.map(), instr_v.data(), n_instr * sizeof(uint32_t)); - std::memcpy(bo_in0.map(), testInputVector0, buf_bytes); - std::memcpy(bo_in1.map(), testInputVector1, buf_bytes); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_in0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // ----------------------------------------------------------------------- - // 6. Launch kernel and wait for completion - // opcode 3 = execute NPU instruction stream - // ----------------------------------------------------------------------- - unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, static_cast(n_instr), - bo_in0, bo_in1, bo_out); - run.wait(); - - // ----------------------------------------------------------------------- - // 7. Sync output back and compare against golden reference - // ----------------------------------------------------------------------- - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - const uint16_t *hw_out = bo_out.map(); - const uint16_t *golden_out = testOutputVector0; - - int errors = 0; - for (size_t i = 0; i < n_elem; ++i) { - bool match = bf16_nearly_equal(hw_out[i], golden_out[i]); - if (!match) { - ++errors; - if (errors <= 10) { - std::cerr << " Mismatch at index " << i - << ": hw=" << bf16_to_float(hw_out[i]) - << " (0x" << std::hex << hw_out[i] << std::dec << ")" - << " ref=" << bf16_to_float(golden_out[i]) - << " (0x" << std::hex << golden_out[i] << std::dec << ")" - << " diff=" << std::fabs(bf16_to_float(hw_out[i]) - bf16_to_float(golden_out[i])) - << "\n"; - } - } - if (verbose) { - float hw_f = bf16_to_float(hw_out[i]); - float ref_f = bf16_to_float(golden_out[i]); - std::cout << "[" << i << "] hw=" << hw_f - << " ref=" << ref_f - << " diff=" << std::fabs(hw_f - ref_f) - << (match ? "" : " *** MISMATCH") - << "\n"; - } + if (verbose) { + float hw_f = bf16_to_float(hw_out[i]); + float ref_f = bf16_to_float(golden_out[i]); + std::cout << "[" << i << "] hw=" << hw_f << " ref=" << ref_f + << " diff=" << std::fabs(hw_f - ref_f) + << (match ? "" : " *** MISMATCH") << "\n"; } + } - // Output format required by testUtils/core/output_parser.py - std::cout << "Errors: " << errors << " out of " << n_elem << "\n"; + // Output format required by testUtils/core/output_parser.py + std::cout << "Errors: " << errors << " out of " << n_elem << "\n"; - return (errors == 0) ? 0 : 1; + return (errors == 0) ? 0 : 1; } diff --git a/DeeployTest/deeployRunner_xdna2.py b/DeeployTest/deeployRunner_xdna2.py index 82be139d46..2fd1a40418 100644 --- a/DeeployTest/deeployRunner_xdna2.py +++ b/DeeployTest/deeployRunner_xdna2.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 - """Thin wrapper that invokes the shared Deeploy test runner for the XDNA2 platform. Usage (from DeeployTest/): @@ -14,4 +13,4 @@ from testUtils.deeployRunner import main if __name__ == '__main__': - sys.exit(main(default_platform="XDNA2", default_simulator="host", tiling_enabled=True)) + sys.exit(main(default_platform = "XDNA2", default_simulator = "host", tiling_enabled = True)) diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py index 995eaabbb7..43fd941926 100644 --- a/DeeployTest/generateNetwork_xdna2.py +++ b/DeeployTest/generateNetwork_xdna2.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna # # SPDX-License-Identifier: Apache-2.0 - """XDNA2 network generation script. JUNGVI: TODO: Move this script to ONNX4Deeploy @@ -16,12 +15,10 @@ """ import os -import struct import numpy as np import onnx import onnx_graphsurgeon as gs - from testUtils.platformMapping import mapDeployer from testUtils.testRunner import TestGeneratorArgumentParser @@ -149,8 +146,8 @@ def generateNetworkXDNA2(args): log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}") - l1_level = MemoryLevel("L1", neighbourNames=["L3"], size=l1_size) - l3_level = MemoryLevel("L3", neighbourNames=["L1"], size=l3_size) + l1_level = MemoryLevel("L1", neighbourNames = ["L3"], size = l1_size) + l3_level = MemoryLevel("L3", neighbourNames = ["L1"], size = l3_size) memory_hierarchy = MemoryHierarchy([l1_level, l3_level]) memory_hierarchy.setDefaultMemoryLevel("L3") # Tensors default to L3 @@ -158,24 +155,23 @@ def generateNetworkXDNA2(args): # defaultTargetMemoryLevel=L1 tells the tiling framework that computation # targets L1, so it must tile data from L3 into L1-sized chunks. mem_platform = MemoryXDNA2Platform( - memoryHierarchy=memory_hierarchy, - defaultTargetMemoryLevel=l1_level, - engines=[XDNA2AIECoreEngine(Mapping=XDNA2TilingMapping, preferredMemoryLevel="L1")] - ) + memoryHierarchy = memory_hierarchy, + defaultTargetMemoryLevel = l1_level, + engines = [XDNA2AIECoreEngine(Mapping = XDNA2TilingMapping, preferredMemoryLevel = "L1")]) # Create base deployer with memory platform deployer = mapDeployer(mem_platform, graph, inputTypes, - scheduler=_tilingScheduler, - deeployStateDir=_DEEPLOYSTATEDIR, - inputOffsets=inputOffsets) + scheduler = _tilingScheduler, + deeployStateDir = _DEEPLOYSTATEDIR, + inputOffsets = inputOffsets) # Wrap with MemoryDeployerWrapper (adds memory level annotation) deployer = MemoryDeployerWrapper(deployer) # Wrap with TilerDeployerWrapper (adds tiling) - deployer = TilerDeployerWrapper(deployer, workDir=_DEEPLOYSTATEDIR) + deployer = TilerDeployerWrapper(deployer, workDir = _DEEPLOYSTATEDIR) # frontEnd() parses the graph; bind() triggers tiling via wrappers deployer.frontEnd() @@ -184,7 +180,7 @@ def generateNetworkXDNA2(args): log.info("[XDNA2] Tiling completed, proceeding with MLIR generation") # Create output directory - os.makedirs(args.dumpdir, exist_ok=True) + os.makedirs(args.dumpdir, exist_ok = True) # Write testinputs.h (raw BF16 bit patterns as uint16_t) testInputStr = _generate_xdna2_inputs_header(test_inputs_f32) @@ -215,8 +211,8 @@ def generateNetworkXDNA2(args): if __name__ == '__main__': - parser = TestGeneratorArgumentParser(tiling_arguments=True, - description="Deeploy XDNA2 Code Generation Utility.") + parser = TestGeneratorArgumentParser(tiling_arguments = True, + description = "Deeploy XDNA2 Code Generation Utility.") args, _ = parser.parse_known_args() if args.platform != 'XDNA2': diff --git a/TargetLibraries/XDNA2/kernels/add.cc b/TargetLibraries/XDNA2/kernels/add.cc index 1a53e47398..13b8b54637 100644 --- a/TargetLibraries/XDNA2/kernels/add.cc +++ b/TargetLibraries/XDNA2/kernels/add.cc @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All +// rights reserved. SPDX-License-Identifier: Apache-2.0 #define NOCPP @@ -10,45 +10,45 @@ #include #include -template void eltwise_add(T_in *a, T_in *b, T_out *c, int size) -{ - for (int i = 0; i < size; i++) { - c[i] = a[i] + b[i]; - } +template +void eltwise_add(T_in *a, T_in *b, T_out *c, int size) { + for (int i = 0; i < size; i++) { + c[i] = a[i] + b[i]; + } } -template void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) -{ - constexpr int vec_factor = 16; - event0(); - T_in *__restrict pA1 = a; - T_in *__restrict pB1 = b; - T_out *__restrict pC1 = c; - const int F = size / vec_factor; - AIE_PREPARE_FOR_PIPELINING - AIE_LOOP_MIN_ITERATION_COUNT(16) - for (int i = 0; i < F; i++) { - aie::vector A0 = aie::load_v(pA1); - pA1 += vec_factor; - aie::vector B0 = aie::load_v(pB1); - pB1 += vec_factor; - aie::vector cout = aie::add(A0, B0); - aie::store_v(pC1, cout); - pC1 += vec_factor; - } - event1(); +template +void eltwise_vadd(T_in *a, T_in *b, T_out *c, int size) { + constexpr int vec_factor = 16; + event0(); + T_in *__restrict pA1 = a; + T_in *__restrict pB1 = b; + T_out *__restrict pC1 = c; + const int F = size / vec_factor; + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(16) + for (int i = 0; i < F; i++) { + aie::vector A0 = aie::load_v(pA1); + pA1 += vec_factor; + aie::vector B0 = aie::load_v(pB1); + pB1 += vec_factor; + aie::vector cout = aie::add(A0, B0); + aie::store_v(pC1, cout); + pC1 += vec_factor; + } + event1(); } extern "C" { -void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size) -{ - eltwise_add(a_in, b_in, c_out, size); +void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, + int size) { + eltwise_add(a_in, b_in, c_out, size); } -void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int size) -{ - eltwise_vadd(a_in, b_in, c_out, size); +void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, + int size) { + eltwise_vadd(a_in, b_in, c_out, size); } } // extern "C" From 4427f5a7b0ac73777a75a30e973c0ea66a7715eb Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 26 Mar 2026 11:49:10 +0100 Subject: [PATCH 15/16] Add general todos for future refactoring --- CMakeLists.txt | 3 --- Deeploy/Targets/XDNA2/Bindings.py | 2 ++ DeeployTest/Platforms/XDNA2/main.cpp | 3 +++ DeeployTest/generateNetwork_xdna2.py | 12 +++++------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c23ccca7b..ffc4d64085 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,9 +311,6 @@ if(platform STREQUAL XDNA2) message(STATUS "==============================================================================") message(STATUS "") - # XDNA2 uses its own CMakeLists.txt in DeeployTest/Platforms/XDNA2/ - # which handles the two-step build: xclbin -> host binary. - # AIE kernel compilation is in TargetLibraries/XDNA2/. add_subdirectory(TargetLibraries/XDNA2) add_subdirectory(DeeployTest/Platforms/XDNA2) diff --git a/Deeploy/Targets/XDNA2/Bindings.py b/Deeploy/Targets/XDNA2/Bindings.py index 14b8b0317a..1f0e7f7587 100644 --- a/Deeploy/Targets/XDNA2/Bindings.py +++ b/Deeploy/Targets/XDNA2/Bindings.py @@ -15,6 +15,8 @@ _ADD_INPUT_KEYS = ['data_in_1', 'data_in_2'] _ADD_OUTPUT_KEYS = ['data_out'] +# JUNGVI: TODO: This logic should not be boiled down for 1 operator but should be applied on every nodes of the network +# Likewise the kernelName and object file name should be specified in the node template of each operator. XDNA2Transformer = MLIRCodeTransformation( devicePasses = [ MLIRObjectFifoPass( diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp index 20d748265d..0cb5186f38 100644 --- a/DeeployTest/Platforms/XDNA2/main.cpp +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -122,6 +122,7 @@ int main(int argc, char **argv) { // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set // by generateNetwork_xdna2.py. // ----------------------------------------------------------------------- + // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs (with respect to the amount of bo available) static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, "Input 0 and input 1 must have the same number of elements"); static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0, @@ -160,6 +161,8 @@ int main(int argc, char **argv) { // 6. Launch kernel and wait for completion // opcode 3 = execute NPU instruction stream // ----------------------------------------------------------------------- + // JUNGVI: TODO: Collect runtime and display it + // JUNGVI: TODO: Enable warmup iterations unsigned int opcode = 3; auto run = kernel(opcode, bo_instr, static_cast(n_instr), bo_in0, bo_in1, bo_out); diff --git a/DeeployTest/generateNetwork_xdna2.py b/DeeployTest/generateNetwork_xdna2.py index 43fd941926..969c41200f 100644 --- a/DeeployTest/generateNetwork_xdna2.py +++ b/DeeployTest/generateNetwork_xdna2.py @@ -3,8 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 """XDNA2 network generation script. -JUNGVI: TODO: Move this script to ONNX4Deeploy - Replaces the generic ``generateNetwork.py`` for the XDNA2 platform. Instead of emitting C code it: @@ -32,7 +30,6 @@ def _tilingScheduler(graph: gs.Graph): - """Scheduler that returns List[List[gs.Node]] as required by the tiling framework.""" return [[node] for node in graph.nodes] @@ -135,13 +132,15 @@ def generateNetworkXDNA2(args): # Force bfloat16_t — BF16 test data stored as float32 in npz would be # inferred as float32_t by minimalFloatType, but the XDNA2 kernel # requires bfloat16_t inputs. + # JUNGVI: TODO: Align minimalFloatType to properly handle bf16 and don't force types. inputTypes[f"input_{index}"] = PointerClass(bfloat16_t) inputOffsets[f"input_{index}"] = 0 _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + # JUNGVI: TODO: Extend with the whole NPU array # Define memory hierarchy: L1 (AIE core local) and L3 (shared) - l1_size = int(getattr(args, 'l1', None) or 8192) # 8KB default + l1_size = int(getattr(args, 'l1', None) or 64000) # 64KB default l3_size = int(getattr(args, 'l3', None) or 128 * 1024 * 1024) # 128MB default log.info(f"[XDNA2] Using MemoryXDNA2Platform with L1={l1_size}, L3={l3_size}") @@ -151,9 +150,7 @@ def generateNetworkXDNA2(args): memory_hierarchy = MemoryHierarchy([l1_level, l3_level]) memory_hierarchy.setDefaultMemoryLevel("L3") # Tensors default to L3 - # Create memory-aware platform with AIE core engine - # defaultTargetMemoryLevel=L1 tells the tiling framework that computation - # targets L1, so it must tile data from L3 into L1-sized chunks. + # Create memory-aware platform with AIE core engines mem_platform = MemoryXDNA2Platform( memoryHierarchy = memory_hierarchy, defaultTargetMemoryLevel = l1_level, @@ -187,6 +184,7 @@ def generateNetworkXDNA2(args): with open(f'{args.dumpdir}/testinputs.h', 'w') as f: f.write(testInputStr) + # JUNGVI: TODO: Move this in ONNX4Deeploy # Recompute golden outputs from the actual BF16 inputs the hardware will # see. The original outputs.npz may have been computed in float32 # precision, which can differ by several BF16 ULPs. From a82fd526a15fe817dee6e27b4b5589096d6af03c Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Thu, 26 Mar 2026 11:49:34 +0100 Subject: [PATCH 16/16] Format --- DeeployTest/Platforms/XDNA2/main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DeeployTest/Platforms/XDNA2/main.cpp b/DeeployTest/Platforms/XDNA2/main.cpp index 0cb5186f38..7984ef8130 100644 --- a/DeeployTest/Platforms/XDNA2/main.cpp +++ b/DeeployTest/Platforms/XDNA2/main.cpp @@ -122,7 +122,8 @@ int main(int argc, char **argv) { // N_ELEMENTS_INPUT0, N_ELEMENTS_INPUT1, N_ELEMENTS_OUTPUT0 are set // by generateNetwork_xdna2.py. // ----------------------------------------------------------------------- - // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs (with respect to the amount of bo available) + // JUNGVI: TODO: Remove this assert and make it scalable for N I/Os graphs + // (with respect to the amount of bo available) static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_INPUT1, "Input 0 and input 1 must have the same number of elements"); static_assert(N_ELEMENTS_INPUT0 == N_ELEMENTS_OUTPUT0,