diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml new file mode 100644 index 0000000000..d456f9f353 --- /dev/null +++ b/.github/workflows/_runner-gap9-tiled.yml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-gap9-tiled + +"on": + workflow_call: + inputs: + runner: + required: true + type: string + docker-image: + required: true + type: string + pytest-markers: + required: true + type: string + +jobs: + test-runner-gap9-tiled: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.docker-image }} + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + shell: bash + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + pip install -e . || true + deactivate + - name: Cache ccache + uses: actions/cache/restore@v4 + with: + path: /app/.ccache + key: ccache-gap9 + - name: Run Test + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation + export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9 + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + pytest test_platforms.py -v -m "${{ inputs.pytest-markers }}" + deactivate + shell: bash diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml new file mode 100644 index 0000000000..d5d8d8e4c0 --- /dev/null +++ b/.github/workflows/_runner-gap9.yml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-gap9 + +"on": + workflow_call: + inputs: + runner: + required: true + type: string + docker-image: + required: true + type: string + pytest-marker: + required: true + type: string + +jobs: + test-runner-gap9: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.docker-image }} + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + shell: bash + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + pip install -e . || true + deactivate + - name: Cache ccache + uses: actions/cache/restore@v4 + with: + path: /app/.ccache + key: ccache-gap9 + - name: Run Test + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation + export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9 + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + pytest test_platforms.py -v -n 4 -m "gap9 and ${{ inputs.pytest-marker }}" + deactivate + shell: bash diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 5493758672..fa3c56197b 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -64,4 +64,4 @@ jobs: - name: Check Licenses shell: bash run: | - python scripts/reuse_skip_wrapper.py $(find . \( -name '*.py' -o -name '*.c' -o -name '*.h' -o -name '*.html' -o -name '*.rst' -o -name '*.yml' -o -name '*.yaml' \) -not -path "*toolchain*" -not -path "*third_party*" -not -path "*.git/*" -not -path "*install/*" -type f) + python scripts/reuse_skip_wrapper.py $(find . \( -name '*.py' -o -name '*.c' -o -name '*.h' -o -name '*.html' -o -name '*.rst' -o -name '*.yml' -o -name '*.yaml' \) -not -path "*toolchain*" -not -path "*third_party*" -not -path "*prebuilt*" -not -path "*.git/*" -not -path "*install/*" -type f) diff --git a/.github/workflows/ci-platform-gap9-tiled.yml b/.github/workflows/ci-platform-gap9-tiled.yml new file mode 100644 index 0000000000..0043f8d3e9 --- /dev/null +++ b/.github/workflows/ci-platform-gap9-tiled.yml @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • GAP9 (Tiled) + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image_deeploy: + description: "Deeploy Image to use" + required: false + default: "ghcr.io/pulp-platform/deeploy-gap9:latest" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + select-env: + uses: ./.github/workflows/_select-env.yml + with: + docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || github.repository == 'pulp-platform/Deeploy' && 'ghcr.io/pulp-platform/deeploy-gap9:latest'}} + + gap9-kernels-tiled-singlebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-gap9-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-markers: "gap9_tiled and kernels and singlebuffer and l2" + + gap9-kernels-tiled-doublebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-gap9-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-markers: "gap9_tiled and kernels and doublebuffer and l2" + + gap9-models-tiled-singlebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-gap9-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-markers: "gap9_tiled and models and singlebuffer and l2" + + gap9-models-tiled-doublebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-gap9-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-markers: "gap9_tiled and models and doublebuffer and l2" diff --git a/.github/workflows/ci-platform-gap9.yml b/.github/workflows/ci-platform-gap9.yml new file mode 100644 index 0000000000..079f13c2a5 --- /dev/null +++ b/.github/workflows/ci-platform-gap9.yml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • GAP9 + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image_deeploy: + description: "Deeploy Image to use" + required: false + default: "ghcr.io/pulp-platform/deeploy-gap9:latest" + + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + select-env: + uses: ./.github/workflows/_select-env.yml + with: + docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || (github.repository == 'pulp-platform/Deeploy' && 'ghcr.io/pulp-platform/deeploy-gap9:latest') }} + + gap9-kernels: + needs: select-env + uses: ./.github/workflows/_runner-gap9.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "kernels" + + gap9-models: + needs: select-env + uses: ./.github/workflows/_runner-gap9.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "models" diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml new file mode 100644 index 0000000000..b189bfd708 --- /dev/null +++ b/.github/workflows/infra-generate-ccache-gap9.yml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: Infrastructure • Generate CCache GAP9 + +"on": + workflow_dispatch: + inputs: + docker_image_deeploy: + description: "Deeploy GAP9 Image to use" + required: false + default: "ghcr.io/pulp-platform/deeploy-gap9:latest" + schedule: + # Runs the workflow on the default branch every day at 2AM CET to keep the cache fresh + - cron: "0 2 * * *" + +jobs: + generate-ccache-gap9: + runs-on: ubuntu-latest + container: + image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:latest' }} + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + shell: bash + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + pip install -e . || true + deactivate + + - name: Generate CCache for GAP9 + run: | + source /app/install/gap9-sdk/.gap9-venv/bin/activate + source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true + export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation + export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9 + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + pytest 'test_platforms.py::test_gap9_kernels[Kernels/Integer/Add/Regular]' --skipsim + pytest 'test_platforms.py::test_gap9_tiled_kernels_l2_singlebuffer[Kernels/Integer/Add/Large-5000-L2-singlebuffer]' --skipsim + deactivate + + - name: Clean and Upload CCache + uses: actions/cache@v4 + with: + path: /app/.ccache + key: ccache-gap9 diff --git a/.gitmodules b/.gitmodules index ea01f2734d..f8bad9fc4e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,10 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 [submodule "pulp-nn-mixed"] - path = TargetLibraries/PULPOpen/third_party/pulp-nn-mixed + path = TargetLibraries/third_party/pulp-nn-mixed url = https://github.com/pulp-platform/pulp-nn-mixed.git [submodule "pulp-nnx"] - path = TargetLibraries/PULPOpen/third_party/pulp-nnx + path = TargetLibraries/third_party/pulp-nnx url = https://github.com/pulp-platform/pulp-nnx.git [submodule "CMSIS-NN"] path = TargetLibraries/CMSIS/third_party/CMSIS-NN diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c8698b5ed..871dcb66cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,15 @@ This file contains the changelog for the Deeploy project. The changelog is divid ### List of Pull Requests +- Deeploy-GAP9 Platform [#143](https://github.com/pulp-platform/Deeploy/pull/143) - Update CLI interface Across Project, Fix Tutorial, and Remove Legacy Test [#157](https://github.com/pulp-platform/Deeploy/pull/157) ### Added -- +- Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows ### Changed +- Update `pulp-nnx` and `pulp-nn-mixed` submodules to their latest versions +- PULP-NN moved to TargetLibraries third-party folder - Aligned CLI commands across the project ### Fixed @@ -50,7 +53,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126) ### Added -- The `publish.yml` action to build a branch and push it to PyPi. The action is automatically triggered when a tag with the "v*" format is emitted. +- The `publish.yml` action to build a branch and push it to PyPi. The action is automatically triggered when a tag with the "v*" format is emitted. - I created a release of [Banshee](https://github.com/pulp-platform/banshee/releases/tag/v0.5.0-prebuilt) so we don't need to rebuild it over and over. The `Makefile` now pulls that release depending on the platform. - I bumped the onnx-graphsurgeon version such that we don't need to use NVIDIA's PyPi index anymore. - `_export_graph` assigns their export type to the tensors before export. diff --git a/CMakeLists.txt b/CMakeLists.txt index 70dec13084..4c8a024c15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.19) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) @@ -14,13 +14,14 @@ set(CMAKE_C_COMPILER_LAUNCHER "ccache") set(CMAKE_CXX_COMPILER_LAUNCHER "ccache") set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE) +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -32,6 +33,11 @@ elseif(platform STREQUAL Siracusa_w_neureka) message(STATUS "Building for platform 'Siracusa_w_neureka'") elseif(platform STREQUAL PULPOpen) message(STATUS "Building for platform 'PULP-Open'") +elseif(platform STREQUAL GAP9) + message(STATUS "Building for platform 'GAP9'") + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + set(ENV{KCONFIG_CONFIG} DeeployTest/Platforms/GAP9/sdk.config) + include($ENV{GAP_SDK_HOME}/utils/cmake/setup.cmake) elseif(platform STREQUAL Generic) message(STATUS "Building for platform 'Generic'") elseif(platform STREQUAL Snitch) @@ -46,9 +52,14 @@ endif() # Import useful functions / macros include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake) -include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake) +# Only if not GAP9 +if(NOT platform STREQUAL GAP9) + include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake) +endif() include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake) +add_library(deeploylib INTERFACE) + message(STATUS "============================= Project Configuration ============================") message(STATUS "[Deeploy] platform = " ${platform}) message(STATUS "[Deeploy] use_dma = " ${use_dma}) @@ -211,6 +222,27 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor endif() +if(platform STREQUAL GAP9) + project(${TESTNAME} LANGUAGES C ASM) + include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_gvsoc.cmake) + add_compile_options( + -Wno-error=unknown-pragmas + ) + + add_compile_definitions( + DEEPLOY_GAP9_PLATFORM + ) + + add_subdirectory(TargetLibraries/Generic) + add_subdirectory(DeeployTest) + setupos(${TESTNAME}) + add_subdirectory(TargetLibraries/GAP9) + target_include_directories(deeploygap9 PUBLIC TargetLibraries/Generic/inc) + + target_link_libraries(deeploylib INTERFACE deeploybasic deeploygap9) + +endif() + if(platform STREQUAL Snitch) if(TOOLCHAIN STREQUAL LLVM) diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py new file mode 100644 index 0000000000..0e7b052f46 --- /dev/null +++ b/Deeploy/Targets/GAP9/Bindings.py @@ -0,0 +1,399 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +GAP9-specific bindings using cl_dma.h API instead of low-level MCHAN. + +This module provides GAP9-specific DMA and code transformations that use +the PMSIS standard cl_dma API for better portability and cleaner code. +""" + +import itertools + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ + MemoryManagementGeneration, MemoryPassthroughGeneration +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \ + int8_t, int32_t, int64_t, uint8_t +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding +from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding +from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration +from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack +from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma +# Import templates from PULPOpen and Generic +from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \ + FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ + GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ + QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ + SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.PULPOpen.Bindings import ForkClosure, L3MemoryAwareFunctionCallClosure, \ + MemoryAwareForkTransformer, MemoryAwareFunctionCallClosure, TilingCallClosure +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled +from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ + FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ + FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \ + MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ + SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ + iRMSNormTemplate, iSoftmaxTemplate +from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ + PULPRequantShiftChecker +from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ + TilingVariableReplacementUpdate + +# GAP9-specific transformer using cl_dma.h API +GAP9Transformer = CodeTransformation([ + TilingVariableReplacement("L1"), + TilingCallClosure(writeback = False), + PULPSynchCoresPass(), + ForkClosure(writeback = False, generateStruct = True), + TilingVariableReplacementUpdate("L1"), + PULPClusterTiling("L2", "L1", GAP9MchanDma()), # Use GAP9MchanDma instead of ClDma + ArgumentStructGeneration(), + MemoryManagementGeneration("L1"), + TilingVariableReplacement("L2"), + MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA + PULPProfileUntiled(), + ArgumentStructGeneration(), + L3MemoryAwareFunctionCallClosure(writeback = False), + MemoryManagementGeneration("L2"), + MemoryManagementGeneration("L3.*"), + MemoryManagementGeneration(), +]) + +# GAP9-specific cluster transformer using cl_dma.h API +GAP9ClusterTransformer = CodeTransformation([ + TilingVariableReplacement("L1"), + TilingCallClosure(writeback = False, generateStruct = True), + TilingVariableReplacementUpdate("L1"), + PULPClusterTiling("L2", "L1", GAP9MchanDma()), # Use GAP9MchanDma instead of ClDma + ArgumentStructGeneration(), + MemoryManagementGeneration("L1"), + TilingVariableReplacement("L2"), + MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA + PULPProfileUntiled(), + ArgumentStructGeneration(), + L3MemoryAwareFunctionCallClosure(writeback = False), + MemoryManagementGeneration("L2"), + MemoryManagementGeneration("L3.*"), + MemoryManagementGeneration(), +]) + +# Simple transformer for non-tiling cases +GAP9SimpleTransformer = CodeTransformation([ + MemoryManagementGeneration("L2"), + MemoryManagementGeneration("L3.*"), + MemoryManagementGeneration(), +]) + +# Skip transformer (no DMA operations) +GAP9SkipTransformer = CodeTransformation( + [ArgumentStructGeneration(), + MemoryPassthroughGeneration("L.*"), + MemoryPassthroughGeneration(), + FutureGeneration()]) + +# =============================================================================== +# GAP9-specific bindings using ClDma instead of MchanDma +# All bindings below use GAP9Transformer or GAP9ClusterTransformer +# =============================================================================== + +GAP9DMASliceBindings = [ + AutoFutureBinding( + SliceChecker([ + PointerClass(type), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t) + ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer) + for type in IntegerDataTypes +] + +GAP9SliceBindings = [ + NodeBinding( + SliceChecker([ + PointerClass(type), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t) + ], [PointerClass(type)]), SliceTemplate.referenceTemplate, GAP9Transformer) for type in FloatDataTypes +] + +GAP9ReshapeBindings = [ + NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]), + ReshapeTemplate.referenceTemplate, GAP9SkipTransformer) for type in IntegerDataTypes + FloatDataTypes +] + +GAP9RQAddBindings = [ + NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]), + RQAddTemplate.referenceTemplate, GAP9Transformer) + for _type in [int8_t, uint8_t] + for _type2 in [int8_t, uint8_t] + for _type3 in [int8_t, uint8_t] +] + +GAP9AddBindings = [ + NodeBinding(AddChecker([PointerClass(type1), PointerClass(type2)], [PointerClass(int32_t)]), + AddTemplate.referenceTemplate, GAP9Transformer) + for type1 in IntegerDataTypes + for type2 in IntegerDataTypes +] + [ + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9RQSConv2DBindings = [ + NodeBinding( + PULPConvChecker([ + PointerClass(type1), + PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t), + PointerClass(int32_t) + ], [PointerClass(type2)]), ConvTemplate.PULPConv2D_8_Template, GAP9Transformer) + for type1, type2 in zip([int8_t, int8_t, uint8_t, uint8_t], [int8_t, uint8_t, int8_t, uint8_t]) +] + +GAP9RQSDWConv2DBindings = [ + NodeBinding( + PULPConvChecker([ + PointerClass(type1), + PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t), + PointerClass(int32_t) + ], [PointerClass(type2)]), ConvTemplate.PULPDWConv2D_8_Template, GAP9Transformer) + for type1, type2 in zip([int8_t, int8_t, uint8_t, uint8_t], [int8_t, uint8_t, int8_t, uint8_t]) +] + +GAP9RQSGEMM_8_Binding = [ + NodeBinding( + PULPLinearChecker([PointerClass(type1), + PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(type2)]), GEMMTemplate.PULPGEMM_8_Template, + GAP9Transformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t]) +] + +GAP9FloatGEMMBindings = [ + NodeBinding( + GEMMChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGemmTemplate.referenceTemplate, + GAP9Transformer) +] + +GAP9FloatConv2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DIm2ColTemplate, + GAP9Transformer) +] + +GAP9FloatDWConv2DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(float_type), PointerClass(float_type), + PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate, + GAP9Transformer) for float_type in FloatDataTypes +] + +GAP9RQSMatrixVecBindings = [ + NodeBinding( + PULPLinearChecker([PointerClass(type1), + PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(type2)]), MatrixVectorTemplate.referenceTemplate, + GAP9Transformer) for type1, type2 in zip([int8_t], [int8_t]) +] + +GAP9RQSTallGEMMBindings = [ + NodeBinding( + PULPLinearChecker([PointerClass(type1), + PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(type2)]), TallGEMMTemplate.referenceTemplate, + GAP9Transformer) for type1, type2 in zip([int8_t], [int8_t]) +] + +GAP9RQSGEMMBindings = GAP9RQSGEMM_8_Binding + +GAP9MaxPool2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(type)], [PointerClass(type)]), + MaxPool2DTemplate.PULPMaxPool2D_8_Template, GAP9Transformer) for type in [int8_t, uint8_t] +] + [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMaxPoolTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9Conv1DBinding = NodeBinding( + PULPConvChecker( + [PointerClass(int8_t), PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPConv1D_8_Template, GAP9Transformer) + +GAP9DWConv1DBinding = NodeBinding( + PULPConvChecker( + [PointerClass(int8_t), PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPDWConv1D_8_Template, GAP9Transformer) + +GAP9MatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + GEMMTemplate.PULPMM_8_Template, GAP9ClusterTransformer) +] + [ + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMatMulTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9ReduceMeanBindings = [ + NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate, + GAP9ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]), + FloatReduceMeanTemplate.referenceTemplate, GAP9ClusterTransformer) + for integer_type in SignedIntegerDataTypes + for float_type in FloatDataTypes +] + +GAP9ReduceSumBindings = [ + NodeBinding(ReduceMeanChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceSumTemplate.referenceTemplate, GAP9ClusterTransformer) +] + +GAP9UniformRQSBindings = [ + NodeBinding( + PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), + UniformRequantShiftTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes +] + +GAP9RQSBindings = [ + NodeBinding( + PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), + RequantShiftTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes +] + [ + NodeBinding( + PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(uint8_t)]), + RequantShiftTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes +] + +GAP9SoftmaxBindings = [ + NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate, + GAP9Transformer) for _type in [int8_t, uint8_t] +] + [ + NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatSoftmaxTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9SoftmaxGradBindings = [ + NodeBinding(SoftmaxChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatSoftmaxTemplate.referenceGradientTemplate, GAP9Transformer) +] + +GAP9SoftmaxCrossEntropyLossBindings = [ + NodeBinding( + SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), + SoftmaxCrossEntropyLossTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes +] + +GAP9SoftmaxCrossEntropyLossGradBindings = [ + NodeBinding( + SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), + SoftmaxCrossEntropyLossTemplate.referenceGradientTemplate, GAP9Transformer) for type in IntegerDataTypes +] + +GAP9SGDBindings = [ + NodeBinding(SGDChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + SGDTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9TransposeBindings = [ + NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate, + GAP9Transformer) for type in IntegerDataTypes +] + [ + NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + TransposeTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9ConcatBindings = [ + NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for type in IntegerDataTypes +] + +GAP9iRMSNormBindings = [ + NodeBinding(LayerNormChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]), + iRMSNormTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9iHardswishBindings = [ + NodeBinding(HardswishChecker([PointerClass(int8_t)], [PointerClass(int32_t)]), iHardswishTemplate.referenceTemplate, + GAP9ClusterTransformer) +] + +GAP9RQSiHardswishBindings = [ + NodeBinding( + RQHardswishChecker([PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), RQSiHardswishTemplate.referenceTemplate, + GAP9Transformer) +] + +GAP9iRQSGELUBindings = [ + NodeBinding( + GELUChecker([PointerClass(int8_t), + PointerClass(int32_t), + PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(int8_t)]), RQSiGELUTemplate.referenceTemplate, + GAP9ClusterTransformer) +] + +GAP9MulBindings = [ + NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]), + MulTemplate.referenceTemplate, GAP9Transformer) + for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes) +] + [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9ReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReluTemplate.referenceTemplate, GAP9Transformer) + +GAP9LayernormBinding = NodeBinding( + LayerNormChecker( + [PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate, + GAP9Transformer) + +GAP9FloatGELUBinding = NodeBinding( + GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGELUTemplate.referenceTemplate, GAP9Transformer) + +GAP9GatherBindings = [ + NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), + GatherTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes +] + +GAP9QuantBindings = [ + NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]), QuantTemplate.referenceTemplate, + GAP9Transformer), +] + +GAP9DequantBindings = [ + NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, + GAP9Transformer), +] + [ + NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, + GAP9Transformer), +] diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py new file mode 100644 index 0000000000..adbf161328 --- /dev/null +++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import math +from typing import Dict, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer +from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \ + PerTensorWaitingStrategy + + +class GAP9L3DmaFuture(Future): + + _initTemplate = NodeTemplate("pi_cl_ram_req_t ${name} = {0};") + + _deinitTemplate = NodeTemplate("") + + _allocTemplate = NodeTemplate("") + + _waitTemplate = NodeTemplate(""" + if (${name}.size != 0) { + pi_cl_ram_copy_wait(&${name}); + }""") + + +class GAP9L3Dma(AsyncDma): + + _transferTemplates = { + 2: + NodeTemplate( + "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + ) + } + _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture) + + def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: + super().__init__(transferTemplates) + + def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, + shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], + direction: DmaDirection) -> None: + super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction) + assert strideExt[-1] == 1, \ + "GAP9 RAM API requires contiguous transfers of the innermost dimension for external memory" + assert strideLoc[0] == shape[1] and strideLoc[1] == 1, \ + f"GAP9 RAM API requires contiguous transfers for local memory. Received local shape: {shape}, stride: {strideLoc}" + + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], + strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, + future: Future) -> OperatorRepresentation: + operatorRepresentation = super().transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc, + direction, future) + operatorRepresentation.update({ + "ext2loc": 1 if direction == "ExternalToLocal" else 0, + "transfer_size": math.prod(shape), + "length": shape[1], + "stride": strideExt[0], + }) + return operatorRepresentation + + +# Blocking adapter for L3 DMA (used in GAP9 L3 tiling) +gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma()) diff --git a/Deeploy/Targets/GAP9/DMA/MchanDma.py b/Deeploy/Targets/GAP9/DMA/MchanDma.py new file mode 100644 index 0000000000..14e7eb0930 --- /dev/null +++ b/Deeploy/Targets/GAP9/DMA/MchanDma.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import math +from typing import Dict, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future + + +class MchanTransferFuture(Future): + _initTemplate = NodeTemplate("int ${name} = -1;") + + _deinitTemplate = NodeTemplate("") + + _allocTemplate = NodeTemplate("${name} = mchan_transfer_get_id();") + + _waitTemplate = NodeTemplate(""" + if (${name} >= 0) { + mchan_transfer_wait(${name}); + mchan_transfer_free(${name}); + } + """) + + +class GAP9MchanDma(AsyncDma): + + _transferTemplates = { + 1: + NodeTemplate( + "{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext} }; mchan_transfer_push_1d(__mchan_tmp); }" + ), + 2: + NodeTemplate( + "{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext}, .ext_size_1d = ${size_1d}, .ext_stride_1d = ${stride_2d} }; mchan_transfer_push_2d(__mchan_tmp); }" + ), + } + _waitingStrategy = DirectionWaitingStrategy(MchanTransferFuture, "transfer") + + def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: + super().__init__(transferTemplates) + + def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, + shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], + direction: DmaDirection) -> None: + super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction) + + transferRank = len(shape) + # MCHAN v7 requires contiguous transfers for innermost dimension in external memory + assert strideExt[ + -1] == 1, "GAP9 MCHAN supports only contiguous transfers of the innermost dimension for external memory" + + # Local memory (TCDM) must also be contiguous + if transferRank == 1: + assert strideLoc[0] == 1, "GAP9 MCHAN supports only contiguous transfers for local memory" + else: + assert strideLoc[0] == shape[1] and strideLoc[ + 1] == 1, "GAP9 MCHAN supports only contiguous transfers for local memory" + + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], + strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, + future: Future) -> OperatorRepresentation: + operatorRepresentation = super().transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc, + direction, future) + + transferRank = len(shape) + + # Build MCHAN command using flags from mchan.h + # We construct the cmd value in Python and let the C code use the macros + mchanFlags = 0 + mchanFlags += (1 << 0) if direction == "ExternalToLocal" else 0 # direction + mchanFlags += (1 << 1) # increment addresses + mchanFlags += (1 << 2) if transferRank == 2 else 0 # 2d transfer + mchanFlags += (1 << 3) # event enable + + mchanTransferSize = math.prod(shape) + mchanTransferSizeBits = math.ceil(math.log2(mchanTransferSize)) if mchanTransferSize > 0 else 0 + assert mchanTransferSizeBits <= 17, ( + "The transfer size is not representable with 17 bits. " + f"Received transfer size {mchanTransferSize} that requires {mchanTransferSizeBits} bits") + + # cmd = (flags << 17) + size, matching PULPOpen MchanDma pattern + operatorRepresentation["cmd"] = (mchanFlags << 17) + mchanTransferSize + operatorRepresentation["size"] = mchanTransferSize + + if transferRank == 2: + operatorRepresentation["size_1d"] = shape[1] + operatorRepresentation["stride_2d"] = strideExt[0] + + return operatorRepresentation diff --git a/Deeploy/Targets/GAP9/Deployer.py b/Deeploy/Targets/GAP9/Deployer.py new file mode 100644 index 0000000000..f179754c2d --- /dev/null +++ b/Deeploy/Targets/GAP9/Deployer.py @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +GAP9-specific deployer that uses cl_dma.h API. + +This deployer extends PULPDeployer to use GAP9-specific DMA (ClDma) via +the GAP9Bindings transformers. +""" + +from typing import Callable, Dict, Type + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer +from Deeploy.Targets.GAP9.Bindings import GAP9ClusterTransformer, GAP9SimpleTransformer, GAP9Transformer +from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer + +# GAP9-specific L3 RAM allocation and loading templates +_GAP9L3AllocTemplate = NodeTemplate(""" +${locPtr} = cl_ram_malloc(${size}); +""") + +_GAP9L3InitTemplate = NodeTemplate(""" +load_file_to_ram(${locPtr}, "${extName}.hex"); +""") + + +class GAP9Deployer(PULPDeployer): + """ + GAP9-specific deployer using cl_dma.h API. + + This deployer uses GAP9-specific transformers that employ ClDma (cl_dma.h) + instead of the low-level MCHAN API used by PULPDeployer. + + The key differences from PULPDeployer: + - DMA: Uses ClDma (PMSIS cl_dma.h) instead of MchanDma (MCHAN hardware API) + - L3 RAM: Uses GAP9 APS256XXN OctaSPI RAM accessed via pi_cl_ram_* APIs + - File System: Uses ReadFS to load L3 data from flash + """ + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda x: x, + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets = {}): + super().__init__(graph, + deploymentPlatform, + inputTypes, + loweringOptimizer, + scheduler, + name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir, + inputOffsets = inputOffsets) + + # Override transformers to use GAP9-specific ones with ClDma + self.Transformer = GAP9Transformer + self.ClusterTransformer = GAP9ClusterTransformer + self.SimpleTransformer = GAP9SimpleTransformer + + def generateBufferAllocationCode(self) -> str: + retStr = SignPropDeployer.generateBufferAllocationCode(self) + + L3FileStr = "" + globalConstBuffers = [ + buf for key, buf in self.ctxt.globalObjects.items() if isinstance(buf, VariableBuffer) and buf._deploy + ] + nonArenaBuffers = [buf for buf in globalConstBuffers if buf._users != []] + outputBuffNames = [outputBuffer.name for outputBuffer in self.graph.outputs] + + # Find all L3 constant buffers + l3ConstBuffer = [] + for buf in nonArenaBuffers: + if hasattr(buf, "_memoryLevel") and buf._memoryLevel == "L3" and buf.name not in outputBuffNames: + l3ConstBuffer.append(buf) + + # Generate allocation and loading code for each L3 buffer + for idx, buf in enumerate(l3ConstBuffer): + locPtr = str(buf._instance) + extName = str(idx) + buf.extName = extName # This enables hex dump generation + size = np.prod(buf.shape) * (buf._type.referencedType.typeWidth // 8) + + # Allocate L3 RAM space (for constant buffers only) + if isinstance(buf, ConstantBuffer): + L3FileStr += _GAP9L3AllocTemplate.generate({"locPtr": locPtr, "extName": extName, "size": size}) + + # Load data from ReadFS + L3FileStr += _GAP9L3InitTemplate.generate({"locPtr": locPtr, "extName": extName, "size": size}) + + retStr = retStr + L3FileStr + + return retStr diff --git a/Deeploy/Targets/GAP9/Platform.py b/Deeploy/Targets/GAP9/Platform.py new file mode 100644 index 0000000000..d40c2c4440 --- /dev/null +++ b/Deeploy/Targets/GAP9/Platform.py @@ -0,0 +1,306 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \ + NodeTemplate, StructBuffer, TransientBuffer, VariableBuffer +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper +from Deeploy.Targets.GAP9.Templates import AllocateTemplate, FreeTemplate +# Import GAP9-specific tiler bindings +from Deeploy.Targets.GAP9.Tiler import GAP9AddTilingReadyBindings, GAP9ConcatTilingReadyBindings, \ + GAP9Conv2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, GAP9FlattenTilingReadyBindings, \ + GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, GAP9GatherTilingReadyBindings, \ + GAP9iHardswishTilingReadyBindings, GAP9iRMSNormTilingReadyBindings, GAP9iRQSGELUTilingReadyBindings, \ + GAP9LayernormTilingReadyBindings, GAP9MatMulTilingReadyBindings, GAP9MaxPool2DTilingReadyBindings, \ + GAP9MulTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, GAP9ReluTilingReadyBindings, \ + GAP9RQAddTilingReadyBindings, GAP9RQSConv2DTilingReadyBindings, GAP9RQSDWConv2DTilingReadyBindings, \ + GAP9RQSGEMMTilingReadyBindings, GAP9RQSiHardswishTilingReadyBindings, GAP9RQSMatrixVecTilingReadyBindings, \ + GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \ + GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \ + GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \ + GAP9UniformRQSTilingReadyBindings +from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ + BasicRQIntegerDivBinding +from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \ + LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \ + ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ + SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ + TransposeLayer, iHardswishLayer, iRMSNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ + GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \ + QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \ + RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ + SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ + TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser +from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate +from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \ + PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings, PULPSliceBindings +from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer +from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ + PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ + PULPTallGEMMParser + +# Create GAP9-specific NodeMappers +GAP9_RQAddMapper = NodeMapper(RQAddParser(), GAP9RQAddTilingReadyBindings) +GAP9_AddMapper = NodeMapper(AddParser(), GAP9AddTilingReadyBindings) +GAP9_FlattenMapper = NodeMapper(FlattenParser(), GAP9FlattenTilingReadyBindings) +GAP9_GELUMapper = NodeMapper(GELUParser(), GAP9FPGELUTilingReadyBindings) +GAP9_GatherMapper = NodeMapper(GatherParser(), GAP9GatherTilingReadyBindings) +GAP9_MulMapper = NodeMapper(MulParser(), GAP9MulTilingReadyBindings) +GAP9_Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) +GAP9_Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) +GAP9_ReshapeMapper = NodeMapper(ReshapeParser(), GAP9FlattenTilingReadyBindings) +GAP9_TransposeMapper = NodeMapper(TransposeParser(), GAP9TransposeTilingReadyBindings) +GAP9_UnsqueezeMapper = NodeMapper(UnsqueezeParser(), GAP9FlattenTilingReadyBindings) +GAP9_RequantShiftMapper = NodeMapper(RequantShiftParser(), GAP9RQSTilingReadyBindings) +GAP9_UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), GAP9UniformRQSTilingReadyBindings) +GAP9_ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanBindings) +GAP9_ReduceSumMapper = NodeMapper(ReduceSumParser(), GAP9ReduceSumTilingReadyBindings) +GAP9_MatMulMapper = NodeMapper(MatMulParser(), GAP9MatMulTilingReadyBindings) +GAP9_RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) +GAP9_RQGELU_int8_Mapper = NodeMapper(RQSiGELUParser(), GAP9iRQSGELUTilingReadyBindings) +GAP9_Conv1DMapper = NodeMapper(PULPConv1DParser(), [PULPConv1DBinding]) +GAP9_DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding]) +GAP9_FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), GAP9Conv2DTilingReadyBindings) +GAP9_Conv2DMapper = NodeMapper(PULPConv2DParser(), GAP9RQSConv2DTilingReadyBindings) +GAP9_FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), GAP9DWConv2DTilingReadyBindings) +GAP9_DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), GAP9RQSDWConv2DTilingReadyBindings) +GAP9_GEMMMapper = NodeMapper(PULPGEMMParser(), GAP9RQSGEMMTilingReadyBindings) +GAP9_FloatGEMMMapper = NodeMapper(GEMMParser(), GAP9FPGEMMTilingReadyBindings) +GAP9_MatrixVecMapper = NodeMapper(PULPMatrixVecParser(), GAP9RQSMatrixVecTilingReadyBindings) +GAP9_TallGEMMMapper = NodeMapper(PULPTallGEMMParser(), GAP9RQSTallGEMMTilingReadyBindings) +GAP9_MaxPool2DMapper = NodeMapper(MaxPool2DParser(), GAP9MaxPool2DTilingReadyBindings) +GAP9_LayerNormMapper = NodeMapper(LayerNormParser(), GAP9LayernormTilingReadyBindings) +GAP9_ReluMapper = NodeMapper(ReluParser(), GAP9ReluTilingReadyBindings) +GAP9_SoftmaxMapper = NodeMapper(SoftmaxParser(), GAP9SoftmaxTilingReadyBindings) +GAP9_SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), GAP9SoftmaxGradTilingReadyBindings) +GAP9_Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), GAP9SoftmaxTilingReadyBindings) +GAP9_ConcatMapper = NodeMapper(ConcatParser(), GAP9ConcatTilingReadyBindings) +GAP9_DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) +GAP9_SliceMapper = NodeMapper(SliceParser(), PULPSliceBindings) +GAP9_iRMSNormMapper = NodeMapper(iRMSNormParser(), GAP9iRMSNormTilingReadyBindings) +GAP9_iHardswishMapper = NodeMapper(iHardswishParser(), GAP9iHardswishTilingReadyBindings) +GAP9_RQSiHardswishMapper = NodeMapper(RQSiHardswishParser(), GAP9RQSiHardswishTilingReadyBindings) +GAP9_SoftmaxCrossEntropyLossMapper = NodeMapper(SoftmaxCrossEntropyLossParser(), + GAP9SoftmaxCrossEntropyTilingReadyBindings) +GAP9_SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(), + GAP9SoftmaxCrossEntropyGradTilingReadyBindings) +GAP9_SGDMapper = NodeMapper(SGDParser(), GAP9SGDTilingReadyBindings) +GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) +GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) +GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) + +# GAP9-specific mapping using ClDma +GAP9Mapping = { + 'Conv': + ConvLayer([GAP9_FPConv2DMapper, GAP9_FPDWConv2DMapper]), + 'RequantizedConv': + PULPRQSConvLayer([GAP9_Conv2DMapper, GAP9_DWConv2DMapper, GAP9_Conv1DMapper, GAP9_DWConv1DMapper]), + 'RequantizedGemm': + PULPRQSGEMMLayer([GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper]), + 'Gemm': + GEMMLayer([GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper]), + 'Gelu': + GELULayer([GAP9_GELUMapper]), + 'LayerNormalization': + LayerNormLayer([GAP9_LayerNormMapper]), + 'MaxPool': + MaxPoolLayer([GAP9_MaxPool2DMapper]), + 'RequantizediGELU': + RQSiGELULayer([GAP9_RQGELU_int8_Mapper]), + 'RQIntegerDiv': + RQIntegerDivLayer([GAP9_RQIntegerDivMapper]), + 'MatMul': + MatMulLayer([GAP9_MatMulMapper]), + 'IntegerMean': + ReduceMeanLayer([GAP9_ReduceMeanMapper]), + 'iSoftmax': + SoftmaxLayer([GAP9_Softmax_int8_Mapper]), + 'Softmax': + SoftmaxLayer([GAP9_SoftmaxMapper]), + 'ReduceMean': + ReduceMeanLayer([GAP9_ReduceMeanMapper]), + 'ReduceSum': + ReduceSumLayer([GAP9_ReduceSumMapper]), + 'RequantShift': + RequantShiftLayer([GAP9_UniformRequantShiftMapper, GAP9_RequantShiftMapper]), + 'Add': + AddLayer([GAP9_AddMapper]), + 'Flatten': + ReshapeLayer([GAP9_FlattenMapper]), + 'Gather': + GatherLayer([GAP9_GatherMapper]), + 'Mul': + MulLayer([GAP9_MulMapper]), + 'Pad': + PadLayer([GAP9_Pad1DMapper, GAP9_Pad2DMapper]), + 'Relu': + ReluLayer([GAP9_ReluMapper]), + 'Reshape': + ReshapeLayer([GAP9_ReshapeMapper]), + 'Squeeze': + ReshapeLayer([GAP9_UnsqueezeMapper]), + 'Transpose': + TransposeLayer([GAP9_TransposeMapper]), + 'Unsqueeze': + ReshapeLayer([GAP9_UnsqueezeMapper]), + 'Slice': + SliceLayer([GAP9_SliceMapper, GAP9_DMASliceMapper]), + 'RequantizedAdd': + AddLayer([GAP9_RQAddMapper]), + 'Concat': + ConcatLayer([GAP9_ConcatMapper]), + 'iRMSNorm': + iRMSNormLayer([GAP9_iRMSNormMapper]), + 'iHardswish': + iHardswishLayer([GAP9_iHardswishMapper]), + 'RequantizediHardswish': + RQSiHardswishLayer([GAP9_RQSiHardswishMapper]), + 'Quant': + QuantLayer([GAP9_QuantMapper]), + 'Dequant': + QuantLayer([GAP9_DequantMapper]), + 'SoftmaxGrad': + SoftmaxGradLayer([GAP9_SoftmaxGradMapper]), + 'SoftmaxCrossEntropyLoss': + SoftmaxCrossEntropyLossLayer([GAP9_SoftmaxCrossEntropyLossMapper]), + 'SoftmaxCrossEntropyLossGrad': + SoftmaxCrossEntropyLossGradLayer([GAP9_SoftmaxCrossEntropyLossGradMapper]), + 'SGD': + SGDLayer([GAP9_SGDMapper]) +} + + +class GAP9VariableBuffer(VariableBuffer): + + initTemplate = AllocateTemplate.gap9L2InitTemplate + # allocTemplate = AllocateTemplate.gap9L2AllocateTemplate + # deallocTemplate = FreeTemplate.gap9L2LocalTemplate + + allocTemplate = AllocateTemplate.gap9GenericAllocate + deallocTemplate = FreeTemplate.gap9GenericFree + + def _bufferRepresentation(self): + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + return { + "type": self._instance, + "name": self.name, + "size": int(np.prod(self.shape)), + "_memoryLevel": memoryLevel + } + + +class GAP9TransientBuffer(TransientBuffer): + + initTemplate = AllocateTemplate.gap9L2InitTemplate + allocTemplate = AllocateTemplate.gap9GenericAllocate + deallocTemplate = FreeTemplate.gap9GenericFree + + # allocTemplate = AllocateTemplate.gap9L2AllocateTemplate + # deallocTemplate = FreeTemplate.gap9L2GlobalTemplate + + def _bufferRepresentation(self): + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + return {"type": self._type, "name": self.name, "size": self.size, "_memoryLevel": memoryLevel} + + +class GAP9ConstantBuffer(ConstantBuffer): + + initTemplate = AllocateTemplate.gap9GenericGlobalInitTemplate + allocTemplate = AllocateTemplate.gap9L2GlobalAllocateTemplate + deallocTemplate = FreeTemplate.gap9L2GlobalTemplate + + def _bufferRepresentation(self): + operatorRepresentation = super()._bufferRepresentation() + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + operatorRepresentation["_memoryLevel"] = memoryLevel + + return operatorRepresentation + + +class GAP9StructBuffer(StructBuffer): + + initTemplate = BasicAllocateTemplate.referenceStructInitTemplate + allocTemplate = BasicAllocateTemplate.referenceStructAllocateTemplate + deallocTemplate = NodeTemplate("") + + +_includeList = ["pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h"] + + +class GAP9ClusterEngine(DeploymentEngine): + + def __init__(self, + name: str, + Mapping = GAP9Mapping, + initCode = "", + includeList = _includeList, + n_cores: int = 8) -> None: + super().__init__(name, Mapping, initCode, includeList) + self.n_cores = n_cores + + +class GAP9Platform(DeploymentPlatform): + + def __init__(self, + engines = [GAP9ClusterEngine("GAP9Cluster")], + variableBuffer = GAP9VariableBuffer, + constantBuffer = GAP9ConstantBuffer, + structBuffer = GAP9StructBuffer, + transientBuffer = GAP9TransientBuffer) -> None: + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + +class MemoryGAP9Platform(MemoryPlatform): + + untiledOps = ["add"] + + def __init__(self, + memoryHierarchy: MemoryHierarchy, + defaultTargetMemoryLevel: MemoryLevel, + engines = [GAP9ClusterEngine("GAP9Cluster")], + variableBuffer = GAP9VariableBuffer, + constantBuffer = GAP9ConstantBuffer, + structBuffer = GAP9StructBuffer, + transientBuffer = GAP9TransientBuffer) -> None: + super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer, + structBuffer, transientBuffer) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + if node.op in self.untiledOps: + return ctxt.lookup(tensorName)._memoryLevel + return super().getTargetMemoryLevel(node, tensorName, ctxt) + + +class MemoryGAP9PlatformWrapper(MemoryPlatformWrapper): + + untiledOps = [] + + def __init__(self, platform: GAP9Platform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel): + assert isinstance(platform, GAP9Platform), \ + f"Given platform is not an instance of GAP9Platform. Platform type: {type(platform).__name__}" + super().__init__(platform, memoryHierarchy, defaultTargetMemoryLevel) + + def getTargetMemoryLevel(self, node: gs.Node, tensorName: str, ctxt: NetworkContext) -> str: + if node.op in self.untiledOps: + return ctxt.lookup(tensorName)._memoryLevel + return super().getTargetMemoryLevel(node, tensorName, ctxt) diff --git a/Deeploy/Targets/GAP9/Templates/AllocateTemplate.py b/Deeploy/Targets/GAP9/Templates/AllocateTemplate.py new file mode 100644 index 0000000000..4d4ddac7c1 --- /dev/null +++ b/Deeploy/Targets/GAP9/Templates/AllocateTemplate.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +gap9L2InitTemplate = NodeTemplate("${type.typeName} ${name};\n") + +gap9L1InitTemplate = NodeTemplate("${type.typeName} ${name};\n") + +gap9L2AllocateTemplate = NodeTemplate( + "${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n") + +gap9L1AllocateTemplate = NodeTemplate( + "${name} = (${type.typeName}) pi_l1_malloc((void *) 0, sizeof(${type.referencedType.typeName}) * ${size});\n") + +gap9L2GlobalInitTemplate = NodeTemplate( + "static PI_L2 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n") + +gap9L1GlobalInitTemplate = NodeTemplate( + "static PI_L1 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n") + +gap9L2GlobalAllocateTemplate = NodeTemplate("") + +gap9L1GlobalAllocateTemplate = NodeTemplate("") + +gap9L2StructInitTemplate = NodeTemplate("""static PI_L2 ${type.typeName} ${name}; +""") + +gap9L2StructAllocateTemplate = NodeTemplate(""" % for key, value in structDict.items(): + ${name}.${key} = ${value}; +% endfor """) + +gap9GenericStructInitTemplate = NodeTemplate(""" +% if _memoryLevel == "L1": +static PI_L1 ${type.typeName} ${name};\n +% elif _memoryLevel == "L2" or _memoryLevel is None: +static PI_L2 ${type.typeName} ${name};\n +% elif _memoryLevel == "L3": +// ${name} is allocated in L3 \n +% endif +""") + +gap9GenericGlobalInitTemplate = NodeTemplate(""" +% if _memoryLevel == "L1": +static PI_L1 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n +% elif _memoryLevel == "L2" or _memoryLevel is None: +static PI_L2 ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n +% elif _memoryLevel == "L3": +// ${name} is allocated in L3 \n +static PI_L2 ${type.referencedType.typeName}* ${name}; +% endif +""") + +gap9GenericAllocate = NodeTemplate(""" +% if _memoryLevel == "L1": +${name} = (${type.typeName}) pi_l1_malloc((void *) 0, sizeof(${type.referencedType.typeName}) * ${size});\n +% elif _memoryLevel == "L2" or _memoryLevel is None: +${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% elif _memoryLevel == "L3": +${name} = (${type.typeName}) cl_ram_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% else: +//COMPILER BLOCK - MEMORYLEVEL ${_memoryLevel} NOT FOUND \n +${name} = (${type.typeName}) pi_l2_malloc(sizeof(${type.referencedType.typeName}) * ${size});\n +// ${name} with size ${size} allocated in L2! +% endif +""") diff --git a/Deeploy/Targets/GAP9/Templates/FreeTemplate.py b/Deeploy/Targets/GAP9/Templates/FreeTemplate.py new file mode 100644 index 0000000000..9604b5f951 --- /dev/null +++ b/Deeploy/Targets/GAP9/Templates/FreeTemplate.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +gap9L2LocalTemplate = NodeTemplate("pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});") +gap9L2GlobalTemplate = NodeTemplate("pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});") +gap9L1FreeTemplate = NodeTemplate("pi_l1_free(${name}, sizeof(${type.referencedType.typeName}) * ${size});\n") +gap9L1GlobalFreeTemplate = NodeTemplate("") + +gap9GenericFree = NodeTemplate(""" +% if _memoryLevel == "L1": +pi_l1_free(${name}, sizeof(${type.referencedType.typeName}) * ${size}); +% elif _memoryLevel == "L2" or _memoryLevel is None: +pi_l2_free(${name}, sizeof(${type.referencedType.typeName}) * ${size}); +% elif _memoryLevel == "L3": +cl_ram_free(${name}, sizeof(${type.referencedType.typeName}) * ${size}); +% else: +//COMPILER BLOCK - MEMORYLEVEL ${_memoryLevel} NOT FOUND \n +% endif +""") diff --git a/Deeploy/Targets/GAP9/Templates/__init__.py b/Deeploy/Targets/GAP9/Templates/__init__.py new file mode 100644 index 0000000000..9cd8ddcb2e --- /dev/null +++ b/Deeploy/Targets/GAP9/Templates/__init__.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from . import * diff --git a/Deeploy/Targets/GAP9/Tiler.py b/Deeploy/Targets/GAP9/Tiler.py new file mode 100644 index 0000000000..fefe12b6d7 --- /dev/null +++ b/Deeploy/Targets/GAP9/Tiler.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +GAP9-specific tiler bindings using ClDma instead of MchanDma. + +This module creates GAP9-specific tiling ready bindings that use ClDma +instead of the low-level MCHAN API. +""" + +import copy + +from Deeploy.Targets.GAP9.Bindings import GAP9AddBindings, GAP9ConcatBindings, GAP9FloatConv2DBindings, \ + GAP9FloatDWConv2DBindings, GAP9FloatGELUBinding, GAP9FloatGEMMBindings, GAP9GatherBindings, \ + GAP9iHardswishBindings, GAP9iRMSNormBindings, GAP9iRQSGELUBindings, GAP9LayernormBinding, GAP9MatMulBindings, \ + GAP9MaxPool2DBindings, GAP9MulBindings, GAP9ReduceSumBindings, GAP9ReluBinding, GAP9ReshapeBindings, \ + GAP9RQAddBindings, GAP9RQSBindings, GAP9RQSConv2DBindings, GAP9RQSDWConv2DBindings, GAP9RQSGEMMBindings, \ + GAP9RQSiHardswishBindings, GAP9RQSMatrixVecBindings, GAP9RQSTallGEMMBindings, GAP9SGDBindings, \ + GAP9SoftmaxBindings, GAP9SoftmaxCrossEntropyLossBindings, GAP9SoftmaxCrossEntropyLossGradBindings, \ + GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings +from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint +from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint +from Deeploy.Targets.Generic.TileConstraints.MulTileConstraint import MulTileConstraint +from Deeploy.Targets.Generic.TileConstraints.NOPTileConstraint import NOPTileConstraint +from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint +from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint +from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint +from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ + RQDWConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ + SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings + +# GAP9-specific tiling ready bindings using ClDma +GAP9RQSConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSConv2DBindings, + tileConstraint = RQConv2DTileConstraint()) + +GAP9RQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSDWConv2DBindings, + tileConstraint = RQDWConv2DTileConstraint()) + +GAP9Conv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatConv2DBindings, + tileConstraint = Conv2DTileConstraint()) + +GAP9DWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatDWConv2DBindings, + tileConstraint = DWConv2DTileConstraint()) + +GAP9RQSGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSGEMMBindings, + tileConstraint = GEMMTileConstraint()) + +GAP9FPGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatGEMMBindings, + tileConstraint = FloatGEMMTileConstraint()) + +GAP9RQSMatrixVecTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSMatrixVecBindings, + tileConstraint = GEMMTileConstraint()) + +GAP9RQSTallGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSTallGEMMBindings, + tileConstraint = GEMMTileConstraint()) + +GAP9MatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9MatMulBindings, + tileConstraint = MatMulTileConstraint()) + +GAP9RQAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQAddBindings, + tileConstraint = AddTileConstraint()) + +GAP9iHardswishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9iHardswishBindings, + tileConstraint = iHardswishTileConstraint()) + +GAP9RQSiHardswishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSiHardswishBindings, + tileConstraint = RQSiHardswishTileConstraint()) + +_GAP9FlattenBindings = copy.deepcopy(GAP9ReshapeBindings) + +GAP9FlattenTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _GAP9FlattenBindings, + tileConstraint = NOPTileConstraint()) + +GAP9MaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9MaxPool2DBindings, + tileConstraint = MaxPoolCTileConstraint()) + +GAP9RQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9RQSBindings, + tileConstraint = RequantShiftTileConstraint()) + +GAP9UniformRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9UniformRQSBindings, + tileConstraint = UnaryTileConstraint()) + +GAP9TransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9TransposeBindings, + tileConstraint = TransposeTileConstraint()) + +GAP9AddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9AddBindings, + tileConstraint = AddTileConstraint()) + +GAP9SoftmaxTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SoftmaxBindings, + tileConstraint = iSoftmaxTileConstraint()) + +GAP9ConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9ConcatBindings, + tileConstraint = ConcatTileConstraint()) + +GAP9iRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9iRMSNormBindings, + tileConstraint = iRMSNormTileConstraint()) + +GAP9iRQSGELUTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9iRQSGELUBindings, + tileConstraint = RQSiGELUTileConstraint()) + +GAP9MulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9MulBindings, + tileConstraint = MulTileConstraint()) + +GAP9ReluTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9ReluBinding], + tileConstraint = UnaryTileConstraint()) + +GAP9LayernormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9LayernormBinding], + tileConstraint = LayernormTileConstraint()) + +GAP9FPGELUTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9FloatGELUBinding], + tileConstraint = UnaryTileConstraint()) + +GAP9GatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9GatherBindings, + tileConstraint = GatherTileConstraint()) + +GAP9SoftmaxCrossEntropyTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9SoftmaxCrossEntropyLossBindings, tileConstraint = SoftmaxCrossEntropyTileConstraint()) + +GAP9SoftmaxCrossEntropyGradTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9SoftmaxCrossEntropyLossGradBindings, tileConstraint = SoftmaxCrossEntropyGradTileConstraint()) + +GAP9SoftmaxGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SoftmaxGradBindings, + tileConstraint = UntiledTileConstraint()) + +GAP9ReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9ReduceSumBindings, + tileConstraint = UntiledTileConstraint()) + +GAP9SGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SGDBindings, + tileConstraint = SGDTileConstraint()) diff --git a/Deeploy/Targets/GAP9/__init__.py b/Deeploy/Targets/GAP9/__init__.py new file mode 100644 index 0000000000..9cd8ddcb2e --- /dev/null +++ b/Deeploy/Targets/GAP9/__init__.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from . import * diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py index bceea01f4d..0828cb392e 100644 --- a/Deeploy/Targets/PULPOpen/Deployer.py +++ b/Deeploy/Targets/PULPOpen/Deployer.py @@ -13,6 +13,7 @@ from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ PULPNCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer +from Deeploy.Targets.GAP9.Platform import GAP9ClusterEngine from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine @@ -69,7 +70,7 @@ def annotateNCores(self) -> None: node = layer.node engine = self._selectEngine(node) opRepr = layer.mapper.parser.operatorRepresentation - if isinstance(engine, PULPClusterEngine): + if isinstance(engine, (PULPClusterEngine, GAP9ClusterEngine)): opRepr["n_cores"] = engine.n_cores def bind(self) -> bool: diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py index 315481741e..0ce331a2b1 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py @@ -11,9 +11,9 @@ ${data_out}, ${weight}, ${bias}, - ${epsilon}, ${size}, - ${lastDimLength} + ${lastDimLength}, + ${epsilon} ); """) diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py index 65c2285e24..64143a9dd6 100644 --- a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py @@ -93,10 +93,13 @@ def alignToContext(self, ctxt: NetworkContext, referenceTemplate = PULPTransposeTemplate(""" // Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp}) ${tileHeader} +// RW: GCC Segmentation fault +${data_in_type.referencedType.typeName} (*src)${shapeStr} = (${data_in_type.referencedType.typeName} (*)${shapeStr})<%text>${data_in}; +${data_in_type.referencedType.typeName} (*dst)${outShapeStr} = (${data_in_type.referencedType.typeName} (*)${outShapeStr})<%text>${data_out}; % for idx, i in enumerate(perm): ${forLoops[idx]} % endfor -((${data_in_type.referencedType.typeName} (*)${outShapeStr})<%text>${data_out})${outAccessStr} = ((${data_in_type.referencedType.typeName} (*)${shapeStr})<%text>${data_in})${accessStr}; +dst${outAccessStr} = src${accessStr}; % for idx, i in enumerate(perm): } % endfor diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index 0634b4ba0f..b7f3535790 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -50,5 +50,32 @@ elseif(DEEPLOY_ARCH STREQUAL SNITCH) add_subdirectory(Platforms/Snitch) elseif(DEEPLOY_ARCH STREQUAL CHIMERA) add_subdirectory(Platforms/Chimera) +elseif(platform STREQUAL GAP9) + # Search for hex files generated by Python code generator + # These files indicate L3 mode (external memory with readfs) + file(GLOB_RECURSE HEXLIST + "${GENERATED_SOURCE}/hex/*" + ) + + if (NOT HEXLIST) + # L2 mode: No flash/readfs files + # Data lives in L2 memory only + target_compile_options(network PUBLIC -DNOFLASH) + message(STATUS "[Deeploy GAP9] L2 mode: No hex files found, -DNOFLASH set") + message(STATUS "[Deeploy GAP9] If you expect L3 mode, ensure Python codegen created hex files in ${GENERATED_SOURCE}/hex/") + else() + # L3 mode: Use flash with readfs files + # Data will be loaded from external flash via readfs + list(LENGTH HEXLIST HEXCOUNT) + message(STATUS "[Deeploy GAP9] L3 mode: Found ${HEXCOUNT} hex file(s)") + message(STATUS "==== HEXLIST ====") + foreach(f ${HEXLIST}) + message(STATUS " ${f}") + endforeach() + gvsoc_flags_add_files_to_flash(GAPY_RUNNER_ARGS HEXLIST) + message(STATUS "GAPY_RUNNER_ARGS: ${GAPY_RUNNER_ARGS}") + endif() + + add_subdirectory(Platforms/GAP9) endif() diff --git a/DeeployTest/Platforms/GAP9/CMakeLists.txt b/DeeployTest/Platforms/GAP9/CMakeLists.txt new file mode 100644 index 0000000000..0a7fde9c00 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/CMakeLists.txt @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +set(ProjectId ${TESTNAME}) + + + +set(${SDKCONFIG_FILE} ${CMAKE_CURRENT_LIST_DIR}/sdk.config) + +file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytest.c +) + +add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) +# add_executable(${ProjectId} ${SOURCES}) +target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc) + +target_link_libraries(${ProjectId} PRIVATE network deeploylib) +target_compile_options(${ProjectId} INTERFACE network) +add_gvsoc_emulation(${ProjectId} "gap9.evk") + +# RW: Waive sign comparison warnings from pulp_nn_utils.h +target_compile_options(network PRIVATE + -Wno-sign-compare + -Wno-pointer-sign + -Wno-unknown-pragmas + -Wno-error + ) + +link_compile_dump(${TESTNAME}) diff --git a/DeeployTest/Platforms/GAP9/inc/CycleCounter.h b/DeeployTest/Platforms/GAP9/inc/CycleCounter.h new file mode 100644 index 0000000000..e4720af1f1 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/inc/CycleCounter.h @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CYCLECOUNTER +#define CYCLECOUNTER + +// Resets the internal cycle counter to zero +void ResetTimer(void); + +// Starts the internal cycle counter +void StartTimer(void); + +// Stops the internal cycle counter +void StopTimer(void); + +// Returns the current number of cycles according to the internal cycle counter +unsigned int getCycles(void); + +#endif diff --git a/DeeployTest/Platforms/GAP9/sdk.config b/DeeployTest/Platforms/GAP9/sdk.config new file mode 100644 index 0000000000..8d6fb6b178 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/sdk.config @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +CONFIG_BOARD_GAP9MOD_V1_0_B=y +CONFIG_BOARD_GAP9EVK_V1_3=y + +CONFIG_DRIVER_CLUSTER=y +CONFIG_DRIVER_CLUSTER_CONF_PROPERTY_ICACHE_CONF=0x7FF + +CONFIG_DRIVER_TYPE_FLASH=y +CONFIG_DRIVER_TYPE_RAM=y + +CONFIG_DRIVER_MRAM=y +CONFIG_READFS_FLASH_TYPE_OSPI=y + +CONFIG_DRIVER_READFS=y +# CONFIG_READFS_FLASH_TYPE_MRAM=y + +CONFIG_DRIVER_APS256XXN=y +CONFIG_DRIVER_RAM_API=y + +CONFIG_ENABLE_LIBMATH=y + +# OS float printf +CONFIG_IO_PRINTF_FLOAT_ENABLE=y +CONFIG_IO_PRINTF_FLOAT_EXPONENT_ENABLE=y + +CONFIG_PLATFORM_GVSOC=y +# CONFIG_DRIVER_CLUSTERDECOMPRESSOR=n + +# GAP9 cluster stack size configuration +# Uncomment and adjust these values if you need to modify stack sizes: +# CONFIG_CL_MASTER_CORE_STACK_SIZE=14000 +# CONFIG_CL_SLAVE_CORE_STACK_SIZE=1000 \ No newline at end of file diff --git a/DeeployTest/Platforms/GAP9/src/CycleCounter.c b/DeeployTest/Platforms/GAP9/src/CycleCounter.c new file mode 100644 index 0000000000..820ea05028 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/src/CycleCounter.c @@ -0,0 +1,19 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "CycleCounter.h" +#include "pmsis.h" + +void ResetTimer() { + pi_perf_conf(1 << PI_PERF_CYCLES); + pi_perf_reset(); +} + +void StartTimer() { pi_perf_start(); } + +void StopTimer() { pi_perf_stop(); } + +unsigned int getCycles() { return pi_perf_read(PI_PERF_CYCLES); } diff --git a/DeeployTest/Platforms/GAP9/src/deeploytest.c b/DeeployTest/Platforms/GAP9/src/deeploytest.c new file mode 100644 index 0000000000..f4818e0104 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/src/deeploytest.c @@ -0,0 +1,185 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include "CycleCounter.h" +#include "Network.h" +#include "dory_mem.h" +#include "pmsis.h" +#include "testinputs.h" +#include "testoutputs.h" + +// RW: Remove MAINSTACKSIZE because gap9-sdk does not use it +#define SLAVESTACKSIZE 3800 + +struct pi_device cluster_dev; +uint32_t total_cycles = 0; + +typedef struct { + void *expected; + void *actual; + int num_elements; + int output_buf_index; + int *err_count; +} FloatCompareArgs; + +void CompareFloatOnCluster(void *args) { + + if (pi_core_id() == 0) { + FloatCompareArgs *compare_args = (FloatCompareArgs *)args; + float *expected = (float *)compare_args->expected; + float *actual = (float *)compare_args->actual; + int num_elements = compare_args->num_elements; + int output_buf_index = compare_args->output_buf_index; + int *err_count = compare_args->err_count; + + int local_err_count = 0; + + for (int i = 0; i < num_elements; i++) { + float expected_val = expected[i]; + float actual_val = actual[i]; + float diff = expected_val - actual_val; + + if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) { + local_err_count += 1; + + printf("Expected: %10.6f ", expected_val); + printf("Actual: %10.6f ", actual_val); + printf("Diff: %10.6f at Index %12u in Output %u\r\n", diff, i, + output_buf_index); + } + } + + *err_count = local_err_count; + } +} + +void CL_CompareFloat(void *arg) { + pi_cl_team_fork(NUM_CORES, CompareFloatOnCluster, arg); +} + +void InitNetworkWrapper(void *args) { + (void)args; + InitNetwork(pi_core_id(), pi_cl_cluster_nb_cores()); +} + +void RunNetworkWrapper(void *args) { + (void)args; + // Initialize performance counter in cluster context + ResetTimer(); + StartTimer(); + RunNetwork(pi_core_id(), pi_cl_cluster_nb_cores()); + total_cycles = getCycles(); + StopTimer(); +} + +int main(void) { +#ifndef CI + uint32_t core_id = pi_core_id(), cluster_id = pi_cluster_id(); + printf("[%d %d] Hello World!\n", cluster_id, core_id); +#endif + struct pi_cluster_conf conf; + + pi_cluster_conf_init(&conf); + conf.id = 0; + pi_open_from_conf(&cluster_dev, &conf); + if (pi_cluster_open(&cluster_dev)) + return -1; + + mem_init(); +#ifndef NOFLASH + open_fs(); +#endif + + printf("Intializing\r\n"); + + struct pi_cluster_task cluster_task; + + pi_cluster_task(&cluster_task, InitNetworkWrapper, NULL); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + +#ifndef CI + printf("Initialized\r\n"); +#endif + for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { + if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) { + memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], + DeeployNetwork_inputs_bytes[buf]); + } + } + +#ifndef CI + printf("Input copied\r\n"); +#endif + + pi_cluster_task(&cluster_task, RunNetworkWrapper, NULL); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + +#ifndef CI + printf("Output:\r\n"); +#endif + + uint32_t tot_err, tot_tested; + tot_err = 0; + tot_tested = 0; + void *compbuf; + FloatCompareArgs float_compare_args; + uint32_t float_error_count = 0; + + for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { + tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); + + if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) { + compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]); + ram_read(compbuf, DeeployNetwork_outputs[buf], + DeeployNetwork_outputs_bytes[buf]); + } else { + compbuf = DeeployNetwork_outputs[buf]; + } + + if (ISOUTPUTFLOAT) { + float_error_count = 0; + float_compare_args.expected = testOutputVector[buf]; + float_compare_args.actual = compbuf; + float_compare_args.num_elements = + DeeployNetwork_outputs_bytes[buf] / sizeof(float); + float_compare_args.output_buf_index = buf; + float_compare_args.err_count = (int *)&float_error_count; + + pi_cluster_task(&cluster_task, CL_CompareFloat, &float_compare_args); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + tot_err += float_error_count; + } else { + + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; + OUTPUTTYPE diff = expected - actual; + + if (diff) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); + } + } + } + if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) { + pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]); + } + } + + printf("Runtime: %u cycles\r\n", total_cycles); + printf("Errors: %u out of %u \r\n", tot_err, tot_tested); + + return 0; +} \ No newline at end of file diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index 39c24ef6b8..c7077067d9 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -66,6 +66,8 @@ def pytest_configure(config: pytest.Config) -> None: config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)") config.addinivalue_line("markers", "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)") + config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test") + config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)") config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)") config.addinivalue_line("markers", "models: mark test as a model test (full networks)") config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration") diff --git a/DeeployTest/deeployRunner_gap9.py b/DeeployTest/deeployRunner_gap9.py new file mode 100644 index 0000000000..ace2d8eb25 --- /dev/null +++ b/DeeployTest/deeployRunner_gap9.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + + # Define parser setup callback to add GAP9-specific arguments + def setup_parser(parser): + parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n') + + sys.exit( + main(default_platform = "GAP9", + default_simulator = "gvsoc", + tiling_enabled = False, + parser_setup_callback = setup_parser)) diff --git a/DeeployTest/deeployRunner_tiled_gap9.py b/DeeployTest/deeployRunner_tiled_gap9.py new file mode 100644 index 0000000000..cde8e1e1d8 --- /dev/null +++ b/DeeployTest/deeployRunner_tiled_gap9.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + + # Define parser setup callback to add GAP9-specific arguments + def setup_parser(parser): + parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n') + + sys.exit( + main(default_platform = "GAP9", + default_simulator = "gvsoc", + tiling_enabled = True, + parser_setup_callback = setup_parser)) diff --git a/DeeployTest/testUtils/core/config.py b/DeeployTest/testUtils/core/config.py index 0c545e1b73..e932c23962 100644 --- a/DeeployTest/testUtils/core/config.py +++ b/DeeployTest/testUtils/core/config.py @@ -19,6 +19,7 @@ class DeeployTestConfig: build_dir: str toolchain: str = "LLVM" toolchain_install_dir: Optional[str] = None + gvsoc_install_dir: Optional[str] = None cmake_args: List[str] = None gen_args: List[str] = None verbose: int = 0 @@ -31,3 +32,5 @@ def __post_init__(self): self.gen_args = [] if self.toolchain_install_dir is None: self.toolchain_install_dir = os.environ.get('LLVM_INSTALL_DIR') + if self.gvsoc_install_dir is None: + self.gvsoc_install_dir = os.environ.get('GVSOC_INSTALL_DIR') diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 46ed86d303..59644adb2e 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -83,6 +83,10 @@ def configure_cmake(config: DeeployTestConfig) -> None: f"-B{config.build_dir}", ] + # Add GVSOC_INSTALL_DIR if available + if config.gvsoc_install_dir: + cmd.append(f"-DGVSOC_INSTALL_DIR={config.gvsoc_install_dir}") + for arg in config.cmake_args: if not arg.startswith("-D"): arg = "-D" + arg @@ -127,6 +131,10 @@ def build_binary(config: DeeployTestConfig) -> None: config.test_name, ] + # GAP9 requires the 'image' target to generate MRAM .bin files for GVSOC + if config.platform == 'GAP9': + cmd.append("image") + env = os.environ.copy() if config.verbose >= 3: env["VERBOSE"] = "1" diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 6e6f57e049..c17e94d3d7 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -245,6 +245,7 @@ def create_config_from_args(args: argparse.Namespace, build_dir = build_dir, toolchain = args.toolchain, toolchain_install_dir = args.toolchain_install_dir, + gvsoc_install_dir = getattr(args, 'gvsoc_install_dir', None), cmake_args = cmake_args_list, gen_args = gen_args_list, verbose = args.verbose, diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 48c5777905..9d526906f9 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -14,6 +14,8 @@ from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform from Deeploy.Targets.CortexM.Deployer import CMSISDeployer from Deeploy.Targets.CortexM.Platform import CMSISOptimizer, CMSISPlatform +from Deeploy.Targets.GAP9.Deployer import GAP9Deployer +from Deeploy.Targets.GAP9.Platform import GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper from Deeploy.Targets.Generic.Deployer import GenericDeployer from Deeploy.Targets.Generic.Platform import GenericOptimizer, GenericPlatform from Deeploy.Targets.MemPool.Deployer import MemPoolDeployer @@ -29,7 +31,7 @@ from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -59,6 +61,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Siracusa" or platformName == "PULPOpen": Platform = PULPPlatform() + elif platformName == "GAP9": + Platform = GAP9Platform() + elif platformName == "Siracusa_w_neureka": Platform = NeurekaPlatform() @@ -85,6 +90,8 @@ def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHie weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \ if "WeightMemory_SRAM" in memoryHierarchy.memoryLevels else None return MemoryNeurekaPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel, weightMemoryLevel) + if isinstance(platform, GAP9Platform): + return MemoryGAP9PlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel) else: return MemoryPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel) @@ -200,6 +207,23 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + elif isinstance(platform, (GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper)): + + if loweringOptimizer is None: + loweringOptimizer = PULPOptimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = GAP9Deployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) + elif isinstance(platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): if loweringOptimizer is None: diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 53a5c7b9b9..9578c2f26c 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -166,6 +166,12 @@ def __init__(self, tiling_arguments: bool, description = None): type = str, default = os.environ.get('LLVM_INSTALL_DIR'), help = 'Pick compiler install dir\n') + self.add_argument('--gvsoc_install_dir', + metavar = '', + dest = 'gvsoc_install_dir', + type = str, + default = os.environ.get('GVSOC_INSTALL_DIR'), + help = 'Pick gvsoc install dir\n') self.add_argument('--input-type-map', nargs = '*', default = [], @@ -312,7 +318,9 @@ def __init__(self, self._dir_gen_root = f'TEST_{platform.upper()}' assert self._args.toolchain_install_dir is not None, f"Environment variable LLVM_INSTALL_DIR is not set" + assert self._args.gvsoc_install_dir is not None, f"Environment variable GVSOC_INSTALL_DIR is not set" self._dir_toolchain = os.path.normpath(self._args.toolchain_install_dir) + self._dir_gvsoc = os.path.normpath(self._args.gvsoc_install_dir) self._dir_build = f"{self._dir_gen_root}/build" self._dir_gen, self._dir_test, self._name_test = getPaths(self._args.dir, self._dir_gen_root) @@ -373,10 +381,10 @@ def configure_cmake_project(self): else: self.cmake_args += " -D gvsoc_simulation=OFF" - command = f"$CMAKE -D TOOLCHAIN={self._args.toolchain} -D TOOLCHAIN_INSTALL_DIR={self._dir_toolchain} -D GENERATED_SOURCE={self._dir_gen} -D platform={self._platform} {self.cmake_args} -B {self._dir_build} -D TESTNAME={self._name_test} .." + command = f"$CMAKE -D TOOLCHAIN={self._args.toolchain} -D GVSOC_INSTALL_DIR={self._dir_gvsoc} -D TOOLCHAIN_INSTALL_DIR={self._dir_toolchain} -D GENERATED_SOURCE={self._dir_gen} -D platform={self._platform} {self.cmake_args} -B {self._dir_build} -D TESTNAME={self._name_test} .." if self._args.verbose >= 3: - command = "VERBOSE=1 " + command + command = "VERBOSE=1 " + command + " --log-level debug" log.debug(f"[TestRunner] Cmake Command: {command}") @@ -385,11 +393,15 @@ def configure_cmake_project(self): raise RuntimeError(f"Configuring cMake project failed on {self._dir_test}") def build_binary(self): - command = f"$CMAKE --build {self._dir_build} --target {self._name_test}" - + command = "$CMAKE" if self._args.verbose >= 3: command = "VERBOSE=1 " + command + if self._platform == 'GAP9': + command += f" --build {self._dir_build} --target {self._name_test} image" + else: + command += f" --build {self._dir_build} --target {self._name_test}" + log.debug(f"[TestRunner] Building Command: {command}") err = os.system(command) diff --git a/DeeployTest/test_gap9_config.py b/DeeployTest/test_gap9_config.py new file mode 100644 index 0000000000..2158287cf4 --- /dev/null +++ b/DeeployTest/test_gap9_config.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Test configuration for GAP9 platform.""" + +DEFAULT_NUM_CORES = 8 + +KERNEL_TESTS = [ + "Kernels/Integer/Add/Regular", "Kernels/Integer/Add/MultIO", "Kernels/Integer/Pad/Regular_1D", + "Kernels/Integer/Pad/Regular_2D", "Kernels/Integer/MatMul/Regular", "Kernels/Integer/MatMul/Add", + "Kernels/Integer/Conv/DW_2D_RQ", "Kernels/Integer/Conv/Regular_2D_RQ", "Kernels/Integer/Softmax/Regular", + "Kernels/Integer/Concat", "Kernels/Integer/Hardswish/Regular", "Others/Backtracking", "Kernels/FP32/Add/Regular", + "Kernels/FP32/GEMM/Regular", "Kernels/FP32/Conv/Regular_2D_Bias", "Kernels/FP32/Conv/Regular_2D_NoBias", + "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias", "Kernels/FP32/Conv/DW_2D_Bias", "Kernels/FP32/Conv/DW_2D_NoBias", + "Kernels/FP32/Conv/DW_2D_ZeroValuedBias", "Kernels/FP32/LayerNorm", "Kernels/FP32/ReLU", "Kernels/FP32/MaxPool", + "Kernels/FP32/MatMul", "Kernels/FP32/Softmax/Regular", "Kernels/FP32/Transpose", "Kernels/FP32/Mul", + "Kernels/Mixed/Dequant", "Kernels/Mixed/Quant", "Kernels/FP32/ReduceSum", "Kernels/FP32/Reshape/SkipConnection" +] + +MODEL_TESTS = [ + "Models/miniMobileNet", + "Models/miniMobileNetv2", + "Models/MLPerf/KeywordSpotting", + "Models/MLPerf/ImageClassification", + "Models/MLPerf/AnomalyDetection", +] diff --git a/DeeployTest/test_gap9_tiled_config.py b/DeeployTest/test_gap9_tiled_config.py new file mode 100644 index 0000000000..f9bac8dfd7 --- /dev/null +++ b/DeeployTest/test_gap9_tiled_config.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +PLATFORM_NAME = "GAP9" +SIMULATOR = "gvsoc" +DEFAULT_CORES = 8 +DEFAULT_L2 = 1024000 +DEFAULT_MEM_ALLOC_STRATEGY = "MiniMalloc" +DEFAULT_SEARCH_STRATEGY = "random-max" + +L2_SINGLEBUFFER_KERNELS = { + "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000], + "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 4000], + "Kernels/Integer/Conv/StriddedPadded_2D_RQ": [600], + "Kernels/Integer/Conv/DW_2D_RQ": [2561], + "Kernels/Integer/Softmax/Regular": [800, 500, 300], + "Kernels/Integer/Concat": [32000, 16000, 8000], + "Kernels/Integer/Hardswish/Regular": [750], + "Kernels/FP32/GEMM/Regular": [8000], + "Kernels/FP32/Conv/Regular_2D_NoBias": [6600], + "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600], + "Kernels/FP32/Conv/DW_2D_Bias": [7200], + "Kernels/FP32/Conv/DW_2D_NoBias": [7200], + "Kernels/FP32/Conv/DW_2D_ZeroValuedBias": [7200], + "Kernels/FP32/LayerNorm": [2000], + "Kernels/FP32/MaxPool": [2000], + "Kernels/FP32/MatMul": [2000], + "Kernels/FP32/ReLU": [2000], + "Kernels/FP32/Reshape/SkipConnection": [1400], + "Kernels/FP32/Softmax/Regular": [4000], + "Kernels/FP32/Transpose": [2000], + "Kernels/FP32/Mul": [2000], + "Kernels/Integer/GEMM/Batch_RQ": [20000], + "Kernels/Integer/MatMul/Batch": [20000], +} + +L2_DOUBLEBUFFER_KERNELS = { + "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000], + "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 5000], + "Kernels/Integer/Conv/DW_2D_RQ": [5121], + "Kernels/Integer/Softmax/Regular": [1600, 1000, 600], + "Kernels/Integer/Concat": [64000, 32000, 16000], + "Kernels/Integer/Hardswish/Regular": [750], + "Kernels/FP32/GEMM/Regular": [8000], + "Kernels/FP32/Conv/Regular_2D_NoBias": [8800], + "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [8800], + "Kernels/FP32/Conv/DW_2D_Bias": [9800], + "Kernels/FP32/Conv/DW_2D_NoBias": [10000], + "Kernels/FP32/Conv/DW_2D_ZeroValuedBias": [9800], + "Kernels/FP32/LayerNorm": [2000], + "Kernels/FP32/MaxPool": [5000], + "Kernels/FP32/MatMul": [5000], + "Kernels/FP32/ReLU": [20], + "Kernels/FP32/Reshape/SkipConnection": [2600], + "Kernels/FP32/Softmax/Regular": [8000], + "Kernels/FP32/Transpose": [2000], + "Kernels/FP32/Mul": [2000], +} + +L2_SINGLEBUFFER_MODELS = { + "Models/miniMobileNet": [60000, 12000, 6000, 3000], + "Models/miniMobileNetv2": [60000, 16000, 12000, 8000], + "Models/MLPerf/KeywordSpotting": [64000], + "Models/MLPerf/ImageClassification": [64000], + "Models/MLPerf/AnomalyDetection": [64000], +} + +L2_DOUBLEBUFFER_MODELS = { + "Models/miniMobileNet": [60000, 24000, 12000, 6000], + "Models/miniMobileNetv2": [60000, 32000, 24000, 16000], + "Models/MLPerf/KeywordSpotting": [64000], + "Models/MLPerf/ImageClassification": [64000], + "Models/MLPerf/AnomalyDetection": [64000], +} + +L3_SINGLEBUFFER_MODELS = { + "Models/miniMobileNet": [60000, 12000, 6000], + "Models/miniMobileNetv2": [60000, 16000, 12000, 8000], + "Models/CCT/FP32/CCT_2_32_32_128": [128000], +} + +L3_DOUBLEBUFFER_MODELS = { + "Models/miniMobileNet": [60000, 24000, 12000, 6000], + "Models/miniMobileNetv2": [60000, 32000, 24000, 16000], +} diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 794ae6fe7e..577be29cb4 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -8,6 +8,16 @@ from test_chimera_config import MODEL_TESTS as CHIMERA_MODEL_TESTS from test_cortexm_config import KERNEL_TESTS as CORTEXM_KERNEL_TESTS from test_cortexm_config import MODEL_TESTS as CORTEXM_MODEL_TESTS +from test_gap9_config import DEFAULT_NUM_CORES as GAP9_DEFAULT_NUM_CORES +from test_gap9_config import KERNEL_TESTS as GAP9_KERNEL_TESTS +from test_gap9_config import MODEL_TESTS as GAP9_MODEL_TESTS +from test_gap9_tiled_config import DEFAULT_CORES as GAP9_TILED_DEFAULT_CORES +from test_gap9_tiled_config import L2_DOUBLEBUFFER_KERNELS as GAP9_L2_DOUBLEBUFFER_KERNELS +from test_gap9_tiled_config import L2_DOUBLEBUFFER_MODELS as GAP9_L2_DOUBLEBUFFER_MODELS +from test_gap9_tiled_config import L2_SINGLEBUFFER_KERNELS as GAP9_L2_SINGLEBUFFER_KERNELS +from test_gap9_tiled_config import L2_SINGLEBUFFER_MODELS as GAP9_L2_SINGLEBUFFER_MODELS +from test_gap9_tiled_config import L3_DOUBLEBUFFER_MODELS as GAP9_L3_DOUBLEBUFFER_MODELS +from test_gap9_tiled_config import L3_SINGLEBUFFER_MODELS as GAP9_L3_SINGLEBUFFER_MODELS from test_generic_config import KERNEL_TESTS as GENERIC_KERNEL_TESTS from test_generic_config import MODEL_TESTS as GENERIC_MODEL_TESTS from test_mempool_config import DEFAULT_NUM_THREADS as MEMPOOL_DEFAULT_NUM_THREADS @@ -100,6 +110,13 @@ def param_id(param): "model_tests": SNITCH_MODEL_TESTS, "default_num_cores": SNITCH_DEFAULT_NUM_CORES, }, + "gap9": { + "platform": "GAP9", + "simulator": "gvsoc", + "kernel_tests": GAP9_KERNEL_TESTS, + "model_tests": GAP9_MODEL_TESTS, + "default_num_cores": GAP9_DEFAULT_NUM_CORES, + }, } ### Markers summary ### @@ -114,6 +131,8 @@ def param_id(param): # siracusa: tests from the Siracusa platform (untiled) # siracusa_tiled: tests from the Siracusa platform (tiled) # siracusa_neureka_tiled: tests from the Siracusa + Neureka platform (tiled) +# gap9: tests from the GAP9 platform (untiled) +# gap9_tiled: tests from the GAP9 platform (tiled) # Test type markers: # kernels: single kernel (or single layer) tests # models: full model (multiple layer) tests @@ -726,3 +745,245 @@ def test_siracusa_neureka_tiled_models_l3_doublebuffer_wmem(test_params, deeploy gen_args = ["--neureka-wmem"], ) run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9 +@pytest.mark.kernels +@pytest.mark.parametrize("test_name", GAP9_KERNEL_TESTS, ids = GAP9_KERNEL_TESTS) +def test_gap9_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + platform_config = PLATFORM_CONFIGS["gap9"] + + # Add GAP9-specific CMake args for number of cores + gap9_cmake_args = cmake_args + [f"NUM_CORES={platform_config['default_num_cores']}"] + + config = create_test_config( + test_name = test_name, + platform = platform_config["platform"], + simulator = platform_config["simulator"], + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9 +@pytest.mark.models +@pytest.mark.parametrize("test_name", GAP9_MODEL_TESTS, ids = GAP9_MODEL_TESTS) +def test_gap9_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + platform_config = PLATFORM_CONFIGS["gap9"] + + # Add GAP9-specific CMake args for number of cores + gap9_cmake_args = cmake_args + [f"NUM_CORES={platform_config['default_num_cores']}"] + + config = create_test_config( + test_name = test_name, + platform = platform_config["platform"], + simulator = platform_config["simulator"], + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.kernels +@pytest.mark.singlebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"), + ids = param_id, +) +def test_gap9_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.kernels +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"), + ids = param_id, +) +def test_gap9_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = True, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.models +@pytest.mark.singlebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L2_SINGLEBUFFER_MODELS, "L2-singlebuffer"), + ids = param_id, +) +def test_gap9_tiled_models_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.models +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L2_DOUBLEBUFFER_MODELS, "L2-doublebuffer"), + ids = param_id, +) +def test_gap9_tiled_models_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = True, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.models +@pytest.mark.singlebuffer +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L3_SINGLEBUFFER_MODELS, "L3-singlebuffer"), + ids = param_id, +) +def test_gap9_tiled_models_l3_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L3", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.gap9_tiled +@pytest.mark.models +@pytest.mark.doublebuffer +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_params", + generate_test_params(GAP9_L3_DOUBLEBUFFER_MODELS, "L3-doublebuffer"), + ids = param_id, +) +def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, + skipsim) -> None: + test_name, l1, config_name = test_params + + # Add GAP9-specific CMake args + gap9_cmake_args = cmake_args + [f"NUM_CORES={GAP9_TILED_DEFAULT_CORES}"] + + config = create_test_config( + test_name = test_name, + platform = "GAP9", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = gap9_cmake_args, + tiling = True, + cores = GAP9_TILED_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L3", + double_buffer = True, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt new file mode 100644 index 0000000000..ca4c3ffbeb --- /dev/null +++ b/TargetLibraries/GAP9/CMakeLists.txt @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +file(GLOB_RECURSE SOURCES + "src/**" +) + +# RW: Include PULPOpen sources but exclude dory_mem related files +file(GLOB_RECURSE PULPOPEN_SOURCES "../PULPOpen/src/**") +list(FILTER PULPOPEN_SOURCES EXCLUDE REGEX ".*dory_mem.*") +list(APPEND SOURCES ${PULPOPEN_SOURCES}) + +add_deeploy_library(deeploygap9 STATIC ${SOURCES}) +target_include_directories(deeploygap9 + PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/inc + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/inc +) + +target_compile_options(deeploygap9 PUBLIC + -DNUM_CORES=${NUM_CORES} + ) + +target_compile_options(deeploygap9 PRIVATE + -Wno-sign-conversion + -Wno-sign-compare + -Wno-type-limits + -Wno-attributes +) + +target_link_libraries(deeploygap9 PUBLIC pmsis) + +#RW: Link PULP-NN +#RW: Set PULP-NN version and bitwidth for pulp-nn-mixed +set(PULPNNVERSION XPULPV2) +set(PULPNNBITWIDTH 32) + +# Option to use prebuilt pulp-nn library (default: ON for faster builds) +option(USE_PREBUILT_PULPNN "Use prebuilt pulp-nn-mixed librar with 8 cores" ON) + +# Check if NUM_CORES matches prebuilt library requirement +if(USE_PREBUILT_PULPNN AND NOT NUM_CORES EQUAL 8) + message(WARNING "[Deeploy GAP9] Prebuilt pulp-nn library is compiled for 8 cores, but NUM_CORES=${NUM_CORES}.") + message(WARNING "[Deeploy GAP9] Disabling prebuilt library and building from source instead.") + set(USE_PREBUILT_PULPNN OFF) +endif() + +if(USE_PREBUILT_PULPNN) + # Use prebuilt pulp-nn library + set(PREBUILT_PULPNN_DIR ${CMAKE_CURRENT_LIST_DIR}/prebuilt) + + if(EXISTS ${PREBUILT_PULPNN_DIR}/libpulp-nn-mixed.a) + message(STATUS "[Deeploy GAP9] Using prebuilt pulp-nn-mixed library") + + # Create imported target for prebuilt library + add_library(pulp-nn-mixed STATIC IMPORTED) + set_target_properties(pulp-nn-mixed PROPERTIES + IMPORTED_LOCATION ${PREBUILT_PULPNN_DIR}/libpulp-nn-mixed.a + INTERFACE_INCLUDE_DIRECTORIES ${PREBUILT_PULPNN_DIR}/include + ) + else() + message(WARNING "[Deeploy GAP9] Prebuilt pulp-nn library not found at ${PREBUILT_PULPNN_DIR}") + message(WARNING "[Deeploy GAP9] Falling back to building from source") + set(USE_PREBUILT_PULPNN OFF) + endif() +endif() + +if(NOT USE_PREBUILT_PULPNN) + # Build pulp-nn from source + message(STATUS "[Deeploy GAP9] Building pulp-nn-mixed from source") + + add_subdirectory(../third_party/pulp-nn-mixed ${CMAKE_CURRENT_BINARY_DIR}/pulp-nn-mixed) + + #RW: GCC not recognizing -Wno-typedef-redefinition defined in PULP-NN CMakelist + target_compile_options(pulp-nn-mixed PRIVATE -Wno-error) + target_compile_definitions(pulp-nn-mixed PUBLIC NUM_CORES=${NUM_CORES}) + target_link_libraries(pulp-nn-mixed PUBLIC pmsis) +endif() + +target_link_libraries(deeploygap9 PUBLIC pulp-nn-mixed) + +target_link_libraries(deeploygap9 PUBLIC m) + diff --git a/TargetLibraries/GAP9/inc/DeeployGAP9Math.h b/TargetLibraries/GAP9/inc/DeeployGAP9Math.h new file mode 100644 index 0000000000..0efa74c72e --- /dev/null +++ b/TargetLibraries/GAP9/inc/DeeployGAP9Math.h @@ -0,0 +1,27 @@ +/* + * SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_HEADER_ +#define __DEEPLOY_MATH_HEADER_ + +#include +#include +#include +#include +#include + +#define BEGIN_SINGLE_CORE if (pi_core_id() == 8 || pi_core_id() == 0) { +#define END_SINGLE_CORE } +#define SINGLE_CORE if (pi_core_id() == 8 || pi_core_id() == 0) + +#include "DeeployBasicMath.h" + +#include "dory_dma.h" +#include "dory_mem.h" + +#include "pmsis.h" + +#endif // __DEEPLOY_MATH_HEADER_ diff --git a/TargetLibraries/GAP9/inc/DeeployMchan.h b/TargetLibraries/GAP9/inc/DeeployMchan.h new file mode 100644 index 0000000000..0f1110af69 --- /dev/null +++ b/TargetLibraries/GAP9/inc/DeeployMchan.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _DEEPLOY_MCHAN_H +#define _DEEPLOY_MCHAN_H + +/* + * GAP9 MCHAN v7 configuration wrapper for Deeploy + * + * This header configures and includes mchan.h with proper GAP9-specific + * settings. Based on DORY's GAP9 DMA implementation. + */ + +#include "pmsis.h" + +// Define MCHAN base address if not already defined +#ifndef MCHAN_BASE_ADDR +#define MCHAN_BASE_ADDR (CLUSTER_PERIPHERALS_ADDR + CLUSTER_MCHAN_OFFSET) +#endif + +// Define MCHAN version (GAP9 uses v7) +#ifndef MCHAN_VERSION +#define MCHAN_VERSION 7 +#endif + +// Use event-based synchronization (recommended for GAP9) +#ifndef MCHAN_POLLED +#define MCHAN_EVENT +#endif + +// Define event bit for cluster DMA +#ifdef MCHAN_EVENT +#ifndef MCHAN_EVENT_BIT +#define MCHAN_EVENT_BIT (CLUSTER_IRQ_DMA0) // Typically 8 +#endif +#endif + +// Now include the mchan.h header with all configurations set +#include "mchan.h" + +#endif // _DEEPLOY_MCHAN_H diff --git a/TargetLibraries/GAP9/inc/dory_dma.h b/TargetLibraries/GAP9/inc/dory_dma.h new file mode 100644 index 0000000000..99ab242558 --- /dev/null +++ b/TargetLibraries/GAP9/inc/dory_dma.h @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _DORY_DMA_H +#define _DORY_DMA_H + +typedef struct { + void *ext; + void *loc; + unsigned short hwc_to_chw; + unsigned short stride_2d; + unsigned short number_of_2d_copies; + unsigned short stride_1d; + unsigned short number_of_1d_copies; + unsigned int length_1d_copy; + unsigned int mchan_cmd; + int dir; // 0 l1->l2, 1 l2->l1 + int tid; +} DMA_copy; + +void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy); + +void dory_dma_memcpy_1d_async(DMA_copy *copy); + +void dory_dma_memcpy_2d_async(DMA_copy *copy); + +void dory_dma_memcpy_3d_async(DMA_copy *copy); + +void dory_dma_memcpy_async(DMA_copy *copy); + +void dory_dma_memcpy_mindims_async(DMA_copy *copy); + +void dory_dma_free(DMA_copy *copy); + +void dory_dma_barrier(DMA_copy *copy); + +int dory_dma_allocate(); +#endif diff --git a/TargetLibraries/GAP9/inc/dory_mem.h b/TargetLibraries/GAP9/inc/dory_mem.h new file mode 100644 index 0000000000..ccd36a37d6 --- /dev/null +++ b/TargetLibraries/GAP9/inc/dory_mem.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __MEM_H__ +#define __MEM_H__ + +#include + +extern struct pi_device ram; + +void open_fs(); +void mem_init(); +struct pi_device *get_ram_ptr(); +void *ram_malloc(size_t size); +void ram_free(void *ptr, size_t size); +void ram_read(void *dest, void *src, size_t size); +void ram_write(void *dest, void *src, size_t size); +void *cl_ram_malloc(size_t size); +void cl_ram_free(void *ptr, size_t size); +void cl_ram_read(void *dest, void *src, size_t size); +void cl_ram_write(void *dest, void *src, size_t size); +size_t load_file_to_ram(const void *dest, const char *filename); +size_t load_file_to_local(const void *dest, const char *filename); + +#endif // __MEM_H__ diff --git a/TargetLibraries/GAP9/inc/mchan.h b/TargetLibraries/GAP9/inc/mchan.h new file mode 100644 index 0000000000..085f20b2f2 --- /dev/null +++ b/TargetLibraries/GAP9/inc/mchan.h @@ -0,0 +1,140 @@ +/* + * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _MCHAN_H +#define _MCHAN_H + +// Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header +#ifndef MCHAN_BASE_ADDR +#error "[mchan.h] MCHAN_BASE_ADDR not defined!" +#endif + +#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED) +#error "[mchan.h] Nor MCHAN_EVENT nor MCHAN_POLLED defined!" +#endif + +#if defined(MCHAN_EVENT) && !defined(MCHAN_EVENT_BIT) +#error \ + "[mchan.h] MCHAN_EVENT_BIT should be defined when using events as signalization!" +#endif + +#include "pmsis.h" + +#define MCHAN_CMD_OFFSET 0 +#define MCHAN_STATUS_OFFSET 4 + +#define MCHAN_CMD_ADDR (MCHAN_BASE_ADDR + MCHAN_CMD_OFFSET) +#define MCHAN_STATUS_ADDR (MCHAN_BASE_ADDR + MCHAN_STATUS_OFFSET) + +#define READ_REG(addr) (*(volatile int *)(addr)) +#define WRITE_REG(addr, value) \ + do { \ + *(volatile int *)(addr) = (int)value; \ + } while (0) + +#define MCHAN_READ_CMD() READ_REG(MCHAN_CMD_ADDR) +#define MCHAN_WRITE_CMD(value) WRITE_REG(MCHAN_CMD_ADDR, value) + +#define MCHAN_READ_STATUS() READ_REG(MCHAN_STATUS_ADDR) +#define MCHAN_WRITE_STATUS(value) WRITE_REG(MCHAN_STATUS_ADDR, value) + +// MCHAN version 7 has 1 more bit for the transfer length, so all the flag +// offsets are shifted by 1. Also, LOC (TCDM) striding is not supported in v6. +#if MCHAN_VERSION == 7 +#define MCHAN_TRANSFER_LEN_SIZE (17) +#else +#define MCHAN_TRANSFER_LEN_SIZE (16) +#endif + +#define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0)) +#define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0)) +#define MCHAN_CMD_FLAG_INCREMENTAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 1)) +#define MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 2)) +#define MCHAN_CMD_FLAG_EVENT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 3)) +#define MCHAN_CMD_FLAG_INTERRUPT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 4)) +#define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5)) +#if MCHAN_VERSION == 7 +#define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL \ + (1 << (MCHAN_TRANSFER_LEN_SIZE + 6)) // can only be used with MCHAN v7 +#endif +#define MCHAN_CMD_SHIFT_DIRECTION MCHAN_TRANSFER_LEN_SIZE + +#define MCHAN_CMD(len, dir, inc, loc_2d, ext_2d, int_en, event_en, broadcast) \ + (len | dir | inc | loc_2d | ext_2d | broadcast | int_en | event_en) + +typedef enum { + MCHAN_DMA_TRANSFER_DIRECTION_EXT2LOC = MCHAN_CMD_FLAG_DIRECTION_EXT2LOC, + MCHAN_DMA_TRANSFER_DIRECTION_LOC2EXT = MCHAN_CMD_FLAG_DIRECTION_LOC2EXT +} mchan_dma_transfer_direction_e; + +typedef struct { + int cmd; + int size; + + void *loc; + int loc_size_1d; + int loc_stride_1d; + + void *ext; + int ext_size_1d; + int ext_stride_1d; +} mchan_transfer_t; + +static int mchan_transfer_get_id() { return MCHAN_READ_CMD(); } + +static void mchan_transfer_push_1d(mchan_transfer_t trans) { + MCHAN_WRITE_CMD(trans.cmd); + MCHAN_WRITE_CMD(trans.loc); + MCHAN_WRITE_CMD(trans.ext); +} + +static void mchan_transfer_push_2d(mchan_transfer_t trans) { + MCHAN_WRITE_CMD(trans.cmd); + MCHAN_WRITE_CMD(trans.loc); + MCHAN_WRITE_CMD(trans.ext); +// MCHAN version 7 takes 2D "count" (length of 1D transfers) and stride in 2 +// steps, v7 takes it in 1 step with the stride shifted to the upper 16 bits. +#if MCHAN_VERSION == 7 + MCHAN_WRITE_CMD(trans.ext_size_1d); + MCHAN_WRITE_CMD(trans.ext_stride_1d); +#else + MCHAN_WRITE_CMD(trans.ext_size_1d | (trans.ext_stride_1d << 16)); +#endif +} + +static void mchan_transfer_push(mchan_transfer_t trans) { + MCHAN_WRITE_CMD(trans.cmd); + MCHAN_WRITE_CMD(trans.loc); + MCHAN_WRITE_CMD(trans.ext); + + if (trans.ext_size_1d < trans.size) { + MCHAN_WRITE_CMD(trans.ext_size_1d); + MCHAN_WRITE_CMD(trans.ext_stride_1d); + } + + if (trans.loc_size_1d < trans.size) { + MCHAN_WRITE_CMD(trans.loc_size_1d); + MCHAN_WRITE_CMD(trans.loc_stride_1d); + } +} + +static void mchan_transfer_free(int tid) { MCHAN_WRITE_STATUS(1 << tid); } + +static int mchan_transfer_busy(int tid) { + return MCHAN_READ_STATUS() & (1 << tid); +} + +static void mchan_transfer_wait(int tid) { +#if defined(MCHAN_EVENT) + while (mchan_transfer_busy(tid)) + eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT); +#elif defined(MCHAN_POLLED) + while (mchan_transfer_busy(tid)) + ; +#endif +} + +#endif diff --git a/TargetLibraries/GAP9/prebuilt/include/pulp_nn_kernels.h b/TargetLibraries/GAP9/prebuilt/include/pulp_nn_kernels.h new file mode 100644 index 0000000000..f6fb952cd2 --- /dev/null +++ b/TargetLibraries/GAP9/prebuilt/include/pulp_nn_kernels.h @@ -0,0 +1,7265 @@ +/* + * pulp_nn_kernels.h + * Nazareno Bruschi + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_KERNELS__ +#define __PULPNN_KERNELS__ + +void pulp_nn_conv_u8_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u8_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i8_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u4_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i4_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_u2_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_conv_i2_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u8_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i8_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u4_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i4_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_u2_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_pointwise_i2_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mul, + uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, uint16_t ch_in, + uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u8_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i8_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u8_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i8_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u8_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i8_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u8_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i8_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u8_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i8_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u8_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i8_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u4_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i4_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u4_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i4_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u4_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i4_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u4_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i4_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u4_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i4_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u4_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i4_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u2_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i2_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u2_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i2_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u2_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i2_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u2_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i2_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_u2_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_u8_i2_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_u2_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + uint8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +uint8_t *pulp_nn_matmul_i8_i2_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pOut2, int8_t *pWeight, + int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t num_col_im2col, uint16_t ch_out, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u8_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i8_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u4_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i4_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i8_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i8_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i8_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i8_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i8_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i8_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i4_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i4_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i4_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i4_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i4_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i4_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i2_i8( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i2_i8( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i2_i4( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i2_i4( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_u2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_u2_i2_i2( + uint8_t *pIn, uint8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_u2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_depthwise_i2_i2_i2( + int8_t *pIn, int8_t *pIm2ColBuffer, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int8_t *pWtBuffer, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, uint16_t dim_in_x, uint16_t dim_in_y, + uint16_t ch_in, uint16_t dim_out_x, uint16_t dim_out_y, uint16_t ch_out, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, uint16_t padding_y_top, + uint16_t padding_y_bottom, uint16_t padding_x_left, + uint16_t padding_x_right, uint16_t stride_x, uint16_t stride_y, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i32_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i8_i32_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u8_i32_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i8_i32_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u8_i32_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i8_i32_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u4_i32_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i4_i32_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u4_i32_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i4_i32_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u4_i32_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i4_i32_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u2_i32_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i2_i32_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u2_i32_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i2_i32_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u2_i32_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_i2_i32_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, uint16_t dim_vec, + uint16_t num_o_neurons); + +void pulp_nn_linear_u8_u8_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i8_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u8_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i8_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u8_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i8_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u8_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i8_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u8_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i8_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u8_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i8_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u4_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i4_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u4_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i4_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u4_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i4_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u4_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i4_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u4_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i4_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u4_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i4_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u2_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i2_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u2_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i2_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u2_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i2_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u2_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i2_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_u2_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u8_i2_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_u2_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i8_i2_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u8_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i8_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u8_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i8_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u8_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i8_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u8_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i8_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u8_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i8_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u8_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i8_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u4_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i4_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u4_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i4_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u4_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i4_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u4_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i4_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u4_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i4_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u4_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i4_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u2_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i2_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u2_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i2_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u2_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i2_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u2_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i2_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_u2_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u4_i2_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_u2_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i4_i2_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u8_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i8_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u8_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i8_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u8_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i8_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u8_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i8_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u8_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i8_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u8_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i8_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u4_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i4_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u4_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i4_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u4_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i4_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u4_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i4_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u4_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i4_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u4_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i4_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u2_i8(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i2_i8(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u2_i8(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i2_i8(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u2_i4(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i2_i4(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u2_i4(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i2_i4(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_u2_i2(uint8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_u2_i2_i2(uint8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_u2_i2(int8_t *pIn, int8_t *pBias, uint8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_linear_i2_i2_i2(int8_t *pIn, int8_t *pBias, int8_t *pOut, + int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, + uint16_t out_mul, uint16_t out_shift, + uint16_t dim_vec, uint16_t num_o_neurons, + uint8_t flag_relu, uint8_t flag_batchnorm); + +void pulp_nn_maxpool_u8(uint8_t *pIn, uint8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_maxpool_i8(int8_t *pIn, int8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_maxpool_u4(uint8_t *pIn, uint8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_maxpool_i4(int8_t *pIn, int8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_maxpool_u2(uint8_t *pIn, uint8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_maxpool_i2(int8_t *pIn, int8_t *pOut, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in, + uint16_t dim_im_out_x, uint16_t dim_im_out_y, + uint16_t dim_kernel_x, uint16_t dim_kernel_y, + uint16_t padding_t, uint16_t padding_b, + uint16_t padding_l, uint16_t padding_r, + uint16_t stride_x, uint16_t stride_y); + +void pulp_nn_avgpool_u8_u8(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u8_i8(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_u8(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_i8(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u8_u4(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u8_i4(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_u4(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_i4(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u8_u2(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u8_i2(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_u2(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i8_i2(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_u8(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_i8(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_u8(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_i8(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_u4(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_i4(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_u4(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_i4(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_u2(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u4_i2(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_u2(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i4_i2(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_u8(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_i8(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_u8(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_i8(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_u4(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_i4(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_u4(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_i4(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_u2(uint8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_u2_i2(uint8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_u2(int8_t *pIn, uint8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_avgpool_i2_i2(int8_t *pIn, int8_t *pOut, int32_t lambda, + uint16_t out_shift, int32_t out_add, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, uint16_t dim_im_out_x, + uint16_t dim_im_out_y, uint16_t dim_kernel_x, + uint16_t dim_kernel_y, uint16_t padding_t, + uint16_t padding_b, uint16_t padding_l, + uint16_t padding_r, uint16_t stride_x, + uint16_t stride_y, int flag_requant); + +void pulp_nn_add_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u8_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u8_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u8_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u8_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i8_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u4_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u4_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u4_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i4_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i8_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i4_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + uint16_t out_mult1, uint16_t out_mult2, + uint16_t out_shift, uint16_t dim_im_in_x, + uint16_t dim_im_in_y, uint16_t ch_im_in); + +void pulp_nn_add_u2_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_u2_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_u2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_u8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_u4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_u2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_i8(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_i4(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +void pulp_nn_add_i2_i2_i2(uint8_t *pIn1, uint8_t *pIn2, uint8_t *pOut, + int32_t in_mult1, int32_t in_add1, uint16_t in_shift1, + int32_t in_mult2, int32_t in_add2, uint16_t in_shift2, + int32_t out_mult, int32_t out_add, uint16_t out_shift, + uint16_t dim_im_in_x, uint16_t dim_im_in_y, + uint16_t ch_im_in, int out_requant_flag); + +#endif \ No newline at end of file diff --git a/TargetLibraries/GAP9/prebuilt/include/pulp_nn_utils.h b/TargetLibraries/GAP9/prebuilt/include/pulp_nn_utils.h new file mode 100644 index 0000000000..0aaa2935f1 --- /dev/null +++ b/TargetLibraries/GAP9/prebuilt/include/pulp_nn_utils.h @@ -0,0 +1,1789 @@ +/* + * pulp_nn_utils.h + * Nazareno Bruschi + * Alessandro Nadalini + * Georg Rutishauser + * + * Copyright (C) 2019-2020 ETH Zurich & University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PULPNN_UTILS__ +#define __PULPNN_UTILS__ + +#include "pmsis.h" + +#define bitext(x, size, off) __builtin_pulp_bextract(x, size, off) +#define bitextu(x, size, off) __builtin_pulp_bextractu(x, size, off) +#ifdef __clang__ +#define bitins(dst, not_mask_imm, src, mask_imm, off) \ + __builtin_binsert(dst, not_mask_imm, src, mask_imm, off) +#else +#define bitins(dst, not_mask_imm, src, mask_imm, off) \ + __builtin_pulp_binsert(dst, not_mask_imm, src, mask_imm, off) +#endif +#define pack(x, y, z, t) __builtin_pulp_pack4(x, y, z, t) +#define max4(a, b) __builtin_pulp_maxu4(a, b) +#define maxs4(a, b) __builtin_pulp_max4(a, b) +#define max8(a, b) __builtin_pulp_maxu8(a, b) +#define maxs8(a, b) __builtin_pulp_max8(a, b) +#define max16(a, b) __builtin_pulp_maxu16(a, b) +#define maxs16(a, b) __builtin_pulp_max16(a, b) +#define max32(a, b) __builtin_pulp_maxusi(a, b) +#define maxs32(a, b) __builtin_pulp_maxsi(a, b) +#define min32(a, b) __builtin_pulp_minusi(a, b) +#define mins32(a, b) __builtin_pulp_minsi(a, b) +#define min4(a, b) __builtin_pulp_minu4(a, b) +#define mins4(a, b) __builtin_pulp_min4(a, b) +#define min8(a, b) __builtin_pulp_minu8(a, b) +#define mins8(a, b) __builtin_pulp_min8(a, b) +#define min16(a, b) __builtin_pulp_minu16(a, b) +#define mins16(a, b) __builtin_pulp_min16(a, b) +#define avg4(a, b) __builtin_pulp_avgu4(a, b) +#define avg8(a, b) __builtin_pulp_avgu8(a, b) +#define avg16(a, b) __builtin_pulp_avgu16(a, b) +#define log2(x) __builtin_pulp_fl1(x) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define SumDotp4(a, b, c) __builtin_pulp_sdotusp4(a, b, c) +#define SumDotp8(a, b, c) __builtin_pulp_sdotusp8(a, b, c) +#define SumDotp16(a, b, c) __builtin_pulp_sdotusp16(a, b, c) +#define SumDotps4(a, b, c) __builtin_pulp_sdotsp4(a, b, c) +#define SumDotps8(a, b, c) __builtin_pulp_sdotsp8(a, b, c) +#define SumDotps16(a, b, c) __builtin_pulp_sdotsp16(a, b, c) +#define clip4(x) __builtin_pulp_clipu_r(x, 15) +#define clip2(x) __builtin_pulp_clipu_r(x, 3) +#define clip8(x) __builtin_pulp_clipu_r(x, 255) + +#define clips4(x) __builtin_pulp_clip_r(x, 7) +#define clips2(x) __builtin_pulp_clip_r(x, 1) +#define clips8(x) __builtin_pulp_clip_r(x, 127) +#define MacLoadInit(a_update, b_update, a_reg, b_reg, ptr) \ + __builtin_pulp_mlinitspr_v3(a_update, b_update, a_reg, b_reg, ptr) +#define MacLoadUpdate(ptr) __builtin_pulp_mlupdatespr_v3(ptr) +#define MacLoadAssign(ptr) __builtin_pulp_mlassignspr_v3(ptr) +#define MacLoad4(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsup4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad8(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsup8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoad16(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsup16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads4(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsp4_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads8(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsp8_v3(a_update, b_update, a_reg, b_reg, ptr, sum) +#define MacLoads16(a_update, b_update, a_reg, b_reg, ptr, sum) \ + __builtin_pulp_mlsdotsp16_v3(a_update, b_update, a_reg, b_reg, ptr, sum) + +#define PACK_INT8_SIZE(x) (x) +#define PACK_INT4_SIZE(x) ((x) >> 1) +#define PACK_INT2_SIZE(x) ((x) >> 2) + +#define MemoryFence() asm volatile("" ::: "memory") + +#define LEGACY_MODE(x) asm volatile("csrwi 0x010," x) +#define IVEC_FMT(x) asm volatile("csrwi 0x00D," x) +#define MIXED_SKIP(x) asm volatile("csrwi 0x00F," x) +#define A_ADDRESS(x) asm volatile("csrw 0x100, %0" ::"r"(x)) +#define W_ADDRESS(x) asm volatile("csrw 0x101, %0" ::"r"(x)) +#define A_STRIDE(x) asm volatile("csrw 0x102, %0" ::"r"(x)) +#define W_STRIDE(x) asm volatile("csrw 0x103, %0" ::"r"(x)) +#define A_ROLLBACK(x) asm volatile("csrw 0x104, %0" ::"r"(x)) +#define W_ROLLBACK(x) asm volatile("csrw 0x105, %0" ::"r"(x)) +#define A_SKIP(x) asm volatile("csrwi 0x106," x) +#define W_SKIP(x) asm volatile("csrwi 0x107," x) + +static uint8_t __attribute__((noinline)) +pulp_nn_quant_u2(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_bn_quant_u2(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip2(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_add_quant_u2(uint8_t pix1, uint8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + uint32_t integer_image = pix1 * m1 + pix2 * m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip2(x); + return res; +} + +static int8_t __attribute__((noinline)) +pulp_nn_quant_i2(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_bn_quant_i2(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips2(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_add_quant_i2(int8_t pix1, int8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + int32_t integer_image = pix1 * m1 + pix2 * m2; + int32_t x = (integer_image) >> d; + int8_t res = clips2(x); + return res; +} + +static uint8_t __attribute__((noinline)) +pulp_nn_quant_u4(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_bn_quant_u4(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip4(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_add_quant_u4(uint8_t pix1, uint8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + uint32_t integer_image = pix1 * m1 + pix2 * m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip4(x); + return res; +} + +static int8_t __attribute__((noinline)) +pulp_nn_quant_i4(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_bn_quant_i4(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips4(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_add_quant_i4(int8_t pix1, int8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + int32_t integer_image = pix1 * m1 + pix2 * m2; + int32_t x = (integer_image) >> d; + int8_t res = clips4(x); + return res; +} + +static uint8_t __attribute__((noinline)) +pulp_nn_quant_u8(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_bn_quant_u8(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + uint8_t res = clip8(x); + return res; +} +static uint8_t __attribute__((noinline)) +pulp_nn_add_quant_u8(uint8_t pix1, uint8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + uint32_t integer_image = pix1 * m1 + pix2 * m2; + uint32_t x = (integer_image) >> d; + uint8_t res = clip8(x); + return res; +} + +static int8_t __attribute__((noinline)) +pulp_nn_quant_i8(int32_t phi, int16_t m, int8_t d) { + int32_t x = (m * phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_bn_quant_i8(int32_t phi, int32_t k, int32_t lambda, int8_t d) { + int32_t integer_image_phi = (k * phi) + lambda; + int32_t x = (integer_image_phi) >> d; + int8_t res = clips8(x); + return res; +} +static int8_t __attribute__((noinline)) +pulp_nn_add_quant_i8(int8_t pix1, int8_t pix2, int16_t m1, int16_t m2, + int8_t d) { + int32_t integer_image = pix1 * m1 + pix2 * m2; + int32_t x = (integer_image) >> d; + int8_t res = clips8(x); + return res; +} + +static uint8_t __attribute__((noinline)) +pulp_nn_u4_quant(int input, int16_t *pThr) { + if (input <= pThr[7]) { + if (input <= pThr[3]) { + if (input <= pThr[1]) { + if (input <= pThr[0]) + return 0; + else + return 1; + } else { + if (input <= pThr[2]) + return 2; + else + return 3; + } + } else { + if (input <= pThr[5]) { + if (input <= pThr[4]) + return 4; + else + return 5; + } else { + if (input <= pThr[6]) + return 6; + else + return 7; + } + } + } else { + if (input <= pThr[11]) { + if (input <= pThr[9]) { + if (input <= pThr[8]) + return 8; + else + return 9; + } else { + if (input <= pThr[10]) + return 10; + else + return 11; + } + } else { + if (input <= pThr[13]) { + if (input <= pThr[12]) + return 12; + else + return 13; + } else { + if (input <= pThr[14]) + return 14; + else + return 15; + } + } + } +} + +static uint8_t __attribute__((noinline)) +pulp_nn_u2_quant(int input, int16_t *pThr) { + if (input <= pThr[1]) { + if (input <= pThr[0]) { + return 0; + } else { + return 1; + } + } else { + if (input <= pThr[2]) { + return 2; + } else { + return 3; + } + } +} + +/* + * Common + */ + +static v4s __attribute__((noinline)) pulp_nn_i4_to_i8_r(int8_t *pSrc) { + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t)bitext((int)Src, 4, 0); + bext2 = (int8_t)bitext((int)Src, 4, 4); + bext3 = (int8_t)bitext((int)Src, 4, 8); + bext4 = (int8_t)bitext((int)Src, 4, 12); + v4s res = pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u4_to_u8_r(uint8_t *pSrc) { + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 4, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 4, 4); + bext3 = (uint8_t)bitextu((unsigned int)Src, 4, 8); + bext4 = (uint8_t)bitextu((unsigned int)Src, 4, 12); + v4u res = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i8_r(int8_t *pSrc) { + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + + bext1 = (int8_t)bitext((int)Src, 2, 0); + bext2 = (int8_t)bitext((int)Src, 2, 2); + bext3 = (int8_t)bitext((int)Src, 2, 4); + bext4 = (int8_t)bitext((int)Src, 2, 6); + v4s res = pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u8_r(uint8_t *pSrc) { + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 2); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 4); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 6); + v4u res = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + + return res; +} + +static v4s __attribute__((noinline)) pulp_nn_i2_to_i4_r(int8_t *pSrc) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + bext1 = (int8_t)bitextu((int)Src, 2, 0); + bext2 = (int8_t)bitextu((int)Src, 2, 2); + bext3 = (int8_t)bitextu((int)Src, 2, 4); + bext4 = (int8_t)bitextu((int)Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (int8_t)bitextu((int)Src, 2, 8); + bext2 = (int8_t)bitextu((int)Src, 2, 10); + bext3 = (int8_t)bitextu((int)Src, 2, 12); + bext4 = (int8_t)bitextu((int)Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4s res = pack((int8_t)out1, (int8_t)out2, (int8_t)out3, (int8_t)out4); + + return res; +} + +static v4u __attribute__((noinline)) pulp_nn_u2_to_u4_r(uint8_t *pSrc) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 2); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 4); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 8); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 10); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 12); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + v4u res = pack((uint8_t)out1, (uint8_t)out2, (uint8_t)out3, (uint8_t)out4); + + return res; +} + +static int8_t *__attribute__((always_inline)) +pulp_nn_i4_to_i8(int8_t *pSrc, int8_t *pDst) { + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc += 4; + + bext1 = (int8_t)bitext((int)Src, 4, 0); + bext2 = (int8_t)bitext((int)Src, 4, 4); + bext3 = (int8_t)bitext((int)Src, 4, 8); + bext4 = (int8_t)bitext((int)Src, 4, 12); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (int8_t)bitext((int)Src, 4, 16); + bext2 = (int8_t)bitext((int)Src, 4, 20); + bext3 = (int8_t)bitext((int)Src, 4, 24); + bext4 = (int8_t)bitext((int)Src, 4, 28); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) +pulp_nn_u4_to_u8(uint8_t *pSrc, uint8_t *pDst) { + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc += 4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 4, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 4, 4); + bext3 = (uint8_t)bitextu((unsigned int)Src, 4, 8); + bext4 = (uint8_t)bitextu((unsigned int)Src, 4, 12); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (uint8_t)bitextu((unsigned int)Src, 4, 16); + bext2 = (uint8_t)bitextu((unsigned int)Src, 4, 20); + bext3 = (uint8_t)bitextu((unsigned int)Src, 4, 24); + bext4 = (uint8_t)bitextu((unsigned int)Src, 4, 28); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) +pulp_nn_i2_to_i8(int8_t *pSrc, int8_t *pDst) { + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + + pSrc += 4; + + bext1 = (int8_t)bitext((int)Src, 2, 0); + bext2 = (int8_t)bitext((int)Src, 2, 2); + bext3 = (int8_t)bitext((int)Src, 2, 4); + bext4 = (int8_t)bitext((int)Src, 2, 6); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (int8_t)bitext((int)Src, 2, 8); + bext2 = (int8_t)bitext((int)Src, 2, 10); + bext3 = (int8_t)bitext((int)Src, 2, 12); + bext4 = (int8_t)bitext((int)Src, 2, 14); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (int8_t)bitext((int)Src, 2, 16); + bext2 = (int8_t)bitext((int)Src, 2, 18); + bext3 = (int8_t)bitext((int)Src, 2, 20); + bext4 = (int8_t)bitext((int)Src, 2, 22); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (int8_t)bitext((int)Src, 2, 24); + bext2 = (int8_t)bitext((int)Src, 2, 26); + bext3 = (int8_t)bitext((int)Src, 2, 28); + bext4 = (int8_t)bitext((int)Src, 2, 30); + *((v4s *)pDst) = + pack((int8_t)bext1, (int8_t)bext2, (int8_t)bext3, (int8_t)bext4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) +pulp_nn_u2_to_u8(uint8_t *pSrc, uint8_t *pDst) { + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + + pSrc += 4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 2); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 4); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 6); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 8); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 10); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 12); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 14); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 16); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 18); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 20); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 22); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + MemoryFence(); + pDst += 4; + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 24); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 26); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 28); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 30); + *((v4u *)pDst) = + pack((uint8_t)bext1, (uint8_t)bext2, (uint8_t)bext3, (uint8_t)bext4); + + return pSrc; +} + +static int8_t *__attribute__((always_inline)) +pulp_nn_i2_to_i4(int8_t *pSrc, int8_t *pDst) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + v4s Src = *((v4s *)pSrc); + int8_t bext1, bext2, bext3, bext4; + int8_t out1, out2, out3, out4; + + pSrc += 4; + + bext1 = (int8_t)bitext((int)Src, 2, 0); + bext2 = (int8_t)bitext((int)Src, 2, 2); + bext3 = (int8_t)bitext((int)Src, 2, 4); + bext4 = (int8_t)bitext((int)Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t)bitext((int)Src, 2, 8); + bext2 = (int8_t)bitext((int)Src, 2, 10); + bext3 = (int8_t)bitext((int)Src, 2, 12); + bext4 = (int8_t)bitext((int)Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s *)pDst) = pack((int8_t)out1, (int8_t)out2, (int8_t)out3, (int8_t)out4); + MemoryFence(); + + pDst += 4; + bext1 = (int8_t)bitext((int)Src, 2, 16); + bext2 = (int8_t)bitext((int)Src, 2, 18); + bext3 = (int8_t)bitext((int)Src, 2, 20); + bext4 = (int8_t)bitext((int)Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (int8_t)bitext((int)Src, 2, 24); + bext2 = (int8_t)bitext((int)Src, 2, 26); + bext3 = (int8_t)bitext((int)Src, 2, 28); + bext4 = (int8_t)bitext((int)Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4s *)pDst) = pack((int8_t)out1, (int8_t)out2, (int8_t)out3, (int8_t)out4); + + return pSrc; +} + +static uint8_t *__attribute__((always_inline)) +pulp_nn_u2_to_u4(uint8_t *pSrc, uint8_t *pDst) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + v4u Src = *((v4u *)pSrc); + uint8_t bext1, bext2, bext3, bext4; + uint8_t out1, out2, out3, out4; + + pSrc += 4; + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 0); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 2); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 4); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 6); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 8); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 10); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 12); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 14); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u *)pDst) = + pack((uint8_t)out1, (uint8_t)out2, (uint8_t)out3, (uint8_t)out4); + MemoryFence(); + + pDst += 4; + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 16); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 18); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 20); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 22); + + out1 = bitins(bext1, n_mask, bext2, mask, off); + out2 = bitins(bext3, n_mask, bext4, mask, off); + MemoryFence(); + + bext1 = (uint8_t)bitextu((unsigned int)Src, 2, 24); + bext2 = (uint8_t)bitextu((unsigned int)Src, 2, 26); + bext3 = (uint8_t)bitextu((unsigned int)Src, 2, 28); + bext4 = (uint8_t)bitextu((unsigned int)Src, 2, 30); + + out3 = bitins(bext1, n_mask, bext2, mask, off); + out4 = bitins(bext3, n_mask, bext4, mask, off); + + *((v4u *)pDst) = + pack((uint8_t)out1, (uint8_t)out2, (uint8_t)out3, (uint8_t)out4); + + return pSrc; +} + +/* + * XpulpV2 + */ + +static void __attribute__((noinline)) +pulp_zero_mem(uint8_t *pBuffer, unsigned int size) { + int lfover = size & 0x3; + for (int i = 0; i < (size >> 2); i++) { + *((v4u *)pBuffer) = (v4u){0, 0, 0, 0}; + MemoryFence(); + pBuffer += 4; + } + while (lfover) { + *pBuffer++ = 0; + lfover--; + } +} + +static void __attribute__((noinline)) +pulp_nn_im2col_u2_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_u2_to_u8(pInput, pOutput); + MemoryFence(); + pOutput += 16; + } + while (lfover) { + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 0); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 2); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 4); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 6); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +pulp_nn_im2col_i2_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_i2_to_i8(pInput, pOutput); + MemoryFence(); + pOutput += 16; + } + while (lfover) { + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 0); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 2); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 4); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 6); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +pulp_nn_im2col_u4_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_u4_to_u8(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 4, 0); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 4, 4); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +pulp_nn_im2col_i4_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_i4_to_i8(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 4, 0); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 4, 4); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +pulp_nn_im2col_u8_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i < blkCnt; i++) { + *((v4u *)pOutput) = *((v4u *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + *((uint8_t *)pOutput) = *((uint8_t *)pInput); + pOutput++; + pInput++; + lfover--; + } +} +static void __attribute__((noinline)) +pulp_nn_im2col_i8_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i < blkCnt; i++) { + *((v4s *)pOutput) = *((v4s *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + *((int8_t *)pOutput) = *((int8_t *)pInput); + pOutput++; + pInput++; + lfover--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_im2col_u2_to_u2(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + *((v4u *)pOutput) = *((v4u *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + uint8_t extr; + *((uint8_t *)pOutput) = *((uint8_t *)pInput); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i2_to_i2(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + *((v4s *)pOutput) = *((v4s *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + int8_t extr; + *((int8_t *)pOutput) = *((int8_t *)pInput); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_u2_to_u4(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_u2_to_u4(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + uint8_t extr; + // TODO: this is potentially dangerous/wrong if the number of channels + // is not a multiple of 8! + *((v4u *)pOutput) = pulp_nn_u2_to_u4_r(pInput); + pInput += 2; + pOutput += 4; + lfover -= 8; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i2_to_i4(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_i2_to_i4(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + int8_t extr; + // TODO: this is potentially dangerous/wrong if the number of channels + // is not a multiple of 8! + *((v4s *)pOutput) = pulp_nn_i2_to_i4_r(pInput); + pInput += 2; + pOutput += 4; + lfover -= 8; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_u2_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_u2_to_u8(pInput, pOutput); + MemoryFence(); + pOutput += 16; + } + while (lfover) { + uint8_t extr; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 0); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 2); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 4); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 2, 6); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i2_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 4u; + int lfover = blockSize & 0x0f; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_i2_to_i8(pInput, pOutput); + MemoryFence(); + pOutput += 16; + } + while (lfover) { + int8_t extr; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 0); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 2); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 4); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 2, 6); + pOutput++; + pInput++; + lfover -= 4; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_u4_to_u4(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + *((v4u *)pOutput) = *((v4u *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + uint8_t extr; + *((uint8_t *)pOutput) = *((uint8_t *)pInput); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i4_to_i4(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + *((v4s *)pOutput) = *((v4s *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + int8_t extr; + *((int8_t *)pOutput) = *((int8_t *)pInput); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_u4_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_u4_to_u8(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + uint8_t extr; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 4, 0); + pOutput++; + *((uint8_t *)pOutput) = (uint8_t)bitextu((unsigned int)*pInput, 4, 4); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i4_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 3u; + int lfover = blockSize & 0x07; + for (int i = 0; i < blkCnt; i++) { + pInput = pulp_nn_i4_to_i8(pInput, pOutput); + MemoryFence(); + pOutput += 8; + } + while (lfover) { + int8_t extr; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 4, 0); + pOutput++; + *((int8_t *)pOutput) = (int8_t)bitext((int)*pInput, 4, 4); + pOutput++; + pInput++; + lfover -= 2; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_u8_to_u8(uint8_t *pInput, uint8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i < blkCnt; i++) { + *((v4u *)pOutput) = *((v4u *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + uint8_t extr; + *((uint8_t *)pOutput) = *((uint8_t *)pInput); + pOutput++; + pInput++; + lfover -= 1; + } +} +static void __attribute__((noinline)) +xpulp_nn_im2col_i8_to_i8(int8_t *pInput, int8_t *pOutput, + unsigned int blockSize) { + unsigned int blkCnt = blockSize >> 2u; + int lfover = blockSize & 0x03; + for (int i = 0; i < blkCnt; i++) { + *((v4s *)pOutput) = *((v4s *)pInput); + pInput += 4; + pOutput += 4; + } + while (lfover) { + int8_t extr; + *((int8_t *)pOutput) = *((int8_t *)pInput); + pOutput++; + pInput++; + lfover -= 1; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_u8(uint8_t *base, uint8_t *target, + uint16_t length) { + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp; + v4u com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4u *)pIn); + com = *((v4u *)pCom); + + *((v4u *)pIn) = max4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_i8(int8_t *base, int8_t *target, + uint16_t length) { + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp; + v4s com; + int cnt = length >> 2; + + while (cnt > 0u) { + inp = *((v4s *)pIn); + com = *((v4s *)pCom); + + *((v4s *)pIn) = maxs4(inp, com); + + pCom += 4; + pIn += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + if (*pIn < *pCom) + *pIn = *pCom; + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_avg_and_replace_u8(uint8_t *base, uint8_t *target, uint16_t length) { + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length > 0u) { + *pIn = ((*pIn + *pCom) >> 1); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_u4(uint8_t *base, uint8_t *target, + uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[2]; + v4u com[2]; + uint8_t *out; + int cnt = length >> 2; + + while (cnt > 0u) { + pulp_nn_u4_to_u8(pIn, (uint8_t *)inp); + pulp_nn_u4_to_u8(pCom, (uint8_t *)com); + + *((v4u *)out) = max4(inp[0], com[0]); + + *((uint8_t *)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t *)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + *((uint8_t *)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((uint8_t *)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 4, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 4, 4); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 4, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((uint8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_i4(int8_t *base, int8_t *target, + uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[2]; + v4s com[2]; + int8_t *out; + int cnt = length >> 2; + + while (cnt > 0u) { + pulp_nn_i4_to_i8(pIn, (int8_t *)inp); + pulp_nn_i4_to_i8(pCom, (int8_t *)com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + *((int8_t *)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t *)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + *((int8_t *)pIn) = bitins(*out, n_mask, *(out + 1), mask, off); + pIn++; + *((int8_t *)pIn) = bitins(*(out + 2), n_mask, *(out + 3), mask, off); + pIn++; + + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_avg_and_replace_u4(uint8_t *base, uint8_t *target, uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 4, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 4, 4); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 4, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_u2(uint8_t *base, uint8_t *target, + uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + v4u inp[4]; + v4u com[4]; + uint8_t *out; + int cnt = length >> 2; + + while (cnt > 0u) { + pulp_nn_u2_to_u8(pIn, inp); + pulp_nn_u2_to_u8(pCom, com); + + *((v4u *)out) = max4(inp[0], com[0]); + + uint8_t inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u *)out) = max4(inp[1], com[1]); + + inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u *)out) = max4(inp[2], com[2]); + + inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4u *)out) = max4(inp[3], com[3]); + + inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 2, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 2, 2); + uint8_t inA2 = (uint8_t)bitextu((unsigned int)*pIn, 2, 4); + uint8_t inA3 = (uint8_t)bitextu((unsigned int)*pIn, 2, 6); + v4u inA4 = pack((uint8_t)inA0, (uint8_t)inA1, (uint8_t)inA2, (uint8_t)inA3); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 2, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 2, 2); + uint8_t inB2 = (uint8_t)bitextu((unsigned int)*pCom, 2, 4); + uint8_t inB3 = (uint8_t)bitextu((unsigned int)*pCom, 2, 6); + v4u inB4 = pack((uint8_t)inB0, (uint8_t)inB1, (uint8_t)inB2, (uint8_t)inB3); + + *((v4u *)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_compare_and_replace_if_larger_i2(int8_t *base, int8_t *target, + uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + v4s inp[4]; + v4s com[4]; + int8_t *out; + int cnt = length >> 2; + + while (cnt > 0u) { + pulp_nn_i2_to_i8(pIn, inp); + pulp_nn_i2_to_i8(pCom, com); + + *((v4s *)out) = maxs4(inp[0], com[0]); + + int8_t inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s *)out) = maxs4(inp[1], com[1]); + + inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s *)out) = maxs4(inp[2], com[2]); + + inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + *((v4s *)out) = maxs4(inp[3], com[3]); + + inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + pIn++; + + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((unsigned int)*pIn, 2, 0); + int8_t inA1 = (int8_t)bitext((unsigned int)*pIn, 2, 2); + int8_t inA2 = (int8_t)bitext((unsigned int)*pIn, 2, 4); + int8_t inA3 = (int8_t)bitext((unsigned int)*pIn, 2, 6); + v4s inA4 = pack((int8_t)inA0, (int8_t)inA1, (int8_t)inA2, (int8_t)inA3); + int8_t inB0 = (int8_t)bitext((unsigned int)*pCom, 2, 0); + int8_t inB1 = (int8_t)bitext((unsigned int)*pCom, 2, 2); + int8_t inB2 = (int8_t)bitext((unsigned int)*pCom, 2, 4); + int8_t inB3 = (int8_t)bitext((unsigned int)*pCom, 2, 6); + v4s inB4 = pack((int8_t)inB0, (int8_t)inB1, (int8_t)inB2, (int8_t)inB3); + + *((v4s *)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +pulp_nn_avg_and_replace_u2(uint8_t *base, uint8_t *target, uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + while (length > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 2, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 2, 2); + uint8_t inA2 = (uint8_t)bitextu((unsigned int)*pIn, 2, 4); + uint8_t inA3 = (uint8_t)bitextu((unsigned int)*pIn, 2, 6); + + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 2, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 2, 2); + uint8_t inB2 = (uint8_t)bitextu((unsigned int)*pCom, 2, 4); + uint8_t inB3 = (uint8_t)bitextu((unsigned int)*pCom, 2, 6); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + inA2 = ((inA2 + inB2) >> 1); + inA3 = ((inA3 + inB3) >> 1); + + uint8_t inA = (uint8_t)bitins(inA0, n_mask2, inA1, mask2, off2); + inA = bitins(inA, n_mask4, inA2, mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, inA3, mask6, off6); + + pIn++; + pCom++; + length--; + } +} + +/* + * XpulpNN + */ + +static void __attribute__((noinline)) +xpulp_nn_zero_mem_u8(uint8_t *pBuffer, unsigned int size) { + int lfover = size & 0x3; + for (int i = 0; i < (size >> 2); i++) { + *((v4u *)pBuffer) = (v4u){0, 0, 0, 0}; + MemoryFence(); + pBuffer += 4; + } + while (lfover) { + *pBuffer++ = 0; + lfover--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_zero_mem_u4(uint8_t *pBuffer, unsigned int size) { + int lfover = size & 0x7; + for (int i = 0; i < (size >> 3); i++) { + *((v4u *)pBuffer) = (v4u){0, 0, 0, 0}; + MemoryFence(); + pBuffer += 4; + } + while (lfover) { + *pBuffer++ = 0; + lfover -= 2; + } +} + +static void __attribute__((noinline)) +xpulp_nn_zero_mem_u2(uint8_t *pBuffer, unsigned int size) { + int lfover = size & 0xf; + for (int i = 0; i < (size >> 4); i++) { + *((v4u *)pBuffer) = (v4u){0, 0, 0, 0}; + MemoryFence(); + pBuffer += 4; + } + while (lfover) { + *pBuffer++ = 0; + lfover -= 4; + } +} + +static void __attribute__((noinline)) +xpulp_nn_compare_and_replace_if_larger_u4(uint8_t *base, uint8_t *target, + uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((uint32_t *)pIn) = max8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 4, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 4, 4); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 4, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((uint8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_compare_and_replace_if_larger_i4(int8_t *base, int8_t *target, + uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + int8_t *pIn = base; + int8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs8(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 4, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 4, 4); + int8_t inB0 = (int8_t)bitext((int)*pCom, 4, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 4, 4); + + if (inA0 < inB0) + inA0 = inB0; + + if (inA1 < inB1) + inA1 = inB1; + + *((int8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_avg_and_replace_u4(uint8_t *base, uint8_t *target, uint16_t length) { + int8_t mask = 0xf0; + int8_t n_mask = ~mask; + int8_t off = 0x04; + + uint8_t *pIn = base; + uint8_t *pCom = target; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((uint32_t *)pIn) = avg8(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn += 4; + pCom += 4; + cnt--; + } + + int left = length & 0x3; + + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 4, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 4, 4); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 4, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 4, 4); + + inA0 = ((inA0 + inB0) >> 1); + inA1 = ((inA1 + inB1) >> 1); + + *((uint8_t *)pIn) = bitins(inA0, n_mask, inA1, mask, off); + + pIn++; + pCom++; + length--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_compare_and_replace_if_larger_u2(uint8_t *base, uint8_t *target, + uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((uint32_t *)pIn) = max16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn += 4; + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 2, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 2, 2); + uint8_t inA2 = (uint8_t)bitextu((unsigned int)*pIn, 2, 4); + uint8_t inA3 = (uint8_t)bitextu((unsigned int)*pIn, 2, 6); + v4u inA4 = pack((uint8_t)inA0, (uint8_t)inA1, (uint8_t)inA2, (uint8_t)inA3); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 2, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 2, 2); + uint8_t inB2 = (uint8_t)bitextu((unsigned int)*pCom, 2, 4); + uint8_t inB3 = (uint8_t)bitextu((unsigned int)*pCom, 2, 6); + v4u inB4 = pack((uint8_t)inB0, (uint8_t)inB1, (uint8_t)inB2, (uint8_t)inB3); + + *((v4u *)out) = max4(inA4, inB4); + + uint8_t inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} +static void __attribute__((noinline)) +xpulp_nn_compare_and_replace_if_larger_i2(int8_t *base, int8_t *target, + uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + int8_t *pIn = base; + int8_t *pCom = target; + int8_t *out; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((int32_t *)pIn) = maxs16(*((int32_t *)pIn), *((int32_t *)pCom)); + + pIn += 4; + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + int8_t inA0 = (int8_t)bitext((int)*pIn, 2, 0); + int8_t inA1 = (int8_t)bitext((int)*pIn, 2, 2); + int8_t inA2 = (int8_t)bitext((int)*pIn, 2, 4); + int8_t inA3 = (int8_t)bitext((int)*pIn, 2, 6); + v4s inA4 = pack((int8_t)inA0, (int8_t)inA1, (int8_t)inA2, (int8_t)inA3); + int8_t inB0 = (int8_t)bitext((int)*pCom, 2, 0); + int8_t inB1 = (int8_t)bitext((int)*pCom, 2, 2); + int8_t inB2 = (int8_t)bitext((int)*pCom, 2, 4); + int8_t inB3 = (int8_t)bitext((int)*pCom, 2, 6); + v4s inB4 = pack((int8_t)inB0, (int8_t)inB1, (int8_t)inB2, (int8_t)inB3); + + *((v4s *)out) = maxs4(inA4, inB4); + + int8_t inA = (int8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((int8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +static void __attribute__((noinline)) +xpulp_nn_avg_and_replace_u2(uint8_t *base, uint8_t *target, uint16_t length) { + int8_t mask2 = 0x0c; + int8_t n_mask2 = ~mask2; + int8_t mask4 = 0x30; + int8_t n_mask4 = ~mask4; + int8_t mask6 = 0xc0; + int8_t n_mask6 = ~mask6; + int8_t off2 = 2; + int8_t off4 = 4; + int8_t off6 = 6; + + uint8_t *pIn = base; + uint8_t *pCom = target; + uint8_t *out; + + int cnt = length >> 2; + + while (cnt > 0u) { + *((uint32_t *)pIn) = avg16(*((uint32_t *)pIn), *((uint32_t *)pCom)); + + pIn += 4; + pCom += 4; + cnt--; + } + + int left = length & 0x3; + while (left > 0u) { + uint8_t inA0 = (uint8_t)bitextu((unsigned int)*pIn, 2, 0); + uint8_t inA1 = (uint8_t)bitextu((unsigned int)*pIn, 2, 2); + uint8_t inA2 = (uint8_t)bitextu((unsigned int)*pIn, 2, 4); + uint8_t inA3 = (uint8_t)bitextu((unsigned int)*pIn, 2, 6); + v4u inA4 = pack((uint8_t)inA0, (uint8_t)inA1, (uint8_t)inA2, (uint8_t)inA3); + uint8_t inB0 = (uint8_t)bitextu((unsigned int)*pCom, 2, 0); + uint8_t inB1 = (uint8_t)bitextu((unsigned int)*pCom, 2, 2); + uint8_t inB2 = (uint8_t)bitextu((unsigned int)*pCom, 2, 4); + uint8_t inB3 = (uint8_t)bitextu((unsigned int)*pCom, 2, 6); + v4u inB4 = pack((uint8_t)inB0, (uint8_t)inB1, (uint8_t)inB2, (uint8_t)inB3); + + *((v4u *)out) = avg4(inA4, inB4); + + uint8_t inA = (uint8_t)bitins(*out, n_mask2, *(out + 1), mask2, off2); + inA = bitins(inA, n_mask4, *(out + 2), mask4, off4); + *((uint8_t *)pIn) = bitins(inA, n_mask6, *(out + 3), mask6, off6); + + pIn++; + pCom++; + left--; + } +} + +#endif diff --git a/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a b/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a new file mode 100644 index 0000000000..d63369b6c9 Binary files /dev/null and b/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a differ diff --git a/TargetLibraries/GAP9/src/Util.c b/TargetLibraries/GAP9/src/Util.c new file mode 100644 index 0000000000..710b9ed521 --- /dev/null +++ b/TargetLibraries/GAP9/src/Util.c @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployGAP9Math.h" +#include +#include +#include + +// Overwrite weak function from DeeployBasicLibs +int deeploy_log(const char *__restrict fmt, ...) { + va_list args; + va_start(args, fmt); + int ret = vprintf(fmt, args); + va_end(args); + return ret; +} + +void *deeploy_malloc(const size_t size) { return malloc(size); } + +void deeploy_free(void *const ptr) { free(ptr); } diff --git a/TargetLibraries/GAP9/src/dory_dma.c b/TargetLibraries/GAP9/src/dory_dma.c new file mode 100644 index 0000000000..9bcd41d83c --- /dev/null +++ b/TargetLibraries/GAP9/src/dory_dma.c @@ -0,0 +1,214 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "dory_dma.h" + +#define log2(x) __builtin_pulp_fl1(x) + +#include "pmsis.h" + +#ifndef MCHAN_BASE_ADDR +#define MCHAN_BASE_ADDR \ + (CLUSTER_PERIPHERALS_ADDR + CLUSTER_MCHAN_OFFSET) // CLUSTER_MCHAN_ADDR +#endif +#define MCHAN_EVENT +// #define MCHAN_POLLED +#ifdef MCHAN_EVENT +#define MCHAN_EVENT_BIT (CLUSTER_IRQ_DMA0) // 8 +#endif +#include "mchan.h" + +#if defined(MCHAN_POLLED) +#define MCHAN_FLAGS (MCHAN_CMD_FLAG_INCREMENTAL) +#elif defined(MCHAN_EVENT) +#define MCHAN_FLAGS (MCHAN_CMD_FLAG_EVENT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL) +#elif defined(MCHAN_INTERRUPT) +#define MCHAN_FLAGS \ + (MCHAN_CMD_FLAG_INTERRUPT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL) +#endif + +#define MCHAN_FLAGS_1D (MCHAN_FLAGS) +#define MCHAN_FLAGS_2D (MCHAN_FLAGS | MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy) { + int core_id = pi_core_id(); + int Log2Core = log2(log2(NUM_CORES)); + int number_of_copies_per_core = + (copy->length_1d_copy >> Log2Core) + + ((copy->length_1d_copy & (NUM_CORES - 1)) != 0); + int start_pixel, stop_pixel; // "pixel" is a misnomer; the CHANNELS are + // divided between the cores + // this function assumes that a DW tile is always as wide as the complete + // feature map (this is enforced by DORY's tiler) + start_pixel = MIN(number_of_copies_per_core * core_id, copy->length_1d_copy); + stop_pixel = + MIN(start_pixel + number_of_copies_per_core, copy->length_1d_copy); + void *ext = copy->ext + start_pixel; + void *loc = copy->loc + copy->number_of_1d_copies * + copy->number_of_2d_copies * start_pixel; + const int size_2d = copy->number_of_1d_copies * copy->number_of_2d_copies; + + for (int i = start_pixel; i < stop_pixel; i++) { + mchan_transfer_t trans = {.cmd = size_2d | + copy->dir << MCHAN_CMD_SHIFT_DIRECTION | + MCHAN_FLAGS_2D, + .size = size_2d, + .ext = ext, + .loc = loc, + .ext_size_1d = 1, // one byte at a time... + .ext_stride_1d = copy->stride_1d}; + mchan_transfer_push_2d(trans); +#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board + dory_dma_barrier(copy); +#endif + ext += 1; // next channel + loc += copy->number_of_1d_copies * copy->number_of_2d_copies; + } +} + +void dory_dma_memcpy_1d_async(DMA_copy *copy) { + if (pi_core_id() == 0) { + mchan_transfer_t trans = { + .cmd = copy->length_1d_copy * copy->number_of_1d_copies * + copy->number_of_2d_copies | + (copy->dir << MCHAN_CMD_SHIFT_DIRECTION) | MCHAN_FLAGS_1D, + .size = copy->length_1d_copy * copy->number_of_1d_copies * + copy->number_of_2d_copies, + .ext = copy->ext, + .loc = copy->loc}; + mchan_transfer_push_1d(trans); + } +} + +void dory_dma_memcpy_2d_async(DMA_copy *copy) { + if (pi_core_id() == 0) { + const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy * + copy->number_of_2d_copies; + const int stride = + (copy->number_of_2d_copies == 1) ? copy->stride_1d : copy->stride_2d; + const int size_1d = (copy->number_of_2d_copies == 1) + ? copy->length_1d_copy + : copy->length_1d_copy * copy->number_of_1d_copies; + + mchan_transfer_t trans = {.cmd = size_2d | + copy->dir << MCHAN_CMD_SHIFT_DIRECTION | + MCHAN_FLAGS_2D, + .size = size_2d, + .ext = copy->ext, + .loc = copy->loc, + .ext_size_1d = size_1d, + .ext_stride_1d = stride}; + mchan_transfer_push_2d(trans); + } +} + +void dory_dma_memcpy_3d_async(DMA_copy *copy) { + int core_id = pi_core_id(); + if (core_id == 0) { + int Log2Core = log2(1); + int number_of_2d_copies_per_core = (copy->number_of_2d_copies >> Log2Core) + + ((copy->number_of_2d_copies & (0)) != 0); + int start_pixel, stop_pixel; + start_pixel = + MIN(number_of_2d_copies_per_core * core_id, copy->number_of_2d_copies); + stop_pixel = MIN(start_pixel + number_of_2d_copies_per_core, + copy->number_of_2d_copies); + void *ext = copy->ext + copy->stride_2d * start_pixel; + void *loc = copy->loc + + copy->length_1d_copy * copy->number_of_1d_copies * start_pixel; + const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy; + + for (int i = start_pixel; i < stop_pixel; i++) { + mchan_transfer_t trans = {.cmd = size_2d | + copy->dir << MCHAN_CMD_SHIFT_DIRECTION | + MCHAN_FLAGS_2D, + .size = size_2d, + .ext = ext, + .loc = loc, + .ext_size_1d = copy->length_1d_copy, + .ext_stride_1d = copy->stride_1d}; + mchan_transfer_push_2d(trans); +#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board + // dory_dma_barrier(copy); +#endif + loc += size_2d; + ext += copy->stride_2d; + } + } +} + +void dory_dma_memcpy_async(DMA_copy *copy) { + if (copy->hwc_to_chw == 1) { + dory_dma_memcpy_hwc_to_chw(copy); + } else if ((copy->number_of_2d_copies == 1 && + copy->number_of_1d_copies == 1) || + (copy->stride_1d == copy->length_1d_copy && + copy->number_of_1d_copies * copy->length_1d_copy == + copy->stride_2d) || + (copy->number_of_2d_copies == 1 && + copy->length_1d_copy == copy->stride_1d)) { + dory_dma_memcpy_1d_async(copy); + } else if ((copy->number_of_2d_copies == 1) || + (copy->length_1d_copy == copy->stride_1d)) { // wrong! + dory_dma_memcpy_2d_async(copy); + } else { + dory_dma_memcpy_3d_async(copy); + } +} + +void dory_dma_memcpy_1d_mindims_async(DMA_copy *copy) { + mchan_transfer_t trans = { + .cmd = copy->mchan_cmd, .ext = copy->ext, .loc = copy->loc}; + mchan_transfer_push_1d(trans); +} + +void dory_dma_memcpy_2d_mindims_async(DMA_copy *copy) { + mchan_transfer_t trans = {.cmd = copy->mchan_cmd, + .ext = copy->ext, + .loc = copy->loc, + .ext_size_1d = copy->length_1d_copy, + .ext_stride_1d = copy->stride_1d}; + mchan_transfer_push_2d(trans); +} + +void dory_dma_memcpy_3d_mindims_async(DMA_copy *copy) { + void *ext = copy->ext; + void *loc = copy->loc; + const int length_2d_copy = + copy->mchan_cmd & ((1 << MCHAN_TRANSFER_LEN_SIZE) - 1); + + for (int i = 0; i < copy->number_of_2d_copies; i++) { + mchan_transfer_t trans = {.cmd = copy->mchan_cmd, + .ext = ext, + .loc = loc, + .ext_size_1d = copy->length_1d_copy, + .ext_stride_1d = copy->stride_1d}; + mchan_transfer_push_2d(trans); + loc += length_2d_copy; + ext += copy->stride_2d; +#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board + // dory_dma_barrier(copy); +#endif + } +} + +void dory_dma_memcpy_mindims_async(DMA_copy *copy) { + if (copy->number_of_2d_copies == 1 && copy->number_of_1d_copies == 1) { + dory_dma_memcpy_1d_mindims_async(copy); + } else if (copy->number_of_2d_copies == 1) { + dory_dma_memcpy_2d_mindims_async(copy); + } else { + dory_dma_memcpy_3d_mindims_async(copy); + } +} + +void dory_dma_free(DMA_copy *copy) { mchan_transfer_free(copy->tid); } + +void dory_dma_barrier(DMA_copy *copy) { mchan_transfer_wait(copy->tid); } + +int dory_dma_allocate() { return mchan_transfer_get_id(); } diff --git a/TargetLibraries/GAP9/src/dory_mem.c b/TargetLibraries/GAP9/src/dory_mem.c new file mode 100644 index 0000000000..7564d2ee8d --- /dev/null +++ b/TargetLibraries/GAP9/src/dory_mem.c @@ -0,0 +1,176 @@ +/* + * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "dory_mem.h" +#include "bsp/bsp.h" +// #include "bsp/flash.h" +#include "bsp/fs.h" +#include "bsp/fs/readfs.h" +// #include "bsp/ram.h" +#include "pmsis.h" + +#ifdef USE_HYPERFLASH +#include "bsp/flash/hyperflash.h" +typedef struct pi_hyperflash_conf flash_conf_t; +#define flash_conf_init(conf) pi_hyperflash_conf_init(conf) +#elif defined USE_SPIFLASH +#include "bsp/flash/spiflash.h" +typedef struct pi_spiflash_conf flash_conf_t; +#define flash_conf_init(conf) pi_spiflash_conf_init(conf) +#elif defined USE_MRAM +typedef struct pi_mram_conf flash_conf_t; +#define flash_conf_init(conf) pi_mram_conf_init(conf) +#else +typedef struct pi_default_flash_conf flash_conf_t; +#define flash_conf_init(conf) pi_default_flash_conf_init(conf) +#endif + +#ifdef USE_HYPERRAM +#include "bsp/ram/hyperram.h" +typedef struct pi_hyperram_conf ram_conf_t; +#define ram_conf_init(conf) pi_hyperram_conf_init(conf) +#else +typedef struct pi_default_ram_conf ram_conf_t; +#define ram_conf_init(conf) pi_default_ram_conf_init(conf) +#endif + +#define BUFFER_SIZE 2048 // 128 +static uint8_t buffer[BUFFER_SIZE]; + +static struct pi_device flash; +static flash_conf_t flash_conf; + +static struct pi_device fs; +static struct pi_readfs_conf fs_conf; + +struct pi_device ram; +static ram_conf_t ram_conf; + +void open_fs() { + // SCHEREMO: Fix FS + // Open filesystem on flash. + pi_readfs_conf_init(&fs_conf); + fs_conf.fs.flash = &flash; + pi_open_from_conf(&fs, &fs_conf); + if (pi_fs_mount(&fs)) { + printf("ERROR: Cannot mount filesystem! Exiting...\n"); + pmsis_exit(-2); + } +} + +void mem_init() { + flash_conf_init(&flash_conf); + pi_open_from_conf(&flash, &flash_conf); + if (pi_flash_open(&flash)) { + printf("ERROR: Cannot open flash! Exiting...\n"); + pmsis_exit(-1); + } + + ram_conf_init(&ram_conf); + pi_open_from_conf(&ram, &ram_conf); + if (pi_ram_open(&ram)) { + printf("ERROR: Cannot open ram! Exiting...\n"); + pmsis_exit(-3); + } +} + +struct pi_device *get_ram_ptr() { return &ram; } + +void *ram_malloc(size_t size) { + void *ptr = NULL; + pi_ram_alloc(&ram, (uint32_t *)&ptr, size); + return ptr; +} + +void ram_free(void *ptr, size_t size) { + pi_ram_free(&ram, (uint32_t)ptr, size); +} + +void ram_read(void *dest, void *src, const size_t size) { + pi_ram_read(&ram, (uint32_t)src, dest, size); +} + +void ram_write(void *dest, void *src, const size_t size) { + pi_ram_write(&ram, (uint32_t)dest, src, size); +} + +void *cl_ram_malloc(size_t size) { + uint32_t addr; + pi_cl_ram_alloc_req_t req; + pi_cl_ram_alloc(&ram, size, &req); + pi_cl_ram_alloc_wait(&req, &addr); + return (void *)addr; +} + +void cl_ram_free(void *ptr, size_t size) { + pi_cl_ram_free_req_t req; + pi_cl_ram_free(&ram, (uint32_t)ptr, size, &req); + pi_cl_ram_free_wait(&req); +} + +void cl_ram_read(void *dest, void *src, const size_t size) { + pi_cl_ram_req_t req; + pi_cl_ram_read(&ram, (uint32_t)src, dest, size, &req); + pi_cl_ram_read_wait(&req); +} + +void cl_ram_write(void *dest, void *src, const size_t size) { + pi_cl_ram_req_t req; + pi_cl_ram_write(&ram, (uint32_t)dest, src, size, &req); + pi_cl_ram_write_wait(&req); +} + +size_t load_file_to_ram(const void *dest, const char *filename) { + pi_fs_file_t *fd = pi_fs_open(&fs, filename, 0); + if (fd == NULL) { + printf("ERROR: Cannot open file %s! Exiting...", filename); + pmsis_exit(-4); + } + + size_t size = fd->size; + size_t load_size = 0; + size_t remaining_size = size; + + size_t offset = 0; + do { + + remaining_size = size - offset; + load_size = BUFFER_SIZE < remaining_size ? BUFFER_SIZE : remaining_size; + + pi_cl_fs_req_t req; + pi_cl_fs_read(fd, buffer, load_size, &req); + pi_cl_fs_wait(&req); + cl_ram_write(dest + offset, buffer, load_size); + offset += load_size; + } while (offset < size); + + return offset; +} + +size_t load_file_to_local(const void *dest, const char *filename) { + pi_fs_file_t *fd = pi_fs_open(&fs, filename, 0); + if (fd == NULL) { + printf("ERROR: Cannot open file %s! Exiting...", filename); + pmsis_exit(-4); + } + + const size_t size = fd->size; + size_t remaining_size = size; + size_t offset = 0; + pi_cl_fs_req_t req; + + while (offset < size) { + remaining_size = size - offset; + size_t load_size = + BUFFER_SIZE < remaining_size ? BUFFER_SIZE : remaining_size; + pi_cl_fs_read(fd, buffer, load_size, &req); + pi_cl_fs_wait(&req); + memcpy(dest + offset, buffer, load_size); + offset += load_size; + } + + return offset; +} diff --git a/TargetLibraries/Generic/src/BatchNorm_fp32.c b/TargetLibraries/Generic/src/BatchNorm_fp32.c index 9b30a30207..1e94d63dbb 100644 --- a/TargetLibraries/Generic/src/BatchNorm_fp32.c +++ b/TargetLibraries/Generic/src/BatchNorm_fp32.c @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "DeeployBasicMath.h" +#include void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, const float32_t *beta, const float32_t *mean, diff --git a/TargetLibraries/Generic/src/Sqrt_fp32.c b/TargetLibraries/Generic/src/Sqrt_fp32.c index 06327fda4e..e206893cde 100644 --- a/TargetLibraries/Generic/src/Sqrt_fp32.c +++ b/TargetLibraries/Generic/src/Sqrt_fp32.c @@ -5,6 +5,7 @@ */ #include "DeeployBasicMath.h" +#include void Sqrt_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) { for (int i = 0; i < size; i++) { diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index 0b9a3247e7..1a510c945b 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -39,7 +39,7 @@ target_compile_options(deeploypulp PRIVATE target_include_directories(deeploypulp PUBLIC ${PULP_SDK_INCLUDES}) target_compile_options(deeploypulp PUBLIC ${PULP_SDK_COMPILE_FLAGS}) -add_subdirectory(third_party/pulp-nn-mixed) +add_subdirectory(../third_party/pulp-nn-mixed ${CMAKE_CURRENT_BINARY_DIR}/pulp-nn-mixed) target_include_directories(pulp-nn-mixed PUBLIC ${PULP_SDK_INCLUDES}) target_compile_options(pulp-nn-mixed PUBLIC ${PULP_SDK_COMPILE_FLAGS}) @@ -54,7 +54,7 @@ if (platform IN_LIST PULP_NNX_PLATFORMS) else() message(FATAL_ERROR "Missing accelerator flags for platform ${platform}") endif() - add_subdirectory(third_party/pulp-nnx) + add_subdirectory(../third_party/pulp-nnx ${CMAKE_CURRENT_BINARY_DIR}/pulp-nnx) target_include_directories(pulp-nnx PUBLIC ${PULP_SDK_INCLUDES}) target_compile_options(pulp-nnx PUBLIC ${PULP_SDK_COMPILE_FLAGS}) target_compile_options(pulp-nnx PRIVATE diff --git a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h index 43e9c55cf4..cb56152bd6 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h @@ -10,8 +10,7 @@ #include "DeeployPULPMath.h" void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, - float32_t *scale, float32_t *bias, - float32_t epsilon, uint32_t size, - uint32_t lastDimLength); + float32_t *scale, float32_t *bias, uint32_t size, + uint32_t lastDimLength, float32_t epsilon); #endif // __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER__ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c index 3565fc664d..58b37d94d6 100644 --- a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c +++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c @@ -6,6 +6,7 @@ #include "DeeployPULPMath.h" #include "pmsis.h" +#include void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, diff --git a/TargetLibraries/PULPOpen/src/GELU.c b/TargetLibraries/PULPOpen/src/GELU.c index 5a0a4fa3f0..27176e1c27 100644 --- a/TargetLibraries/PULPOpen/src/GELU.c +++ b/TargetLibraries/PULPOpen/src/GELU.c @@ -7,6 +7,7 @@ #include "pmsis.h" #include "DeeployPULPMath.h" +#include #define M_PI 3.14159265358979323846 diff --git a/TargetLibraries/PULPOpen/src/Layernorm.c b/TargetLibraries/PULPOpen/src/Layernorm.c index f8387ab5e2..9324ff19ee 100644 --- a/TargetLibraries/PULPOpen/src/Layernorm.c +++ b/TargetLibraries/PULPOpen/src/Layernorm.c @@ -8,10 +8,11 @@ #include "DeeployPULPMath.h" +#include + void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, - float32_t *scale, float32_t *bias, - float32_t epsilon, uint32_t size, - uint32_t lastDimLength) { + float32_t *scale, float32_t *bias, uint32_t size, + uint32_t lastDimLength, float32_t epsilon) { int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); diff --git a/TargetLibraries/PULPOpen/src/Softmax.c b/TargetLibraries/PULPOpen/src/Softmax.c index 3fd60111fe..00678be825 100644 --- a/TargetLibraries/PULPOpen/src/Softmax.c +++ b/TargetLibraries/PULPOpen/src/Softmax.c @@ -6,6 +6,7 @@ #include "DeeployPULPMath.h" #include "pmsis.h" +#include void PULPSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out, uint32_t *lastDimBuffer, uint32_t size, diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed deleted file mode 160000 index a9b4aaf597..0000000000 --- a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a9b4aaf597c030ce24bf65a00b5f3ec84a1528c4 diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nnx b/TargetLibraries/PULPOpen/third_party/pulp-nnx deleted file mode 160000 index 234971fca4..0000000000 --- a/TargetLibraries/PULPOpen/third_party/pulp-nnx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 234971fca4a0eba5e8b703e9ccb62b7764dac7fa diff --git a/TargetLibraries/third_party/pulp-nn-mixed b/TargetLibraries/third_party/pulp-nn-mixed new file mode 160000 index 0000000000..faed38c72b --- /dev/null +++ b/TargetLibraries/third_party/pulp-nn-mixed @@ -0,0 +1 @@ +Subproject commit faed38c72b029b69dcab98571d228a66c3263891 diff --git a/TargetLibraries/third_party/pulp-nnx b/TargetLibraries/third_party/pulp-nnx new file mode 160000 index 0000000000..c4f6ba351e --- /dev/null +++ b/TargetLibraries/third_party/pulp-nnx @@ -0,0 +1 @@ +Subproject commit c4f6ba351e30b31125baba35896db394804d819d diff --git a/cmake/common.cmake b/cmake/common.cmake index 18437219d5..9d1aaba7d2 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -2,17 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 -set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE) - set(use_dma 1 CACHE STRING "Enable DMA trasfers") add_compile_definitions( USE_DMA=${use_dma} ) -add_library(deeploylib INTERFACE) - add_compile_options( -std=gnu99 diff --git a/cmake/gap9/gap9_gvsoc.cmake b/cmake/gap9/gap9_gvsoc.cmake new file mode 100644 index 0000000000..5a6b205376 --- /dev/null +++ b/cmake/gap9/gap9_gvsoc.cmake @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +# Mark that GAP9-specific gvsoc emulation is defined +set(GAP9_GVSOC_DEFINED TRUE) + +macro(add_gvsoc_emulation name target) + + set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir) + make_directory(${GVSOC_WORKDIR}) + set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}") + + # GAP9 SDK paths + set(GAP9_SDK_HOME $ENV{GAP_SDK_HOME}) + if(NOT GAP9_SDK_HOME) + message(FATAL_ERROR "Environment variable GAP_SDK_HOME not set") + endif() + + # Check if GAPY_RUNNER_ARGS is defined and non-empty (indicates L3 with readfs files) + if(GAPY_RUNNER_ARGS) + # L3 mode: Use gapy with flash layout and readfs + message(STATUS "[Deeploy GAP9] L3 mode: using gapy with readfs") + + set(GAPY "${GAP9_SDK_HOME}/utils/gapy_v2/bin/gapy") + set(FLASH_LAYOUT "${GAP9_SDK_HOME}/utils/layouts/default_layout_multi_readfs.json") + set(FSBL_BINARY "${GAP9_SDK_HOME}/install/target/bin/fsbl") + set(SSBL_BINARY "${GAP9_SDK_HOME}/install/target/bin/ssbl") + + # Build the gapy command + set(GAPY_CMD + ${GAPY} + --target=gap9.evk + --target-dir=${GAP9_SDK_HOME}/install/workstation/generators + --model-dir=${GAP9_SDK_HOME}/install/workstation/models + --platform=gvsoc + --work-dir=${GVSOC_WORKDIR} + --target-property=boot.flash_device=mram + --target-property=boot.mode=flash + --multi-flash-content=${FLASH_LAYOUT} + --flash-property=${GVSOC_BINARY}@mram:app:binary + ) + + # Add readfs files if provided + if(GAPY_RUNNER_ARGS) + list(LENGTH GAPY_RUNNER_ARGS num_readfs_files) + message(STATUS "[Deeploy GAP9] Adding ${num_readfs_files} readfs file(s)") + list(APPEND GAPY_CMD ${GAPY_RUNNER_ARGS}) + endif() + + # Add fsbl/ssbl + list(APPEND GAPY_CMD + --flash-property=${FSBL_BINARY}@mram:fsbl:binary + --flash-property=${SSBL_BINARY}@mram:ssbl:binary + ) + + # Add final commands + list(APPEND GAPY_CMD + --py-stack + image flash run + --binary=${GVSOC_BINARY} + ) + + # Convert list to string for printing + string(REPLACE ";" " " GAPY_CMD_STR "${GAPY_CMD}") + + add_custom_target(gvsoc_${name} + DEPENDS ${name} + WORKING_DIRECTORY ${GVSOC_WORKDIR} + COMMAND bash -c "${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ 2>/dev/null || true" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${GAP9_SDK_HOME}/utils/efuse/GAP9/efuse_hyper_preload.data + ${GVSOC_WORKDIR}/chip.efuse_preload.data + COMMAND ${CMAKE_COMMAND} -E echo "==========================================" + COMMAND ${CMAKE_COMMAND} -E echo "[Deeploy GAP9] Executing gapy command (L3 mode with readfs):" + COMMAND ${CMAKE_COMMAND} -E echo "${GAPY_CMD_STR}" + COMMAND ${CMAKE_COMMAND} -E echo "==========================================" + COMMAND ${GAPY_CMD} + COMMENT "Simulating ${name} with gapy for GAP9 (L3 mode)" + POST_BUILD + USES_TERMINAL + VERBATIM + ) + + else() + # L2 mode: Use traditional gvsoc command directly (no flash/readfs) + message(STATUS "[Deeploy GAP9] L2 mode: using traditional gvsoc without flash") + + set(GVSOC_EXECUTABLE "${GVSOC_INSTALL_DIR}/bin/gvsoc") + + # L2 mode: run directly without flash operations + set(GVSOC_CMD + ${GVSOC_EXECUTABLE} + --target=${target} + --binary ${GVSOC_BINARY} + --work-dir=${GVSOC_WORKDIR} + image flash run + ) + + # Convert list to string for printing + string(REPLACE ";" " " GVSOC_CMD_STR "${GVSOC_CMD}") + + add_custom_target(gvsoc_${name} + DEPENDS ${name} + WORKING_DIRECTORY ${GVSOC_WORKDIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${GAP9_SDK_HOME}/utils/efuse/GAP9/efuse_hyper_preload.data + ${GVSOC_WORKDIR}/chip.efuse_preload.data + COMMAND ${CMAKE_COMMAND} -E echo "==========================================" + COMMAND ${CMAKE_COMMAND} -E echo "[Deeploy GAP9] Executing gvsoc command - L2 mode:" + COMMAND ${CMAKE_COMMAND} -E echo "${GVSOC_CMD_STR}" + COMMAND ${CMAKE_COMMAND} -E echo "==========================================" + COMMAND ${GVSOC_CMD} + COMMENT "Simulating ${name} with gvsoc for GAP9 (L2 mode)" + POST_BUILD + USES_TERMINAL + ) + endif() +endmacro() \ No newline at end of file diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake index 8d68a0ad05..55525feedd 100644 --- a/cmake/simulation.cmake +++ b/cmake/simulation.cmake @@ -73,23 +73,32 @@ function(gvsoc_flags_add_files_to_hyperflash out_var files_var) set(${out_var} ${flags} PARENT_SCOPE) endfunction() +function(gvsoc_flags_add_files_to_flash out_var files_var) + set(flags) + foreach(file ${${files_var}}) + list(APPEND flags "--flash-property=${file}@flash:readfs_flash:files") + endforeach() + set(${out_var} ${flags} PARENT_SCOPE) +endfunction() + # The macro creates a new gvsoc_ cmake target which executes the final # binary on the gvsoc simulator. To give extra flags to the gvsoc command, set # the GVSOC_EXTRA_FLAGS variable. macro(add_gvsoc_emulation name target) - if(NOT DEFINED ENV{GVSOC_INSTALL_DIR}) - message(FATAL_ERROR "Environment variable GVSOC_INSTALL_DIR not set") + if(NOT DEFINED GVSOC_INSTALL_DIR) + message(FATAL_ERROR "CMake variable GVSOC_INSTALL_DIR not set. Please specify it with -DGVSOC_INSTALL_DIR=") endif() set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir) make_directory(${GVSOC_WORKDIR}) - set(GVSOC_EXECUTABLE "$ENV{GVSOC_INSTALL_DIR}/bin/gvsoc") + set(GVSOC_EXECUTABLE "${GVSOC_INSTALL_DIR}/bin/gvsoc") set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}") add_custom_target(gvsoc_${name} DEPENDS ${name} + WORKING_DIRECTORY ${GVSOC_WORKDIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true COMMAND ${GVSOC_EXECUTABLE} --target=${target} --binary ${GVSOC_BINARY} --work-dir=${GVSOC_WORKDIR} ${GVSOC_EXTRA_FLAGS} image flash run COMMENT "Simulating deeploytest ${name} with gvsoc for the target ${target}" POST_BUILD USES_TERMINAL - VERBATIM ) endmacro()