diff --git a/.gitignore b/.gitignore
index 8fb87927ce..3327583d32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ sdist/
 *.egg-info/
 vivado_prj
 .vscode
+.idea
 my-hls-test
 *.tar.gz
 docs/_build
diff --git a/docs/backend/xls.rst b/docs/backend/xls.rst
new file mode 100644
index 0000000000..5b6b2ae4ae
--- /dev/null
+++ b/docs/backend/xls.rst
@@ -0,0 +1,47 @@
+============
+XLS
+============
+
+The XLS backend can convert hls4ml models into SystemVerilog via `Google XLS <https://google.github.io/xls/>`_, which can be converted to IP via **Vivado**.
+
+To enable XLS:
+.. code-block:: bash
+
+    pip install hls4ml[xls]
+
+hls4ml uses `pyxls <https://calad0i.github.io/pyxls/>`_ package to access XLS API.
+pyxls comes with batteries included, and a separate XLS installation is not required.
+
+Workflow
+=========================
+
+XLS backend preforms the following transformations:
+.. code-block::
+
+    hls4ml representation -> DSLX (<ProjectName>.x) -> XLS IR (<ProjectName>.ir) -> Optimized XLS IR (<ProjectName>.opt.ir) -> SystemVerilog (<ProjectName>.sv) -> IP
+
+`DSLX <https://google.github.io/xls/dslx_reference/>`_ is a DSL with Rust-like syntax.
+DSLX project generated by hls4ml in ``<OutputDir>/firmware`` contains the main module ``<ProjectName>.x``, layer modules ``layer_<LayerName>.x``, and helper modules in ``ap_types/`` and ``nnet_utils/``.
+You may work with this project either through hls4ml or using your own XLS toolchain.
+
+hls4ml calls XLS compiler to convert DSLX into `XLS IR <https://google.github.io/xls/ir_overview/>`_ format (``<ProjectName>.ir``) and then runs IR optimization passes (``<ProjectName>.opt.ir``).
+
+Then, hls4ml uses `XLS Codegen <https://google.github.io/xls/codegen_options/>`_ to generate SystemVerilog (``<ProjectName>.sv``) from IR, and calls **Vivado** for ``<OutputDir>/build_prj.tcl`` to generate IP.
+
+You can override default codegen options:
+.. code-block:: python
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    # This sets hls_model.config['XLSCodegenFlags']
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, backend='XLS',
+        xls_codegen_flags={'delay_model': 'asap7', 'generator': 'pipeline', 'use_system_verilog': False}
+    )
+
+I/O Types and Strategy
+=========================
+
+Currently, only ``io_parallel`` is supported. ``Strategy`` is ignored.
+All operations are fully unrolled.
+
+XLS supports only signed ``FixedPoint`` type (similar to ``ap_fixed``).
diff --git a/docs/index.rst b/docs/index.rst
index f170ca6858..2a4f60b733 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,6 +39,7 @@
     backend/catapult
     backend/quartus
     backend/sr
+    backend/xls
 
 .. toctree::
     :hidden:
diff --git a/docs/intro/setup.rst b/docs/intro/setup.rst
index 4e3d192fcf..fd36c959b9 100644
--- a/docs/intro/setup.rst
+++ b/docs/intro/setup.rst
@@ -203,6 +203,9 @@ Optional Dependencies
    # For symbolic regression
    pip install hls4ml[sr]
 
+   # For XLS backend
+   pip install hls4ml[xls]
+
    # For documentation building (developers)
    pip install hls4ml[doc]
 
diff --git a/docs/intro/status.rst b/docs/intro/status.rst
index 7526c3bec4..3245b73540 100644
--- a/docs/intro/status.rst
+++ b/docs/intro/status.rst
@@ -47,6 +47,7 @@ HLS backends:
 * Vitis HLS
 * Catapult HLS
 * oneAPI (experimental)
+* XLS (experimental)
 
 A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
@@ -79,6 +80,8 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 +-----------------------+-----+-----+--------------+--------+--------+-----+
 | oneAPI (experimental) | ✅  | ✅  | ✅           | ❌     | ✅     | ❌  |
 +-----------------------+-----+-----+--------------+--------+--------+-----+
+| XLS (experimental)    | ✅  | ✅  | ❌           | ❌     | ❌     | ❌  |
++-----------------------+-----+-----+--------------+--------+--------+-----+
 
 Other feature notes:
 
diff --git a/docs/ir/attributes.rst b/docs/ir/attributes.rst
index dfbec51b1c..8fecef73aa 100644
--- a/docs/ir/attributes.rst
+++ b/docs/ir/attributes.rst
@@ -87,19 +87,19 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 ParametrizedActivation
 ======================
@@ -143,19 +143,19 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 PReLU
 =====
@@ -203,19 +203,19 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Softmax
 =======
@@ -251,43 +251,59 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* n_outer: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* n_inner: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * implementation: list [latency,stable,argmax,legacy] (Default: stable)
 
   * Choice of implementation of softmax function. "latency" provides good latency at the expense of extra resources. performs well on small number of classes. "stable" may require extra clock cycles but has better accuracy. "legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. It is superseded by the "latency" implementation for most applications. "argmax" is a special implementation that can be used if only the output with the highest probability is important. Using this implementation will save resources and clock cycles.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * skip: bool (Default: False)
 
   * If enabled, skips the softmax node and returns the raw outputs.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * exp_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * inv_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* inv_inp_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* accum_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 TernaryTanh
 ===========
@@ -323,19 +339,19 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 HardActivation
 ==============
@@ -383,19 +399,19 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_size: int (Default: 1024)
 
   * The size of the lookup table used to approximate the function.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
 
   * The datatype (precision) used for the values of the lookup table.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Reshape
 =======
@@ -471,13 +487,17 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* strategy: list [latency,resource] (Default: latency)
+
+  * Available in: Libero
 
 Conv
 ====
@@ -509,13 +529,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Conv1D
 ======
@@ -577,13 +597,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -669,13 +689,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -761,13 +781,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -847,23 +867,23 @@ Backend-specific attributes
 ---------------------------
 * depthwise_accum_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * pointwise_accum_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * depthwise_result_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * depthwise_reuse_factor: int (Default: 1)
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * pointwise_reuse_factor: int (Default: 1)
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
 
@@ -965,13 +985,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -1063,23 +1083,23 @@ Backend-specific attributes
 ---------------------------
 * depthwise_accum_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * pointwise_accum_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * depthwise_result_t: NamedType
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * depthwise_reuse_factor: int (Default: 1)
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * pointwise_reuse_factor: int (Default: 1)
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
 
@@ -1205,13 +1225,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -1277,7 +1297,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Pooling1D
 =========
@@ -1327,13 +1347,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
 
@@ -1401,13 +1421,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
 
@@ -1451,13 +1471,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 GlobalPooling2D
 ===============
@@ -1497,13 +1517,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 ZeroPadding1D
 =============
@@ -1571,6 +1591,82 @@ Type attributes
 
 * pad_right: int
 
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Cropping1D
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* crop_left: int
+
+* crop_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Cropping2D
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* crop_top: int
+
+* crop_bottom: int
+
+* crop_left: int
+
+* crop_right: int
+
 Configurable attributes
 -----------------------
 * trace: int (Default: False)
@@ -1611,7 +1707,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 MatMul
 ======
@@ -1643,13 +1739,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Dot
 ===
@@ -1697,13 +1793,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI, Libero, Libero, XLS, XLS
 
 Concatenate
 ===========
@@ -1751,7 +1847,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 Resize
 ======
@@ -1859,7 +1955,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 SimpleRNN
 =========
@@ -1889,6 +1985,8 @@ Type attributes
 
 * return_state: bool (Default: False)
 
+* pass_initial_states: bool (Default: False)
+
 Weight attributes
 -----------------
 * weight: WeightVariable
@@ -1921,13 +2019,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * recurrent_reuse_factor: int (Default: 1)
 
@@ -1985,6 +2083,10 @@ Type attributes
 
 * return_state: bool (Default: False)
 
+* pass_initial_states: bool (Default: False)
+
+* direction: list [forward,backward] (Default: forward)
+
 * time_major: bool (Default: False)
 
 Weight attributes
@@ -2007,8 +2109,6 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-* direction: list [forward,backward] (Default: forward)
-
 * weight_t: NamedType
 
 * bias_t: NamedType
@@ -2023,13 +2123,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * recurrent_reuse_factor: int (Default: 1)
 
@@ -2087,8 +2187,14 @@ Type attributes
 
 * return_state: bool (Default: False)
 
+* pass_initial_states: bool (Default: False)
+
+* direction: list [forward,backward] (Default: forward)
+
 * time_major: bool (Default: False)
 
+* apply_reset_gate: list [before,after] (Default: after)
+
 Weight attributes
 -----------------
 * weight: WeightVariable
@@ -2109,10 +2215,6 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-* direction: list [forward,backward] (Default: forward)
-
-* apply_reset_gate: list [before,after] (Default: after)
-
 * weight_t: NamedType
 
 * bias_t: NamedType
@@ -2127,13 +2229,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * recurrent_reuse_factor: int (Default: 1)
 
@@ -2159,6 +2261,192 @@ Backend-specific attributes
 
   * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
 
+Bidirectional
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* forward_weight_t: NamedType
+
+* forward_bias_t: NamedType
+
+* forward_recurrent_weight_t: NamedType
+
+* forward_recurrent_bias_t: NamedType
+
+* backward_weight_t: NamedType
+
+* backward_bias_t: NamedType
+
+* backward_recurrent_weight_t: NamedType
+
+* backward_recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* pass_initial_states: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+* forward_activation: str
+
+* forward_recurrent_activation: str
+
+* backward_activation: str
+
+* backward_recurrent_activation: str
+
+Weight attributes
+-----------------
+* forward_weight: WeightVariable
+
+* forward_bias: WeightVariable
+
+* forward_recurrent_weight: WeightVariable
+
+* forward_recurrent_bias: WeightVariable
+
+* backward_weight: WeightVariable
+
+* backward_bias: WeightVariable
+
+* backward_recurrent_weight: WeightVariable
+
+* backward_recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* forward_weight_t: NamedType
+
+* forward_bias_t: NamedType
+
+* forward_recurrent_weight_t: NamedType
+
+* forward_recurrent_bias_t: NamedType
+
+* backward_weight_t: NamedType
+
+* backward_bias_t: NamedType
+
+* backward_recurrent_weight_t: NamedType
+
+* backward_recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* forward_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* backward_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* forward_recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* backward_recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+TimeDistributed
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* wrapped_layer: None
+
+* n_time_steps: int
+
+* output_shape: list
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* time_step_loop_parallelism: list [Off,Unroll,Pipeline] (Default: Off)
+
+  * Controls the amont and type of parallelism in the loop over time steps. If set to "off", no parallelism will be used. If set to "unroll", the loop will be unrolled. This may result in excessive resource use and cannot be used in "io_stream" mode. If set to "pipeline", the loop will be pipelined.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
 GarNet
 ======
 Base attributes
@@ -2189,7 +2477,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI, Libero, Libero, XLS, XLS
 
 GarNetStack
 ===========
@@ -2237,7 +2525,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI, Libero, Libero, XLS, XLS
 
 Quant
 =====
@@ -2275,7 +2563,31 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+BipolarQuant
+============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
 
 ApplyAlpha
 ==========
@@ -2329,7 +2641,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 BatchNormOnnx
 =============
@@ -2361,7 +2673,7 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 LayerGroup
 ==========
@@ -2427,6 +2739,174 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
+LayerNormalization
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* seq_len: int
+
+* axis: int (Default: 2)
+
+* epsilon_power_of_10: int (Default: 3)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
+
+* table_range_power2: int (Default: 0)
+
+  * The negative power of 2 that represents the range of the lookup table, e.g. a value of 1 would represent a range of 0.5.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* table_size: int (Default: 4096)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* table_t: NamedType (Default: ufixed<8,5,RND_CONV,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+* accum_t: NamedType (Default: fixed<14,4,RND_CONV,SAT,0>)
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis
+
+EinsumDense
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* equation: str
+
+* inp_shape: tuple
+
+* out_shape: tuple
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* accum_t: NamedType
+
+Einsum
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* equation: str
+
+* inp0_shape: tuple
+
+* inp1_shape: tuple
+
+* out_shape: tuple
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
 BiasAdd
 =======
 Base attributes
@@ -2473,10 +2953,10 @@ Backend-specific attributes
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
-FixedPointQuantizer
-===================
+DACombinational
+===============
 Base attributes
 ---------------
 * result_t: NamedType
@@ -2499,8 +2979,8 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-UnaryLUT
-========
+FixedPointQuantizer
+===================
 Base attributes
 ---------------
 * result_t: NamedType
@@ -2523,20 +3003,28 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-Repack
-======
+UnaryLUT
+========
 Base attributes
 ---------------
 * result_t: NamedType
 
   * The datatype (precision) of the output tensor.
 
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
 Type attributes
 ---------------
 * index: int
 
   * Internal node counter used for bookkeeping and variable/tensor naming.
 
+* n_in: int
+
+Weight attributes
+-----------------
+* table: WeightVariable
+
 Configurable attributes
 -----------------------
 * trace: int (Default: False)
@@ -2547,8 +3035,10 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-Clone
-=====
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+Repack
+======
 Base attributes
 ---------------
 * result_t: NamedType
@@ -2571,26 +3061,20 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-BatchNormalizationQuantizedTanh
-===============================
+Clone
+=====
 Base attributes
 ---------------
 * result_t: NamedType
 
   * The datatype (precision) of the output tensor.
 
-* accum_t: NamedType
-
 Type attributes
 ---------------
 * index: int
 
   * Internal node counter used for bookkeeping and variable/tensor naming.
 
-* n_in: int
-
-* n_filt: int (Default: 0)
-
 Configurable attributes
 -----------------------
 * trace: int (Default: False)
@@ -2601,10 +3085,6 @@ Configurable attributes
 
   * The datatype (precision) of the output tensor.
 
-* accum_t: NamedType
-
-* reuse_factor: int (Default: 1)
-
 PointwiseConv1D
 ===============
 Base attributes
@@ -2665,13 +3145,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -2757,13 +3237,13 @@ Backend-specific attributes
 
   * The datatype (precision) used to store intermediate results of the computation within the layer.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * reuse_factor: int (Default: 1)
 
   * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
 
-  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI, Libero, XLS
 
 * parallelization_factor: int (Default: 1)
 
@@ -2800,3 +3280,37 @@ Configurable attributes
 * result_t: NamedType
 
   * The datatype (precision) of the output tensor.
+
+BatchNormalizationQuantizedTanh
+===============================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: 0)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+* reuse_factor: int (Default: 1)
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 07a089cdf8..0ae8e3d20e 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -12,6 +12,7 @@
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
+from hls4ml.backends.xls.xls_backend import XLSBackend
 
 
 def _register_builtin_backends():
@@ -23,6 +24,7 @@ def _register_builtin_backends():
     register_backend('SymbolicExpression', SymbolicExpressionBackend)
     register_backend('oneAPI', OneAPIBackend)
     register_backend('Libero', LiberoBackend)
+    register_backend('XLS', XLSBackend)
 
 
 _register_builtin_backends()
diff --git a/hls4ml/backends/xls/__init__.py b/hls4ml/backends/xls/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/xls/passes/build_attr.py b/hls4ml/backends/xls/passes/build_attr.py
new file mode 100644
index 0000000000..f24309b52e
--- /dev/null
+++ b/hls4ml/backends/xls/passes/build_attr.py
@@ -0,0 +1,430 @@
+# Typing imports
+from __future__ import annotations  # makes all annotations into strings
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Literal
+
+from hls4ml.backends.xls.xls_types import (
+    XLSArray,
+    XLSArrayType,
+    XLSConst,
+    XLSFixedPoint,
+    XLSFixedPointType,
+    XLSFunctionCall,
+    XLSQualifiedName,
+    XLSTensorVariable,
+    float_to_significand,
+)
+from hls4ml.model.types import PrecisionType
+
+if TYPE_CHECKING:
+    from hls4ml.model.graph import ModelGraph
+    from hls4ml.model.layers import Layer
+
+from functools import wraps
+
+import numpy as np
+
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class XLSAttrBuilder:
+    """A helper class that sets XLS specific attributes for the layers of the original ModelGraph.
+    In doing so, we simplify the process of creating new optimization passes
+    and constructing the writer class.
+    The new attributes must be accessed with .get_attr(...)
+
+    New attributes:
+        - xls_module_name (str):                  DSLX module name (e.g. layer_4_softmax) used for the layer
+        - xls_input_variables(list[XLSTensorVariable]):  XLS representation of input shape and precision
+        - xls_output_variables(list[XLSTensorVariable]): XLS representation of output shape and precision
+        - xls_weights(XLSArray):                  Weights converted to XLS array
+        - xls_bias(XLSArray):                     Bias converted to XLS array
+        - xls_extra_func_params(list[XLSConst]):  Extra parameters for function call, e.g. stride, padding, pool_op, etc.
+        - xls_extra_func_args(list[XLSConst]):    Extra arguments for function call, e.g. activation parameter.
+        - xls_func_call(XLSFunctionCall):         Function used for transformation, e.g. softmax_stable or conv2d
+
+    Args:
+        - node (Layer): A layer of the model graph
+    """
+
+    def __init__(self, node) -> None:
+        self.node = node
+
+    @staticmethod
+    def attach_to_node(attr_name: str | None = None):
+        """A decorator-factory to easily chain 'set_attr' commands to the node.
+        It calls the provided function. This eliminates a lot of boiler plate code.
+        All the added attributes can be chained in one call since the wrapped function returns self.
+        """
+
+        def decorator(fn) -> Callable:
+            name = attr_name or fn.__name__
+
+            @wraps(fn)
+            def wrapped(self, *args, **kwargs):
+                val = fn(self, *args, **kwargs)
+                assert name not in self.node.attributes, f"Duplicate attribute: '{name}'"
+                self.node.set_attr(name, val)
+                return self
+
+            return wrapped
+
+        return decorator
+
+    @staticmethod
+    def _xls_const_array(name: str, data: np.ndarray, precision: PrecisionType) -> XLSConst:
+        # We allow unsigned types (e.g. XnorPrecisionType or uint<1>) for weights and biases.
+        # They will be converted to signed FixedPoint in DSLX.
+        allow_unsigned = True
+        xls_precision = XLSFixedPointType.from_precision(precision, allow_unsigned)
+        xls_raw_array = XLSArray(
+            array_type=XLSArrayType(element_type=xls_precision.significand_type, shape=data.shape),
+            array=float_to_significand(data, precision, allow_unsigned),
+        )
+        xls_fixed_point_array = XLSFunctionCall(
+            name=f'fixed_point_util::make_fixed_points_{len(data.shape)}d',
+            params=[xls_precision.binary_exponent],
+            args=[xls_raw_array],
+        )
+        return XLSConst(
+            name=name, value=xls_fixed_point_array, type=XLSArrayType(element_type=xls_precision, shape=data.shape)
+        )
+
+    @attach_to_node()
+    def xls_weights(self) -> XLSConst | None:
+        class_name = self.node.class_name
+        if class_name == 'ApplyAlpha':
+            class_name = 'BatchNormalization'
+
+        precision = None
+        xls_weights_name = None
+        if class_name == 'PReLU':
+            weights = self.node.weights.get('param_data')
+            xls_weights_name = 'PRELU_PARAM'
+            precision = self.node.get_attr('param_t').precision
+        elif class_name == 'BatchNormalization':
+            weights = self.node.weights.get('scale', None)
+        else:
+            weights = self.node.weights.get('weight', None)
+        if weights is None:
+            return None
+
+        xls_weights_name = xls_weights_name or f'WEIGHTS_{weights.name}'.upper()
+        precision: PrecisionType = precision or weights.type.precision
+
+        input_var = self.node.get_input_variable()
+        output_var = self.node.get_output_variable()
+
+        match class_name:
+            case 'BatchNormalization':
+                # NB: we need flattening because sometimes the weights can be e.g.
+                # (1,1,1,n_filt) instead of (n_filt,)
+                # We'll throw an error if there are several dimensions larger than 1.
+                data = np.asarray(weights.data).flatten()
+                n_filt = self.node.get_attr('n_filt')
+                if n_filt == -1:
+                    n_filt = input_var.shape[-1]
+                expected_shape = (n_filt,)
+            case 'Conv1D':
+                data = np.asarray(weights.data)
+                expected_shape = tuple(self.node.get_attr(x) for x in ['filt_width', 'n_chan', 'n_filt'])
+            case 'DepthwiseConv1D':
+                data = np.asarray(weights.data)
+                expected_shape = tuple(self.node.get_attr(x) for x in ['filt_width', 'n_chan', 'depth_multiplier'])
+            case 'Conv2D':
+                data = np.asarray(weights.data)
+                expected_shape = tuple(self.node.get_attr(x) for x in ['filt_height', 'filt_width', 'n_chan', 'n_filt'])
+            case 'DepthwiseConv2D':
+                data = np.asarray(weights.data)
+                expected_shape = tuple(
+                    self.node.get_attr(x) for x in ['filt_height', 'filt_width', 'n_chan', 'depth_multiplier']
+                )
+            case 'Dense':
+                # Transpose the weights so that we can call dot_prod(x, w[i]) in dense.x
+                data = np.asarray(weights.data).T
+                expected_shape = (output_var.shape[0], input_var.shape[0])
+            case 'PReLU':
+                data = weights
+                expected_shape = (input_var.shape[0],)
+            case _:
+                raise ValueError(f'Unsupported weights for layer {self.node.class_name}')
+
+        assert data.shape == expected_shape, f'Weights shape mismatch: expected {expected_shape}, got {data.shape}'
+
+        return XLSAttrBuilder._xls_const_array(name=xls_weights_name, data=data, precision=precision)
+
+    @attach_to_node()
+    def xls_bias(self) -> XLSConst | None:
+        bias = self.node.weights.get('bias', None)
+        if not bias:
+            return None
+
+        return XLSAttrBuilder._xls_const_array(
+            name=f'BIAS_{bias.name}'.upper(), data=bias.data, precision=bias.type.precision
+        )
+
+    @attach_to_node()
+    def xls_module_name(self) -> str:
+        name = ''.join(c for c in self.node.name if c.isalnum() or c == '_').lower()
+        return f'layer_{self.node.index}_{name}'
+
+    @attach_to_node()
+    def xls_output_variables(self) -> list[XLSTensorVariable]:
+        return [
+            XLSTensorVariable.from_tensor_variable(self.node.get_output_variable(name))
+            for (i, name) in enumerate(self.node.outputs)
+        ]
+
+    @attach_to_node()
+    def xls_input_variables(self) -> list[XLSTensorVariable]:
+        if self.node.class_name == 'Input':
+            assert self.node.get_input_variable() is None, f'Input layer {self.node.name} should not have input variable'
+            out_var = self.node.get_output_variable()
+            return [XLSTensorVariable.from_tensor_variable(out_var, name=f'input_{out_var.name}')]
+        else:
+            return [
+                XLSTensorVariable.from_tensor_variable(var=self.node.get_input_variable(name)) for name in self.node.inputs
+            ]
+
+    @attach_to_node()
+    def xls_min_input_rank(self) -> int:
+        """Minimally required rank of the input tensor.
+        Input tensor can have a higher rank if it consists of multiple batches.
+        NB: in the case of multiple input variables, the rank is determined by the first input variable.
+        """
+        name = self.node.class_name
+        if name.endswith('2D'):
+            return 3
+        elif name.endswith('1D'):
+            return 2
+        elif name in ('Reshape', 'Concatenate'):
+            return len(self.node.get_input_variable().shape)
+        elif name == 'Transpose':
+            return len(self.node.get_attr('perm'))
+        else:
+            return 1
+
+    @attach_to_node()
+    def xls_extra_func_params(self) -> list[XLSConst]:
+        layer = self.node
+        class_name = layer.class_name
+        if class_name == 'Concatenate':
+            rank = len(layer.get_input_variable().shape)
+            if rank == 1:
+                return []
+            axis = layer.get_attr('axis')
+            if axis > 0:
+                # Convert axis to a 0-based index.
+                # This is the same adjustment as in hls4ml.model.layers.Concatenate.initialize()
+                # TODO: should it be done earlier, when converting from frontend?
+                axis -= 1
+            if axis == -1:
+                axis = rank - 1
+            return [XLSConst(name='AXIS', value=axis, type='u32')]
+        elif class_name in ('Conv1D', 'DepthwiseConv1D'):
+            return [
+                XLSConst(name='STRIDE', value=layer.get_attr('stride_width'), type='u32'),
+                XLSConst(name='PAD_LEFT', value=layer.get_attr('pad_left'), type='u32'),
+                XLSConst(name='PAD_RIGHT', value=layer.get_attr('pad_right'), type='u32'),
+                XLSConst(name='DATA_FORMAT', value=f'data_format::DataFormat::{layer.get_attr("data_format").upper()}'),
+            ]
+        elif class_name in ('Conv2D', 'DepthwiseConv2D'):
+            return [
+                XLSConst(name='STRIDE_HEIGHT', value=layer.get_attr('stride_height'), type='u32'),
+                XLSConst(name='STRIDE_WIDTH', value=layer.get_attr('stride_width'), type='u32'),
+                XLSConst(name='PAD_TOP', value=layer.get_attr('pad_top'), type='u32'),
+                XLSConst(name='PAD_BOTTOM', value=layer.get_attr('pad_bottom'), type='u32'),
+                XLSConst(name='PAD_LEFT', value=layer.get_attr('pad_left'), type='u32'),
+                XLSConst(name='PAD_RIGHT', value=layer.get_attr('pad_right'), type='u32'),
+                XLSConst(name='DATA_FORMAT', value=f'data_format::DataFormat::{layer.get_attr("data_format").upper()}'),
+            ]
+        elif 'Pooling' in class_name:
+            pool_op = f'pooling::PoolingOperation::{layer.get_attr("pool_op").upper()}'
+            data_format = f'data_format::DataFormat::{layer.get_attr("data_format").upper()}'
+            if class_name.startswith('GlobalPooling'):
+                return [XLSConst(name='POOL_OP', value=pool_op), XLSConst(name='DATA_FORMAT', value=data_format)]
+            elif class_name.endswith('Pooling1D'):
+                count_pad = str(layer.get_attr('count_pad')).lower()
+                return [
+                    XLSConst(name='POOL_OP', value=pool_op),
+                    XLSConst(name='POOL_SIZE', value=layer.get_attr('pool_width'), type='u32'),
+                    XLSConst(name='STRIDE', value=layer.get_attr('stride_width'), type='u32'),
+                    XLSConst(name='PAD_LEFT', value=layer.get_attr('pad_left'), type='u32'),
+                    XLSConst(name='PAD_RIGHT', value=layer.get_attr('pad_right'), type='u32'),
+                    XLSConst(name='COUNT_PAD', value=count_pad, type='bool'),
+                    XLSConst(name='DATA_FORMAT', value=data_format),
+                ]
+            elif class_name.endswith('Pooling2D'):
+                count_pad = str(layer.get_attr('count_pad')).lower()
+                return [
+                    XLSConst(name='POOL_OP', value=pool_op),
+                    XLSConst(name='POOL_HEIGHT', value=layer.get_attr('pool_height'), type='u32'),
+                    XLSConst(name='POOL_WIDTH', value=layer.get_attr('pool_width'), type='u32'),
+                    XLSConst(name='STRIDE_HEIGHT', value=layer.get_attr('stride_height'), type='u32'),
+                    XLSConst(name='STRIDE_WIDTH', value=layer.get_attr('stride_width'), type='u32'),
+                    XLSConst(name='PAD_TOP', value=layer.get_attr('pad_top'), type='u32'),
+                    XLSConst(name='PAD_BOTTOM', value=layer.get_attr('pad_bottom'), type='u32'),
+                    XLSConst(name='PAD_LEFT', value=layer.get_attr('pad_left'), type='u32'),
+                    XLSConst(name='PAD_RIGHT', value=layer.get_attr('pad_right'), type='u32'),
+                    XLSConst(name='COUNT_PAD', value=count_pad, type='bool'),
+                    XLSConst(name='DATA_FORMAT', value=data_format),
+                ]
+            else:
+                raise ValueError(f'Unsupported pooling layer {class_name}')
+        elif class_name == 'Reshape':
+            out_vars = layer.get_attr('xls_output_variables')
+            assert len(out_vars) == 1, f'Reshape layer should have exactly one output variable, got {len(out_vars)}'
+            return list(out_vars[0].shape)
+        elif class_name == 'Transpose':
+            return [XLSConst(name=f'PERM_{i}', value=perm, type='u32') for i, perm in enumerate(layer.get_attr('perm'))]
+        else:
+            return []
+
+    @attach_to_node()
+    def xls_extra_func_args(self) -> list[XLSConst]:
+        layer = self.node
+        match layer.class_name:
+            case 'HardActivation':
+                return [
+                    XLSConst(
+                        name=arg_name.upper(),
+                        value=XLSFixedPoint.from_float(
+                            layer.get_attr(arg_name),
+                            precision=layer.get_attr(f'{arg_name}_t').precision,
+                            allow_unsigned=True,
+                        ),
+                    )
+                    for arg_name in ['slope', 'shift']
+                ]
+            case 'ParametrizedActivation':
+                precision = layer.get_attr('param_t').precision
+                value = layer.get_attr('activ_param')
+                if layer.get_attr('activation').lower() in ('leakyrelu', 'leaky_relu', 'thresholdedrelu'):
+                    return [
+                        XLSConst(
+                            name='ACTIVATION_PARAM', value=XLSFixedPoint.from_float(value, precision, allow_unsigned=True)
+                        )
+                    ]
+            case _:
+                pass
+        return []
+
+    @staticmethod
+    def func_name(layer: Layer) -> XLSQualifiedName:
+        match layer.class_name:
+            case 'Input':
+                # Identity transformation except for OverflowMode::SAT_SYM case.
+                return XLSQualifiedName(name='resize_1d', module_name='fixed_point_util')
+            case 'ApplyAlpha':
+                return XLSQualifiedName(name='normalize', module_name='batchnorm')
+            case 'BatchNormalization':
+                return XLSQualifiedName(name='normalize', module_name='batchnorm')
+            case 'Dense':
+                return XLSQualifiedName(name='dense', module_name='dense')
+            case 'Conv1D':
+                return XLSQualifiedName(name='conv1d_latency', module_name='conv1d')
+            case 'DepthwiseConv1D':
+                return XLSQualifiedName(name='depthwise_conv_1d', module_name='depthwise_conv')
+            case 'Conv2D':
+                return XLSQualifiedName(name='conv2d_latency', module_name='conv2d')
+            case 'DepthwiseConv2D':
+                return XLSQualifiedName(name='depthwise_conv_2d', module_name='depthwise_conv')
+            case 'Pooling1D':
+                return XLSQualifiedName(name='pooling_1d', module_name='pooling')
+            case 'Pooling2D':
+                return XLSQualifiedName(name='pooling_2d', module_name='pooling')
+            case 'GlobalPooling1D':
+                return XLSQualifiedName(name='global_pooling_1d', module_name='pooling')
+            case 'GlobalPooling2D':
+                return XLSQualifiedName(name='global_pooling_2d', module_name='pooling')
+            case 'Merge':
+                op = layer.get_attr('op').lower()
+                return XLSQualifiedName(name=op, module_name='merge')
+            case 'Concatenate':
+                rank = len(layer.get_input_variable().shape)
+                return XLSQualifiedName(name=f'concatenate{rank}d', module_name='merge')
+            case 'Dot':
+                return XLSQualifiedName(name='dot', module_name='merge')
+            case 'Activation':
+                return XLSQualifiedName(name=layer.get_attr('activation').lower(), module_name='activations')
+            case 'HardActivation':
+                return XLSQualifiedName(name=layer.get_attr('activation').lower(), module_name='activations')
+            case 'ParametrizedActivation':
+                return XLSQualifiedName(name=layer._get_act_function_name(), module_name='activations')
+            case 'PReLU':
+                return XLSQualifiedName(name='prelu', module_name='activations')
+            case 'Reshape':
+                in_shape = layer.get_input_variable().shape
+                out_shape = layer.get_output_variable().shape
+                name = f'reshape_{len(in_shape)}d_to_{len(out_shape)}d'
+                return XLSQualifiedName(name=name, module_name='reshape')
+            case 'Softmax':
+                implementation = layer.attributes.get('implementation', 'stable')
+                match implementation:
+                    case 'stable':
+                        name = 'softmax_stable'
+                    case 'latency':
+                        name = 'softmax_latency'
+                    case 'argmax':
+                        name = 'argmax'
+                    case _:
+                        # TODO: support implementation == 'legacy'
+                        raise ValueError(f'Unknown softmax implementation {implementation}')
+                return XLSQualifiedName(name=name, module_name='activations')
+            case 'Transpose':
+                rank = len(layer.get_input_variable().shape)
+                return XLSQualifiedName(name=f'transpose_{rank}d', module_name='transpose')
+            case 'TernaryTanh':
+                return XLSQualifiedName(name='ternary_tanh', module_name='activations')
+            case _:
+                raise ValueError(f'Unknown layer type: {layer.class_name}')
+
+    @attach_to_node()
+    def xls_func_call(self) -> XLSFunctionCall:
+        in_vars = self.node.get_attr('xls_input_variables')
+        out_vars = self.node.get_attr('xls_output_variables')
+        name = self.func_name(self.node)
+        params = [
+            x.name
+            for out_var in out_vars
+            for x in (
+                out_var.num_bits,
+                out_var.binary_exponent,
+                out_var.rounding_mode,
+                out_var.overflow_mode,
+            )
+        ] + [x.name for x in self.node.get_attr('xls_extra_func_params')]
+        args = [f'x_{i}' for i in range(len(in_vars))]
+        args += [self.node.get_attr(x).name for x in ('xls_weights', 'xls_bias') if self.node.get_attr(x) is not None]
+        args += [x.lookup_table.name for x in self.node.get_attr('lookup_tables', [])]
+        args += [x.name for x in self.node.get_attr('xls_extra_func_args')]
+        return XLSFunctionCall(name=name, params=params, args=args)
+
+
+class BuildAttr(OptimizerPass):
+    """Builds the XLS-specific attributes for all layers."""
+
+    def match(self, node: Layer) -> bool:
+        return True
+
+    def transform(self, model: ModelGraph, node: Layer) -> Literal[False]:
+        try:
+            # uses the builder to add all the attributes
+            (
+                XLSAttrBuilder(node)
+                .xls_module_name()
+                .xls_min_input_rank()
+                .xls_input_variables()
+                .xls_output_variables()
+                .xls_weights()
+                .xls_bias()
+                .xls_extra_func_params()
+                .xls_extra_func_args()
+                .xls_func_call()
+            )
+        except Exception as e:
+            raise ValueError(
+                f'Failed to build XLS attributes for layer (name={node.name}, class_name={node.class_name}): {e}'
+            ) from e
+        return False
diff --git a/hls4ml/backends/xls/passes/build_tables.py b/hls4ml/backends/xls/passes/build_tables.py
new file mode 100644
index 0000000000..851542b704
--- /dev/null
+++ b/hls4ml/backends/xls/passes/build_tables.py
@@ -0,0 +1,259 @@
+# Typing imports
+from __future__ import annotations  # makes all annotations into strings
+
+import warnings
+from collections.abc import Callable
+from copy import copy
+from enum import Enum
+from typing import TYPE_CHECKING, Literal
+
+from hls4ml.backends.xls.xls_types import XLSFixedPoint, XLSFixedPointType, XLSLookupTable, float_to_significand
+from hls4ml.model.types import FixedPrecisionType
+
+if TYPE_CHECKING:
+    from hls4ml.model.graph import ModelGraph
+    from hls4ml.model.layers import Layer
+
+import math
+
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class LookupTableRange(Enum):
+    FULL = 1
+    NON_NEGATIVE = 2
+    NEGATIVE = 3
+
+
+def build_table(
+    name: str,
+    func: Callable[[float], float],
+    table_size: int,
+    input_precision: FixedPrecisionType,
+    output_precision: FixedPrecisionType,
+    table_range: LookupTableRange,
+) -> XLSLookupTable:
+    # Hereafter 'raw' means operations with significand values, i.e.
+    # raw_x == x.significand == int(x * 2**precision.fractional)
+
+    raw_to_float = 2 ** (-input_precision.fractional)
+
+    def raw_func(raw_x: int) -> int:
+        return float_to_significand(func(raw_x * raw_to_float), output_precision)
+
+    raw_minus_inf = XLSFixedPoint.min_value(XLSFixedPointType.from_precision(input_precision)).significand.value
+    raw_plus_inf = XLSFixedPoint.max_value(XLSFixedPointType.from_precision(input_precision)).significand.value
+    match table_range:
+        # x = -inf..+inf
+        case LookupTableRange.FULL:
+            raw_original_x_min = raw_minus_inf
+            raw_original_x_max = raw_plus_inf
+        # x = 0..+inf
+        case LookupTableRange.NON_NEGATIVE:
+            raw_original_x_min = 0
+            raw_original_x_max = raw_plus_inf
+        # x = -inf..0
+        case LookupTableRange.NEGATIVE:
+            raw_original_x_min = raw_minus_inf
+            raw_original_x_max = -1
+
+    raw_x_min = raw_original_x_min
+    raw_x_max = raw_original_x_max
+
+    # Build input range for lookup table.
+    # If the function saturates at the table edges,
+    # we adjust the range to account for that.
+    recompute_range = True
+    while recompute_range:
+        raw_log2_step = math.ceil(math.log2((raw_x_max - raw_x_min) / (table_size - 1)))
+        if raw_log2_step < 0:
+            raw_log2_step = 0
+        raw_step = 2**raw_log2_step
+        f_min = raw_func(raw_x_min)
+        f_max = raw_func(raw_x_max)
+        raw_range = list(range(raw_x_min, raw_x_max + 1, raw_step))
+
+        recompute_range = False
+        for x in raw_range[1:]:
+            if raw_func(x) == f_min:
+                raw_x_min = x
+                recompute_range = True
+            else:
+                break
+        for x in reversed(raw_range[:-1]):
+            if x < raw_x_min:
+                break
+            if raw_func(x) == f_max:
+                raw_x_max = x
+                recompute_range = True
+
+    if raw_x_min != raw_original_x_min or raw_x_max != raw_original_x_max:
+        warnings.warn(
+            f'Lookup table {name} range has been reduced to account for saturation at the table edges. '
+            f'The original significand range was {raw_original_x_min}..{raw_original_x_max}, '
+            f'and the adjusted range is {raw_x_min}..{raw_x_max}.',
+            stacklevel=1,
+        )
+    if len(raw_range) < table_size:
+        warnings.warn(f'Lookup table {name} size has been reduced from {table_size} to {len(raw_range)}.', stacklevel=1)
+
+    assert 0 < len(raw_range) <= table_size
+    assert raw_range[0] == raw_x_min >= raw_original_x_min
+    assert raw_range[-1] <= raw_x_max <= raw_original_x_max
+
+    return XLSLookupTable(
+        name=name,
+        input_precision=XLSFixedPointType.from_precision(input_precision),
+        output_precision=XLSFixedPointType.from_precision(output_precision),
+        x_min=XLSFixedPoint(type=input_precision, significand=raw_x_min),
+        log2_step=raw_log2_step - input_precision.fractional,
+        raw_table=[raw_func(x) for x in raw_range],
+    )
+
+
+def build_softmax_tables(node: Layer) -> list[XLSLookupTable]:
+    table_size = int(node.get_attr('table_size'))
+    exp_table_size = int(node.get_attr('exp_table_size', table_size))
+    inv_table_size = int(node.get_attr('inv_table_size', table_size))
+    implementation = node.get_attr('implementation', 'stable')
+    input_precision = node.get_input_variable().type.precision
+    exp_in = copy(input_precision)
+    exp_out = node.get_attr('exp_table_t').precision
+    match implementation:
+        case 'stable':
+            exp_in.width += 1
+            exp_in.integer += 1
+            exp_name = 'EXP_NEG_TABLE'
+
+            def exp_func(x):
+                return math.exp(-x)
+
+            # Arguments of exp_func are (x_max - x_i) > 0
+            exp_table_range = LookupTableRange.NON_NEGATIVE
+        case 'latency':
+            exp_name = 'EXP_TABLE'
+            exp_func = math.exp
+            # Arguments of exp_func are x_i, which can be both positive and negative
+            exp_table_range = LookupTableRange.FULL
+        case _:
+            raise ValueError(f'Unknown softmax implementation={implementation}')
+
+    inv_in = exp_out
+    inv_out = node.get_attr('inv_table_t').precision
+    inv_name = 'INV_TABLE'
+
+    def inv_func(x):
+        if x == 0:
+            return inv_out.max
+        return 1.0 / x
+
+    exp_table = build_table(
+        name=exp_name,
+        func=exp_func,
+        table_size=exp_table_size,
+        input_precision=exp_in,
+        output_precision=exp_out,
+        table_range=exp_table_range,
+    )
+    inv_table = build_table(
+        name=inv_name,
+        func=inv_func,
+        table_size=inv_table_size,
+        input_precision=inv_in,
+        output_precision=inv_out,
+        # We're inverting sum of exponents, which is always non-negative.
+        table_range=LookupTableRange.NON_NEGATIVE,
+    )
+    return [exp_table, inv_table]
+
+
+def build_activation_table(node: Layer) -> XLSLookupTable:
+    activation = node.get_attr('activation').lower()
+    table_name = f'{activation.upper()}_TABLE'
+    match activation:
+        case 'elu':
+            table_range = LookupTableRange.NEGATIVE
+            alpha = node.get_attr('activ_param')
+
+            def func(x):
+                assert x < 0, f'Building ELU table only for x < 0, got {x}'
+                return alpha * (math.exp(x) - 1)
+        case 'selu':
+            table_range = LookupTableRange.NEGATIVE
+            alpha = 1.6732632423543772848170429916717
+            scale = 1.0507009873554804934193349852946
+
+            def func(x):
+                assert x < 0, f'Building ELU table only for x < 0, got {x}'
+                return scale * alpha * (math.exp(x) - 1)
+        case 'softplus':
+            table_range = LookupTableRange.FULL
+
+            def func(x):
+                return math.log(1 + math.exp(x))
+        case 'softsign':
+            table_range = LookupTableRange.NON_NEGATIVE
+
+            def func(x):
+                return x / (1 + abs(x))
+        case 'tanh':
+            table_range = LookupTableRange.NON_NEGATIVE
+
+            def func(x):
+                return math.tanh(x)
+        case 'sigmoid':
+            table_range = LookupTableRange.FULL
+
+            def func(x):
+                return 1 / (1 + math.exp(-x))
+        case _:
+            raise ValueError(f'Unknown activation={activation}')
+
+    match table_range:
+        case LookupTableRange.FULL:
+            pass
+        case LookupTableRange.NON_NEGATIVE:
+            table_name += '_NON_NEGATIVE'
+        case LookupTableRange.NEGATIVE:
+            table_name += '_NEGATIVE'
+
+    return build_table(
+        name=table_name,
+        func=func,
+        table_size=int(node.get_attr('table_size')),
+        input_precision=node.get_input_variable().type.precision,
+        output_precision=node.get_output_variable().type.precision,
+        table_range=table_range,
+    )
+
+
+class BuildTables(OptimizerPass):
+    """Builds attributes that store the softmax and multiplication inverse for the approximation
+    of the Softmax function.
+    """
+
+    def match(self, node: Layer) -> bool:
+        match node.class_name:
+            case 'Softmax':
+                return node.get_attr('implementation', 'stable') != 'argmax'
+            case 'Activation':
+                return node.get_attr('activation').lower() in ['selu', 'softplus', 'softsign', 'tanh', 'sigmoid']
+            case 'ParametrizedActivation':
+                return node.get_attr('activation').lower() in ['elu', 'prelu']
+            case _:
+                return False
+
+    def transform(self, model: ModelGraph, node: Layer) -> Literal[False]:
+        lookup_tables = node.get_attr('lookup_tables', [])
+        match node.class_name:
+            case 'Softmax':
+                lookup_tables += build_softmax_tables(node)
+            case 'Activation':
+                lookup_tables.append(build_activation_table(node))
+            case 'ParametrizedActivation':
+                lookup_tables.append(build_activation_table(node))
+            case _:
+                raise ValueError(f'Unknown layer type: {node.class_name}')
+
+        node.set_attr('lookup_tables', lookup_tables)
+        return False
diff --git a/hls4ml/backends/xls/xls_backend.py b/hls4ml/backends/xls/xls_backend.py
new file mode 100644
index 0000000000..ba7fe42597
--- /dev/null
+++ b/hls4ml/backends/xls/xls_backend.py
@@ -0,0 +1,429 @@
+# Typing imports
+from __future__ import annotations  # makes all annotations into strings
+
+import functools
+import importlib
+import math
+from collections.abc import Callable, Iterable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from numpy.typing import ArrayLike, NDArray
+
+from hls4ml.backends.xls.xls_types import float_to_significand
+from hls4ml.model.types import FixedPrecisionType
+
+if TYPE_CHECKING:
+    from hls4ml.model.graph import ModelGraph
+
+import subprocess
+from warnings import warn
+
+import numpy as np
+
+from hls4ml.backends import FPGABackend
+from hls4ml.model.flow import register_flow
+from hls4ml.model.optimizer import get_backend_passes
+from hls4ml.report import parse_xls_report
+
+
+@functools.lru_cache(maxsize=1)
+def import_xls():
+    try:
+        return importlib.import_module('xls')
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            "XLS backend requires optional dependency 'xls'. "
+            "Please install hls4ml with XLS extras (or install package 'xls')."
+        ) from e
+
+
+class XLSBackend(FPGABackend):
+    def __init__(self) -> None:
+        super().__init__('XLS')
+        self._writer_flow = ''
+        self._default_flow = ''
+
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_layer_attributes(self) -> None:
+        pass
+
+    def _register_flows(self) -> None:
+        initializers: list = self._get_layer_initializers()
+        init_flow: str = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
+
+        quantization_passes = [
+            # 'xls:merge_batch_norm_quantized_tanh',
+            # 'xls:quantize_dense_output',
+            'fuse_consecutive_batch_normalization',
+            'xls:xnor_pooling',
+        ]
+        quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
+
+        optimization_passes = [
+            'xls:remove_final_reshape',
+            'xls:inplace_parallel_reshape',
+            'xls:skip_softmax',
+            'infer_precision_types',
+        ]
+        optimization_flow: str = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
+
+        xls_attributes = [
+            'xls:build_tables',
+            'xls:build_attr',
+        ]
+        xls_attributes_flow: str = register_flow('xls', xls_attributes, requires=[optimization_flow], backend=self.name)
+
+        # TODO: stamp is currently unused, shall we add it to myproject.x, myproject.ir, myproject.opt.ir, ...?
+        # In other backends, this is used to generate myproject-$STAMP.so.
+        # In XLS, .opt.ir file plays the same role as .so
+        # It is unclear whether we should copy or rename myproject.opt.ir to myproject-$STAMP.opt.ir.
+        writer_passes = ['make_stamp', 'xls:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['xls:ip'], backend=self.name)
+
+        # Passed that are irrelevant for XLS
+        ignored_passes = [
+            f'xls:{opt_pass}'
+            for opt_pass in [
+                # io_stream only:
+                'reshape_stream',
+                'inplace_stream_flatten',
+                'repack_function_template',
+                'clone_output',
+                'clone_function_template',
+                # HGQ passes, not implemented:
+                'process_fixed_point_quantizer_layer',
+                'fixedpointquantizer_function_template',
+                'unarylut_function_template',
+                # Embedding
+                'embedding_config_template',
+                'embedding_function_template',
+                # we fix table sizes in xls:build_tables using a different method
+                'fix_softmax_table_size',
+                # BRAM not supported
+                'register_bram_weights',
+            ]
+        ]
+
+        all_passes: list = get_backend_passes(self.name)
+
+        extras = [
+            # Ideally, this should be empty
+            opt_pass
+            for opt_pass in all_passes
+            if opt_pass
+            not in initializers + quantization_passes + optimization_passes + xls_attributes + writer_passes + ignored_passes
+        ]
+
+        if len(extras) > 0:
+            for opt in extras:
+                warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.')
+
+        ip_flow_requirements = [
+            'optimize',
+            init_flow,
+            quantization_flow,
+            optimization_flow,
+            xls_attributes_flow,
+        ]
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def get_default_flow(self) -> str:
+        return self._default_flow
+
+    def get_writer_flow(self) -> str:
+        return self._writer_flow
+
+    @staticmethod
+    def _to_xls_clock_period_ps(clock_period) -> int:
+        """Convert nanoseconds to picoseconds."""
+        return int(float(clock_period) * 1000)
+
+    @staticmethod
+    def _to_xls_clock_margin_percent(clock_uncertainty: str) -> int:
+        """Convert ClockUncertainty string to integer XLS option clock_margin_percent"""
+        assert isinstance(clock_uncertainty, str) and clock_uncertainty.endswith('%'), (
+            f'Clock uncertainty must be in percentage format, got {clock_uncertainty}'
+        )
+        return math.ceil(float(clock_uncertainty.strip('%')))
+
+    @staticmethod
+    def _percent_to_float(percent: str) -> float:
+        """Convert a string representing a percentage to a float."""
+        assert isinstance(percent, str) and percent.endswith('%'), (
+            f'Clock uncertainty must be in percentage format, got {percent}'
+        )
+        return float(percent.strip('%')) / 100
+
+    def create_initial_config(
+        self,
+        part='xcu250-figd2104-2L-e',
+        clock_period=5,
+        clock_uncertainty='12.5%',
+        io_type='io_parallel',
+        write_tar=False,
+        xls_codegen_flags=None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Create an initial configuration of the XLS backend.
+
+        Args:
+            part (str, optional): The FPGA part to be used. Defaults to 'xcvu13p-flga2577-2-e'.
+            clock_period (int, optional): The clock period. Defaults to 5.
+            clock_uncertainty (str, optional): The clock uncertainty. Defaults to 12.5%.
+            io_type (str, optional): Type of implementation used. Only 'io_parallel' is currently supported.
+            write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False.
+            xls_codegen_flags (dict, optional): Flags to pass to the XLS codegen. Defaults to None.
+
+        Returns:
+            dict: initial configuration.
+        """
+        config = {}
+
+        config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e'
+        config['ClockPeriod'] = clock_period if clock_period is not None else 5
+        config['ClockUncertainty'] = clock_uncertainty if clock_uncertainty is not None else '12.5%'
+        config['IOType'] = io_type if io_type is not None else 'io_parallel'
+        config['HLSConfig'] = {}
+        config['WriterConfig'] = {
+            'WriteTar': write_tar,
+        }
+
+        # Set default flags to mimic codegen_main executable behavior
+        config['XLSCodegenFlags'] = (
+            xls_codegen_flags
+            if xls_codegen_flags is not None
+            else {
+                'delay_model': 'asap7',
+                'generator': 'pipeline',
+                'use_system_verilog': True,
+                'flop_inputs': True,
+                'flop_outputs': True,
+                'max_inline_depth': 5,
+                'flop_single_value_channels': True,
+                # convert nanoseconds to picoseconds
+                'clock_period_ps': self._to_xls_clock_period_ps(config['ClockPeriod']),
+                # NB: XLS needs integer percents
+                'clock_margin_percent': self._to_xls_clock_margin_percent(config['ClockUncertainty']),
+            }
+        )
+
+        for arg in kwargs:
+            warn(f'WARNING: Unknown argument {arg} for XLS backend will be ignored.')
+
+        return config
+
+    @staticmethod
+    def _ir_top_function_name(model: ModelGraph):
+        xls = import_xls()
+        name = model.config.get_project_name()
+        return xls.mangle_dslx_name(module_name=name, function_name=name)
+
+    def compile(self, model: ModelGraph) -> None:
+        xls = import_xls()
+        io_type = model.config.get_config_value('IOType')
+        if io_type != 'io_parallel':
+            raise NotImplementedError(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
+        kernel_name = model.config.get_project_name()
+        firmware_dir = Path(f'{model.config.get_output_dir()}') / 'firmware'
+        path_no_ext = firmware_dir / kernel_name
+
+        ir_text = xls.c_api.convert_dslx_path_to_ir(path=f'{path_no_ext}.x', additional_search_paths=[str(firmware_dir)])
+        with open(f'{path_no_ext}.ir', 'w') as ir_file:
+            ir_file.write(ir_text)
+
+        opt_ir_text = xls.optimize_ir(ir=ir_text, top=XLSBackend._ir_top_function_name(model))
+        with open(f'{path_no_ext}.opt.ir', 'w') as opt_ir_file:
+            opt_ir_file.write(opt_ir_text)
+
+        # This object can be heavy, so we don't want to cache it unless we call predict().
+        if hasattr(model, '_xls_top_function'):
+            del model._xls_top_function
+
+    @staticmethod
+    def _float_to_xls_ir(x: np.floating[Any] | NDArray[np.floating[Any]], precision: FixedPrecisionType):
+        xls = import_xls()
+        if np.isscalar(x):
+            significand = float_to_significand(x, precision)
+            bits = xls.Value.make_sbits(bit_count=precision.width, val=significand)
+            return bits
+        else:
+            return xls.Value.make_array([XLSBackend._float_to_xls_ir(item, precision) for item in x])
+
+    @staticmethod
+    def _bits_to_int(bits, signed: bool = True) -> int:
+        # bits: xls.Bits
+        n = bits.get_bit_count()
+        if n <= 64:
+            return bits.to_int64()
+        value = int.from_bytes(bits.to_bytes(), byteorder='little', signed=False)
+        value &= (1 << n) - 1
+        if signed and (bits.get_bit(n - 1) == 1):
+            value -= 1 << n
+        return value
+
+    @staticmethod
+    def _xls_ir_to_float(
+        x, precision: FixedPrecisionType | Iterable[FixedPrecisionType], dtype: np.typing.DTypeLike
+    ) -> ArrayLike | tuple[ArrayLike, ...]:
+        xls = import_xls()
+        # x: xls.Value
+        match x.get_kind():
+            case xls.c_api.ValueKind.BITS:
+                assert isinstance(precision, FixedPrecisionType), (
+                    f'Precision must be FixedPrecisionType, got {type(precision)}'
+                )
+                return XLSBackend._bits_to_int(x.get_bits()) / (2**precision.fractional)
+            case xls.c_api.ValueKind.ARRAY:
+                return np.asarray(
+                    [XLSBackend._xls_ir_to_float(x.get_element(i), precision, dtype) for i in range(x.get_element_count())],
+                    dtype=dtype,
+                )
+            case xls.c_api.ValueKind.TUPLE:
+                precision = tuple(precision)
+                assert len(precision) == x.get_element_count(), (
+                    f'Precision mismatch for tuple: {len(precision)} != {x.get_element_count()}'
+                )
+                return tuple(
+                    XLSBackend._xls_ir_to_float(x.get_element(i), precision[i], dtype) for i in range(x.get_element_count())
+                )
+            case _:
+                raise ValueError(f'Unexpected output type: {x.get_kind()}')
+
+    @staticmethod
+    def get_top_function(model: ModelGraph, x: np.floating | NDArray[np.floating[Any]]) -> tuple[Callable, np.dtype]:
+        # Cache JIT function to avoid reparsing IR file.
+        top_function = getattr(model, '_xls_top_function', None)
+        if top_function is None:
+            top_function = XLSBackend._make_top_function(model)
+            model._xls_top_function = top_function
+
+        # TODO: this duplicates ModelGraph._get_top_function().
+        # NB: ctype is not used in XLS, but it is required by ModelGraph._predict
+        x0 = x[0] if isinstance(x, (list, tuple)) else x
+        if np.asarray(x0).dtype in [np.single, np.float32]:
+            ctype = np.float32
+        elif np.asarray(x0).dtype in [np.double, np.float64]:
+            ctype = np.float64
+        else:
+            raise TypeError(
+                'Invalid type ({}) of numpy array. Supported types are: single, float32, double, float64, float_.'.format(
+                    np.asarray(x0).dtype
+                )
+            )
+
+        return top_function, ctype
+
+    @staticmethod
+    def _make_top_function(model: ModelGraph) -> Callable:
+        xls = import_xls()
+        project_dir = model.config.get_output_dir()
+        project_name = model.config.get_project_name()
+        ir_path = Path(project_dir) / 'firmware' / f'{project_name}.opt.ir'
+        if not ir_path.exists():
+            raise FileNotFoundError(f'Optimized IR file not found: {ir_path}. Please compile your model first.')
+        ir_text = ir_path.read_text()
+        pkg = xls.Package.parse_ir(ir_text)
+        fn = pkg.get_function(XLSBackend._ir_top_function_name(model))
+        jit = fn.to_jit()
+
+        input_vars = model.get_input_variables()
+        output_vars = model.get_output_variables()
+
+        def top_function(*args):
+            assert len(args) == len(input_vars) + len(output_vars), (
+                f'Expected {len(input_vars)} inputs and {len(output_vars)} outputs, got {len(args)}'
+            )
+            inputs = args[: len(input_vars)]
+            outputs = args[len(input_vars) :]
+            ir_input = [
+                XLSBackend._float_to_xls_ir(np.asarray(x).reshape(var.shape), var.type.precision)
+                for x, var in zip(inputs, input_vars)
+            ]
+            ir_output = jit.run(ir_input)
+
+            out_precision = [output_var.type.precision for output_var in output_vars]
+            if len(out_precision) == 1:
+                out_precision = out_precision[0]
+            dtype = np.asarray(inputs[0]).dtype
+            output = XLSBackend._xls_ir_to_float(ir_output, out_precision, dtype)
+            # This is the case when len(output_vars) == 1
+            if not isinstance(output, tuple):
+                output = (output,)
+            for i in range(len(output_vars)):
+                outputs[i][:] = np.reshape(output[i], -1)
+
+        return top_function
+
+    def build(
+        self,
+        model: ModelGraph,
+        reset: bool | None = None,
+        pr: bool = False,
+    ) -> dict:
+        """Builds the RTL (SystemVerilog) code and uses Vivado to return the resource utilization.
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+            reset (bool): the reset synthesis option
+            pr (bool): place and route option
+        """
+        xls = import_xls()
+        project_name = model.config.get_project_name()
+        output_dir = Path(model.config.get_output_dir())
+
+        clock_period_ns = model.config.get_config_value('ClockPeriod')
+        clock_period_ps = self._to_xls_clock_period_ps(clock_period_ns)
+
+        clock_uncertainty_str = model.config.get_config_value('ClockUncertainty')
+        clock_uncertainty_float = self._percent_to_float(clock_uncertainty_str)
+        clock_margin_percent: int = self._to_xls_clock_margin_percent(clock_uncertainty_str)
+
+        def build_codegen_flags() -> dict[str, Any]:
+            flags = dict(model.config.get_config_value('XLSCodegenFlags'))
+            flags['clock_period_ps'] = clock_period_ps
+            flags['clock_margin_percent'] = clock_margin_percent
+            if reset is not None:
+                flags['reset'] = 'reset' if reset else None
+                flags['reset_data_path'] = reset
+            return flags
+
+        def build_vivado_flags() -> list[str]:
+            flags = [
+                '-mode',
+                'batch',
+                '-nolog',
+                '-nojournal',
+                '-source',
+                './build_prj.tcl',
+                '-tclargs',
+                project_name,
+                model.config.get_config_value('Part'),
+                clock_period_ps,
+                clock_uncertainty_float,
+            ]
+            if pr:
+                flags += ['--pr']
+            return [str(flag) for flag in flags]
+
+        # Generate RTL
+        firmware_dir = output_dir / 'firmware'
+
+        opt_ir_path = firmware_dir / f'{project_name}.opt.ir'
+        opt_ir_text = opt_ir_path.read_text()
+        codegen_flags = build_codegen_flags()
+
+        pkg = xls.parse_ir_package(ir=opt_ir_text, filename=str(opt_ir_path))
+        verilog_text = pkg.schedule_and_codegen(**codegen_flags).get_verilog_text()
+        sv_path = firmware_dir / f'{project_name}.sv'
+        sv_path.write_text(verilog_text)
+
+        # Run Vivado for resource report
+        vivado_command: list[str] = ['vivado'] + build_vivado_flags()
+        subprocess.run(vivado_command, cwd=output_dir, check=True)
+
+        return parse_xls_report(output_dir)
diff --git a/hls4ml/backends/xls/xls_types.py b/hls4ml/backends/xls/xls_types.py
new file mode 100644
index 0000000000..59c5b98117
--- /dev/null
+++ b/hls4ml/backends/xls/xls_types.py
@@ -0,0 +1,501 @@
+from __future__ import annotations
+
+import builtins
+from typing import Any
+
+import numpy as np
+from numpy.typing import NDArray
+
+from hls4ml.model.types import (
+    ExponentPrecisionType,
+    FixedPrecisionType,
+    IntegerPrecisionType,
+    PrecisionType,
+    RoundingMode,
+    SaturationMode,
+    TensorVariable,
+    XnorPrecisionType,
+)
+
+
+def to_signed_fixed_precision(precision: PrecisionType, allow_unsigned: bool = False) -> FixedPrecisionType:
+    """Convert precision to a signed FixedPrecisionType used by XLS."""
+    rounding_mode = RoundingMode.TRN
+    saturation_mode = SaturationMode.WRAP
+    if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType):
+        integer = precision.integer
+        rounding_mode = precision.rounding_mode
+        saturation_mode = precision.saturation_mode
+    elif isinstance(precision, XnorPrecisionType):
+        integer = precision.integer
+    elif isinstance(precision, ExponentPrecisionType):
+        integer = 1
+    else:
+        raise ValueError(f'Unknown precision type: {type(precision)}')
+    fixed_precision = FixedPrecisionType(
+        width=precision.width,
+        integer=integer,
+        signed=precision.signed,
+        rounding_mode=rounding_mode,
+        saturation_mode=saturation_mode,
+    )
+    # Only signed types are supported in XLS
+    if not fixed_precision.signed:
+        if not allow_unsigned:
+            raise ValueError(f'Expected signed precision, got: {precision}')
+        fixed_precision.signed = True
+        fixed_precision.width += 1
+        fixed_precision.integer += 1
+
+    return fixed_precision
+
+
+def float_to_significand(
+    x: np.floating[Any] | NDArray[np.floating[Any]], precision: PrecisionType, allow_unsigned: bool = False
+) -> int:
+    """Convert floating point value to fixed point significand.
+
+    Returns: x * 2^precision.fractional
+    """
+    if not np.isscalar(x):
+        if not isinstance(x, np.ndarray) or x.dtype.kind != 'f':
+            x = np.asarray(x, dtype=np.float64)
+
+    if isinstance(precision, XnorPrecisionType):
+        # hls4ml stores XNOR weights as bits {0,1};
+        # We convert it to XLS FixedPoint {-1, 1}
+        x = np.where(x == 0, -1, x)
+
+    precision = to_signed_fixed_precision(precision, allow_unsigned)
+
+    width = precision.width
+    frac = precision.fractional
+    scale = 2**frac
+    # TODO support different saturation and rounding modes
+    significand = np.round(x * scale).astype(np.int64)
+    n = 2**width
+    shift = 2 ** (width - 1)
+    return (significand + shift) % n - shift
+
+
+# XLS types
+
+
+class XLSIntegerType:
+    def __init__(self, width, signed: bool):
+        self.width = width
+        self.signed = signed
+
+    def __str__(self):
+        prefix = 's' if self.signed else 'u'
+        if isinstance(self.width, int) and 1 <= self.width <= 64:
+            # u32
+            return f'{prefix}{self.width}'
+        # uN[NUM_BITS]
+        return f'{prefix}N[{self.width}]'
+
+    @staticmethod
+    def u32():
+        return XLSIntegerType(width=32, signed=False)
+
+    @staticmethod
+    def s32():
+        return XLSIntegerType(width=32, signed=True)
+
+
+class XLSFixedPointType:
+    def __init__(self, num_bits, binary_exponent):
+        self.num_bits = num_bits
+        self.binary_exponent = binary_exponent
+
+    @classmethod
+    def from_precision(cls, precision: PrecisionType, allow_unsigned: bool = False):
+        precision = to_signed_fixed_precision(precision, allow_unsigned)
+        assert precision.signed, 'XLS FixedPoint is always a signed type'
+        num_bits = precision.width
+        binary_exponent = -precision.fractional
+        return cls(num_bits=num_bits, binary_exponent=binary_exponent)
+
+    @property
+    def significand_type(self):
+        return XLSIntegerType(width=self.num_bits, signed=True)
+
+    @property
+    def precision(self):
+        return FixedPrecisionType(width=self.num_bits, integer=self.num_bits + self.binary_exponent, signed=True)
+
+    def __str__(self):
+        return f'FixedPoint<{self.num_bits}, {self.binary_exponent}>'
+
+
+def as_xls_fixed_point_type(type: XLSFixedPointType | PrecisionType, allow_unsigned: bool = False) -> XLSFixedPointType:
+    if isinstance(type, XLSFixedPointType):
+        return type
+    return XLSFixedPointType.from_precision(type, allow_unsigned)
+
+
+# 1d array type. TODO make it explicitly multidimensional?
+class XLSArrayType:
+    def __init__(
+        self, element_type, shape: int | str | tuple[int | str, ...] | list[int | str], allow_unsigned: bool = False
+    ):
+        if isinstance(element_type, PrecisionType):
+            element_type = XLSFixedPointType.from_precision(element_type, allow_unsigned)
+
+        if isinstance(shape, str) or isinstance(shape, int):
+            shape = (shape,)
+        else:
+            shape = tuple(shape)
+        assert len(shape) > 0, 'Zero-dimensional arrays are not supported'
+        if len(shape) == 1:
+            self.element_type = element_type
+        else:
+            self.element_type = XLSArrayType(element_type, shape[1:], allow_unsigned)
+        self.size = shape[0]
+
+    def as_multidimensional(self) -> tuple[Any, tuple[int | str, ...]]:
+        """Returns: (inner element type, shape)
+
+        >>> element_type = XLSFixedPointType(num_bits=16, binary_exponent=-10)
+        >>> array_2d = XLSArrayType(element_type=element_type, shape=(2, 3))
+        >>> elt, shape = array_2d.as_multidimensional()
+        >>> str(elt)
+        'FixedPoint<16, -10>'
+        >>> shape
+        (2, 3)
+
+        """
+        if isinstance(self.element_type, XLSArrayType):
+            elt, shape = self.element_type.as_multidimensional()
+            shape = (self.size,) + shape
+        else:
+            elt = self.element_type
+            shape = (self.size,)
+        return elt, shape
+
+    @property
+    def shape(self):
+        """Returns: shape of the multidimensional array type"""
+        _, shape = self.as_multidimensional()
+        return shape
+
+    @property
+    def rank(self):
+        """Returns: rank of the multidimensional array type"""
+        return len(self.shape)
+
+    @property
+    def innermost_element_type(self):
+        """Returns: inner element type, for example:
+
+        >>> element_type = XLSFixedPointType(num_bits=16, binary_exponent=-10)
+        >>> array_2d = XLSArrayType(element_type=element_type, shape=(2, 3))
+        >>> str(array_2d.innermost_element_type)
+        'FixedPoint<16, -10>'
+        >>> str(array_2d.element_type)
+        'FixedPoint<16, -10>[3]'
+        """
+        elt, shape = self.as_multidimensional()
+        return elt
+
+    def __str__(self):
+        return f'{self.element_type}[{self.size}]'
+
+
+# XLS values
+
+
+class XLSInteger:
+    def __init__(self, type: XLSIntegerType | str, value: int | str):
+        self.type = type
+        self.value = value
+
+    @classmethod
+    def u32(cls, value: int | str):
+        if isinstance(value, int):
+            assert value >= 0, f'value={value} is not an unsigned integer'
+        return cls(XLSIntegerType.u32(), value)
+
+    @classmethod
+    def s32(cls, value: int | str):
+        return cls(XLSIntegerType.s32(), value)
+
+    def __str__(self):
+        return f'{self.type}:{self.value}'
+
+
+class XLSFixedPoint:
+    def __init__(
+        self,
+        type: XLSFixedPointType | PrecisionType,
+        significand: XLSInteger | int | np.integer[Any] | str,
+        allow_unsigned: bool = False,
+    ):
+        type = as_xls_fixed_point_type(type, allow_unsigned)
+        if np.issubdtype(builtins.type(significand), np.integer):
+            significand = XLSInteger(type=type.significand_type, value=significand)
+        elif isinstance(significand, XLSInteger):
+            assert significand.type.width == type.num_bits
+            assert significand.type.signed, 'FixedPoint is always a signed type'
+
+        self.type = type
+        self.significand = significand
+
+    @classmethod
+    def from_float(cls, x: np.floating[Any], precision: PrecisionType, allow_unsigned: bool = False):
+        xls_type = XLSFixedPointType.from_precision(precision, allow_unsigned)
+        return cls(type=xls_type, significand=float_to_significand(x, precision, allow_unsigned))
+
+    @classmethod
+    def min_value(cls, type: XLSFixedPointType):
+        return cls(type=type, significand=-(2 ** (type.num_bits - 1)))
+
+    @classmethod
+    def max_value(cls, type: XLSFixedPointType):
+        return cls(type=type, significand=2 ** (type.num_bits - 1) - 1)
+
+    @classmethod
+    def zero(cls, type: XLSFixedPointType):
+        return cls(type=type, significand=0)
+
+    def __str__(self):
+        # return f'fp_util::make_fixed_point<{self.type.binary_exponent}>:<{self.significand}>'
+        return f'{self.type}{{ significand: {self.significand} }}'
+
+
+# 1d array. TODO make it explicitly multidimensional?
+class XLSArray:
+    def __init__(self, array_type: XLSArrayType, array):
+        self.array_type = array_type
+
+        if not isinstance(array, str):
+            if isinstance(array_type.element_type, XLSArrayType):
+                array = [XLSArray(array_type=array_type.element_type, array=inner_array) for inner_array in array]
+            if not isinstance(array_type.size, str):
+                assert len(array) == array_type.size, f'Array size mismatch: expected {array_type.size}, got {len(array)}'
+        self.array = array
+
+    def __str__(self):
+        # TODO make it less verbose, e.g. replace:
+        #   FixedPoint<16,-6>[2]:[FixedPoint<16,-6>{ significand = sN[16]:-1}, FixedPoint<16,-6>{ significand = sN[16]:235} ]
+        # with
+        #   fp_util::make_fixed_points_1d<-6>(sN[6][2]:[-1, 235])
+        # NB: this works only when self.array contains explicit values, not string(s)!
+        if isinstance(self.array, str):
+            return f'{self.array_type}:[{self.array}]'
+        elements = ', '.join(map(str, self.array))
+        return f'{self.array_type}:[{elements}]'
+
+
+class XLSQualifiedName:
+    def __init__(self, name: str, module_name: str | None = None):
+        self.name = name
+        self.module_name = module_name
+
+    def __str__(self):
+        if self.module_name:
+            return f'{self.module_name}::{self.name}'
+        return self.name
+
+
+class XLSFunctionCall:
+    def __init__(self, name, params=None, args=None):
+        self.name = name
+        self.params = params or []
+        self.args = args or []
+        if isinstance(self.params, str):
+            self.params = [self.params]
+        if isinstance(self.args, str):
+            self.args = [self.args]
+
+    @property
+    def namespace(self):
+        parts = self.name.split('::')
+        match len(parts):
+            case 1:
+                return None
+            case 2:
+                return parts[0]
+            case _:
+                raise ValueError(f'Cannot extract namespace from function name: {self.name}')
+
+    def __str__(self):
+        params = ', '.join(map(str, self.params))
+        if params:
+            params = f'<{params}>'
+        args = ', '.join(map(str, self.args))
+        return f'{self.name}{params}({args})'
+
+
+class XLSConst:
+    def __init__(self, name, value, type=None):
+        self.name = name
+        self.value = value
+        self.type = type
+
+    def __str__(self):
+        type = f': {self.type}' if self.type else ''
+        return f'pub const {self.name}{type} = {self.value};'
+
+
+class XLSTypeAlias:
+    def __init__(self, name, type):
+        self.name = name
+        self.type = type
+
+    def __str__(self):
+        return f'pub type {self.name} = {self.type};'
+
+
+class XLSImport:
+    def __init__(self, name, alias=None):
+        self.name = name
+        self.alias = alias
+
+    def __str__(self):
+        as_alias = f' as {self.alias}' if self.alias else ''
+        return f'import {self.name}{as_alias};'
+
+
+class XLSVariableDefinition:
+    def __init__(self, name, value, type=None):
+        self.name = name
+        self.type = type
+        self.value = value
+
+    def __str__(self):
+        type = f': {self.type}' if self.type else ''
+        return f'let {self.name}{type} = {self.value};'
+
+
+class XLSFunctionDefinition:
+    def __init__(self, name, params, args, output_type, body):
+        self.name = name
+        self.params = params or []
+        self.args = args or []
+        self.output_type = output_type or '()'
+        self.body = body or ''
+
+    def __str__(self):
+        if isinstance(self.params, str):
+            params = self.params
+        else:
+            params = ', '.join(map(str, self.params))
+        if params:
+            params = f'<{params}>'
+        if isinstance(self.args, str):
+            args = self.args
+        else:
+            args = ', '.join(map(str, self.args))
+        return f"""pub fn {self.name}{params}({args})
+    -> {self.output_type} {{
+    {self.body}
+}}"""
+
+
+class XLSTensorVariable:
+    """Helper class to generate XLS constants for tensor variables."""
+
+    def __init__(self, name: str, num_bits, binary_exponent, rounding_mode, saturation_mode, shape) -> None:
+        if isinstance(shape, int) or isinstance(shape, str):
+            shape = (shape,)
+        name = ''.join(filter(lambda s: s.isalnum() or s == '_', name))
+        self.name = name
+        name = name.upper()
+        self.num_bits = XLSConst(f'{name}_NUM_BITS', num_bits, type='u32')
+        self.binary_exponent = XLSConst(f'{name}_BINARY_EXPONENT', binary_exponent, type='s32')
+        self.rounding_mode = XLSConst(f'{name}_ROUNDING_MODE', f'RoundingMode::{rounding_mode}', type='RoundingMode')
+        self.overflow_mode = XLSConst(f'{name}_OVERFLOW_MODE', f'OverflowMode::{saturation_mode}', type='OverflowMode')
+        self.shape = tuple(XLSConst(f'{name}_DIM_{i}', dim, type='u32') for i, dim in enumerate(shape))
+        name = name[0].upper() + name[1:].lower()
+        self.type_alias = XLSTypeAlias(name=f'{name}_Type', type=self.to_array_type())
+        self.type_alias_bits = XLSTypeAlias(name=f'{name}_Type_Bits', type=self.to_array_type_bits())
+
+    @classmethod
+    def from_tensor_variable(cls, var: TensorVariable, name: str | None = None) -> XLSTensorVariable:
+        precision = var.type.precision
+        assert precision.signed, (
+            f'{var.__class__.__name__}: XLS supports only signed FixedPrecision, but got: {precision} ({type(precision)})'
+        )
+        element_type = XLSFixedPointType.from_precision(precision)
+        return cls(
+            name=name or var.name,
+            num_bits=element_type.num_bits,
+            binary_exponent=element_type.binary_exponent,
+            rounding_mode=precision.rounding_mode,
+            saturation_mode=precision.saturation_mode,
+            shape=var.shape,
+        )
+
+    def definitions(self) -> list[XLSConst | XLSTypeAlias]:
+        return (
+            [self.num_bits, self.binary_exponent, self.rounding_mode, self.overflow_mode]
+            + list(self.shape)
+            + [self.type_alias, self.type_alias_bits]
+        )
+
+    def to_array_type(self) -> XLSArrayType:
+        return XLSArrayType(
+            element_type=XLSFixedPointType(self.num_bits.name, binary_exponent=self.binary_exponent.name),
+            shape=tuple(dim.name for dim in self.shape),
+        )
+
+    def to_array_type_bits(self) -> XLSArrayType:
+        return XLSArrayType(
+            element_type=XLSIntegerType(width=self.num_bits.name, signed=True), shape=tuple(dim.name for dim in self.shape)
+        )
+
+
+class XLSLookupTable:
+    def __init__(
+        self,
+        name: str,
+        input_precision: XLSFixedPointType | FixedPrecisionType,
+        output_precision: XLSFixedPointType | FixedPrecisionType,
+        x_min,
+        log2_step,
+        raw_table,
+    ) -> None:
+        input_precision = as_xls_fixed_point_type(input_precision)
+        output_precision = as_xls_fixed_point_type(output_precision)
+        self.input_num_bits = XLSConst(f'{name}_INPUT_NUM_BITS', input_precision.num_bits, 'u32')
+        self.input_binary_exponent = XLSConst(f'{name}_INPUT_BINARY_EXPONENT', input_precision.binary_exponent, 's32')
+        self.output_num_bits = XLSConst(f'{name}_OUTPUT_NUM_BITS', output_precision.num_bits, 'u32')
+        self.output_binary_exponent = XLSConst(f'{name}_OUTPUT_BINARY_EXPONENT', output_precision.binary_exponent, 's32')
+        self.size = XLSConst(f'{name}_SIZE', len(raw_table), 'u32')
+        self.log2_step = XLSConst(f'{name}_LOG2_STEP', log2_step, 's32')
+        self.x_min = XLSConst(
+            f'{name}_X_MIN',
+            x_min,
+            XLSFixedPointType(num_bits=f'{name}_INPUT_NUM_BITS', binary_exponent=f'{name}_INPUT_BINARY_EXPONENT'),
+        )
+        int_table = XLSArray(
+            array_type=XLSArrayType(
+                element_type=XLSIntegerType(width=f'{name}_OUTPUT_NUM_BITS', signed=True), shape=f'{name}_SIZE'
+            ),
+            array=raw_table,
+        )
+        fixed_point_table = XLSFunctionCall(
+            name='fixed_point_util::make_fixed_points_1d', params=[self.output_binary_exponent.name], args=[int_table]
+        )
+        self.lookup_table = XLSConst(
+            name=name,
+            value=XLSFunctionCall(
+                name='lookup_table::create', params=[self.log2_step.name], args=[x_min, fixed_point_table]
+            ),
+        )
+
+    def definitions(self) -> list[XLSConst]:
+        return [
+            self.input_num_bits,
+            self.input_binary_exponent,
+            self.output_num_bits,
+            self.output_binary_exponent,
+            self.size,
+            self.log2_step,
+            self.x_min,
+            self.lookup_table,
+        ]
+
+    def __str__(self):
+        return '\n'.join(map(str, self.definitions()))
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4351b78950..d9399b38bf 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -820,7 +820,12 @@ def _compile(self):
             dlclose_func(self._top_function_lib._handle)
         self._top_function_lib = ctypes.cdll.LoadLibrary(lib_name)
 
-    def _get_top_function(self, x):
+    def _get_top_function(self, x, *args, **kwargs):
+        backend = self.config.backend
+
+        if hasattr(backend, 'get_top_function') and callable(backend.get_top_function):
+            return backend.get_top_function(self, x, *args, **kwargs)
+
         if self._top_function_lib is None:
             raise Exception('Model not compiled')
         if len(self.get_input_variables()) == 1:
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index 4d3641a5ac..4692741d61 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -18,3 +18,4 @@
     print_vivado_report,  # noqa: F401
     read_vivado_report,  # noqa: F401
 )
+from hls4ml.report.xls_report import parse_xls_report  # noqa: F401
diff --git a/hls4ml/report/xls_report.py b/hls4ml/report/xls_report.py
new file mode 100644
index 0000000000..b2cd17dec0
--- /dev/null
+++ b/hls4ml/report/xls_report.py
@@ -0,0 +1,47 @@
+import os
+import re
+from pathlib import Path
+
+
+def _get_project_name(path) -> str:
+    project_path = Path(path) / 'firmware'
+    sv_files = list(project_path.glob('*.sv'))
+    return sv_files[0].stem
+
+
+def parse_xls_report(hls_dir) -> dict:
+    if not os.path.exists(hls_dir):
+        print(f'Path {hls_dir} does not exist. Exiting.')
+        return {}
+
+    project_name = _get_project_name(hls_dir)
+    report_dir = Path(hls_dir) / f'output_{project_name}' / 'reports'
+
+    vivado_syn_file = report_dir / f'{project_name}_post_synth_util.rpt'
+    report = {}
+    if os.path.isfile(vivado_syn_file):
+        vivado_synth_rpt = {}
+        with open(vivado_syn_file) as f:
+            section = 0
+            for line in f.readlines():
+                match = re.match(r'^(\d)\.', line)
+                if match:
+                    section = int(match.group(1))
+                # Sometimes, phrases such as 'CLB Registers' can show up in the non-tabular sections of the report
+                if '|' in line:
+                    # CLB (2019.X) vs. Slice (2020.X)
+                    if ('CLB LUTs' in line or 'Slice LUTs' in line) and section == 1:
+                        vivado_synth_rpt['LUT'] = line.split('|')[2].strip()
+                    elif ('CLB Registers' in line or 'Slice Registers' in line) and section == 1:
+                        vivado_synth_rpt['FF'] = line.split('|')[2].strip()
+                    elif 'Block RAM Tile' in line and section == 2:
+                        vivado_synth_rpt['BRAM_18K'] = line.split('|')[2].strip()
+                    elif 'URAM' in line and section == 2:
+                        vivado_synth_rpt['URAM'] = line.split('|')[2].strip()
+                    elif 'DSPs' in line and section == 3:
+                        vivado_synth_rpt['DSP48E'] = line.split('|')[2].strip()
+        report['VivadoSynthReport'] = vivado_synth_rpt
+    else:
+        print(f'Vivado synthesis report not found at {vivado_syn_file}.')
+
+    return report
diff --git a/hls4ml/templates/xls/build_prj.tcl b/hls4ml/templates/xls/build_prj.tcl
new file mode 100644
index 0000000000..bb383a1776
--- /dev/null
+++ b/hls4ml/templates/xls/build_prj.tcl
@@ -0,0 +1,56 @@
+# build_prj.tcl
+# Usage:
+#   vivado -mode batch -nolog -nojournal -source build_prj.tcl --tclargs <project_name> <board_part> <clock_period> <clock_uncertainty> [--pr]
+
+if {[llength $argv] < 4} {
+    puts stderr "ERROR: missing arguments\nUsage: vivado -mode batch -nolog -nojournal  -source build_prj.tcl -tclargs <project_name> <board_part> <clock_period> <clock_uncertainty> [--pr]"
+    exit 1
+}
+
+# get arguments
+set project_name      [lindex $argv 0]
+set board             [lindex $argv 1]
+set clock_period      [lindex $argv 2]
+set clock_uncertainty [lindex $argv 3]
+set do_pr             0
+if {[llength $argv] > 4 && [lindex $argv 4] eq "--pr"} {
+    set do_pr 1
+}
+
+set prj_root [file normalize [file dirname [info script]]]
+set prj_files [glob -nocomplain "${prj_root}/firmware/*.sv"]
+set output_dir "${prj_root}/output_${project_name}"
+set top_module "__${project_name}__${project_name}"
+
+# Parameters used in xdc
+set xdc_path "${prj_root}/constraints.xdc"
+set uncertainty_hold_r $clock_uncertainty
+set uncertainty_setup_r $clock_uncertainty
+set delay_max_r 0.4
+set delay_min_r 0.2
+
+
+set source_type "verilog"
+
+create_project $project_name "${output_dir}/$project_name" -force -part $board
+
+set_property DEFAULT_LIB work [current_project]
+set_property TARGET_LANGUAGE Verilog [current_project]
+
+read_verilog $prj_files
+read_xdc "${xdc_path}" -mode out_of_context
+
+set_property top $top_module [current_fileset]
+
+file mkdir $output_dir
+file mkdir "${output_dir}/reports"
+
+# synth
+synth_design -top $top_module -mode out_of_context -global_retiming on \
+    -flatten_hierarchy full -resource_sharing auto -directive AreaOptimized_High
+
+write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
+
+report_timing_summary -file "${output_dir}/reports/${project_name}_post_synth_timing.rpt"
+report_power -file "${output_dir}/reports/${project_name}_post_synth_power.rpt"
+report_utilization -file "${output_dir}/reports/${project_name}_post_synth_util.rpt"
diff --git a/hls4ml/templates/xls/constraints.xdc b/hls4ml/templates/xls/constraints.xdc
new file mode 100644
index 0000000000..4b4a9ab80a
--- /dev/null
+++ b/hls4ml/templates/xls/constraints.xdc
@@ -0,0 +1,21 @@
+
+
+# Calculate actual uncertainty values
+set uncertainty_setup [expr {$clock_period * $uncertainty_setup_r}]
+set uncertainty_hold [expr {$clock_period * $uncertainty_hold_r}]
+set delay_max [expr {$clock_period * $delay_max_r}]
+set delay_min [expr {$clock_period * $delay_min_r}]
+
+# Create clock with variable period
+create_clock -period $clock_period -name sys_clk [get_ports {clk}]
+
+# Input/Output constraints
+set_input_delay -clock sys_clk -max $delay_max [get_ports {x[*]}]
+set_input_delay -clock sys_clk -min $delay_min [get_ports {x[*]}]
+
+set_output_delay -clock sys_clk -max $delay_max [get_ports {out[*]}]
+set_output_delay -clock sys_clk -min $delay_min [get_ports {out[*]}]
+
+# Apply calculated uncertainty values
+set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
+set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
diff --git a/hls4ml/templates/xls/firmware/ap_types/fixed_point_util.x b/hls4ml/templates/xls/firmware/ap_types/fixed_point_util.x
new file mode 100644
index 0000000000..488c53e787
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/ap_types/fixed_point_util.x
@@ -0,0 +1,1053 @@
+// Collection of utility functions for fixed_point::FixedPoint<NUM_BITS, BINARY_EXPONENT>.
+// Here we use abbreviations NB -> NUM_BITS, BE -> BINARY_EXPONENT.
+// fixed_point::FixedPoint<NB, BE>{significand: sN[NB]} represents a real number (significand * 2^BE)
+
+import std;
+import fixed_point;
+import round;
+
+type FixedPoint = fixed_point::FixedPoint;
+type Sign = round::Sign;
+
+// All modes from hls4ml.model.types.RoundingMode
+// NB: do not confuse with round.RoundingMode!
+// TODO: not all modes are currently supported, see convert_rounding_mode()
+type RoundingModeIntegerType = u3;
+pub enum RoundingMode: RoundingModeIntegerType {
+    // Trunacte toward -inf
+    TRN         = 1,
+    // Truncate towards 0
+    TRN_ZERO    = 2,
+    // Round towards +inf
+    RND         = 3,
+    // Round towards 0
+    RND_ZERO    = 4,
+    // Round towards +-inf
+    RND_INF     = 5,
+    // Round towards -inf
+    RND_MIN_INF = 6,
+    // Round towards nearest even
+    RND_CONV    = 7
+}
+
+// Same oveflow modes as in ac_fixed type and in hls4ml
+type OverflowModeIntegerType = u2;
+pub enum OverflowMode: OverflowModeIntegerType {
+    // Drop bits to the left of MSB
+    WRAP      = 0,
+    // Saturate to [MIN, MAX]
+    SAT       = 1,
+    // Set to 0 on overflow
+    SAT_ZERO  = 2,
+    // Saturate to [-MAX, MAX]
+    SAT_SYM   = 3
+}
+
+// === Non-public functions copied from stdlib/fixed_point.x ===
+
+// Returns the position of the most significant bit, where 0 is the bit just left of the binary
+// point.
+//
+// E.g. consider a value like x.xxxb, which corresponds to NB=4 BE=-3.
+// most_significant_bit_position(4,-3) is 0
+fn most_significant_bit_position(NB: u32, BE: s32) -> s33 { NB as s33 + BE as s33 - s33:1 }
+
+// Returns the position of the least significant bit, where 0 is the bit just left of the binary
+// point.
+//
+// E.g. consider a value like xxxx.b, which corresponds to NB=4 BE=0.
+// least_significant_bit_position(4,0) is 0
+fn least_significant_bit_position(NB: u32, BE: s32) -> s32 { BE }
+
+// Returns the number of representable bits where two fixed point numbers overlap.
+//
+// These examples use x to indicate a representable bit:
+// num_bits_overlapping(2,-1, 2,-1) -> x.x and x.x overlap = 2
+// num_bits_overlapping(2, -1, 3, -2) -> x.x and x.xx overlap = 2
+// num_bits_overlapping(4, 0, 2, -1)  -> xxxx and x.x overlap = 1
+// num_bits_overlapping(4, 1, 1, 0)  -> xxxx0 and x overlap = 0
+// num_bits_overlapping(4, 0, 2, -2)  -> xxxx and .xx overlap = 0
+// num_bits_overlapping(4, 0, 2, -3)  -> xxxx and .0xx overlap = 0
+pub fn num_bits_overlapping(NB_A: u32, BE_A: s32, NB_B: u32, BE_B: s32) -> u32 {
+    let msb_a = most_significant_bit_position(NB_A, BE_A);
+    let msb_b = most_significant_bit_position(NB_B, BE_B);
+    let lsb_a = least_significant_bit_position(NB_A, BE_A) as s33;
+    let lsb_b = least_significant_bit_position(NB_B, BE_B) as s33;
+    let overlap = std::min(msb_a, msb_b) - std::max(lsb_a, lsb_b) + s33:1;
+    std::max(overlap, s33:0) as u32
+}
+
+// Returns the total width of two fixed point numbers when their binary points are aligned and the
+// representable bits are unioned. Includes the bits that would always be zero if these values were
+// aligned and then ANDed or ORed.
+pub fn aligned_width(NB_A: u32, BE_A: s32, NB_B: u32, BE_B: s32) -> u32 {
+    assert!(NB_A > u32:0, "0_width_will_yield_nonsensical_results");
+    assert!(NB_B > u32:0, "0_width_will_yield_nonsensical_results");
+
+    let msb_a = most_significant_bit_position(NB_A, BE_A);
+    let msb_b = most_significant_bit_position(NB_B, BE_B);
+    let lsb_a = least_significant_bit_position(NB_A, BE_A);
+    let lsb_b = least_significant_bit_position(NB_B, BE_B);
+    let msb = std::max(msb_a, msb_b);
+    let lsb = std::min(lsb_a, lsb_b) as s33;
+    let NB = msb - lsb + s33:1;
+    NB as u32
+}
+
+// === Create FixedPoint constants ===
+
+pub fn one<NB: u32, BE: s32>() -> FixedPoint<NB, BE> {
+    // If BE > 0, 1 is below quantization limit
+    const_assert!(BE <= s32:0);
+    let SHIFT = std::abs(BE) as u32;
+    const_assert!(SHIFT <= NB);
+    let x = sN[NB]:1 << SHIFT;
+    fixed_point::make_fixed_point<BE>(x)
+}
+
+pub fn max_value<NB: u32, BE: s32>() -> FixedPoint<NB, BE> {
+    fixed_point::make_fixed_point<BE>(std::signed_max_value<NB>())
+}
+
+pub fn min_value<NB: u32, BE: s32>() -> FixedPoint<NB, BE> {
+    fixed_point::make_fixed_point<BE>(std::signed_min_value<NB>())
+}
+
+// === Create FixedPoint arrays numbers from arrays of significands sN[NB] ===
+
+
+pub fn make_fixed_points_1d
+    <BE: s32, NB: u32, DIM: u32>
+    (significands: sN[NB][DIM])
+    -> FixedPoint<NB, BE>[DIM] {
+    map(significands, fixed_point::make_fixed_point<BE>)
+}
+
+pub fn make_fixed_points_2d
+    <BE: s32, NB: u32, DIM_0: u32, DIM_1: u32>
+    (significands: sN[NB][DIM_1][DIM_0])
+    -> FixedPoint<NB, BE>[DIM_1][DIM_0] {
+    map(significands, make_fixed_points_1d<BE>)
+}
+
+pub fn make_fixed_points_3d
+    <BE: s32, NB: u32, DIM_0: u32, DIM_1: u32, DIM_2: u32>
+    (significands: sN[NB][DIM_2][DIM_1][DIM_0])
+    -> FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0] {
+    map(significands, make_fixed_points_2d<BE>)
+}
+
+pub fn make_fixed_points_4d
+    <BE: s32, NB: u32, DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32>
+    (significands: sN[NB][DIM_3][DIM_2][DIM_1][DIM_0])
+    -> FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0] {
+    map(significands, make_fixed_points_3d<BE>)
+}
+
+pub fn const_array_1d
+    <DIM: u32, NB: u32, BE: s32>
+    (value: FixedPoint<NB, BE>)
+    -> FixedPoint<NB, BE>[DIM] {
+    FixedPoint<NB, BE>[DIM]:[value, ...]
+}
+
+pub fn const_array_2d
+    <DIM_0: u32, DIM_1: u32, NB: u32, BE: s32>
+    (value: FixedPoint<NB, BE>)
+    -> FixedPoint<NB, BE>[DIM_1][DIM_0] {
+    FixedPoint<NB, BE>[DIM_1][DIM_0]:[const_array_1d<DIM_1>(value), ...]
+}
+
+pub fn const_array_3d
+    <DIM_0: u32, DIM_1: u32, DIM_2: u32, NB: u32, BE: s32>
+    (value: FixedPoint<NB, BE>)
+    -> FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0] {
+    FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0]:[const_array_2d<DIM_1, DIM_2>(value), ...]
+}
+
+pub fn const_array_4d
+    <DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32, NB: u32, BE: s32>
+    (value: FixedPoint<NB, BE>)
+    -> FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0] {
+    FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0]:[const_array_3d<DIM_0, DIM_1, DIM_2>(value), ...]
+}
+
+
+// === Compare ===
+
+pub enum Compare: s2 {
+    LESS = -1,
+    EQUAL = 0,
+    GREATER = 1
+}
+
+pub fn compare<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> Compare {
+    let diff = fixed_point::sub(a, b).significand;
+    if (diff == 0)
+        { Compare::EQUAL }
+    else if (std::msb(diff) == u1:1)
+        { Compare::LESS }
+    else
+        { Compare::GREATER }
+}
+
+pub fn greater<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> bool {
+    compare(a, b) as s2 == Compare::GREATER as s2
+}
+
+pub fn greater_or_equal<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> bool {
+    compare(a, b) as s2 >= Compare::EQUAL as s2
+}
+
+pub fn less<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> bool {
+    compare(a, b) as s2 == Compare::LESS as s2
+}
+
+pub fn less_or_equal<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> bool {
+    compare(a, b) as s2 <= Compare::EQUAL as s2
+}
+
+pub fn equal<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>
+) -> bool {
+    compare(a, b) as s2 == Compare::EQUAL as s2
+}
+
+fn check_compare_impl<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>,
+    expected_compare_result: Compare
+) {
+    let compare_result = compare(a, b);
+    assert_eq(compare_result as s2, expected_compare_result as s2);
+
+    match expected_compare_result {
+        Compare::LESS => {
+            assert_eq(less(a,b), true);
+            assert_eq(less_or_equal(a,b), true);
+            assert_eq(equal(a,b), false);
+            assert_eq(greater_or_equal(a,b), false);
+            assert_eq(greater(a,b), false);
+        },
+        Compare::EQUAL => {
+            assert_eq(less(a,b), false);
+            assert_eq(less_or_equal(a,b), true);
+            assert_eq(equal(a,b), true);
+            assert_eq(greater_or_equal(a,b), true);
+            assert_eq(greater(a,b), false);
+        },
+        Compare::GREATER => {
+                assert_eq(less(a,b), false);
+                assert_eq(less_or_equal(a,b), false);
+                assert_eq(equal(a,b), false);
+                assert_eq(greater_or_equal(a,b), true);
+                assert_eq(greater(a,b), true);
+        }
+    };
+}
+
+fn check_compare<
+    NB_A: u32, BE_A: s32,
+    NB_B: u32, BE_B: s32
+>(
+    a: FixedPoint<NB_A, BE_A>,
+    b: FixedPoint<NB_B, BE_B>,
+    expected_compare_result: Compare
+) {
+    check_compare_impl(a, b, expected_compare_result);
+    check_compare_impl(b, a, match expected_compare_result {
+        Compare::LESS => Compare::GREATER,
+        Compare::EQUAL => Compare::EQUAL,
+        Compare::GREATER => Compare::LESS
+    });
+}
+
+#[test]
+fn test_compare() {
+    let minus_one = fixed_point::from_integer(s3:-1);
+    let zero = fixed_point::from_integer(s3:0);
+    let one = fixed_point::from_integer(s3:1);
+    let two = fixed_point::from_integer(s3:2);
+
+    let minus_one_big = fixed_point::make_fixed_point<-8>(s16:-256);
+    let zero_big = fixed_point::make_fixed_point<-4>(s8:0);
+    let one_big = fixed_point::make_fixed_point<-5>(s12:32);
+    let two_big = fixed_point::make_fixed_point<-1>(s12:4);
+
+    let values = [minus_one, zero, one, two];
+    // Cannot make it an array because of different types
+    let values_big = (minus_one_big, zero_big, one_big, two_big);
+
+    check_compare(minus_one, minus_one_big, Compare::EQUAL);
+    check_compare(minus_one, minus_one_big, Compare::EQUAL);
+
+    for (i, _) in u32:0..4 {
+        for (j, _) in u32:0..4 {
+            let expected_result = if (i < j) {
+                Compare::LESS
+            } else if (i == j) {
+                Compare::EQUAL
+            } else {
+                Compare::GREATER
+            };
+            let a = values[i];
+            // values_big[i] or values_big.i does not compile,
+            // so we iterate manually
+            match j {
+                u32:0 => check_compare(a, values_big.0, expected_result),
+                u32:1 => check_compare(a, values_big.1, expected_result),
+                u32:2 => check_compare(a, values_big.2, expected_result),
+                u32:3 => check_compare(a, values_big.3, expected_result),
+                _     => fail!("index_out_of_bounds", ())
+            }
+        }(())
+    }(())
+}
+
+
+// === Transpose ===
+
+pub fn transpose
+<NB: u32, BE: s32, DIM_0: u32, DIM_1: u32>
+(x: FixedPoint<NB, BE>[DIM_1][DIM_0])
+-> FixedPoint<NB, BE>[DIM_0][DIM_1] {
+    let res = zero!<FixedPoint<NB, BE>[DIM_0][DIM_1]>();
+    for (i, res) in 0..DIM_0 {
+        for (j, res) in 0..DIM_1 {
+            update(res, (j,i), x[i][j])
+        }(res)
+    }(res)
+}
+
+#[test]
+fn test_transpose() {
+    let x = make_fixed_points_2d<0>([[s16:1, 2, 3], [s16:4, 5, 6]]);
+    let x_t = make_fixed_points_2d<0>([[s16:1, 4], [s16:2, 5], [s16:3, 6]]);
+    assert_eq(x_t, transpose(x));
+    assert_eq(x, transpose(x_t));
+}
+
+// Reshape to and from 1D arrays with C-style (row-major) ordering.
+
+pub fn flatten_2d<
+    NB: u32, BE: s32,
+    DIM_0: u32, DIM_1: u32,
+    DIM: u32 = {DIM_0 * DIM_1}
+>
+(x: FixedPoint<NB, BE>[DIM_1][DIM_0])
+-> FixedPoint<NB, BE>[DIM] {
+    let res = zero!<FixedPoint<NB, BE>[DIM]>();
+    for (i, res) in 0..DIM_0 {
+        for (j, res) in 0..DIM_1 {
+            update(res, i * DIM_1 + j, x[i][j])
+        }(res)
+    }(res)
+}
+
+pub fn flatten_3d<
+    NB: u32, BE: s32,
+    DIM_0: u32, DIM_1: u32, DIM_2: u32,
+    DIM: u32 = {DIM_0 * DIM_1 * DIM_2}
+>(x: FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0])
+-> FixedPoint<NB, BE>[DIM] {
+    flatten_2d(map(x, flatten_2d))
+}
+
+pub fn flatten_4d<
+    NB: u32, BE: s32,
+    DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32,
+    DIM: u32 = {DIM_0 * DIM_1 * DIM_2 * DIM_3}
+>(x: FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0])
+-> FixedPoint<NB, BE>[DIM] {
+    flatten_2d(map(x, flatten_3d))
+}
+
+pub fn reshape_to_2d<
+    DIM_0: u32, DIM_1: u32,
+    NB: u32, BE: s32,
+    DIM: u32 = {DIM_0 * DIM_1}>
+(x: FixedPoint<NB, BE>[DIM])
+-> FixedPoint<NB, BE>[DIM_1][DIM_0] {
+    let res = zero!<FixedPoint<NB, BE>[DIM_1][DIM_0]>();
+    for (i, res) in 0..DIM_0 {
+        for (j, res) in 0..DIM_1 {
+            update(res, (i, j), x[i * DIM_1 + j])
+        }(res)
+    }(res)
+}
+
+pub fn reshape_to_3d<
+    DIM_0: u32, DIM_1: u32, DIM_2: u32,
+    NB: u32, BE: s32,
+    DIM: u32 = {DIM_0 * DIM_1 * DIM_2}>
+(x: FixedPoint<NB, BE>[DIM])
+-> FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0] {
+    let x_2d = reshape_to_2d<DIM_0, {DIM_1 * DIM_2}>(x);
+    map(x_2d, reshape_to_2d<DIM_1, DIM_2>)
+}
+
+pub fn reshape_to_4d<
+    DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32,
+    NB: u32, BE: s32,
+    DIM: u32 = {DIM_0 * DIM_1 * DIM_2 * DIM_3}>
+(x: FixedPoint<NB, BE>[DIM])
+-> FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0] {
+    let x_2d = reshape_to_2d<DIM_0, {DIM_1 * DIM_2 * DIM_3}>(x);
+    map(x_2d, reshape_to_3d<DIM_1, DIM_2, DIM_3>)
+}
+
+#[test]
+fn test_reshape_2d() {
+    let x_flat = make_fixed_points_1d<0>([s16:1, 2, 3, 4, 5, 6]);
+    let x = make_fixed_points_2d<0>([[s16:1, 2, 3], [s16:4, 5, 6]]);
+    assert_eq(x, reshape_to_2d<2,3>(x_flat));
+    assert_eq(x_flat, flatten_2d(x));
+}
+
+#[test]
+fn test_reshape_3d() {
+    let x_flat = make_fixed_points_1d<0>([s16:1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]);
+    let x = make_fixed_points_3d<0>([[[s16:1, 2], [s16:3, 4], [s16:5, 6]], [[s16:7, 8], [s16:9, 10], [s16:11, 12]]]);
+    assert_eq(x, reshape_to_3d<2,3,2>(x_flat));
+    assert_eq(x_flat, flatten_3d(x));
+}
+
+#[test]
+fn test_reshape_4d() {
+    let x = make_fixed_points_4d<0>([[[[s16:1, 2], [s16:3, 4], [s16:5, 6]]], [[[s16:7, 8], [s16:9, 10], [s16:11, 12]]]]);
+    let x_flat = make_fixed_points_1d<0>([s16:1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]);
+    assert_eq(x, reshape_to_4d<2,1,3,2>(x_flat));
+    assert_eq(x_flat, flatten_4d(x));
+}
+
+// === Convert FixedPoint array to array of significands sN[NB] ===
+
+pub fn to_significand
+    <NB: u32, BE: s32>
+    (x: FixedPoint<NB, BE>)
+    -> sN[NB] {
+    x.significand
+}
+
+pub fn to_significand_1d
+    <NB: u32, BE: s32, DIM_0: u32>
+    (x: FixedPoint<NB, BE>[DIM_0])
+    -> sN[NB][DIM_0] {
+    map(x, to_significand)
+}
+pub fn to_significand_2d
+    <NB: u32, BE: s32, DIM_0: u32, DIM_1: u32>
+    (x: FixedPoint<NB, BE>[DIM_1][DIM_0])
+    -> sN[NB][DIM_1][DIM_0] {
+    map(x, to_significand_1d)
+}
+pub fn to_significand_3d
+    <NB: u32, BE: s32, DIM_0: u32, DIM_1: u32, DIM_2: u32>
+    (x: FixedPoint<NB, BE>[DIM_2][DIM_1][DIM_0])
+    -> sN[NB][DIM_2][DIM_1][DIM_0] {
+    map(x, to_significand_2d)
+}
+pub fn to_significand_4d
+    <NB: u32, BE: s32, DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32>
+    (x: FixedPoint<NB, BE>[DIM_3][DIM_2][DIM_1][DIM_0])
+    -> sN[NB][DIM_3][DIM_2][DIM_1][DIM_0] {
+    map(x, to_significand_3d)
+}
+
+// === Change width and exponent ===
+
+fn overflow_truncated<OVERFLOW: OverflowMode, N: u32>(
+    // result of truncate_msbs(x) or truncate_lsbs(x)
+    truncated: sN[N],
+    // Sign of the result (need to pass it because is could be lost during truncation)
+    sign: Sign,
+    // Did overflow happen during truncation?
+    had_overflow: bool
+    ) -> sN[N] {
+
+    assert!(N != 0, "illegal_zero_width");
+    // TODO: this fails due to eager instantiation for N=0
+    // let MAX = std::signed_max_value<N>();
+    // let MIN = std::signed_max_value<N>();
+    let MAX = (std::signed_max_value<{N+2}>() >> 2) as sN[N];
+    let MIN = (std::signed_min_value<{N+2}>() >> 2) as sN[N];
+
+    let has_overflow = match OVERFLOW {
+        OverflowMode::SAT_SYM => had_overflow || (truncated == MIN),
+        _ => had_overflow
+    };
+
+    if has_overflow {
+        match OVERFLOW {
+            OverflowMode::WRAP => {
+                truncated
+            },
+            OverflowMode::SAT => {
+                match sign {
+                    Sign::NonNegative => MAX,
+                    Sign::Negative    => MIN
+                }
+            },
+            OverflowMode::SAT_ZERO => {
+                sN[N]:0
+            },
+            OverflowMode::SAT_SYM => {
+                match sign {
+                    Sign::NonNegative => MAX,
+                    Sign::Negative    => -MAX
+                }
+            }
+        }
+    }
+    else {
+        truncated
+    }
+}
+
+// Drop (NB_IN - NB_OUT) MSBs and handle overflow
+fn truncate_msbs<NB_OUT: u32, OVERFLOW: OverflowMode, NB_IN: u32>
+    (x: sN[NB_IN]) -> sN[NB_OUT] {
+
+    // TODO const_assert! fails due to eager instantiation.
+    // const_assert!(NB_IN > NB_OUT);
+    // let NB_OVERFLOW = NB_IN - NB_OUT;
+    assert!(NB_IN > NB_OUT, "truncate_msbs_nothing_to_truncate");
+    let NB_OVERFLOW = std::usub_or_zero(NB_IN, NB_OUT);
+
+    // TODO: this causes const_assert! in split_lsbs.
+    // So we have to introduce NB_SPLIT
+    // let (msbs, lsbs) = std::split_lsbs<NB_OUT>(std::to_unsigned(x));
+    let NB_SPLIT = std::min(NB_IN, NB_OUT);
+    let (_, lsbs) = std::split_lsbs<NB_SPLIT>(std::to_unsigned(x));
+    let truncated = std::to_signed(lsbs) as sN[NB_OUT];
+
+    // TODO this fails due to eager instantiation for NB_IN = 0
+    // let sign:Sign = std::msb(x) as Sign;
+    let sign:Sign = std::msb((x as sN[NB_IN + 1]) << 1) as Sign;
+
+    // TODO this fails due to eager instantiation for NB_IN = 0
+    // let NB_SIGN_EXT = NB_OVERFLOW + 1;
+    let NB_SIGN_EXT = std::min(NB_OVERFLOW + 1, NB_IN);
+    // If there is no overflow, overflow_bits and are either 000..0 or 111..1
+    let sign_ext = match sign {
+        Sign::NonNegative => zero!<uN[NB_SIGN_EXT]>(),
+        Sign::Negative => all_ones!<uN[NB_SIGN_EXT]>()
+    };
+    // Take all truncated bits and the sign bit
+    let (msbs, _) = std::split_msbs<NB_SIGN_EXT>(std::to_unsigned(x));
+
+    // NB: overflow also happens when truncated == MIN for OverflowMode::SAT_SYM
+    // We handle this inside overflow_truncated()
+    let had_overflow = (msbs != sign_ext);
+    overflow_truncated<OVERFLOW>(truncated as sN[NB_OUT], sign, had_overflow)
+}
+
+fn convert_rounding_mode<rm: RoundingMode>() -> round::RoundingMode {
+    match rm {
+        RoundingMode::TRN         => round::RoundingMode::RTN,
+        RoundingMode::TRN_ZERO    => round::RoundingMode::RTZ,
+        // RoundingMode::RND         => TODO,
+        // RoundingMode::RND_ZERO    => TODO,
+        RoundingMode::RND_INF     => round::RoundingMode::RNA,
+        // RoundingMode::RND_MIN_INF => TODO,
+        RoundingMode::RND_CONV    => round::RoundingMode::RNE,
+        _ => {
+            assert_fmt!(false, "unsupported_RoundingMode_{}", (rm as RoundingModeIntegerType));
+            round::RoundingMode::RTN
+        }
+    }
+}
+
+// round::round_trunc_s, but with our RoundingMode
+fn round_trunc_s<NUM_BITS_ROUNDED: u32, ROUNDING: RoundingMode, N: u32, R: u32 = {N - NUM_BITS_ROUNDED}>
+    (unrounded: sN[N]) -> (u1, sN[R]) {
+    round::round_trunc_s<NUM_BITS_ROUNDED>(convert_rounding_mode<ROUNDING>(), unrounded)
+}
+
+// Drop (NB_IN - NB_OUT) LSBs using RoundingMode,
+// and handle possible overflow (e.g. rounding MAX up) according to OverflowMode.
+fn truncate_lsbs<NB_OUT: u32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode, NB_IN: u32>
+    (x: sN[NB_IN]) -> sN[NB_OUT] {
+
+    // TODO const_assert! fails due to eager instantiation
+    // const_assert!(NB_IN > NB_OUT);
+    // let NUM_BITS_ROUNDED = NB_IN - NB_OUT;
+    assert!(NB_IN > NB_OUT, "truncate_lsbs_nothing_to_truncate");
+    let NUM_BITS_ROUNDED = std::usub_or_zero(NB_IN, NB_OUT);
+
+    let (had_overflow, truncated) = round_trunc_s<NUM_BITS_ROUNDED, ROUNDING>(x);
+    let sign = std::msb(x) as Sign;
+    overflow_truncated<OVERFLOW>(truncated as sN[NB_OUT], sign, had_overflow)
+}
+
+// FixedPoint<NB, BE> ~ ac_fixed<NB, NB + BE>
+// ~ significand * 2^BE
+// 0b00111.001 ~ FixedPoint<8,-3>
+pub fn resize<
+    NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32>
+    (x: FixedPoint<NB_IN, BE_IN>)
+    -> FixedPoint<NB_OUT, BE_OUT>{
+
+    let SHIFT: s32 = BE_IN - BE_OUT;
+
+    let NB_ALIGNED = if (SHIFT >= s32:0) {
+        NB_IN + std::to_unsigned(SHIFT)
+    }
+    else {
+        std::usub_or_zero(NB_IN, std::to_unsigned(-SHIFT))
+    };
+
+    // Align exponent
+    let aligned : sN[NB_ALIGNED] =
+        if (SHIFT >= s32:0) {
+            (x.significand as sN[NB_ALIGNED]) << std::to_unsigned(SHIFT)
+        } else if (NB_ALIGNED == 0) {
+            // TODO: move this case inside truncate_lsbs?
+            zero!<sN[NB_ALIGNED]>()
+        } else {
+            truncate_lsbs<NB_ALIGNED, ROUNDING, OVERFLOW>(x.significand)
+        };
+
+    // Resize width
+    let resized = if (NB_OUT < NB_ALIGNED) {
+        truncate_msbs<NB_OUT, OVERFLOW>(aligned)
+    } else if (NB_OUT == NB_ALIGNED){
+        // Here overflow_truncated() will change the result on in SAT_SYM mode, if aligned == MIN.
+        let sign = std::msb(aligned as sN[NB_OUT]) as Sign;
+        let had_overflow = false;
+        overflow_truncated<OVERFLOW>(aligned as sN[NB_OUT], sign, had_overflow)
+    } else {
+        aligned as sN[NB_OUT]
+    };
+
+    FixedPoint<NB_OUT, BE_OUT>{ significand: resized }
+}
+
+pub fn resize_1d<
+    NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32
+    >
+(x: FixedPoint<NB_IN, BE_IN>[DIM])
+-> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+    map(x, resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>)
+}
+
+pub fn resize_2d<
+    NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM_0: u32, DIM_1: u32
+    >
+(x: FixedPoint<NB_IN, BE_IN>[DIM_1][DIM_0])
+-> FixedPoint<NB_OUT, BE_OUT>[DIM_1][DIM_0] {
+    map(x, resize_1d<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>)
+}
+
+pub fn resize_3d<
+    NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM_0: u32, DIM_1: u32, DIM_2: u32
+    >
+(x: FixedPoint<NB_IN, BE_IN>[DIM_2][DIM_1][DIM_0])
+-> FixedPoint<NB_OUT, BE_OUT>[DIM_2][DIM_1][DIM_0] {
+    map(x, resize_2d<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>)
+}
+
+pub fn resize_4d<
+    NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM_0: u32, DIM_1: u32, DIM_2: u32, DIM_3: u32
+    >
+(x: FixedPoint<NB_IN, BE_IN>[DIM_3][DIM_2][DIM_1][DIM_0])
+-> FixedPoint<NB_OUT, BE_OUT>[DIM_3][DIM_2][DIM_1][DIM_0] {
+    map(x, resize_3d<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>)
+}
+
+
+fn resize_test_case<
+    ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32>
+    (input: FixedPoint<NB_IN, BE_IN>, expected_output: FixedPoint<NB_OUT, BE_OUT>) {
+
+    let output = resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(input);
+    assert_eq(output, expected_output);
+}
+
+#[test]
+fn test_resize() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    resize_test_case<R, O>(
+        fixed_point::make_fixed_point<0>(s2:1),
+        fixed_point::make_fixed_point<-2>(s4:1 << 2)
+    );
+    resize_test_case<R, O>(
+        fixed_point::make_fixed_point<0>(s2:-1),
+        fixed_point::make_fixed_point<-2>(s4:-1 << 2)
+    );
+}
+
+#[test]
+fn test_resize_more() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+
+    // widen width only (sign extension)
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s2:1),
+        fixed_point::make_fixed_point<0>(s4:1)
+    );
+
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s2:-1),
+        fixed_point::make_fixed_point<0>(s4:-1)
+    );
+
+    // exponent decrease (SHIFT > 0) → left shift
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s3:1),
+        fixed_point::make_fixed_point<-2>(s5:1 << 2)
+    );
+
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s3:-2),
+        fixed_point::make_fixed_point<-2>(s5:-2 << 2)
+    );
+
+    // exponent increase (SHIFT < 0) → truncate LSBs
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<-2>(s4:0b0110), // 1.5
+        fixed_point::make_fixed_point<0>(s2:1)
+    );
+
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<-2>(s4:0b1010), // -1.5
+        fixed_point::make_fixed_point<0>(s2:-2)
+    );
+
+    // full LSB truncation (NB_ALIGNED = 0)
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<-1>(s3:3),
+        fixed_point::make_fixed_point<3>(s4:0)
+    );
+
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<-1>(s3:-3),
+        fixed_point::make_fixed_point<3>(s4:0)
+    );
+
+    // MSB truncation (wrap)
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s5:0b10110),
+        fixed_point::make_fixed_point<0>(s3:0b110)
+    );
+
+    resize_test_case<R,O>(
+        fixed_point::make_fixed_point<0>(s5:-7),
+        fixed_point::make_fixed_point<0>(s3:1)
+    );
+}
+
+fn resize_overflow_test_case<
+    OVERFLOW: OverflowMode,
+    NB_IN: u32,
+    NB_OUT: u32
+>(
+    x: sN[NB_IN],
+    expected: sN[NB_OUT]
+) {
+    resize_test_case<RoundingMode::TRN, OVERFLOW>(
+        fixed_point::make_fixed_point<0>(x),
+        fixed_point::make_fixed_point<0>(expected)
+    );
+}
+
+#[test]
+fn test_resize_overflow_modes() {
+    // WRAP
+    resize_overflow_test_case<OverflowMode::WRAP>(s5:15, s3:-1);
+    resize_overflow_test_case<OverflowMode::WRAP>(s5:8, s3:0);
+    // SAT
+    resize_overflow_test_case<OverflowMode::SAT>(s5:15, s4:7);
+    resize_overflow_test_case<OverflowMode::SAT>(s5:15, s3:3);
+    resize_overflow_test_case<OverflowMode::SAT>(s5:-16, s4:-8);
+    resize_overflow_test_case<OverflowMode::SAT>(s5:-16, s3:-4);
+    // SAT_ZERO
+    resize_overflow_test_case<OverflowMode::SAT_ZERO>(s5:15,s3:0);
+    resize_overflow_test_case<OverflowMode::SAT_ZERO>(s5:-15,s3:0);
+    resize_overflow_test_case<OverflowMode::SAT_ZERO>(s5:-9,s3:0);
+    // SAT_SYM
+    resize_overflow_test_case<OverflowMode::SAT_SYM>(s5:-16, s3:-3);
+    resize_overflow_test_case<OverflowMode::SAT_SYM>(s5:-16, s5:-15);
+    resize_overflow_test_case<OverflowMode::SAT_SYM>(s5:15, s5:15);
+}
+
+
+// === Queries ===
+
+
+pub fn max<NB: u32, BE: s32>
+    (x: FixedPoint<NB, BE>, y: FixedPoint<NB, BE>) -> FixedPoint<NB, BE> {
+    fixed_point::make_fixed_point<BE>(std::max(x.significand, y.significand))
+}
+
+pub fn max_1d
+    <NB: u32, BE: s32, DIM: u32>
+    (xs: FixedPoint<NB, BE>[DIM])
+    -> FixedPoint<NB, BE> {
+    // We could do 1..DIM, but compilation fails for empty range
+    let max_significand = for (i, acc) in 0..DIM {
+        std::max(acc, xs[i].significand)
+    }(xs[0].significand);
+    fixed_point::make_fixed_point<BE>(max_significand)
+}
+
+
+// === Clip ===
+
+pub fn clip<NB: u32, BE: s32>(
+    x: FixedPoint<NB, BE>,
+    min_value: FixedPoint<NB, BE>,
+    max_value: FixedPoint<NB, BE>
+    ) -> FixedPoint<NB, BE> {
+
+    if (fixed_point::sub(x, min_value).significand < 0)
+        { min_value }
+    else if (fixed_point::sub(x, max_value).significand > 0)
+        { max_value }
+    else
+        { x }
+}
+
+pub fn clip_resize<
+    NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    NB_MIN: u32, BE_MIN: s32,
+    NB_MAX: u32, BE_MAX: s32>(
+        x: FixedPoint<NB_IN, BE_IN>,
+        min_value: FixedPoint<NB_MIN, BE_MIN>,
+        max_value: FixedPoint<NB_MAX, BE_MAX>
+    ) -> FixedPoint<NB_OUT, BE_OUT> {
+
+    if (fixed_point::sub(x, min_value).significand < 0)
+        { resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(min_value) }
+    else if (fixed_point::sub(x, max_value).significand > 0)
+        { resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(max_value) }
+    else
+        { resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x) }
+}
+
+// === Arithmetic operations ===
+
+// Compute -x
+// Adds one extra bit to avoid overflow when x = -2^(NB-1)
+pub fn negate<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32 = {NB_IN + 1}, BE_OUT: s32 = {BE_IN}
+>
+(x: FixedPoint<NB_IN, BE_IN>)
+-> FixedPoint<NB_OUT, BE_OUT> {
+    let xx = x.significand as sN[NB_OUT];
+    FixedPoint<NB_OUT, BE_OUT>{ significand: -xx }
+}
+
+// Negate without adding extra bit
+pub fn negate_with_overflow<
+    OVERFLOW: OverflowMode,
+    NB: u32, BE: s32
+>
+(x: FixedPoint<NB, BE>)
+-> FixedPoint<NB, BE> {
+    let minus_x = negate(x);
+    let significand = truncate_msbs<NB, OVERFLOW>(minus_x.significand);
+    fixed_point::make_fixed_point<BE>(significand)
+}
+
+fn negate_test_case<NB: u32, BE: s32, OVERFLOW: OverflowMode>() {
+    let NB_OUT = NB + 1;
+
+    let MIN = std::signed_min_value<NB>();
+    let MAX = std::signed_max_value<NB>();
+
+    let ROUNDING = RoundingMode::TRN;
+    for (i, _) in MIN..MAX {
+        let x = fixed_point::make_fixed_point<BE>(i);
+        let expected = fixed_point::make_fixed_point<BE>(-(i as sN[NB_OUT]));
+        let expected_with_overflow = resize<NB, BE, ROUNDING, OVERFLOW>(expected);
+        assert_eq(expected, negate(x));
+        assert_eq(expected_with_overflow, negate_with_overflow<OVERFLOW>(x));
+    }(());
+}
+
+#[test]
+fn test_negate() {
+    negate_test_case<3, 0, OverflowMode::WRAP>();
+    negate_test_case<3, 0, OverflowMode::SAT>();
+    negate_test_case<3, 0, OverflowMode::SAT_ZERO>();
+    negate_test_case<3, 0, OverflowMode::SAT_SYM>();
+}
+
+
+// Performs an add assuming that the rhs is already wide enough to not overflow.
+// WARNING: rhs must be wide enough to avoid any overflow
+pub fn add_already_widened
+    <NB_A: u32, BE_A: s32, NB_B: u32, BE_B: s32>
+    (fxd_a: FixedPoint<NB_A, BE_A>, fxd_b: FixedPoint<NB_B, BE_B>)
+    -> FixedPoint<NB_B, BE_B> {
+    // Widen before left shifting to avoid overflow
+    let aligned_lhs = (fxd_a.significand as sN[NB_B]) << (BE_A - BE_B) as u32;
+    // TODO: I think this is also always the same in the dot product use case. Fraction bits stay
+    // the same
+    let aligned_rhs = fxd_b.significand;
+
+    fixed_point::make_fixed_point<BE_B>(aligned_lhs + aligned_rhs)
+}
+
+// Performs an subtraction assuming that the rhs is already wide enough to not overflow.
+// WARNING: rhs must be wide enough to avoid any overflow
+pub fn sub_already_widened
+    <NB_A: u32, BE_A: u32, NB_B: u32, BE_B: u32>
+    (fxd_a: FixedPoint<NB_A, BE_A>, fxd_b: FixedPoint<NB_B, BE_B>)
+    -> FixedPoint<NB_B, BE_B> {
+    // Widen before left shifting to avoid overflow
+    let aligned_lhs = (fxd_a.significand as sN[NB_B]) << (BE_A - BE_B) as u32;
+    let aligned_rhs = fxd_b.significand;
+
+    fixed_point::make_fixed_point<BE_B>(aligned_lhs - aligned_rhs)
+}
+
+// Performs an fused-multiply-add assuming that the rhs is already wide enough to not overflow.
+// WARNING: the add rhs must be wide enough to avoid any overflow
+pub fn fmadd_already_widened
+    <NB_A: u32, BE_A: s32, NB_B: u32, BE_B: s32, NB_C: u32, BE_C: s32,
+     NB_MUL: u32 = {NB_A + NB_B}, BE_MUL: s32 = {BE_A + BE_B}>
+    (fxd_a: FixedPoint<NB_A, BE_A>,
+    fxd_b: FixedPoint<NB_B, BE_B>,
+    fxd_c: FixedPoint<NB_C, BE_C>)
+    -> FixedPoint<NB_C, BE_C> {
+    let prod = fixed_point::mul<NB_A, BE_A, NB_B, BE_B>(fxd_a, fxd_b);
+    add_already_widened<NB_MUL, BE_MUL, NB_C, BE_C>(prod, fxd_c)
+}
+
+// Performs a dot product on 2 vectors. To implement this, the final widened result is
+// computed before. An accumulator is instantiated with this final size and the fmadd operation
+// is reimplemented in such a way as to not widen the output when summing in the accumulator.
+//
+// TYPE EXPLANATIONS:
+// number bits: a multiplication assumes to always double the number of bits.
+//      Since our vectors must be of the same type
+//      (each elem. within each vector follow the same fixed point representation)
+//      we know the size of all elem. wise multiplications.
+//      We can also guarantee that all elements will have overlapping positions
+//      (again because elems. within vectors have the same type). This means that we must
+//      widen by one bit for each element of the vector minus one. Minus one because we performs VEC_SZ - 1 adds.
+// binary exponent: The binary exponent will never change with additions since
+//      all elem-wise multiplication will result in the same exponent.
+// exp is negative: inferred from 'binary exponent'
+// unsigned exp:    inferred from 'binary exponent'
+// WARNINGS:
+// 1. made aligned_width() and num_bits_overlapping() public in a copy of the fixed_point_lib module.
+// to write the type inference
+// 2. We use ''already_widened'' functions.
+pub fn dot_prod
+    <NB_X: u32, BE_X: s32,
+    NB_Y: u32, BE_Y: s32,
+    VEC_SZ: u32,
+    // Precision inference MUL
+    NB_MUL: u32 = {NB_X + NB_Y},
+    BE_MUL: s32 = {BE_X + BE_Y},
+    // Precision Inference DOT PROD
+    NB_DOT_PROD: u32 = {NB_MUL + std::clog2(VEC_SZ)},
+    BE_DOT_PROD: s32 = {BE_MUL}>
+    (x: FixedPoint<NB_X, BE_X>[VEC_SZ],
+     y: FixedPoint<NB_Y, BE_Y>[VEC_SZ])
+    -> FixedPoint<NB_DOT_PROD, BE_DOT_PROD> {
+
+    for (i, acc) in 0..VEC_SZ {
+        fmadd_already_widened(x[i], y[i], acc)
+    }(zero!<FixedPoint<NB_DOT_PROD, BE_DOT_PROD>>())
+}
+
+// TODO
+// #[test]
+// fn fadd_test() {
+//     let a = sN[u32:16]:1024; // 1.0
+//     let b = sN[u32:16]:1024; // 1.0
+//     let c = sN[u32:16]:1024; // 1.0
+
+//     let result = fmadd<u32:16, u32:1, u32:10, u32:16, u32:1, u32:10, u32:16, u32:1, u32:10>(a, b, c);
+//     // Solve: x * 2^(-20) = 2 (x must fit in 33 bits)
+//     let expected = sN[u32:33]:2097152; // 2.0
+//     assert_eq(expected, result);
+// }
+
+
+type FP = FixedPoint<16, -10>;
+
+#[test]
+fn dot_prod_test() {
+    // [1.5, 1.5]
+    let x = make_fixed_points_1d<-10>(sN[16][2]:[1536, ...]);
+    // [2.25, 2.25]
+    let y = make_fixed_points_1d<-10>(sN[16][2]:[2304, ...]);
+    // 6.75
+    let expected = fixed_point::make_fixed_point<-20>(sN[33]:7077888);
+    assert_eq(expected, dot_prod(x, y));
+
+    // [1.0, 1.0, 1.0]
+    let x = make_fixed_points_1d<-10>(sN[16][3]:[1024, ...]);
+    // [1.0, 1.0, 1.0]
+    let y = make_fixed_points_1d<-10>(sN[16][3]:[1024, ...]);
+    // 3.0
+    let expected = fixed_point::make_fixed_point<-20>(sN[34]:3145728);
+    assert_eq(expected, dot_prod(x, y));
+}
diff --git a/hls4ml/templates/xls/firmware/layer.x b/hls4ml/templates/xls/firmware/layer.x
new file mode 100644
index 0000000000..0aebb004fc
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/layer.x
@@ -0,0 +1,30 @@
+import std;
+import fixed_point;
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+// hls-fpga-machine-learning insert imports
+
+// hls-fpga-machine-learning insert types
+
+// hls-fpga-machine-learning insert weights
+
+// hls-fpga-machine-learning insert lookup tables
+
+// hls-fpga-machine-learning insert other constants
+
+
+// hls-fpga-machine-learning insert helpers for different input ranks
+
+
+// Top-level function
+pub fn transform(
+    // hls-fpga-machine-learning insert layer input
+) ->
+// hls-fpga-machine-learning insert layer output
+{
+    // hls-fpga-machine-learning insert top-level function call
+}
diff --git a/hls4ml/templates/xls/firmware/myproject.x b/hls4ml/templates/xls/firmware/myproject.x
new file mode 100644
index 0000000000..0c807ab9ae
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/myproject.x
@@ -0,0 +1,33 @@
+import fixed_point;
+import ap_types.fixed_point_util;
+
+// hls-fpga-machine-learning insert imports
+
+// Input and output types: arrays of FixedPoint
+pub fn myproject_fixed_point(
+    // hls-fpga-machine-learning insert architecture input
+) ->
+// hls-fpga-machine-learning insert architecture output
+{
+    // hls-fpga-machine-learning insert layers
+}
+
+// Input and output types: arrays of sN[N]
+pub fn myproject_bits(
+    // hls-fpga-machine-learning insert bits input
+) ->
+// hls-fpga-machine-learning insert bits output
+{
+    // hls-fpga-machine-learning insert convert from bits
+}
+
+// Top-level function
+pub fn myproject(
+    // hls-fpga-machine-learning insert bits input
+) ->
+// hls-fpga-machine-learning insert bits output
+{
+    // hls-fpga-machine-learning insert top-level function call
+}
+
+// hls-fpga-machine-learning insert debugging
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/activations.x b/hls4ml/templates/xls/firmware/nnet_utils/activations.x
new file mode 100644
index 0000000000..9adfa1c11d
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/activations.x
@@ -0,0 +1,556 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+import nnet_utils.lookup_table;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+type LookupTable = lookup_table::LookupTable;
+
+
+// =========================================================================
+// --------------------------------- ReLU ----------------------------------
+
+pub fn thresholded_relu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32,
+    NB_THRESHOLD: u32, BE_THRESHOLD: s32
+    >(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        threshold: FixedPoint<NB_THRESHOLD, BE_THRESHOLD>)
+    -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = if (fixed_point_util::greater(x[i], threshold))
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x[i]) }
+        else
+            { zero!<FixedPoint<NB_OUT, BE_OUT>>() };
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn relu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32>
+    (x: FixedPoint<NB_IN, BE_IN>[DIM]) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    thresholded_relu<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x, zero!<FixedPoint<NB_IN, BE_IN>>())
+}
+
+#[test]
+fn relu_test() {
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][2]:[
+        1536, 1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[16][2]:[
+        1536, 1024
+    ]);
+    assert_eq(expected, relu<16, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        -1536, -1024, 0, -1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        0,...
+    ]);
+    assert_eq(expected, relu<16, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        -1536, -1024, 1024, -1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        0, 0, 1024, 0
+    ]);
+    assert_eq(expected, relu<16, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+
+    // Different width and precision
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[32][4]:[
+        -1536, -1024, 1024, -1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-11>(sN[16][4]:[
+        0, 0, 2048, 0
+    ]);
+    assert_eq(expected, relu<16, -11, RoundingMode::TRN, OverflowMode::WRAP>(x));
+}
+
+pub fn leaky_relu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    NB_ALPHA: u32, BE_ALPHA: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        alpha: FixedPoint<NB_ALPHA, BE_ALPHA>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = if (x[i].significand >= 0)
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x[i]) }
+        else
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(fixed_point::mul(x[i], alpha)) };
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn elu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        elu_lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = if (x[i].significand >= 0)
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x[i]) }
+        else
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(lookup_table::eval(elu_lut, x[i])) };
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn selu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32,
+    // Precision required for SELU_SCALE in so that it doesn't introduce rounding errors
+    BE_SCALE: s32 = {std::min(s32:-2, BE_OUT - BE_IN - (NB_IN as s32))},
+    NB_SCALE: u32 = {(2 - BE_SCALE) as u32}>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        selu_lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    // SELU_SCALE = 1.0507009873554804934193349852946
+    // TODO: specify up to 64 bit?
+    let SELU_SCALE: FixedPoint<32,-30> = fixed_point::make_fixed_point<-30>(s32: 1128181595);
+    const_assert!(NB_SCALE <= 32);
+
+    // Downscale to required precision
+    let SELU_SCALE = fixed_point_util::resize<NB_SCALE, BE_SCALE, ROUNDING, OVERFLOW>(SELU_SCALE);
+
+
+    for (i, acc) in 0..DIM {
+        let y = if (x[i].significand >= 0)
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(fixed_point::mul(SELU_SCALE, x[i])) }
+        else
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(lookup_table::eval(selu_lut, x[i])) };
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn prelu
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    NB_ALPHA: u32, BE_ALPHA: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        alpha: FixedPoint<NB_ALPHA, BE_ALPHA>[DIM]
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = if (x[i].significand >= 0)
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(x[i]) }
+        else
+            { fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(fixed_point::mul(alpha[i], x[i])) };
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+
+pub fn softplus
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(lookup_table::eval(lut, x[i]));
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn softsign
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        lut_asym: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            lookup_table::eval_antisymmetric<OVERFLOW>(lut_asym, x[i])
+        );
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn sigmoid
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(lookup_table::eval(lut, x[i]));
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+pub fn tanh
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32,
+    TABLE_SIZE: u32, TABLE_LOG2_STEP: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        lut_asym: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, TABLE_SIZE, TABLE_LOG2_STEP>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            lookup_table::eval_antisymmetric<OVERFLOW>(lut_asym, x[i])
+        );
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// clip(slope * x + shift, 0, 1)
+pub fn hard_sigmoid
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32,
+    NB_SLOPE: u32, BE_SLOPE: s32,
+    NB_SHIFT: u32, BE_SHIFT: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        slope: FixedPoint<NB_SLOPE, BE_SLOPE>,
+        shift: FixedPoint<NB_SHIFT, BE_SHIFT>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    let ZERO = fixed_point::from_integer(s1:0);
+    let ONE = fixed_point::from_integer(s2:1);
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            fixed_point::add(
+                fixed_point::mul(x[i], slope),
+                shift)
+        );
+        let y = fixed_point_util::clip_resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(y, ZERO, ONE);
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// 2 * hard_sigmoid(x) - 1
+// = clip(2 * slope * x + 2 * shift - 1, -1, 1)
+pub fn hard_tanh
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32,
+    NB_SLOPE: u32, BE_SLOPE: s32,
+    NB_SHIFT: u32, BE_SHIFT: s32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        slope: FixedPoint<NB_SLOPE, BE_SLOPE>,
+        shift: FixedPoint<NB_SHIFT, BE_SHIFT>
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    let ZERO = fixed_point::from_integer(s1:0);
+    let MINUS_ONE = fixed_point::from_integer(s1:-1);
+    let ONE = fixed_point::from_integer(s2:1);
+    let TWO = fixed_point::from_integer(s3:2);
+    // 2 * slope
+    let slope_2 = fixed_point::mul(slope, TWO);
+    // 2 * shift - 1
+    let shift_2 = fixed_point::sub(fixed_point::mul(shift, TWO), ONE);
+
+    for (i, acc) in 0..DIM {
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            fixed_point::add(
+                fixed_point::mul(x[i], slope_2),
+                shift_2)
+        );
+        let y = fixed_point_util::clip_resize(y, MINUS_ONE, ONE);
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// binary_tanh(x) =
+//   -1 | x < 0
+//   1  | x >= 0
+pub fn binary_tanh<
+    NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM]
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    let ONE = fixed_point::from_integer(s2:1);
+    let MINUS_ONE = fixed_point::from_integer(s2:-1);
+
+    for (i, acc) in 0..DIM {
+        let y = if (x[i].significand >= 0)
+            { ONE }
+        else
+            { MINUS_ONE };
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(y);
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// ternary_tanh(x) =
+//   -1 | x <= -1
+//   0  | -1 < x <= 1
+//   1  | x > 1
+pub fn ternary_tanh<
+    NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32, DIM: u32>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM]
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    let ZERO = fixed_point::from_integer(s2:0);
+    let ONE = fixed_point::from_integer(s2:1);
+    let MINUS_ONE = fixed_point::from_integer(s2:-1);
+
+    for (i, acc) in 0..DIM {
+        let y = if (fixed_point_util::greater(x[i], ONE))
+            { ONE }
+        else if (fixed_point_util::greater(x[i], MINUS_ONE))
+            { ZERO }
+        else
+            { MINUS_ONE };
+        let y = fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(y);
+        update(acc, i, y)
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// =========================================================================
+// ------------------------------- Argmax ---------------------------------
+
+pub fn argmax
+    <NB_OUT: u32, BE_OUT: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    DIM: u32>
+    (y: FixedPoint<NB_IN, BE_IN>[DIM])
+    -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+        let y_max = fixed_point_util::max_1d(y);
+        let one = fixed_point_util::resize<
+            NB_OUT, BE_OUT, ROUNDING, OVERFLOW
+        >(fixed_point::from_integer(s2:1));
+        for (i, z) in 0..DIM {
+            if y[i] == y_max {
+                update(z, i, one)
+            }
+            else {
+                z
+            }
+        }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+#[test]
+fn argmax_test() {
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][2]:[
+        1536,
+        1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[18][2]:[
+        1024,
+        0
+    ]);
+    assert_eq(expected, argmax<18, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        -1536,
+        -1024,
+        0,
+        -1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[18][4]:[
+        0,
+        0,
+        1024,
+        0,
+    ]);
+    assert_eq(expected, argmax<18, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+
+    let x = fixed_point_util::make_fixed_points_1d<-10>(sN[16][4]:[
+        -1536,
+        -1024,
+        -512,
+        -1024
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<-10>(sN[18][4]:[
+        0,
+        0,
+        1024,
+        0,
+    ]);
+    assert_eq(expected, argmax<18, -10, RoundingMode::TRN, OverflowMode::WRAP>(x));
+}
+
+// =========================================================================
+// ------------------------------ Softmax ----------------------------------
+
+pub fn softmax_latency
+    <NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    NB_EXP: u32, BE_EXP: s32, SIZE_EXP: u32, LOG2_STEP_EXP: s32,
+    NB_INV: u32, BE_INV: s32, SIZE_INV: u32, LOG2_STEP_INV: s32,
+    DIM: u32,
+    NB_SUM_EXP: u32 = {NB_EXP + std::clog2(DIM)},
+    BE_SUM_EXP: s32 = {BE_EXP}>(
+        y: FixedPoint<NB_IN, BE_IN>[DIM],
+        exp_lut: LookupTable<NB_IN, BE_IN, NB_EXP, BE_EXP, SIZE_EXP, LOG2_STEP_EXP>,
+        inv_lut: LookupTable<NB_INV, BE_INV, NB_INV, BE_INV, SIZE_INV, LOG2_STEP_INV>,
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    // Compute exp() with Lookup Tables
+    let exp = lookup_table::eval_1d(exp_lut, y);
+
+    // Sum all exponents
+    let sum_exp = for (i, acc) in 0..DIM {
+        fixed_point_util::add_already_widened(exp[i], acc)
+    }(zero!<FixedPoint<NB_SUM_EXP, BE_SUM_EXP>>());
+    let sum_exp = fixed_point_util::resize<NB_INV, BE_INV, ROUNDING, OVERFLOW>(sum_exp);
+    let inv_sum_exp = lookup_table::eval(inv_lut, sum_exp);
+
+    // Compute softmax
+    let softmax_result = for (i, inv_vec) in 0..DIM {
+        update(inv_vec, i, fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            fixed_point::mul(exp[i], inv_sum_exp)
+        ))
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>());
+
+    softmax_result
+}
+
+// softmax(x) = exp(x[i]) / sum(exp(x[k])
+// Stable implementation:
+// softmax(x) = exp(-(x_max-x[i])) / sum_k(exp(-(x_max-x[k])))
+pub fn softmax_stable
+    <NB_OUT: u32, BE_OUT: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    NB_EXP: u32, BE_EXP: s32, SIZE_EXP: u32, LOG2_STEP_EXP: s32,
+    NB_INV: u32, BE_INV: s32, SIZE_INV: u32, LOG2_STEP_INV: s32,
+    DIM: u32,
+    // x_max - x_i
+    NB_DIFF: u32 = {NB_IN + 1}, BE_DIFF: s32 = {BE_IN},
+    // sum(exp(-(x_max-x_i)
+    NB_SUM_EXP: u32 = {NB_EXP + std::clog2(DIM)},
+    BE_SUM_EXP: s32 = {BE_EXP}>(
+        x: FixedPoint<NB_IN, BE_IN>[DIM],
+        // f(x) = exp(-x)
+        exp_neg_lut: LookupTable<NB_DIFF, BE_DIFF, NB_EXP, BE_EXP, SIZE_EXP, LOG2_STEP_EXP>,
+        // f(x) = 1/x
+        inv_lut: LookupTable<NB_INV, BE_INV, NB_INV, BE_INV, SIZE_INV, LOG2_STEP_INV>,
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    let x_max = fixed_point_util::max_1d(x);
+
+    // exp(-(x_max-x_i))
+    let exp = for (i, acc) in 0..DIM {
+        let d_xmax_xi = fixed_point::sub(x_max, x[i]);
+        let exp_dx = lookup_table::eval(exp_neg_lut, d_xmax_xi);
+        update(acc, i, exp_dx)
+    }(zero!<FixedPoint<NB_EXP, BE_EXP>[DIM]>());
+
+    // Sum all exponents
+    let sum_exp = for (i, acc) in 0..DIM {
+        fixed_point_util::add_already_widened(exp[i], acc)
+    }(zero!<FixedPoint<NB_SUM_EXP, BE_SUM_EXP>>());
+    // Truncate.
+    let sum_exp = fixed_point_util::resize<NB_INV, BE_INV, ROUNDING, OVERFLOW>(sum_exp);
+    // 1 / sum(exp)
+    let inv_sum_exp = lookup_table::eval(inv_lut, sum_exp);
+
+    // softmax
+    let softmax_result = for (i, acc) in 0..DIM {
+        update(acc, i, fixed_point_util::resize<NB_OUT, BE_OUT, ROUNDING, OVERFLOW>(
+            fixed_point::mul(exp[i], inv_sum_exp)
+        ))
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>());
+
+    softmax_result
+}
+
+// ------------- TODO Tests should be generated depending on the table precision/size
+
+// #[test]
+// fn softmax_latency_test() {
+//     let x = sN[16][4]:[
+//         sN[16]:1024,
+//         sN[16]:1024,
+//         sN[16]:1024,
+//         sN[16]:1024
+//     ];
+//     let expected = sN[16][4]:[
+//         sN[16]:258,  // Ideal 256
+//         sN[16]:258,
+//         sN[16]:258,
+//         sN[16]:258
+//     ];
+//     assert_eq(expected, softmax_latency
+//         <u32:16, u32:1, u32:10,
+//         u32:16, u32:1, u32:10,
+//         u32:18, u32:1, u32:10,
+//         u32:18, u32:1, u32:10,
+//         u32:1024>(x));
+
+//     let x = sN[16][4]:[
+//         sN[16]:2048,
+//         sN[16]:2048,
+//         sN[16]:2048,
+//         sN[16]:2048
+//     ];
+//     let expected = sN[16][4]:[
+//         sN[16]:258,  // Ideal 256
+//         sN[16]:258,
+//         sN[16]:258,
+//         sN[16]:258
+//     ];
+//     assert_eq(expected, softmax_latency
+//         <u32:16, u32:1, u32:10,
+//         u32:16, u32:1, u32:10,
+//         u32:18, u32:1, u32:10,
+//         u32:18, u32:1, u32:10,
+//         u32:1024>(x));
+// }
+
+// #[test]
+// fn softmax_stable_test() {
+//     let x = fixed_point_util::make_fixed_points<-10>(sN[16][4]:[
+//         1024,
+//         1024,
+//         1024,
+//         1024
+//     ]);
+//     let expected = fixed_point_util::make_fixed_points<-10>(sN[16][4]:[
+//         256,  // Ideal 256
+//         256,
+//         256,
+//         256
+//     ]);
+//     assert_eq(expected, softmax_stable<16,1,10>(x, EXP_TABLE, INV_TABLE));
+
+//     let x = fixed_point_util::make_fixed_points<-10>(sN[16][4]:[
+//         4096,
+//         4096,
+//         4096,
+//         4096
+//     ]);
+//     let expected = fixed_point_util::make_fixed_points<-10>(sN[16][4]:[
+//         256,  // Ideal 256
+//         256,
+//         256,
+//         256
+//     ]);
+//     assert_eq(expected, softmax_stable<16,1,10>(x, EXP_TABLE, INV_TABLE));
+// }
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/batchnorm.x b/hls4ml/templates/xls/firmware/nnet_utils/batchnorm.x
new file mode 100644
index 0000000000..c317e7a16d
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/batchnorm.x
@@ -0,0 +1,52 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+pub fn normalize<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    IN_NB: u32, IN_BE: s32, DIM: u32,
+    SCALE_NB: u32, SCALE_BE: s32, SCALE_DIM: u32,
+    BIAS_NB: u32, BIAS_BE: s32, BIAS_DIM: u32 = {SCALE_DIM}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[DIM],
+    scale: FixedPoint<SCALE_NB, SCALE_BE>[SCALE_DIM],
+    bias: FixedPoint<BIAS_NB, BIAS_BE>[BIAS_DIM],
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, acc) in 0..DIM {
+        update(acc, i,
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                fixed_point::add(bias[i % BIAS_DIM],
+                    fixed_point::mul(scale[i % SCALE_DIM], x[i])
+                )
+            )
+        )
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn normalize_mixed_precision_test() {
+    let x = fixed_point_util::make_fixed_points_1d<0>(s4[4]:[
+        1, 2, 3, 4
+    ]);
+    let scale = fixed_point_util::make_fixed_points_1d<-1>(s3[2]:[
+        2, 3
+    ]);
+    let bias = fixed_point_util::make_fixed_points_1d<-2>(s3[2]:[
+        1, 2
+    ]);
+
+    let expected = fixed_point_util::make_fixed_points_1d<-2>(s6[4]:[
+        5, 14, 13, 26
+    ]);
+
+    assert_eq(expected,
+        normalize<6, -2, RoundingMode::TRN, OverflowMode::WRAP>(x, scale, bias)
+    );
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/conv1d.x b/hls4ml/templates/xls/firmware/nnet_utils/conv1d.x
new file mode 100644
index 0000000000..d5b76a3b2d
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/conv1d.x
@@ -0,0 +1,235 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+import nnet_utils.activations;
+import nnet_utils.data_format;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+type DataFormat = data_format::DataFormat;
+
+pub fn conv1d_latency
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    STRIDE: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32,
+    // Kernel
+    KERN_NB: u32, KERN_BE: s32,
+    KERN_SIZE: u32, OUT_FILTERS: u32,
+    // Bias
+    BIAS_NB: u32, BIAS_BE: s32,
+    // Derived input dims
+    IN_SIZE: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[0]},
+    IN_CHANNELS: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[1]},
+    // Output size
+    OUT_SIZE: u32 = {((IN_SIZE + PAD_LEFT + PAD_RIGHT - KERN_SIZE) / STRIDE) + 1},
+    // Output dims
+    OUT_DIM_0: u32 = {data_format::from_size_chans(OUT_SIZE, OUT_FILTERS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_size_chans(OUT_SIZE, OUT_FILTERS, DATA_FORMAT)[1]},
+    // Precision
+    MUL_BE: s32 = {IN_BE + IN_BE},
+    MUL_NB: u32 = {IN_NB + IN_NB},
+    CONV_NB: u32 = {MUL_NB + std::clog2(KERN_SIZE * IN_CHANNELS)},
+    CONV_BE: s32 = {MUL_BE}
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0],
+    kernel: FixedPoint<KERN_NB, KERN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_2d) in 0..OUT_DIM_0 {
+        let out_1d = for (out_i_1, out_1d) in 0..OUT_DIM_1 {
+
+            let ij = data_format::to_size_chans(out_i_0, out_i_1, DATA_FORMAT);
+            let out_pos = ij[0];
+            let filter_idx = ij[1];
+
+            let in_pos: s32 = ((out_pos as s32) * (STRIDE as s32)) - (PAD_LEFT as s32);
+
+            let conv_pixel = for (ch_idx, pixel_chans) in 0..IN_CHANNELS {
+
+                for (k, acc) in 0..KERN_SIZE {
+                    let ii = in_pos + (k as s32);
+
+                    let val = if ii < s32:0
+                            || ii >= IN_SIZE as s32 {
+                        zero!<FixedPoint<IN_NB, IN_BE>>()
+                    } else {
+                        let ii = ii as u32;
+                        match DATA_FORMAT {
+                            DataFormat::CHANNELS_LAST  => x[ii][ch_idx],
+                            DataFormat::CHANNELS_FIRST => x[ch_idx][ii]
+                        }
+                    };
+
+                    let w = kernel[k][ch_idx][filter_idx];
+                    fixed_point_util::fmadd_already_widened(val, w, acc)
+                }(pixel_chans)
+
+            }(zero!<FixedPoint<CONV_NB, CONV_BE>>());
+
+            let conv_pixel_with_bias = fixed_point::add(conv_pixel, bias[filter_idx]);
+            let conv_pixel_with_bias = fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(conv_pixel_with_bias);
+
+            update(out_1d, out_i_1, conv_pixel_with_bias)
+
+        }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1]>());
+
+        update(out_2d, out_i_0, out_1d)
+
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0]>())
+}
+
+// Testing
+
+fn conv1d_latency_default<
+    IN_NB: u32, IN_BE: s32,
+    // Input
+    IN_SIZE: u32, IN_CHANNELS: u32,
+    // Kernel
+    KERN_SIZE: u32, OUT_FILTERS: u32,
+    // Output
+    OUT_SIZE: u32 = {IN_SIZE + u32:1 - KERN_SIZE},
+    // Defaults
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE: u32 = {u32:1},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_CHANNELS][IN_SIZE],
+    weights: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_FILTERS][OUT_SIZE] {
+
+    conv1d_latency<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_LAST
+    >(x, weights, bias)
+}
+
+fn conv1d_latency_default_first<
+    IN_NB: u32, IN_BE: s32,
+
+    // Input
+    IN_SIZE: u32, IN_CHANNELS: u32,
+
+    // Kernel
+    KERN_SIZE: u32, OUT_FILTERS: u32,
+
+    // Output
+    OUT_SIZE: u32 = {IN_SIZE + u32:1 - KERN_SIZE},
+
+    // Defaults
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE: u32 = {u32:1},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_SIZE][IN_CHANNELS],
+    weights: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_SIZE][OUT_FILTERS] {
+
+    conv1d_latency<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_FIRST
+    >(x, weights, bias)
+}
+
+fn test_zero_1d<
+    IN_SIZE: u32,
+    IN_CHANNELS: u32,
+    KERN_SIZE: u32,
+    OUT_FILTERS: u32,
+    OUT_SIZE: u32 = {IN_SIZE + u32:1 - KERN_SIZE}
+>() {
+
+    let x = zero!<FixedPoint<16, -10>[IN_CHANNELS][IN_SIZE]>();
+    let w = zero!<FixedPoint<16, -10>[OUT_FILTERS][IN_CHANNELS][KERN_SIZE]>();
+    let b = zero!<FixedPoint<16, -10>[OUT_FILTERS]>();
+
+    let expected = zero!<FixedPoint<16, -10>[OUT_FILTERS][OUT_SIZE]>();
+    assert_eq(expected, conv1d_latency_default(x, w, b));
+
+    // CHANNELS_FIRST
+    let x_first = zero!<FixedPoint<16, -10>[IN_SIZE][IN_CHANNELS]>();
+    let expected_first = zero!<FixedPoint<16, -10>[OUT_SIZE][OUT_FILTERS]>();
+    assert_eq(expected_first, conv1d_latency_default_first(x_first, w, b));
+}
+
+#[test]
+fn test_zero_1d_1() {
+    let IN_SIZE = u32:1;
+    let IN_CHANNELS = u32:1;
+    let KERN_SIZE = u32:1;
+    let OUT_FILTERS = u32:1;
+    test_zero_1d<IN_SIZE, IN_CHANNELS, KERN_SIZE, OUT_FILTERS>();
+}
+
+#[test]
+fn test_zero_1d_2() {
+    let IN_SIZE = u32:5;
+    let IN_CHANNELS = u32:2;
+    let KERN_SIZE = u32:3;
+    let OUT_FILTERS = u32:4;
+    test_zero_1d<IN_SIZE, IN_CHANNELS, KERN_SIZE, OUT_FILTERS>();
+}
+
+#[test]
+fn conv1d_latency_test_uniform_io() {
+
+    // x = [1,1,1,1,1]
+    let x = fixed_point_util::make_fixed_points_2d<-10>(
+        s16[1][5]:[[s16:1024], ...]
+    );
+
+    // w = [1,2,3]
+    let w = fixed_point_util::make_fixed_points_3d<-10>(
+        s16[1][1][3]:[
+            s16[1][1]:[[s16:1024]],
+            s16[1][1]:[[s16:2048]],
+            s16[1][1]:[[s16:3072]]
+        ]
+    );
+
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:0]);
+
+    // expected = [6,6,6] (scaled: 6 * 1024 = 6144)
+    let expected = fixed_point_util::make_fixed_points_2d<-10>(
+        s16[1][3]:[[s16:6144], ...]
+    );
+
+    assert_eq(expected, conv1d_latency_default(x, w, b));
+
+    // CHANNELS_FIRST
+    let x_first = fixed_point_util::make_fixed_points_2d<-10>(
+        s16[5][1]:[s16[5]:[s16:1024, ...]]
+    );
+
+    let expected_first = fixed_point_util::make_fixed_points_2d<-10>(
+        s16[3][1]:[s16[3]:[s16:6144, ...]]
+    );
+
+    assert_eq(expected_first, conv1d_latency_default_first(x_first, w, b));
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/conv2d.x b/hls4ml/templates/xls/firmware/nnet_utils/conv2d.x
new file mode 100644
index 0000000000..01d57aa0cc
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/conv2d.x
@@ -0,0 +1,578 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+import nnet_utils.activations;
+import nnet_utils.data_format;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+type DataFormat = data_format::DataFormat;
+
+pub fn conv2d_latency
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    STRIDE_HEIGHT: u32, STRIDE_WIDTH: u32,
+    PAD_TOP: u32, PAD_BOTTOM: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    DATA_FORMAT: DataFormat,
+    // All parameters below can be deduced automatically
+    IN_NB: u32, IN_BE: s32,
+    // Input Image
+    // Dimensions: (IN_HEIGHT, IN_WIDTH, IN_CHANNELS) or (IN_CHANNELS, IN_HEIGHT, IN_WIDTH),
+    // depending on DATA_FORMAT
+    IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+    // Kernel
+    KERN_NB: u32, KERN_BE: s32,
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, OUT_FILTERS: u32,
+    // Bias
+    BIAS_NB: u32, BIAS_BE: s32,
+    // Input image
+    IN_HEIGHT: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[0]},
+    IN_WIDTH: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[1]},
+    IN_CHANNELS: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[2]},
+    // Output Image
+    OUT_HEIGHT: u32 = {((IN_HEIGHT + PAD_TOP + PAD_BOTTOM - KERN_HEIGHT) / STRIDE_HEIGHT) + 1},
+    OUT_WIDTH: u32 = {((IN_WIDTH  + PAD_LEFT + PAD_RIGHT  - KERN_WIDTH) / STRIDE_WIDTH) + 1},
+    // Output dimension
+    OUT_DIM_0: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[1]},
+    OUT_DIM_2: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[2]},
+    // Precision inference MUL
+    MUL_BE: s32 = {IN_BE + IN_BE},
+    MUL_NB: u32 = {IN_NB + IN_NB},
+    // Precision Inference CONV
+    // TODO support custom accum_t precision
+    CONV_NB: u32 = {MUL_NB + std::clog2(KERN_HEIGHT * KERN_WIDTH * IN_CHANNELS)},
+    CONV_BE: s32 = {MUL_BE}
+    >
+    (x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0],
+    kernel: FixedPoint<KERN_NB, KERN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_FILTERS])
+    -> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_3d) in 0..OUT_DIM_0 {
+        let out_2d = for (out_i_1, out_2d) in 0..OUT_DIM_1 {
+            let out_1d = for (out_i_2, out_1d) in 0..OUT_DIM_2 {
+                let ijf = data_format::to_height_width_chans(out_i_0, out_i_1, out_i_2, DATA_FORMAT);
+                let out_i = ijf[0];
+                let out_j = ijf[1];
+                let filter_idx = ijf[2];
+
+                let in_i: s32 = ((out_i as s32) * (STRIDE_HEIGHT as s32)) - (PAD_TOP as s32);
+                let in_j: s32 = ((out_j as s32) * (STRIDE_WIDTH as s32)) - (PAD_LEFT as s32);
+                // Compute convolution across channels:
+                // res[out_i, out_j, filt] = sum(x[in_i+di, in_j+dj, ch_idx] * w[di, dj, ch_idx, filt])
+                let conv_pixel = for (ch_idx, pixel_chans) in 0..IN_CHANNELS {
+                    // Compute convolution for a single channel:
+                    // acc = sum(x[i+di, j+dj] * w[di, dj])
+                    for (di, pixel_ch) in 0..KERN_HEIGHT {
+                        for (dj, acc) in 0..KERN_WIDTH {
+                            let ii = in_i + (di as s32);
+                            let jj = in_j + (dj as s32);
+                            // Pad with zeros
+                            let val = if ii < s32:0
+                                    || ii >= IN_HEIGHT as s32
+                                    || jj < s32:0
+                                    || jj >= IN_WIDTH as s32 {
+                                zero!<FixedPoint<IN_NB, IN_BE>>()
+                            } else {
+                                let ii = ii as u32;
+                                let jj = jj as u32;
+                                match DATA_FORMAT{
+                                    DataFormat::CHANNELS_LAST  => x[ii][jj][ch_idx],
+                                    DataFormat::CHANNELS_FIRST => x[ch_idx][ii][jj]
+                                }
+                            };
+                            let w = kernel[di][dj][ch_idx][filter_idx];
+                            fixed_point_util::fmadd_already_widened(val, w, acc)
+                        }(pixel_ch)
+                    }(pixel_chans)
+                }(zero!<FixedPoint<CONV_NB, CONV_BE>>());
+                let conv_pixel_with_bias = fixed_point::add(conv_pixel, bias[filter_idx]);
+                let conv_pixel_with_bias = fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(conv_pixel_with_bias);
+
+                update(out_1d, out_i_2, conv_pixel_with_bias)
+
+            }(zero!<FixedPoint<OUT_NB,OUT_BE>[OUT_DIM_2]>());
+
+            update(out_2d, out_i_1, out_1d)
+
+        }(zero!<FixedPoint<OUT_NB,OUT_BE>[OUT_DIM_2][OUT_DIM_1]>());
+
+        update(out_3d, out_i_0, out_2d)
+
+    }(zero!<FixedPoint<OUT_NB,OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>())
+}
+
+// Set some default parameters reused in all tests.
+// TODO: test other parameters
+fn conv2d_latency_default<
+    IN_NB: u32, IN_BE: s32,
+    // Input Image
+    IN_HEIGHT: u32, IN_WIDTH: u32, IN_CHANNELS: u32,
+    // Kernel Dims
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, OUT_FILTERS: u32,
+    // Output Image
+    OUT_HEIGHT: u32 = {IN_HEIGHT + u32:1 - KERN_HEIGHT},
+    OUT_WIDTH: u32 = {IN_WIDTH + u32:1 - KERN_WIDTH},
+    // Default parameters:
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE_HEIGHT: u32 = {u32:1},
+    STRIDE_WIDTH: u32 = {u32:1},
+    PAD_TOP: u32 = {u32:0},
+    PAD_BOTTOM: u32 = {u32:0},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+    >
+    (x: FixedPoint<IN_NB, IN_BE>[IN_CHANNELS][IN_WIDTH][IN_HEIGHT],
+    weights: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS])
+    -> FixedPoint<OUT_NB, OUT_BE>[OUT_FILTERS][OUT_WIDTH][OUT_HEIGHT] {
+
+    conv2d_latency<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE_HEIGHT, STRIDE_WIDTH,
+        PAD_TOP, PAD_BOTTOM,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_LAST
+    >(x, weights, bias)
+}
+
+// Same but with CHANNELS_FIRST
+fn conv2d_latency_default_first<
+    IN_NB: u32, IN_BE: s32,
+    // Input Image
+    IN_HEIGHT: u32, IN_WIDTH: u32, IN_CHANNELS: u32,
+    // Kernel Dims
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, OUT_FILTERS: u32,
+    // Output Image
+    OUT_HEIGHT: u32 = {IN_HEIGHT + u32:1 - KERN_HEIGHT},
+    OUT_WIDTH: u32 = {IN_WIDTH + u32:1 - KERN_WIDTH},
+    // Default parameters:
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE_HEIGHT: u32 = {u32:1},
+    STRIDE_WIDTH: u32 = {u32:1},
+    PAD_TOP: u32 = {u32:0},
+    PAD_BOTTOM: u32 = {u32:0},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+    >
+    (x: FixedPoint<IN_NB, IN_BE>[IN_WIDTH][IN_HEIGHT][IN_CHANNELS],
+    weights: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS])
+    -> FixedPoint<OUT_NB, OUT_BE>[OUT_WIDTH][OUT_HEIGHT][OUT_FILTERS] {
+
+    conv2d_latency<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE_HEIGHT, STRIDE_WIDTH,
+        PAD_TOP, PAD_BOTTOM,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_FIRST
+    >(x, weights, bias)
+}
+
+// All inputs are zero => we test only dimensions
+// TODO: test also padding and stride
+fn test_zero<
+    IN_HEIGHT: u32, IN_WIDTH: u32,
+    IN_CHANNELS: u32,
+    KERN_HEIGHT: u32, KERN_WIDTH: u32,
+    OUT_FILTERS: u32,
+    OUT_HEIGHT: u32 = {IN_HEIGHT + u32:1 - KERN_HEIGHT},
+    OUT_WIDTH: u32 = {IN_WIDTH + u32:1 - KERN_WIDTH},
+    >() {
+
+    let x = zero!<FixedPoint<16, -10>[IN_CHANNELS][IN_WIDTH][IN_HEIGHT]>();
+
+    let w = zero!<FixedPoint<16, -10>[OUT_FILTERS][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT]>();
+    let b = zero!<FixedPoint<16, -10>[OUT_FILTERS]>();
+
+    let expected = zero!<FixedPoint<16, -10>[OUT_FILTERS][OUT_WIDTH][OUT_HEIGHT]>();
+
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+
+    // CHANNELS_FIRST
+    let x_first = zero!<FixedPoint<16, -10>[IN_WIDTH][IN_HEIGHT][IN_CHANNELS]>();
+    let expected_first = zero!<FixedPoint<16, -10>[OUT_WIDTH][OUT_HEIGHT][OUT_FILTERS]>();
+    assert_eq(expected_first, conv2d_latency_default_first(x_first, w, b));
+}
+
+#[test]
+fn test_zero_1() {
+    let IN_HEIGHT = u32:1;
+    let IN_WIDTH = u32:1;
+    let IN_CHANNELS = u32:1;
+    let KERN_HEIGHT = u32:1;
+    let KERN_WIDTH = u32:1;
+    let OUT_FILTERS = u32:1;
+    test_zero<IN_HEIGHT, IN_WIDTH, IN_CHANNELS, KERN_HEIGHT, KERN_WIDTH, OUT_FILTERS>();
+}
+
+#[test]
+fn test_zero_2() {
+    let IN_HEIGHT = u32:2;
+    let IN_WIDTH = u32:2;
+    let IN_CHANNELS = u32:1;
+    let KERN_HEIGHT = u32:1;
+    let KERN_WIDTH = u32:1;
+    let OUT_FILTERS = u32:1;
+    test_zero<IN_HEIGHT, IN_WIDTH, IN_CHANNELS, KERN_HEIGHT, KERN_WIDTH, OUT_FILTERS>();
+}
+
+#[test]
+fn test_zero_multi() {
+    let IN_HEIGHT = u32:9;
+    let IN_WIDTH = u32:10;
+    let IN_CHANNELS = u32:4;
+    let KERN_HEIGHT = u32:3;
+    let KERN_WIDTH = u32:2;
+    let OUT_FILTERS = u32:5;
+    test_zero<IN_HEIGHT, IN_WIDTH, IN_CHANNELS, KERN_HEIGHT, KERN_WIDTH, OUT_FILTERS>();
+}
+
+
+#[test]
+fn conv2d_latency_test_uniform_io() {
+    // x =
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[1][5][5]:[s16[1][5]:[s16[1]:[s16:1024], ...], ...]);
+
+    // w =
+    //  | 1, 1, 1|
+    //  | 2, 2, 2|
+    //  | 3, 3, 3|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[1][1][3][3]:[
+        s16[1][1][3]:[[[s16:1024]], ...],
+        s16[1][1][3]:[[[s16:2048]], ...],
+        s16[1][1][3]:[[[s16:3072]], ...]
+    ]);
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:0]);
+
+    // expected =
+    //  | 18, 18, 18|
+    //  | 18, 18, 18|
+    //  | 18, 18, 18|
+    // TODO: herefater we have to specify integer type inside each 1d array because of type inference bug in DSLX:
+    // It loses types in make_fixed_points_2d, _3d etc.,
+    // and assert_eq fails with a message like:
+    // lhs and rhs were not equal: [ [ FixedPoint {
+    // < significand: s16:0
+    // > significand: u0:0
+    // } ] ]
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[1][3][3]:[
+        s16[1][3]:[[s16:18432], ...], ...]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+
+    // CHANNELS_FIRST
+    let x_first = fixed_point_util::make_fixed_points_3d<-10>(s16[5][5][1]:[
+        s16[5][5]:[s16[5]:[s16:1024, ...], ...]]);
+    let expected_first = fixed_point_util::make_fixed_points_3d<-10>(s16[3][3][1]:[
+            s16[3][3]:[s16[3]:[s16:18432, ...], ...]]);
+    assert_eq(expected_first, conv2d_latency_default_first(x_first, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_bias() {
+    // x =
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    //  | 1, 1, 1, 1, 1|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[1][5][5]:[s16[1][5]:[s16[1]:[s16:1024], ...], ...]);
+
+    // w =
+    //  | 1, 1, 1|
+    //  | 2, 2, 2|
+    //  | 3, 3, 3|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[1][1][3][3]:[
+        s16[1][1][3]:[[[s16:1024]], ...],
+        s16[1][1][3]:[[[s16:2048]], ...],
+        s16[1][1][3]:[[[s16:3072]], ...]
+    ]);
+    // b = | 1 |
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:1024]);
+
+    // expected =
+    //  | 19, 19, 19|
+    //  | 19, 19, 19|
+    //  | 19, 19, 19|
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[1][3][3]:[
+        s16[1][3]:[[s16:19456], ...], ...]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+
+    // CHANNELS_FIRST
+    let x_first = fixed_point_util::make_fixed_points_3d<-10>(s16[5][5][1]:[s16[5][5]:[s16[5]:[s16:1024, ...], ...]]);
+    let expected_first = fixed_point_util::make_fixed_points_3d<-10>(s16[3][3][1]:[
+            s16[3][3]:[s16[3]:[s16:19456, ...], ...]]);
+    assert_eq(expected_first, conv2d_latency_default_first(x_first, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_pattern() {
+    // x =
+    //  | 1, 1, 1, 1, 1|
+    //  | 0, 0, 0, 0, 0|
+    //  | 2, 2, 2, 2, 2|
+    //  | 0, 0, 0, 0, 0|
+    //  | 1, 1, 1, 1, 1|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[1][5][5]:[
+        s16[1][5]:[[s16:1024], ...],
+        s16[1][5]:[[s16:0], ...],
+        s16[1][5]:[[s16:2048], ...],
+        s16[1][5]:[[s16:0], ...],
+        s16[1][5]:[[s16:1024], ...]
+    ]);
+
+    // w =
+    //  | 1, 1, 1|
+    //  | 2, 2, 2|
+    //  | 3, 3, 3|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[1][1][3][3]:[
+        s16[1][1][3]:[[[s16:1024]], ...],
+        s16[1][1][3]:[[[s16:2048]], ...],
+        s16[1][1][3]:[[[s16:3072]], ...]
+    ]);
+    // b = | 0 |
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:0]);
+
+    // expected =
+    //  | 21, 21, 21|
+    //  | 12, 12, 12|
+    //  | 15, 15, 15|
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[1][3][3]:[
+        s16[1][3]:[[s16:21504], ...],
+        s16[1][3]:[[s16:12288], ...],
+        s16[1][3]:[[s16:15360], ...]
+    ]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_mutiple_filters() {
+    // x =
+    //  | 1, 1, 1, 1, 1|
+    //  | 0, 0, 0, 0, 0|
+    //  | 2, 2, 2, 2, 2|
+    //  | 0, 0, 0, 0, 0|
+    //  | 1, 1, 1, 1, 1|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[1][5][5]:[
+        s16[1][5]:[[s16:1024], ...],
+        s16[1][5]:[[s16:0], ...],
+        s16[1][5]:[[s16:2048], ...],
+        s16[1][5]:[[s16:0], ...],
+        s16[1][5]:[[s16:1024], ...]
+    ]);
+
+    // w =
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 2, 2, 2|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 3, 3, 3|  | 1, 1, 1|  | 0, 0, 0|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[3][1][3][3]:[
+        s16[3][1][3]:[[[s16:1024, 1024, 0]], ...],
+        s16[3][1][3]:[[[s16:2048, 1024, 0]], ...],
+        s16[3][1][3]:[[[s16:3072, 1024, 0]], ...],
+    ]);
+
+    // b = | 0, 0 ,-2|
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[3]:[s16:0, 0, -2048]);
+
+    // expected =
+    //  | 21, 21, 21|  | 9, 9, 9|  | -2, -2, -2|
+    //  | 12, 12, 12|  | 6, 6, 6|  | -2, -2, -2|
+    //  | 15, 15, 15|  | 9, 9, 9|  | -2, -2, -2|
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[3][3][3]:[
+        s16[3][3]:[[s16:21504, 9216, -2048], ...],
+        s16[3][3]:[[s16:12288, 6144, -2048], ...],
+        s16[3][3]:[[s16:15360, 9216, -2048], ...]
+    ]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_mutiple_channels() {
+    // x =
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 2, 2, 2, 2, 2|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[3][5][5]:[
+        s16[3][5]:[[s16:1024, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:2048, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:1024, 1024, 0], ...]
+    ]);
+
+    // w =
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 2, 2, 2|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[3][3][3][1]:[[[
+        [s16:1024, 1024, 1024],
+        [s16:2048, 2048, 2048],
+        [s16:1024, 1024, 1024],
+    ],[
+        [s16:1024, 1024, 1024],
+        [s16:1024, 1024, 1024],
+        [s16:1024, 1024, 1024],
+    ],[
+        [s16:0, 0, 0],
+        [s16:0, 0, 0],
+        [s16:0, 0, 0],
+    ]]]);
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[1][3][3][3]:[
+        s16[1][3][3]:[[[s16:1024], [s16:1024], [s16:0]], ...],
+        s16[1][3][3]:[[[s16:2048], [s16:1024], [s16:0]], ...],
+        s16[1][3][3]:[[[s16:1024], [s16:1024], [s16:0]], ...]
+    ]);
+    // b = | 0 |
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:0]);
+
+    // expected =
+    //  | 18, 18, 18|
+    //  | 21, 21, 21|
+    //  | 18, 18, 18|
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[1][3][3]:[
+        s16[1][3]:[[s16:18432], ...],
+        s16[1][3]:[[s16:21504], ...],
+        s16[1][3]:[[s16:18432], ...]
+    ]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_mutiple_channels_and_filters() {
+    // x =
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 2, 2, 2, 2, 2|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[3][5][5]:[
+        s16[3][5]:[[s16:1024, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:2048, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:1024, 1024, 0], ...]
+    ]);
+
+    // w =
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 2, 2, 2|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+
+    //  | 0, 0, 0|  | 0, 0, 0|  | 0, 0, 0|
+    //  | 0, 0, 0|  | 0, 0, 0|  | 0, 0, 0|
+    //  | 0, 0, 0|  | 0, 0, 0|  | 0, 0, 0|
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[3][3][3][3]:[
+        s16[3][3][3]:[[
+            [s16:1024, 1024, 0],
+            [s16:1024, 1024, 0],
+            [s16:0, 1024, 0]
+        ], ...],
+        s16[3][3][3]:[[
+            [s16:2048, 1024, 0],
+            [s16:1024, 1024, 0],
+            [s16:0, 1024, 0]
+        ], ...],
+        s16[3][3][3]:[[
+            [s16:1024, 1024, 0],
+            [s16:1024, 1024, 0],
+            [s16:0, 1024, 0]
+        ], ...]
+    ]);
+    // b = | 0, 0, 0|
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[3]:[s16:0, 0, 0]);
+
+    // expected =
+    //  | 18, 18, 18|  | 18, 18, 18|  | 0, 0, 0|
+    //  | 21, 21, 21|  | 15, 15, 15|  | 0, 0, 0|
+    //  | 18, 18, 18|  | 18, 18, 18|  | 0, 0, 0|
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[3][3][3]:[
+        s16[3][3]:[[s16:18432, 18432, 0], ...],
+        s16[3][3]:[[s16:21504, 15360, 0], ...],
+        s16[3][3]:[[s16:18432, 18432, 0], ...]
+    ]);
+    assert_eq(expected, conv2d_latency_default(x, w, b));
+}
+
+#[test]
+fn conv2d_latency_test_two_layers() {
+    // x =
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 2, 2, 2, 2, 2|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 0, 0, 0, 0, 0|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    //  | 1, 1, 1, 1, 1|  | 1, 1, 1, 1, 1|  | 0, 0, 0, 0, 0|
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[3][5][5]:[
+        s16[3][5]:[[s16:1024, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:2048, 1024, 0], ...],
+        s16[3][5]:[[s16:0, 1024, 0], ...],
+        s16[3][5]:[[s16:1024, 1024, 0], ...]
+    ]);
+
+    // w =
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 2, 2, 2|  | 1, 1, 1|  | 0, 0, 0|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 0, 0, 0|
+
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|  | 1, 1, 1|
+    let w0 = fixed_point_util::make_fixed_points_4d<-10>(s16[2][3][3][3]:[
+        s16[2][3][3]:[[
+            [s16:1024, 1024],
+            [s16:1024, 1024],
+            [s16:0, 1024]
+        ], ...],
+        s16[2][3][3]:[[
+            [s16:2048, 1024],
+            [s16:1024, 1024],
+            [s16:0, 1024]
+        ], ...],
+        s16[2][3][3]:[[
+            [s16:1024, 1024],
+            [s16:1024, 1024],
+            [s16:0, 1024]
+        ], ...]
+    ]);
+    // b = | -17, -17|
+    let b0 = fixed_point_util::make_fixed_points_1d<-10>(s16[2]:[-17408, -17408]);
+
+    // w1 =
+    //  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|
+    //  | 1, 1, 1|  | 1, 1, 1|
+    let w1 = fixed_point_util::make_fixed_points_4d<-10>(s16[1][2][3][3]:[
+        s16[1][2][3]:[s16[1][2]:[[s16:1024], ...], ...], ...
+    ]);
+    // b = | 0 |
+    let b1 = fixed_point_util::make_fixed_points_1d<-10>(s16[1]:[s16:0]);
+
+    // expected = | 18 |
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[1][1][1]:[[
+        [s16:18432],
+    ]]);
+
+    let z0 = conv2d_latency_default(x, w0, b0);
+    let z1 = conv2d_latency_default(z0, w1, b1);
+    assert_eq(expected, z1);
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/data_format.x b/hls4ml/templates/xls/firmware/nnet_utils/data_format.x
new file mode 100644
index 0000000000..f115613bfa
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/data_format.x
@@ -0,0 +1,61 @@
+pub enum DataFormat: u1 {
+    CHANNELS_LAST = 0,
+    CHANNELS_FIRST = 1
+}
+
+pub const CHANNELS_LAST = DataFormat::CHANNELS_LAST;
+pub const CHANNELS_FIRST = DataFormat::CHANNELS_FIRST;
+
+pub fn to_size_chans(dim_0: u32, dim_1: u32, data_format: DataFormat) -> u32[2] {
+    match data_format {
+        DataFormat::CHANNELS_LAST => [dim_0, dim_1],
+        DataFormat::CHANNELS_FIRST => [dim_1, dim_0]
+    }
+}
+
+pub fn from_size_chans(size: u32, channels: u32, data_format: DataFormat) -> u32[2] {
+    match data_format {
+        DataFormat::CHANNELS_LAST => [size, channels],
+        DataFormat::CHANNELS_FIRST => [channels, size]
+    }
+}
+
+pub fn to_height_width_chans(dim_0: u32, dim_1: u32, dim_2: u32, data_format: DataFormat) -> u32[3] {
+    match data_format {
+        DataFormat::CHANNELS_LAST => [dim_0, dim_1, dim_2],
+        DataFormat::CHANNELS_FIRST => [dim_1, dim_2, dim_0]
+    }
+}
+
+pub fn from_height_width_chans(height: u32, width: u32, channels: u32, data_format: DataFormat) -> u32[3] {
+    match data_format {
+        DataFormat::CHANNELS_LAST => [height, width, channels],
+        DataFormat::CHANNELS_FIRST => [channels, height, width]
+    }
+}
+
+#[test]
+fn test_data_format() {
+    let size = u32:4;
+    let height = u32:1;
+    let width = u32:2;
+    let channels = u32:3;
+    for (data_format, _) in [CHANNELS_LAST, CHANNELS_FIRST] {
+        let size_chans = from_size_chans(size, channels, data_format);
+        assert_eq(
+            to_size_chans(size_chans[0], size_chans[1], data_format),
+            [size, channels]
+        );
+
+        let height_width_chans = from_height_width_chans(height, width, channels, data_format);
+        assert_eq(
+            to_height_width_chans(
+                height_width_chans[0],
+                height_width_chans[1],
+                height_width_chans[2],
+                data_format
+            ),
+            [height, width, channels]
+        );
+    }(())
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/dense.x b/hls4ml/templates/xls/firmware/nnet_utils/dense.x
new file mode 100644
index 0000000000..d165b3fe25
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/dense.x
@@ -0,0 +1,205 @@
+import std;
+import fixed_point;
+import ap_types.fixed_point_util;
+import nnet_utils.activations;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+// y = Wx + b
+// When called must specify the fixed point precision that is in the output.
+// This allows the truncation to be done correctly.
+pub fn dense
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    IN_NB: u32, IN_BE: s32,
+    WEIGHTS_NB: u32, WEIGHTS_BE: s32,
+    BIAS_NB: u32, BIAS_BE: s32,
+    IN_DIM: u32, OUT_DIM: u32>(
+        x: FixedPoint<IN_NB, IN_BE>[IN_DIM],
+        w: FixedPoint<WEIGHTS_NB, WEIGHTS_BE>[IN_DIM][OUT_DIM],
+        bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_DIM]
+    ) -> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM] {
+
+    for (i, z) in u32:0..OUT_DIM {
+        let vec_prod  = fixed_point_util::dot_prod(x, w[i]);
+        let with_bias = fixed_point::add(vec_prod, bias[i]);
+        let with_bias_out = fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(with_bias);
+        update(z, i, with_bias_out)
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM]>())
+}
+
+// TODO: used only for tests
+// y = relu(Wx + b)
+// When called must specify the fixed point precision that is in the output.
+// This allows the truncation to be done correctly.
+pub fn dense_relu
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    IN_NB: u32, IN_BE: s32,
+    WEIGHTS_NB: u32, WEIGHTS_BE: s32,
+    BIAS_NB: u32, BIAS_BE: s32,
+    IN_DIM: u32, OUT_DIM: u32>(
+        x: FixedPoint<IN_NB, IN_BE>[IN_DIM],
+        w: FixedPoint<WEIGHTS_NB, WEIGHTS_BE>[IN_DIM][OUT_DIM],
+        bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_DIM]
+    ) -> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM] {
+
+    let y = dense<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x, w, bias);
+    activations::relu<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y)
+}
+
+// Testing
+
+const NB_COMMON = u32:16;
+const BE_COMMON = s32:-10;
+const ROUNDING_COMMON = RoundingMode::TRN;
+const OVERFLOW_COMMON = OverflowMode::WRAP;
+
+type FP = FixedPoint<NB_COMMON, BE_COMMON>;
+
+fn make_fixed(x:sN[NB_COMMON]) -> FP {
+    fixed_point::make_fixed_point<BE_COMMON>(x)
+}
+
+const FXP_6_75_NEG = make_fixed(-6912);
+const FXP_4_0_NEG  = make_fixed(-4096);
+const FXP_3_0_NEG  = make_fixed(-3072);
+const FXP_0_0      = make_fixed(0);
+const FXP_0_5      = make_fixed(512);
+const FXP_1_0      = make_fixed(1024);
+const FXP_1_5      = make_fixed(1536);
+const FXP_2_0      = make_fixed(2048);
+const FXP_2_25     = make_fixed(2304);
+const FXP_4_5      = make_fixed(4608);
+const FXP_5_5      = make_fixed(5632);
+const FXP_6_75     = make_fixed(6912);
+const FXP_12_0     = make_fixed(12288);
+const FXP_13_5     = make_fixed(13824);
+
+const NB_IN = u32:8;
+const BE_IN = s32:-4;
+const NB_WEIGHTS = u32:10;
+const BE_WEIGHTS = s32:-6;
+const NB_BIAS = u32:12;
+const BE_BIAS = s32:-8;
+const NB_OUT = u32:14;
+const BE_OUT = s32:-6;
+
+fn make_in<DIM: u32>(x: sN[NB_IN][DIM]) -> FixedPoint<NB_IN, BE_IN>[DIM] {
+    fixed_point_util::make_fixed_points_1d<BE_IN>(x)
+}
+
+fn make_weights
+    <IN_DIM: u32, OUT_DIM: u32>
+    (x: sN[NB_WEIGHTS][IN_DIM][OUT_DIM])
+    -> FixedPoint<NB_WEIGHTS, BE_WEIGHTS>[IN_DIM][OUT_DIM] {
+    fixed_point_util::make_fixed_points_2d<BE_WEIGHTS>(x)
+}
+
+fn make_bias<DIM: u32>(x: sN[NB_BIAS][DIM]) -> FixedPoint<NB_BIAS, BE_BIAS>[DIM] {
+    fixed_point_util::make_fixed_points_1d<BE_BIAS>(x)
+}
+
+fn make_out<DIM: u32>(x: sN[NB_OUT][DIM]) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+    fixed_point_util::make_fixed_points_1d<BE_OUT>(x)
+}
+
+
+#[test]
+fn dense_relu_test_pos() {
+    let x  = [FXP_1_5, FXP_1_5];
+    let w1 = [
+        [FXP_1_5, FXP_1_5],
+        [FXP_1_5, FXP_1_5]
+    ];
+    let b1 = [FXP_0_0, FXP_0_0];
+    let expected = [FXP_4_5, FXP_4_5];
+    assert_eq(expected, dense_relu<NB_COMMON, -10, ROUNDING_COMMON, OVERFLOW_COMMON>(x, w1, b1));
+}
+
+#[test]
+fn dense_relu_test_neg() {
+    let x  = [FXP_1_5, FXP_1_5];
+    let w1 = [
+        [FXP_1_5, FXP_1_5],
+        [FXP_1_5, FXP_1_5]
+    ];
+    let b1 = [FXP_6_75_NEG, FXP_0_0];
+    let expected = [FXP_0_0, FXP_4_5];
+    assert_eq(expected, dense_relu<NB_COMMON, -10, ROUNDING_COMMON, OVERFLOW_COMMON>(x, w1, b1));
+}
+
+#[test]
+fn dense_test_different_precisions() {
+    let x = make_in(sN[NB_IN][2]:[24, -8]); // [1.5, -0.5]
+    let w = make_weights(sN[NB_WEIGHTS][2][2]:[
+        [32, 16], // [0.5, 0.25]
+        [-64, 96], // [-1.0, 1.5]
+    ]);
+    let bias = make_bias(sN[NB_BIAS][2]:[128, -64]); // [0.5, -0.25]
+    let expected = make_out(sN[NB_OUT][2]:[72, -160]); // [1.125, -2.5]
+    assert_eq(
+        expected,
+        dense<NB_OUT, BE_OUT, ROUNDING_COMMON, OVERFLOW_COMMON>(x, w, bias));
+}
+
+fn integration_nn
+    <INPUT_D1: u32, INPUT_D2: u32,
+    IN_L1: u32 = {INPUT_D2}, OUT_L1: u32,
+    IN_L2: u32 = {OUT_L1},   OUT_L2: u32>
+    (x: FP[INPUT_D2][INPUT_D1],
+    w1: FP[IN_L1][OUT_L1],
+    b1: FP[OUT_L1],
+    w2: FP[IN_L2][OUT_L2],
+    b2: FP[OUT_L2])
+    -> FP[OUT_L2][INPUT_D1] {
+
+    // ---------------- Layer 1 -----------------
+    let z1 = for (batch_idx, layer1) in 0..INPUT_D1 {
+        update(
+            layer1,
+            batch_idx,
+            dense_relu<NB_COMMON, BE_COMMON, ROUNDING_COMMON, OVERFLOW_COMMON>(x[batch_idx], w1, b1)
+        )
+    }(zero!<FP[OUT_L1][INPUT_D1]>()); // init matrix w/ zeros
+
+    // ---------------- Layer 2 -----------------
+    let z2 = for (batch_idx, layer2) in 0..INPUT_D1 {
+        update(
+            layer2,
+            batch_idx,
+            dense_relu<NB_COMMON, BE_COMMON, ROUNDING_COMMON, OVERFLOW_COMMON>(z1[batch_idx], w2, b2)
+        )
+    }(zero!<FP[OUT_L2][INPUT_D1]>()); // init matrix w/ zeros
+
+    // ------------ Output -------------------
+    z2
+}
+
+#[test]
+fn integration_test() {
+    let x = [
+        [FXP_1_5, FXP_1_5],
+        [FXP_1_5, FXP_1_5]
+    ];
+    let w1 = [
+        [FXP_1_5, FXP_1_5],
+        [FXP_1_5, FXP_1_5]
+    ];
+    let b1 = [FXP_0_0, FXP_0_0];
+    let w2 = [
+        [FXP_1_5, FXP_1_5],
+        [FXP_1_5, FXP_1_5]
+    ];
+    let b2 = [FXP_0_0, FXP_0_0];
+    let expected = [
+        [FXP_13_5, FXP_13_5],
+        [FXP_13_5, FXP_13_5]
+    ];
+    let result = integration_nn(x, w1, b1, w2, b2);
+    assert_eq(expected, result);
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/depthwise_conv.x b/hls4ml/templates/xls/firmware/nnet_utils/depthwise_conv.x
new file mode 100644
index 0000000000..94e4c01c6c
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/depthwise_conv.x
@@ -0,0 +1,406 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+import nnet_utils.data_format;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+pub type DataFormat = data_format::DataFormat;
+
+pub fn depthwise_conv_1d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    STRIDE: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32,
+    // Kernel
+    KERN_NB: u32, KERN_BE: s32,
+    KERN_SIZE: u32, OUT_FILTERS: u32,
+    // Bias
+    BIAS_NB: u32, BIAS_BE: s32,
+    // Derived input dims
+    IN_SIZE: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[0]},
+    IN_CHANNELS: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[1]},
+    DEPTH_MULTIPLIER: u32 = {OUT_FILTERS / IN_CHANNELS},
+    // Output size
+    OUT_SIZE: u32 = {((IN_SIZE + PAD_LEFT + PAD_RIGHT - KERN_SIZE) / STRIDE) + 1},
+    // Output dims
+    OUT_DIM_0: u32 = {data_format::from_size_chans(OUT_SIZE, OUT_FILTERS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_size_chans(OUT_SIZE, OUT_FILTERS, DATA_FORMAT)[1]},
+    // Precision
+    MUL_BE: s32 = {IN_BE + KERN_BE},
+    MUL_NB: u32 = {IN_NB + KERN_NB},
+    CONV_NB: u32 = {MUL_NB + std::clog2(KERN_SIZE)},
+    CONV_BE: s32 = {MUL_BE}
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0],
+    kernel: FixedPoint<KERN_NB, KERN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_2d) in 0..OUT_DIM_0 {
+        let out_1d = for (out_i_1, out_1d) in 0..OUT_DIM_1 {
+
+            let ij = data_format::to_size_chans(out_i_0, out_i_1, DATA_FORMAT);
+            let out_pos = ij[0];
+            let filter_idx = ij[1];
+            let ch_idx = filter_idx / DEPTH_MULTIPLIER;
+            let depth_idx = filter_idx % DEPTH_MULTIPLIER;
+
+            let in_pos: s32 = ((out_pos as s32) * (STRIDE as s32)) - (PAD_LEFT as s32);
+
+            let conv_pixel = for (k, acc) in 0..KERN_SIZE {
+                let ii = in_pos + (k as s32);
+
+                let val = if ii < s32:0 || ii >= IN_SIZE as s32 {
+                    zero!<FixedPoint<IN_NB, IN_BE>>()
+                } else {
+                    let ii = ii as u32;
+                    match DATA_FORMAT {
+                        DataFormat::CHANNELS_LAST  => x[ii][ch_idx],
+                        DataFormat::CHANNELS_FIRST => x[ch_idx][ii]
+                    }
+                };
+
+                let w = kernel[k][ch_idx][depth_idx];
+                fixed_point_util::fmadd_already_widened(val, w, acc)
+            }(zero!<FixedPoint<CONV_NB, CONV_BE>>());
+
+            let conv_pixel_with_bias = fixed_point::add(conv_pixel, bias[filter_idx]);
+            let conv_pixel_with_bias =
+                fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(conv_pixel_with_bias);
+
+            update(out_1d, out_i_1, conv_pixel_with_bias)
+
+        }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1]>());
+
+        update(out_2d, out_i_0, out_1d)
+
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0]>())
+}
+
+pub fn depthwise_conv_2d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    STRIDE_HEIGHT: u32, STRIDE_WIDTH: u32,
+    PAD_TOP: u32, PAD_BOTTOM: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+    // Kernel
+    KERN_NB: u32, KERN_BE: s32,
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, OUT_FILTERS: u32,
+    // Bias
+    BIAS_NB: u32, BIAS_BE: s32,
+    // Derived input dims
+    IN_HEIGHT: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[0]},
+    IN_WIDTH: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[1]},
+    IN_CHANNELS: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[2]},
+    DEPTH_MULTIPLIER: u32 = {OUT_FILTERS / IN_CHANNELS},
+    // Output dims
+    OUT_HEIGHT: u32 = {((IN_HEIGHT + PAD_TOP + PAD_BOTTOM - KERN_HEIGHT) / STRIDE_HEIGHT) + 1},
+    OUT_WIDTH: u32 = {((IN_WIDTH + PAD_LEFT + PAD_RIGHT - KERN_WIDTH) / STRIDE_WIDTH) + 1},
+    OUT_DIM_0: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[1]},
+    OUT_DIM_2: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, OUT_FILTERS, DATA_FORMAT)[2]},
+    // Precision
+    MUL_BE: s32 = {IN_BE + KERN_BE},
+    MUL_NB: u32 = {IN_NB + KERN_NB},
+    CONV_NB: u32 = {MUL_NB + std::clog2(KERN_HEIGHT * KERN_WIDTH)},
+    CONV_BE: s32 = {MUL_BE}
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0],
+    kernel: FixedPoint<KERN_NB, KERN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<BIAS_NB, BIAS_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_3d) in 0..OUT_DIM_0 {
+        let out_2d = for (out_i_1, out_2d) in 0..OUT_DIM_1 {
+            let out_1d = for (out_i_2, out_1d) in 0..OUT_DIM_2 {
+                let ijf = data_format::to_height_width_chans(out_i_0, out_i_1, out_i_2, DATA_FORMAT);
+                let out_i = ijf[0];
+                let out_j = ijf[1];
+                let filter_idx = ijf[2];
+                let ch_idx = filter_idx / DEPTH_MULTIPLIER;
+                let depth_idx = filter_idx % DEPTH_MULTIPLIER;
+
+                let in_i: s32 = ((out_i as s32) * (STRIDE_HEIGHT as s32)) - (PAD_TOP as s32);
+                let in_j: s32 = ((out_j as s32) * (STRIDE_WIDTH as s32)) - (PAD_LEFT as s32);
+
+                let conv_pixel = for (di, pixel_ch) in 0..KERN_HEIGHT {
+                    for (dj, acc) in 0..KERN_WIDTH {
+                        let ii = in_i + (di as s32);
+                        let jj = in_j + (dj as s32);
+
+                        let val = if ii < s32:0
+                                || ii >= IN_HEIGHT as s32
+                                || jj < s32:0
+                                || jj >= IN_WIDTH as s32 {
+                            zero!<FixedPoint<IN_NB, IN_BE>>()
+                        } else {
+                            let ii = ii as u32;
+                            let jj = jj as u32;
+                            match DATA_FORMAT {
+                                DataFormat::CHANNELS_LAST  => x[ii][jj][ch_idx],
+                                DataFormat::CHANNELS_FIRST => x[ch_idx][ii][jj]
+                            }
+                        };
+
+                        let w = kernel[di][dj][ch_idx][depth_idx];
+                        fixed_point_util::fmadd_already_widened(val, w, acc)
+                    }(pixel_ch)
+                }(zero!<FixedPoint<CONV_NB, CONV_BE>>());
+
+                let conv_pixel_with_bias = fixed_point::add(conv_pixel, bias[filter_idx]);
+                let conv_pixel_with_bias =
+                    fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(conv_pixel_with_bias);
+
+                update(out_1d, out_i_2, conv_pixel_with_bias)
+
+            }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2]>());
+
+            update(out_2d, out_i_1, out_1d)
+
+        }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1]>());
+
+        update(out_3d, out_i_0, out_2d)
+
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>())
+}
+
+// Testing
+
+fn depthwise_conv_1d_default<
+    IN_NB: u32, IN_BE: s32,
+    IN_SIZE: u32, IN_CHANNELS: u32,
+    KERN_SIZE: u32, DEPTH_MULTIPLIER: u32,
+    OUT_FILTERS: u32 = {IN_CHANNELS * DEPTH_MULTIPLIER},
+    OUT_SIZE: u32 = {IN_SIZE + u32:1 - KERN_SIZE},
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE: u32 = {u32:1},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_CHANNELS][IN_SIZE],
+    weights: FixedPoint<IN_NB, IN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_FILTERS][OUT_SIZE] {
+
+    depthwise_conv_1d<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_LAST
+    >(x, weights, bias)
+}
+
+fn depthwise_conv_1d_default_first<
+    IN_NB: u32, IN_BE: s32,
+    IN_SIZE: u32, IN_CHANNELS: u32,
+    KERN_SIZE: u32, DEPTH_MULTIPLIER: u32,
+    OUT_FILTERS: u32 = {IN_CHANNELS * DEPTH_MULTIPLIER},
+    OUT_SIZE: u32 = {IN_SIZE + u32:1 - KERN_SIZE},
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE: u32 = {u32:1},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_SIZE][IN_CHANNELS],
+    weights: FixedPoint<IN_NB, IN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_SIZE],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_SIZE][OUT_FILTERS] {
+
+    depthwise_conv_1d<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_FIRST
+    >(x, weights, bias)
+}
+
+fn depthwise_conv_2d_default<
+    IN_NB: u32, IN_BE: s32,
+    IN_HEIGHT: u32, IN_WIDTH: u32, IN_CHANNELS: u32,
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, DEPTH_MULTIPLIER: u32,
+    OUT_FILTERS: u32 = {IN_CHANNELS * DEPTH_MULTIPLIER},
+    OUT_HEIGHT: u32 = {IN_HEIGHT + u32:1 - KERN_HEIGHT},
+    OUT_WIDTH: u32 = {IN_WIDTH + u32:1 - KERN_WIDTH},
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE_HEIGHT: u32 = {u32:1},
+    STRIDE_WIDTH: u32 = {u32:1},
+    PAD_TOP: u32 = {u32:0},
+    PAD_BOTTOM: u32 = {u32:0},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_CHANNELS][IN_WIDTH][IN_HEIGHT],
+    weights: FixedPoint<IN_NB, IN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_FILTERS][OUT_WIDTH][OUT_HEIGHT] {
+
+    depthwise_conv_2d<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE_HEIGHT, STRIDE_WIDTH,
+        PAD_TOP, PAD_BOTTOM,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_LAST
+    >(x, weights, bias)
+}
+
+fn depthwise_conv_2d_default_first<
+    IN_NB: u32, IN_BE: s32,
+    IN_HEIGHT: u32, IN_WIDTH: u32, IN_CHANNELS: u32,
+    KERN_HEIGHT: u32, KERN_WIDTH: u32, DEPTH_MULTIPLIER: u32,
+    OUT_FILTERS: u32 = {IN_CHANNELS * DEPTH_MULTIPLIER},
+    OUT_HEIGHT: u32 = {IN_HEIGHT + u32:1 - KERN_HEIGHT},
+    OUT_WIDTH: u32 = {IN_WIDTH + u32:1 - KERN_WIDTH},
+    OUT_NB: u32 = {IN_NB},
+    OUT_BE: s32 = {IN_BE},
+    STRIDE_HEIGHT: u32 = {u32:1},
+    STRIDE_WIDTH: u32 = {u32:1},
+    PAD_TOP: u32 = {u32:0},
+    PAD_BOTTOM: u32 = {u32:0},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0}
+>
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_WIDTH][IN_HEIGHT][IN_CHANNELS],
+    weights: FixedPoint<IN_NB, IN_BE>[DEPTH_MULTIPLIER][IN_CHANNELS][KERN_WIDTH][KERN_HEIGHT],
+    bias: FixedPoint<IN_NB, IN_BE>[OUT_FILTERS]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_WIDTH][OUT_HEIGHT][OUT_FILTERS] {
+
+    depthwise_conv_2d<
+        OUT_NB, OUT_BE,
+        RoundingMode::TRN, OverflowMode::WRAP,
+        STRIDE_HEIGHT, STRIDE_WIDTH,
+        PAD_TOP, PAD_BOTTOM,
+        PAD_LEFT, PAD_RIGHT,
+        DataFormat::CHANNELS_FIRST
+    >(x, weights, bias)
+}
+
+#[test]
+fn test_zero_1d() {
+    let x = zero!<FixedPoint<16, -10>[2][5]>();
+    let w = zero!<FixedPoint<16, -10>[2][2][3]>();
+    let b = zero!<FixedPoint<16, -10>[4]>();
+
+    let expected = zero!<FixedPoint<16, -10>[4][3]>();
+    assert_eq(expected, depthwise_conv_1d_default(x, w, b));
+
+    let x_first = zero!<FixedPoint<16, -10>[5][2]>();
+    let expected_first = zero!<FixedPoint<16, -10>[3][4]>();
+    assert_eq(expected_first, depthwise_conv_1d_default_first(x_first, w, b));
+}
+
+#[test]
+fn test_depthwise_conv_1d_uniform_io() {
+    let x = fixed_point_util::make_fixed_points_2d<-10>(s16[2][4]:[
+        s16[2]:[s16:1024, s16:2048],
+        s16[2]:[s16:1024, s16:2048],
+        s16[2]:[s16:1024, s16:2048],
+        s16[2]:[s16:1024, s16:2048]
+    ]);
+    let w = fixed_point_util::make_fixed_points_3d<-10>(s16[2][2][2]:[
+        s16[2][2]:[[s16:1024, s16:1024], [s16:1024, s16:1024]],
+        s16[2][2]:[[s16:1024, s16:2048], [s16:1024, s16:2048]]
+    ]);
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[4]:[s16:0, s16:1024, s16:0, s16:1024]);
+
+    let expected = fixed_point_util::make_fixed_points_2d<-10>(s16[4][3]:[
+        s16[4]:[s16:2048, s16:4096, s16:4096, s16:7168],
+        s16[4]:[s16:2048, s16:4096, s16:4096, s16:7168],
+        s16[4]:[s16:2048, s16:4096, s16:4096, s16:7168]
+    ]);
+    assert_eq(expected, depthwise_conv_1d_default(x, w, b));
+
+    let x_first = fixed_point_util::make_fixed_points_2d<-10>(s16[4][2]:[
+        s16[4]:[s16:1024, s16:1024, s16:1024, s16:1024],
+        s16[4]:[s16:2048, s16:2048, s16:2048, s16:2048]
+    ]);
+    let expected_first = fixed_point_util::make_fixed_points_2d<-10>(s16[3][4]:[
+        s16[3]:[s16:2048, s16:2048, s16:2048],
+        s16[3]:[s16:4096, s16:4096, s16:4096],
+        s16[3]:[s16:4096, s16:4096, s16:4096],
+        s16[3]:[s16:7168, s16:7168, s16:7168]
+    ]);
+    assert_eq(expected_first, depthwise_conv_1d_default_first(x_first, w, b));
+}
+
+#[test]
+fn test_zero_2d() {
+    let x = zero!<FixedPoint<16, -10>[2][4][4]>();
+    let w = zero!<FixedPoint<16, -10>[2][2][2][2]>();
+    let b = zero!<FixedPoint<16, -10>[4]>();
+
+    let expected = zero!<FixedPoint<16, -10>[4][3][3]>();
+    assert_eq(expected, depthwise_conv_2d_default(x, w, b));
+
+    let x_first = zero!<FixedPoint<16, -10>[4][4][2]>();
+    let expected_first = zero!<FixedPoint<16, -10>[3][3][4]>();
+    assert_eq(expected_first, depthwise_conv_2d_default_first(x_first, w, b));
+}
+
+#[test]
+fn test_depthwise_conv_2d_uniform_io() {
+    let x = fixed_point_util::make_fixed_points_3d<-10>(s16[2][3][3]:[
+        s16[2][3]:[[s16:1024, s16:2048], [s16:1024, s16:2048], [s16:1024, s16:2048]],
+        s16[2][3]:[[s16:1024, s16:2048], [s16:1024, s16:2048], [s16:1024, s16:2048]],
+        s16[2][3]:[[s16:1024, s16:2048], [s16:1024, s16:2048], [s16:1024, s16:2048]]
+    ]);
+    let w = fixed_point_util::make_fixed_points_4d<-10>(s16[2][2][2][2]:[
+        s16[2][2][2]:[
+            [[s16:1024, s16:1024], [s16:1024, s16:1024]],
+            [[s16:1024, s16:0], [s16:1024, s16:0]]
+        ],
+        s16[2][2][2]:[
+            [[s16:1024, s16:0], [s16:1024, s16:0]],
+            [[s16:1024, s16:1024], [s16:1024, s16:1024]]
+        ]
+    ]);
+    let b = fixed_point_util::make_fixed_points_1d<-10>(s16[4]:[s16:0, s16:1024, s16:0, s16:1024]);
+
+    let expected = fixed_point_util::make_fixed_points_3d<-10>(s16[4][2][2]:[
+        s16[4][2]:[[s16:4096, s16:3072, s16:8192, s16:5120], [s16:4096, s16:3072, s16:8192, s16:5120]],
+        s16[4][2]:[[s16:4096, s16:3072, s16:8192, s16:5120], [s16:4096, s16:3072, s16:8192, s16:5120]]
+    ]);
+    assert_eq(expected, depthwise_conv_2d_default(x, w, b));
+
+    let x_first = fixed_point_util::make_fixed_points_3d<-10>(s16[3][3][2]:[
+        s16[3][3]:[s16[3]:[s16:1024, s16:1024, s16:1024], s16[3]:[s16:1024, s16:1024, s16:1024], s16[3]:[s16:1024, s16:1024, s16:1024]],
+        s16[3][3]:[s16[3]:[s16:2048, s16:2048, s16:2048], s16[3]:[s16:2048, s16:2048, s16:2048], s16[3]:[s16:2048, s16:2048, s16:2048]]
+    ]);
+    let expected_first = fixed_point_util::make_fixed_points_3d<-10>(s16[2][2][4]:[
+        s16[2][2]:[s16[2]:[s16:4096, s16:4096], s16[2]:[s16:4096, s16:4096]],
+        s16[2][2]:[s16[2]:[s16:3072, s16:3072], s16[2]:[s16:3072, s16:3072]],
+        s16[2][2]:[s16[2]:[s16:8192, s16:8192], s16[2]:[s16:8192, s16:8192]],
+        s16[2][2]:[s16[2]:[s16:5120, s16:5120], s16[2]:[s16:5120, s16:5120]]
+    ]);
+    assert_eq(expected_first, depthwise_conv_2d_default_first(x_first, w, b));
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/lookup_table.x b/hls4ml/templates/xls/firmware/nnet_utils/lookup_table.x
new file mode 100644
index 0000000000..8959730f10
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/lookup_table.x
@@ -0,0 +1,252 @@
+import std;
+import fixed_point;
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+// Table of values f(x[i]), i=0..SIZE
+// where x[i] = x_min + i * dx,
+// dx = 2^(LOG2_STEP)
+pub struct LookupTable<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32,
+    LOG2_STEP: s32>{
+
+    x_min: FixedPoint<NB_IN, BE_IN>,
+    values: FixedPoint<NB_OUT, BE_OUT>[SIZE]
+}
+
+fn const_validate_lookup_table_params<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32,
+    LOG2_STEP: s32>(){
+
+    const_assert!(SIZE >= 1);
+
+    // Step should not be smaller than allowed by FixedPoint, i.e. 2^(BE_IN)
+    const_assert!(LOG2_STEP >= BE_IN);
+    let SHIFT = (LOG2_STEP - BE_IN) as u32;
+
+    // Check that DELTA = (x_max - x_min) does not overflow
+    let DELTA = ((SIZE - 1) as uN[32 + SHIFT]) << SHIFT;
+    let MAX_DELTA = std::unsigned_max_value<NB_IN>();
+
+    let NB_MAX = std::max(32 + SHIFT, NB_IN);
+    const_assert!(DELTA as uN[NB_MAX] <= MAX_DELTA as uN[NB_MAX]);
+}
+
+// Check for overflows
+fn validate_lookup_table<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32,
+    LOG2_STEP: s32>(
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>
+    ){
+    // Check statically everything that is possible
+    const_validate_lookup_table_params<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>();
+
+    // Now check for x_max overflow
+    let SHIFT = (LOG2_STEP - BE_IN) as u32;
+    // DELTA = x_max - x_min = step * (SIZE - 1)
+    let DELTA = ((SIZE - 1) as sN[NB_IN + 1]) << SHIFT;
+    let x_min = lut.x_min.significand as sN[NB_IN + 1];
+    let x_max = x_min + DELTA;
+    assert_fmt!(x_max <= std::signed_max_value<NB_IN>() as sN[NB_IN + 1], "lookup_table_x_max_overflow");
+}
+
+// Check arguments and create LUT
+pub fn create<
+    LOG2_STEP: s32,
+    // Other parametes are deduced automatically, so we put them after LOG2_STEP
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32,
+    >(
+        x_min: FixedPoint<NB_IN, BE_IN>,
+        values: FixedPoint<NB_OUT, BE_OUT>[SIZE]
+    ) -> LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP> {
+
+    let lut = LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>{
+        x_min: x_min,
+        values: values
+    };
+    validate_lookup_table(lut);
+    lut
+}
+
+pub fn eval<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32, LOG2_STEP: s32>(
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>,
+        fxp_x: FixedPoint<NB_IN, BE_IN>
+    ) -> FixedPoint<NB_OUT, BE_OUT> {
+
+    const_validate_lookup_table_params<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>();
+
+    let SHIFT = (LOG2_STEP - BE_IN) as u32;
+
+    // add extra bit to avoid overflow
+    let x = fxp_x.significand as sN[NB_IN + 1];
+    let x_min = lut.x_min.significand as sN[NB_IN +1];
+    let delta = x - x_min;
+
+    let idx = delta >> SHIFT;
+    // clamp
+    let idx = std::max(0, idx) as u32;
+    let idx = std::min(idx, SIZE - 1);
+
+    lut.values[idx]
+}
+
+pub fn eval_1d<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32, LOG2_STEP: s32,
+    DIM: u32>(
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>,
+        x: FixedPoint<NB_IN, BE_IN>[DIM]
+    ) -> FixedPoint<NB_OUT, BE_OUT>[DIM] {
+
+    for (i, res) in 0..DIM{
+        update(res, i, eval(lut, x[i]))
+    }(zero!<FixedPoint<NB_OUT, BE_OUT>[DIM]>())
+}
+
+// Evaluate f(-x) = - f(x)
+pub fn eval_antisymmetric<
+    OVERFLOW: OverflowMode,
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32, LOG2_STEP: s32>(
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>,
+        x: FixedPoint<NB_IN, BE_IN>
+    ) -> FixedPoint<NB_OUT, BE_OUT> {
+
+    assert_fmt!(lut.x_min.significand >= 0, "eval_antisymmetric_needs_nonnegative_table_x_min");
+    // f(0) == 0
+    assert_fmt!(lut.values[0].significand == 0, "eval_antisymmetric_nonzero_at_zero");
+
+    if (x.significand == 0) {
+        zero!<FixedPoint<NB_OUT, BE_OUT>>()
+    }
+    else if (x.significand > 0) {
+        eval(lut, x)
+    }
+    else {
+        let minus_x = fixed_point_util::negate_with_overflow<OverflowMode::SAT>(x);
+        assert_fmt!(minus_x.significand >= 0, "minus_x_negative");
+        let minus_f = eval(lut, minus_x);
+        fixed_point_util::negate_with_overflow<OVERFLOW>(minus_f)
+    }
+}
+
+// =========================================================================
+// --------------------------------- Tests ----------------------------------
+
+fn from_integer<NB: u32, BE: s32, NB_IN: u32>(x:sN[NB_IN]) -> FixedPoint<NB, BE> {
+    let res:FixedPoint<NB_IN,0> = fixed_point::from_integer(x);
+    let res:FixedPoint<NB, BE> = fixed_point::to_common_type<NB, BE>(res);
+    res
+}
+
+fn x_values<
+    NB_IN: u32, BE_IN: s32,
+    NB_OUT: u32, BE_OUT: s32,
+    SIZE: u32, LOG2_STEP: s32>(
+        lut: LookupTable<NB_IN, BE_IN, NB_OUT, BE_OUT, SIZE, LOG2_STEP>
+    ) -> FixedPoint<NB_IN, BE_IN>[SIZE] {
+
+    let SHIFT = (LOG2_STEP - BE_IN) as u32;
+    let step = fixed_point::make_fixed_point<BE_IN>(sN[NB_IN]:1 << SHIFT);
+    let (_, res) = for (i, (x, xs)) in 0..SIZE{
+        let x_next = fixed_point_util::add_already_widened(x, step);
+        (x_next, update(xs, i, x))
+    }((lut.x_min, zero!<FixedPoint<NB_IN, BE_IN>[SIZE]>()));
+    res
+}
+
+fn plus_one<
+    NB_OUT: u32, BE_OUT: s32,
+    NB_IN: u32, BE_IN: s32>(
+        x: FixedPoint<NB_IN, BE_IN>
+    ) -> FixedPoint<NB_OUT, BE_OUT>{
+
+    fixed_point::to_common_type<NB_OUT, BE_OUT>(
+        fixed_point::add(x, fixed_point::from_integer(s2:1))
+    )
+}
+
+#[test]
+fn test_lookup_table(){
+
+    let NB_IN = u32:8;
+    let BE_IN = s32:-3;
+    let NB_OUT = NB_IN + 1;
+    let BE_OUT = BE_IN - 1;
+
+    // xs = [-3,-2,..6]
+    let LOG2_STEP = s32:0;
+    let SIZE = u32:10;
+
+    let x_min = s32:-3;
+    let xs = x_min..(x_min + (SIZE as s32));
+    let ys = (x_min + 1)..(x_min + (SIZE as s32) + 1);
+
+
+    let xs_lut = map(xs, from_integer<NB_IN, BE_IN>);
+    let ys_lut = map(ys, from_integer<NB_OUT, BE_OUT>);
+
+    let lut = create<LOG2_STEP>(
+        from_integer<NB_IN, BE_IN>(x_min),
+        map(ys, from_integer<NB_OUT, BE_OUT>)
+    );
+
+
+    let lut_keys = x_values(lut);
+    let lut_values = lut.values;
+
+    // Check consistency
+    assert_eq(lut_keys, xs_lut);
+    assert_eq(lut_values, eval_1d(lut, lut_keys));
+
+    // TODO check intermediate values
+    // TODO check input outside of lut_keys
+    // Check overflow
+    {
+        let x = fixed_point_util::make_fixed_points_1d<BE_IN>([
+            std::signed_min_value<NB_IN>(),
+            std::signed_max_value<NB_IN>()
+        ]);
+        let expected = [
+            lut_values[0],
+            lut_values[SIZE - 1]
+        ];
+        assert_eq(expected, eval_1d(lut, x));
+    };
+
+    let lut_asym = create<LOG2_STEP>(
+        from_integer<NB_IN, BE_IN>(s32:0),
+        map([0,4,-3,7,12], from_integer<NB_OUT, BE_OUT>) ++ [
+            fixed_point_util::max_value<NB_OUT, BE_OUT>(),
+            fixed_point_util::min_value<NB_OUT, BE_OUT>()
+        ]
+    );
+
+    for (i, _) in std::signed_min_value<NB_IN>()..std::signed_max_value<NB_IN>() {
+        let NEGATE_OVERFLOW = OverflowMode::SAT;
+        let plus_x = fixed_point::make_fixed_point<BE_IN>(i);
+        let minus_x = fixed_point_util::negate_with_overflow<NEGATE_OVERFLOW>(plus_x);
+
+        let plus_f = eval_antisymmetric<NEGATE_OVERFLOW>(lut_asym, plus_x);
+        let minus_f = eval_antisymmetric<NEGATE_OVERFLOW>(lut_asym, minus_x);
+        let minus_minus_f = fixed_point_util::negate_with_overflow<NEGATE_OVERFLOW>(minus_f);
+        let plus_f_sat_sym = fixed_point_util::resize<NB_OUT, BE_OUT, RoundingMode::TRN, OverflowMode::SAT_SYM>(plus_f);
+        assert_eq(plus_f_sat_sym, minus_minus_f);
+    }(())
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/merge.x b/hls4ml/templates/xls/firmware/nnet_utils/merge.x
new file mode 100644
index 0000000000..4cbd22ffc5
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/merge.x
@@ -0,0 +1,429 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+pub fn add
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, res) in 0..DIM {
+        update(res, i,
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                fixed_point::add(x[i], y[i])
+        ))
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_add() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:1, 2, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s16:2, 4, 6]);
+    let result = add<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:2, 4, 6]);
+    assert_eq(result, expected);
+}
+
+pub fn subtract
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, res) in 0..DIM {
+        update(res, i,
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                fixed_point::sub(x[i], y[i])
+        ))
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_subtract() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:10, 20, 30]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s7:2, 4, 6]);
+    let result = subtract<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:9, 18, 27]);
+    assert_eq(result, expected);
+}
+
+pub fn multiply
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, res) in 0..DIM {
+        update(res, i,
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                fixed_point::mul(x[i], y[i])
+        ))
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_multiply() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:2, 3, 4]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s7:4, 4, 4]);
+    let result = multiply<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:4, 6, 8]);
+    assert_eq(result, expected);
+}
+
+pub fn maximum
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, res) in 0..DIM {
+        // NB: cannot compare significants directly if BINARY_EXPONENT's are different.
+        let diff = fixed_point::sub(x[i], y[i]);
+        let max_value = if(std::msb(diff.significand) == u1:0) {
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x[i])
+        } else {
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y[i])
+        };
+        update(res, i, max_value)
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_maximum() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:5, 10, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s7:10, 7, 18]);
+    let result = maximum<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:5, 10, 9]);
+    assert_eq(result, expected);
+}
+
+pub fn minimum
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    for (i, res) in 0..DIM {
+        // NB: cannot compare significants directly if BINARY_EXPONENT's are different.
+        let diff = fixed_point::sub(x[i], y[i]);
+        let min_value = if(std::msb(diff.significand) == u1:1) {
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x[i])
+        } else {
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y[i])
+        };
+        update(res, i, min_value)
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_minimum() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:5, 10, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s7:10, 7, 18]);
+    let result = minimum<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:5, 3, 3]);
+    assert_eq(result, expected);
+}
+
+pub fn average
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    let ONE_HALF = fixed_point::make_fixed_point<-1>(s2:1);
+    for (i, res) in 0..DIM {
+        let sum = fixed_point::add(x[i], y[i]);
+        let avg = fixed_point::mul(sum, ONE_HALF);
+        update(res, i,
+            fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(avg)
+        )
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[DIM]>())
+}
+
+#[test]
+fn test_average() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:1, 2, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s16:2, 4, 10]);
+    let result = average<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:1, 2, 4]);
+    assert_eq(result, expected);
+}
+
+pub fn dot
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    DIM: u32
+>
+(
+    x: FixedPoint<X_NB, X_BE>[DIM],
+    y: FixedPoint<Y_NB, Y_BE>[DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[1] {
+    [fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+        fixed_point_util::dot_prod(x,y)
+    )]
+}
+
+#[test]
+fn test_dot() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:1, 2, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s16:2, 4, 10]);
+    let result = dot<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:20]);
+    assert_eq(result, expected);
+}
+
+pub fn concatenate1d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    X_DIM: u32,
+    Y_DIM: u32,
+    OUT_DIM: u32 = {X_DIM + Y_DIM}
+>
+(
+    x: FixedPoint<X_NB, X_BE>[X_DIM],
+    y: FixedPoint<Y_NB, Y_BE>[Y_DIM]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM] {
+    let x_out = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    let y_out = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y);
+    x_out ++ y_out
+}
+
+#[test]
+fn test_concatenate1d() {
+    let x = fixed_point_util::make_fixed_points_1d<0>([s8:1, 2, 3]);
+    let y = fixed_point_util::make_fixed_points_1d<-1>([s16:2, 4, 10]);
+    let result = concatenate1d<8, 0, RoundingMode::TRN, OverflowMode::WRAP>(x, y);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s8:1,2,3,1,2,5]);
+    assert_eq(result, expected);
+}
+
+
+// Shape of concatenate(x,y)
+fn concatenate_out_shape<N: u32>(
+    axis: u32,
+    x_shape: u32[N], y_shape: u32[N]
+) -> u32[N] {
+    // assert!(axis < N, "concatenate_illegal_axis");
+    for (i, out_shape) in 0..N {
+        let x = x_shape[i];
+        let y = y_shape[i];
+        let out_dim = if (i == axis) {
+            x + y
+        } else {
+            // assert!(x == y, "concatenate_shape_mismatch");
+            x
+        };
+        update(out_shape, i, out_dim)
+    }(x_shape)
+}
+
+pub fn concatenate2d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    AXIS: u32,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    X_DIM_0: u32, X_DIM_1: u32,
+    Y_DIM_0: u32, Y_DIM_1: u32,
+    OUT_DIM_0: u32 = {concatenate_out_shape(AXIS, [X_DIM_0, X_DIM_1], [Y_DIM_0, Y_DIM_1])[0]},
+    OUT_DIM_1: u32 = {concatenate_out_shape(AXIS, [X_DIM_0, X_DIM_1], [Y_DIM_0, Y_DIM_1])[1]},
+>
+(
+    x: FixedPoint<X_NB, X_BE>[X_DIM_1][X_DIM_0],
+    y: FixedPoint<Y_NB, Y_BE>[Y_DIM_1][Y_DIM_0]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(AXIS < 2);
+    let x_out = fixed_point_util::resize_2d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    let y_out = fixed_point_util::resize_2d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y);
+    let res = zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0]>();
+    for (i, res) in 0..OUT_DIM_0 {
+        for (j, res) in 0..OUT_DIM_1 {
+            let value = match AXIS {
+                u32:0 => {
+                    if (i < X_DIM_0) {
+                        x_out[i][j]
+                    } else {
+                        y_out[i - X_DIM_0][j]
+                    }
+                },
+                u32:1 => {
+                    if (j < X_DIM_1){
+                        x_out[i][j]
+                    }
+                    else {
+                        y_out[i][j - X_DIM_1]
+                    }
+                },
+                _ => fail!("concatenate2d_axis", res[0][0])
+            };
+            update(res, (i, j), value)
+        }(res)
+    }(res)
+}
+
+#[test]
+fn test_concatenate2d() {
+    let x = fixed_point_util::make_fixed_points_2d<0>([[s8:1, 2, 3]]);
+    let y = fixed_point_util::make_fixed_points_2d<-1>([[s16:2, 4, 10]]);
+
+    let expected_0 = fixed_point_util::make_fixed_points_2d<0>([[s8:1,2,3],[s8:1,2,5]]);
+    let result_0 = concatenate2d<8, 0, RoundingMode::TRN, OverflowMode::WRAP, 0>(x, y);
+    assert_eq(result_0, expected_0);
+
+    let expected_1 = fixed_point_util::make_fixed_points_2d<0>([[s8:1,2,3,1,2,5]]);
+    let result_1 = concatenate2d<8, 0, RoundingMode::TRN, OverflowMode::WRAP, 1>(x, y);
+    assert_eq(result_1, expected_1);
+}
+
+pub fn concatenate3d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    AXIS: u32,
+    X_NB: u32, X_BE: s32,
+    Y_NB: u32, Y_BE: s32,
+    X_DIM_0: u32, X_DIM_1: u32, X_DIM_2: u32,
+    Y_DIM_0: u32, Y_DIM_1: u32, Y_DIM_2: u32,
+    OUT_DIM_0: u32 = {concatenate_out_shape(AXIS, [X_DIM_0, X_DIM_1, X_DIM_2], [Y_DIM_0, Y_DIM_1, Y_DIM_2])[0]},
+    OUT_DIM_1: u32 = {concatenate_out_shape(AXIS, [X_DIM_0, X_DIM_1, X_DIM_2], [Y_DIM_0, Y_DIM_1, Y_DIM_2])[1]},
+    OUT_DIM_2: u32 = {concatenate_out_shape(AXIS, [X_DIM_0, X_DIM_1, X_DIM_2], [Y_DIM_0, Y_DIM_1, Y_DIM_2])[2]},
+>
+(
+    x: FixedPoint<X_NB, X_BE>[X_DIM_2][X_DIM_1][X_DIM_0],
+    y: FixedPoint<Y_NB, Y_BE>[Y_DIM_2][Y_DIM_1][Y_DIM_0]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(AXIS < 3);
+    let x_out = fixed_point_util::resize_3d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    let y_out = fixed_point_util::resize_3d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(y);
+    let res = zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>();
+    for (i, res) in 0..OUT_DIM_0 {
+        for (j, res) in 0..OUT_DIM_1 {
+            for (k, res) in 0..OUT_DIM_2 {
+                let value = match AXIS {
+                    u32:0 => {
+                        if (i < X_DIM_0) {
+                            x_out[i][j][k]
+                        } else {
+                            y_out[i - X_DIM_0][j][k]
+                        }
+                    },
+                    u32:1 => {
+                        if (j < X_DIM_1){
+                            x_out[i][j][k]
+                        }
+                        else {
+                            y_out[i][j - X_DIM_1][k]
+                        }
+                    },
+                    u32:2 => {
+                        if (k < X_DIM_2){
+                            x_out[i][j][k]
+                        }
+                        else {
+                            y_out[i][j][k - X_DIM_2]
+                        }
+                    },
+                    _ => fail!("concatenate3d_axis", res[0][0][0])
+                };
+                update(res, (i, j, k), value)
+            }(res)
+        }(res)
+    }(res)
+}
+
+#[test]
+fn test_concatenate3d() {
+    let x = fixed_point_util::reshape_to_3d<1,2,3>(
+        fixed_point_util::make_fixed_points_1d<0>([
+            s8:1, 2, 3, 4, 5, 6
+        ])
+    );
+    let y = fixed_point_util::reshape_to_3d<1,2,3>(
+        fixed_point_util::make_fixed_points_1d<-1>([
+            s8:20, 40, 60, 80, 100, 120
+        ])
+    );
+
+    let expected_0 = fixed_point_util::reshape_to_3d<2,2,3>(
+        fixed_point_util::make_fixed_points_1d<0>([
+            s8:1,2,3,4,5,6,10,20,30,40,50,60
+        ])
+    );
+    let result_0 = concatenate3d<8, 0, RoundingMode::TRN, OverflowMode::WRAP, 0>(x, y);
+    assert_eq(result_0, expected_0);
+
+    let expected_1 = fixed_point_util::reshape_to_3d<1,4,3>(
+        fixed_point_util::make_fixed_points_1d<0>([
+            s8:1,2,3,4,5,6,10,20,30,40,50,60
+        ])
+    );
+    let result_1 = concatenate3d<8, 0, RoundingMode::TRN, OverflowMode::WRAP, 1>(x, y);
+    assert_eq(result_1, expected_1);
+
+    let expected_2 = fixed_point_util::reshape_to_3d<1,2,6>(
+        fixed_point_util::make_fixed_points_1d<0>([
+            s8:1,2,3,10,20,30,4,5,6,40,50,60
+        ])
+    );
+    let result_2 = concatenate3d<8, 0, RoundingMode::TRN, OverflowMode::WRAP, 2>(x, y);
+    assert_eq(result_2, expected_2);
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/pooling.x b/hls4ml/templates/xls/firmware/nnet_utils/pooling.x
new file mode 100644
index 0000000000..2208acc7be
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/pooling.x
@@ -0,0 +1,612 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+import nnet_utils.activations;
+import nnet_utils.data_format;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+type DataFormat = data_format::DataFormat;
+
+pub enum PoolingOperation: u1 {
+    MAX = 0,
+    AVERAGE = 1
+}
+
+
+pub fn pooling_1d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    POOLING_OP: PoolingOperation,
+    // Pool
+    POOL_SIZE: u32,
+    STRIDE: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    COUNT_PAD: bool,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32,
+    // Derived input dims
+    IN_SIZE: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[0]},
+    IN_CHANNELS: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[1]},
+    // Output size
+    OUT_SIZE: u32 = {((IN_SIZE + PAD_LEFT + PAD_RIGHT - POOL_SIZE) / STRIDE) + 1},
+    // Output dims
+    OUT_DIM_0: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[1]},
+    // Precision for max_or_sum accumulator
+    ACC_NB: u32 = {match POOLING_OP {
+        PoolingOperation::MAX => IN_NB,
+        PoolingOperation::AVERAGE => IN_NB + std::clog2(POOL_SIZE)
+    }},
+    ACC_BE: s32 = {IN_BE},
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_2d) in 0..OUT_DIM_0 {
+        let out_1d = for (out_i_1, out_1d) in 0..OUT_DIM_1 {
+
+            let ij = data_format::to_size_chans(out_i_0, out_i_1, DATA_FORMAT);
+            let out_pos = ij[0];
+            let ch_idx = ij[1];
+
+            let in_pos: s32 = ((out_pos as s32) * (STRIDE as s32)) - (PAD_LEFT as s32);
+
+            // Initial value
+            let max_or_sum: FixedPoint<ACC_NB, ACC_BE> = match POOLING_OP {
+                PoolingOperation::MAX => fixed_point_util::min_value<ACC_NB, ACC_BE>(),
+                PoolingOperation::AVERAGE => zero!<FixedPoint<ACC_NB, ACC_BE>>()
+            };
+            let (max_or_sum, num_elements) = for (k, (max_or_sum, num_elements)) in 0..POOL_SIZE {
+                let ii = in_pos + (k as s32);
+
+                if ii < s32:0 || ii >= IN_SIZE as s32 {
+                    if COUNT_PAD {
+                        (max_or_sum, num_elements + u32:1)
+                    } else {
+                        // Padding elements are ignored
+                        (max_or_sum, num_elements)
+                    }
+                } else {
+                    let ii = ii as u32;
+                    let val = match DATA_FORMAT {
+                        DataFormat::CHANNELS_LAST  => x[ii][ch_idx],
+                        DataFormat::CHANNELS_FIRST => x[ch_idx][ii]
+                    };
+                    let max_or_sum = match POOLING_OP {
+                        PoolingOperation::MAX => {
+                            // val and acc have the same precision in this case,
+                            // widening is needed only to prevent compilation error.
+                            assert_fmt!(ACC_NB == IN_NB, "max_pooling_op_width");
+                            const_assert!(ACC_BE == IN_BE);
+                            let val_widened = fixed_point::make_fixed_point<ACC_BE>(val.significand as sN[ACC_NB]);
+                            fixed_point_util::max(max_or_sum, val_widened)
+                        },
+                        PoolingOperation::AVERAGE => fixed_point_util::add_already_widened(val, max_or_sum)
+                    };
+                    (max_or_sum, num_elements + u32:1)
+                }
+            }((max_or_sum, u32:0));
+
+            // TODO is it valid case?
+            // assert_fmt!(num_elements > 0, "pooling_1d_zero_elements");
+
+            let pool_result = match POOLING_OP {
+                PoolingOperation::MAX => fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                    max_or_sum
+                ),
+                PoolingOperation::AVERAGE =>{
+                    let avg_significand = max_or_sum.significand / (num_elements as sN[ACC_NB]);
+                    let avg = fixed_point::make_fixed_point<ACC_BE>(avg_significand);
+                    fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                        avg
+                    )
+                }
+            };
+            update(out_1d, out_i_1, pool_result)
+        }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1]>());
+
+        update(out_2d, out_i_0, out_1d)
+
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0]>())
+}
+
+pub fn pooling_2d
+    <OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    POOLING_OP: PoolingOperation,
+    // Pool
+    POOL_HEIGHT: u32, POOL_WIDTH: u32,
+    STRIDE_HEIGHT: u32, STRIDE_WIDTH: u32,
+    PAD_TOP: u32, PAD_BOTTOM: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    COUNT_PAD: bool,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+    // Derived input dims
+    IN_HEIGHT: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[0]},
+    IN_WIDTH: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[1]},
+    IN_CHANNELS: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[2]},
+    // Output size
+    OUT_HEIGHT: u32 = {((IN_HEIGHT + PAD_TOP + PAD_BOTTOM - POOL_HEIGHT) / STRIDE_HEIGHT) + 1},
+    OUT_WIDTH: u32 = {((IN_WIDTH + PAD_LEFT + PAD_RIGHT - POOL_WIDTH) / STRIDE_WIDTH) + 1},
+    // Output dims
+    OUT_DIM_0: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[1]},
+    OUT_DIM_2: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[2]},
+    // Precision for max_or_sum accumulator
+    ACC_NB: u32 = {match POOLING_OP {
+        PoolingOperation::MAX => IN_NB,
+        PoolingOperation::AVERAGE => IN_NB + std::clog2(POOL_HEIGHT * POOL_WIDTH)
+    }},
+    ACC_BE: s32 = {IN_BE},
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+
+    for (out_i_0, out_3d) in 0..OUT_DIM_0 {
+        let out_2d = for (out_i_1, out_2d) in 0..OUT_DIM_1 {
+            let out_1d = for (out_i_2, out_1d) in 0..OUT_DIM_2 {
+
+                let ijc = data_format::to_height_width_chans(out_i_0, out_i_1, out_i_2, DATA_FORMAT);
+                let out_i = ijc[0];
+                let out_j = ijc[1];
+                let ch_idx = ijc[2];
+
+                let in_i: s32 = ((out_i as s32) * (STRIDE_HEIGHT as s32)) - (PAD_TOP as s32) ;
+                let in_j: s32 = ((out_j as s32) * (STRIDE_WIDTH as s32)) - (PAD_LEFT as s32);
+
+                // Initial value
+                let max_or_sum: FixedPoint<ACC_NB, ACC_BE> = match POOLING_OP {
+                    PoolingOperation::MAX => fixed_point_util::min_value<ACC_NB, ACC_BE>(),
+                    PoolingOperation::AVERAGE => zero!<FixedPoint<ACC_NB, ACC_BE>>()
+                };
+                let (max_or_sum, num_elements) = for (di, (max_or_sum, num_elements)) in 0..POOL_HEIGHT {
+                    for (dj, (max_or_sum, num_elements)) in 0..POOL_WIDTH {
+                        let ii = in_i + (di as s32);
+                        let jj = in_j + (dj as s32);
+
+                        if ii < s32:0 || ii >= IN_HEIGHT as s32 || jj < s32:0 || jj >= IN_WIDTH as s32 {
+                            if COUNT_PAD {
+                                (max_or_sum, num_elements + u32:1)
+                            } else {
+                                // Padding elements are ignored
+                                (max_or_sum, num_elements)
+                            }
+                        } else {
+                            let ii = ii as u32;
+                            let jj = jj as u32;
+                            let val = match DATA_FORMAT {
+                                DataFormat::CHANNELS_LAST  => x[ii][jj][ch_idx],
+                                DataFormat::CHANNELS_FIRST => x[ch_idx][ii][jj]
+                            };
+                            let max_or_sum = match POOLING_OP {
+                                PoolingOperation::MAX => {
+                                    // val and acc have the same precision in this case,
+                                    // widening is needed only to prevent compilation error.
+                                    assert_fmt!(ACC_NB == IN_NB, "max_pooling_op_width");
+                                    const_assert!(ACC_BE == IN_BE);
+                                    let val_widened = fixed_point::make_fixed_point<ACC_BE>(val.significand as sN[ACC_NB]);
+                                    fixed_point_util::max(max_or_sum, val_widened)
+                                },
+                                PoolingOperation::AVERAGE => fixed_point_util::add_already_widened(val, max_or_sum)
+                            };
+                            (max_or_sum, num_elements + u32:1)
+                        }
+                    }((max_or_sum, num_elements))
+                }((max_or_sum, u32:0));
+
+                // TODO is it valid case?
+                // assert_fmt!(num_elements > 0, "pooling2d_zero_elements");
+                let pool_result = match POOLING_OP {
+                    PoolingOperation::MAX => fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                        max_or_sum
+                    ),
+                    PoolingOperation::AVERAGE =>{
+                        let avg_significand = max_or_sum.significand / (num_elements as sN[ACC_NB]);
+                        let avg = fixed_point::make_fixed_point<ACC_BE>(avg_significand);
+                        fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(
+                            avg
+                        )
+                    }
+                };
+                update(out_1d, out_i_2, pool_result)
+            }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2]>());
+
+            update(out_2d, out_i_1, out_1d)
+        }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1]>());
+
+        update(out_3d, out_i_0, out_2d)
+    }(zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>())
+}
+
+pub fn global_pooling_1d<
+    OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    POOLING_OP: PoolingOperation,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32,
+    // Derived input dims
+    IN_SIZE: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[0]},
+    IN_CHANNELS: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[1]},
+    // For global pooling, pool size is equal to input size
+    POOL_SIZE: u32 = {IN_SIZE},
+    STRIDE: u32 = {1},
+    PAD_LEFT: u32 = {0}, PAD_RIGHT: u32 = {0},
+    COUNT_PAD: bool = {false},
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[IN_CHANNELS] {
+    let res_2d = pooling_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW, POOLING_OP, POOL_SIZE, STRIDE, PAD_LEFT, PAD_RIGHT, COUNT_PAD, DATA_FORMAT>(x);
+    fixed_point_util::flatten_2d(res_2d)
+}
+
+pub fn global_pooling_2d<
+    OUT_NB: u32, OUT_BE: s32,
+    ROUNDING: RoundingMode,
+    OVERFLOW: OverflowMode,
+    POOLING_OP: PoolingOperation,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+    // Derived input dims
+    IN_HEIGHT: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[0]},
+    IN_WIDTH: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[1]},
+    IN_CHANNELS: u32 = {data_format::to_height_width_chans(IN_DIM_0, IN_DIM_1, IN_DIM_2, DATA_FORMAT)[2]},
+    // For global pooling, pool size is equal to input size
+    POOL_HEIGHT: u32 = {IN_HEIGHT},
+    POOL_WIDTH: u32 = {IN_WIDTH},
+    STRIDE_HEIGHT: u32 = {1},
+    STRIDE_WIDTH: u32 = {1},
+    PAD_TOP: u32 = {0}, PAD_BOTTOM: u32 = {0},
+    PAD_LEFT: u32 = {0}, PAD_RIGHT: u32 = {0},
+    COUNT_PAD: bool = {false},
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[IN_CHANNELS] {
+    let res_3d = pooling_2d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW, POOLING_OP, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, COUNT_PAD, DATA_FORMAT>(x);
+    fixed_point_util::flatten_3d(res_3d)
+}
+
+// Testing
+
+// Test constant input for 1D and 2D
+
+fn test_pooling_const_case<
+    POOLING_OP: PoolingOperation,
+    DATA_FORMAT: DataFormat,
+    COUNT_PAD: bool,
+    IN_HEIGHT: u32,
+    IN_WIDTH: u32,
+    IN_CHANNELS: u32,
+    POOL_HEIGHT: u32,
+    POOL_WIDTH: u32,
+    STRIDE_HEIGHT: u32,
+    STRIDE_WIDTH: u32,
+    PAD_TOP: u32, PAD_BOTTOM: u32,
+    PAD_LEFT: u32, PAD_RIGHT: u32,
+    // Input
+    NB: u32, BE: s32,
+    // 2d
+    IN_2D_DIM_0: u32 = {data_format::from_height_width_chans(IN_HEIGHT, IN_WIDTH, IN_CHANNELS, DATA_FORMAT)[0]},
+    IN_2D_DIM_1: u32 = {data_format::from_height_width_chans(IN_HEIGHT, IN_WIDTH, IN_CHANNELS, DATA_FORMAT)[1]},
+    IN_2D_DIM_2: u32 = {data_format::from_height_width_chans(IN_HEIGHT, IN_WIDTH, IN_CHANNELS, DATA_FORMAT)[2]},
+    // 1d
+    IN_SIZE: u32 = {IN_HEIGHT},
+    IN_1D_DIM_0: u32 = {data_format::from_size_chans(IN_SIZE, IN_CHANNELS, DATA_FORMAT)[0]},
+    IN_1D_DIM_1: u32 = {data_format::from_size_chans(IN_SIZE, IN_CHANNELS, DATA_FORMAT)[1]},
+    // Output 2d
+    OUT_HEIGHT: u32 = {((IN_HEIGHT + PAD_TOP + PAD_BOTTOM - POOL_HEIGHT) / STRIDE_HEIGHT) + 1},
+    OUT_WIDTH: u32 = {((IN_WIDTH + PAD_LEFT + PAD_RIGHT - POOL_WIDTH) / STRIDE_WIDTH) + 1},
+    OUT_2D_DIM_0: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[0]},
+    OUT_2D_DIM_1: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[1]},
+    OUT_2D_DIM_2: u32 = {data_format::from_height_width_chans(OUT_HEIGHT, OUT_WIDTH, IN_CHANNELS, DATA_FORMAT)[2]},
+    // Output 1d
+    OUT_SIZE: u32 = {OUT_HEIGHT},
+    OUT_1D_DIM_0: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[0]},
+    OUT_1D_DIM_1: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[1]},
+>(value: FixedPoint<NB, BE>) {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+
+    let input_for_1d = fixed_point_util::const_array_2d<IN_1D_DIM_0, IN_1D_DIM_1>(value);
+    let expected_1d = fixed_point_util::const_array_2d<OUT_1D_DIM_0, OUT_1D_DIM_1>(value);
+
+    let input_for_2d = fixed_point_util::const_array_3d<IN_2D_DIM_0, IN_2D_DIM_1, IN_2D_DIM_2>(value);
+    let expected_2d = fixed_point_util::const_array_3d<OUT_2D_DIM_0, OUT_2D_DIM_1, OUT_2D_DIM_2>(value);
+    let expected_global = fixed_point_util::const_array_1d<IN_CHANNELS>(value);
+
+    let pooling_1d_result = pooling_1d<
+        NB, BE, R, O,
+        POOLING_OP,
+        POOL_HEIGHT,
+        STRIDE_HEIGHT, PAD_TOP, PAD_BOTTOM,
+        COUNT_PAD,
+        DATA_FORMAT
+        >(input_for_1d);
+    let pooling_2d_result = pooling_2d<
+        NB, BE, R, O,
+        POOLING_OP,
+        POOL_HEIGHT, POOL_WIDTH,
+        STRIDE_HEIGHT, STRIDE_WIDTH,
+        PAD_TOP, PAD_BOTTOM,
+        PAD_LEFT, PAD_RIGHT,
+        COUNT_PAD,
+        DATA_FORMAT
+        >(input_for_2d);
+
+    if (COUNT_PAD == false ||
+        POOLING_OP == PoolingOperation::MAX ||
+        (PAD_TOP + PAD_BOTTOM + PAD_LEFT + PAD_RIGHT == 0) ||
+        value == zero!<FixedPoint<NB, BE>>()
+    ){
+        assert_eq(expected_1d, pooling_1d_result);
+        assert_eq(expected_2d, pooling_2d_result);
+    }
+    else{
+        // TODO check element values instead of skipping the test
+        trace_fmt!("test_pooling_const_case: skip because output array will not be constant: COUNT_PAD={}, POOLING_OP={}, PAD_TOP={}, PAD_BOTTOM={}, PAD_LEFT={}, PAD_RIGHT={}, value={}",
+            COUNT_PAD, POOLING_OP, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, value);
+    };
+
+    let global_pooling_1d_result = global_pooling_1d<NB, BE, R, O, POOLING_OP, DATA_FORMAT>(input_for_1d);
+    let global_pooling_2d_result = global_pooling_2d<NB, BE, R, O, POOLING_OP, DATA_FORMAT>(input_for_2d);
+    assert_eq(expected_global, global_pooling_1d_result);
+    assert_eq(expected_global, global_pooling_2d_result);
+}
+
+fn test_pooling_const_cases<
+    IN_HEIGHT: u32,
+    IN_WIDTH: u32,
+    IN_CHANNELS: u32,
+    POOL_HEIGHT: u32,
+    POOL_WIDTH: u32,
+    STRIDE_HEIGHT: u32,
+    STRIDE_WIDTH: u32,
+    PAD_TOP: u32,
+    PAD_BOTTOM: u32,
+    PAD_LEFT: u32,
+    PAD_RIGHT: u32,
+    // Input
+    NB: u32, BE: s32,
+>(value: FixedPoint<NB, BE>) {
+    let POOLING_OP = PoolingOperation::MAX;
+    let DATA_FORMAT = DataFormat::CHANNELS_LAST;
+    let COUNT_PAD = false;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+    let COUNT_PAD = true;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+
+    let DATA_FORMAT = DataFormat::CHANNELS_FIRST;
+    let COUNT_PAD = false;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+    let COUNT_PAD = true;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+
+    let POOLING_OP = PoolingOperation::AVERAGE;
+    let DATA_FORMAT = DataFormat::CHANNELS_LAST;
+    let COUNT_PAD = false;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+    let COUNT_PAD = true;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+
+    let DATA_FORMAT = DataFormat::CHANNELS_FIRST;
+    let COUNT_PAD = false;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+    let COUNT_PAD = true;
+    test_pooling_const_case<POOLING_OP, DATA_FORMAT, COUNT_PAD, IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>(value);
+}
+
+#[test]
+fn test_pooling_const() {
+    let IN_HEIGHT = u32:5;
+    let IN_WIDTH = u32:6;
+    let IN_CHANNELS = u32:2;
+    let POOL_HEIGHT = u32:3;
+    let POOL_WIDTH = u32:2;
+    let STRIDE_HEIGHT = u32:2;
+    let STRIDE_WIDTH = u32:2;
+    let PAD_TOP = u32:1;
+    let PAD_BOTTOM = u32:1;
+    let PAD_LEFT = u32:1;
+    let PAD_RIGHT = u32:1;
+    let zero = fixed_point::make_fixed_point<-10>(s16:0);
+    let one = fixed_point::make_fixed_point<-10>(s16:1024);
+    let min_value = fixed_point_util::min_value<16, -10>();
+    let max_value = fixed_point_util::max_value<16, -10>();
+    map(
+        [zero, one, min_value, max_value],
+        test_pooling_const_cases<IN_HEIGHT, IN_WIDTH, IN_CHANNELS, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT>
+    );
+}
+
+// Test pooling_1d with non-constant input and simple parameters
+
+// TODO inline and remove this function, use pooling_1d with explicit named parameters instead
+pub fn pooling_1d_default
+    <POOLING_OP: PoolingOperation,
+    DATA_FORMAT: DataFormat,
+    // Input
+    IN_NB: u32, IN_BE: s32,
+    IN_DIM_0: u32, IN_DIM_1: u32,
+    // Defaults
+    OUT_NB: u32 = {IN_NB}, OUT_BE: s32 = {IN_BE},
+    ROUNDING: RoundingMode = {RoundingMode::TRN},
+    OVERFLOW: OverflowMode = {OverflowMode::WRAP},
+    POOL_SIZE: u32 = {u32:3},
+    STRIDE: u32 = {u32:1},
+    PAD_LEFT: u32 = {u32:0},
+    PAD_RIGHT: u32 = {u32:0},
+    COUNT_PAD: bool = {false},
+    // Derived input dims
+    IN_SIZE: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[0]},
+    IN_CHANNELS: u32 = {data_format::to_size_chans(IN_DIM_0, IN_DIM_1, DATA_FORMAT)[1]},
+    // Output size
+    OUT_SIZE: u32 = {((IN_SIZE + PAD_LEFT + PAD_RIGHT - POOL_SIZE) / STRIDE) + 1},
+    // Output dims
+    OUT_DIM_0: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[0]},
+    OUT_DIM_1: u32 = {data_format::from_size_chans(OUT_SIZE, IN_CHANNELS, DATA_FORMAT)[1]},
+    >
+(
+    x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0]
+)
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    pooling_1d<
+        OUT_NB, OUT_BE,
+        ROUNDING, OVERFLOW,
+        POOLING_OP, POOL_SIZE,
+        STRIDE, PAD_LEFT, PAD_RIGHT, COUNT_PAD,
+        DATA_FORMAT
+    >(x)
+}
+
+#[test]
+fn test_pooling_1d() {
+    let NB = u32:16;
+    let BE = s32:0;
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+
+    let IN_SIZE = u32:5;
+    let CHANNELS = u32:1;
+    let OUT_SIZE = u32:3;
+    let x_flat = fixed_point_util::make_fixed_points_1d<0>([s16:1,2,3,4,5]);
+    let expected_max_flat = fixed_point_util::make_fixed_points_1d<0>([s16:3,4,5]);
+    let expected_avg_flat = fixed_point_util::make_fixed_points_1d<0>([s16:2,3,4]);
+    let expected_global_max_flat = fixed_point_util::make_fixed_points_1d<0>([s16:5]);
+    let expected_global_avg_flat = fixed_point_util::make_fixed_points_1d<0>([s16:3]);
+
+    // CHANNELS_LAST
+    let x_last = fixed_point_util::reshape_to_2d<IN_SIZE, CHANNELS>(x_flat);
+    let expected_max_last = fixed_point_util::reshape_to_2d<OUT_SIZE, CHANNELS>(expected_max_flat);
+    let expected_avg_last = fixed_point_util::reshape_to_2d<OUT_SIZE, CHANNELS>(expected_avg_flat);
+    assert_eq(
+        expected_max_last,
+        pooling_1d_default<PoolingOperation::MAX, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_avg_last,
+        pooling_1d_default<PoolingOperation::AVERAGE, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_global_max_flat,
+        global_pooling_1d<NB, BE, R, O, PoolingOperation::MAX, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_global_avg_flat,
+        global_pooling_1d<NB, BE, R, O, PoolingOperation::AVERAGE, DataFormat::CHANNELS_LAST>(x_last)
+    );
+
+    // CHANNELS_FIRST
+    let x_first = fixed_point_util::reshape_to_2d<CHANNELS, IN_SIZE>(x_flat);
+    let expected_max_first = fixed_point_util::reshape_to_2d<CHANNELS, OUT_SIZE>(expected_max_flat);
+    let expected_avg_first = fixed_point_util::reshape_to_2d<CHANNELS, OUT_SIZE>(expected_avg_flat);
+    assert_eq(
+        expected_max_first,
+        pooling_1d_default<PoolingOperation::MAX, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_avg_first,
+        pooling_1d_default<PoolingOperation::AVERAGE, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_global_max_flat,
+        global_pooling_1d<NB, BE, R, O, PoolingOperation::MAX, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_global_avg_flat,
+        global_pooling_1d<NB, BE, R, O, PoolingOperation::AVERAGE, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+}
+
+// Test pooling_2d with non-constant input and simple parameters
+
+#[test]
+fn test_pooling_2d() {
+    let NB = u32:16;
+    let BE = s32:0;
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+
+    let IN_HEIGHT = u32:4;
+    let IN_WIDTH = u32:4;
+    let CHANNELS = u32:1;
+    let OUT_HEIGHT = u32:2;
+    let OUT_WIDTH = u32:2;
+    let POOL_HEIGHT = u32:2;
+    let POOL_WIDTH = u32:2;
+    let STRIDE_HEIGHT = u32:2;
+    let STRIDE_WIDTH = u32:2;
+    let PAD_TOP = u32:0;
+    let PAD_BOTTOM = u32:0;
+    let PAD_LEFT = u32:0;
+    let PAD_RIGHT = u32:0;
+    let x_flat = fixed_point_util::make_fixed_points_1d<0>([
+        s16:1, 2, 3, 4,
+        5, 6, 7, 8,
+        9, 10, 11, 12,
+        13, 14, 15, 16
+    ]);
+    let expected_max_flat = fixed_point_util::make_fixed_points_1d<0>([s16:6, 8, 14, 16]);
+    let expected_avg_flat = fixed_point_util::make_fixed_points_1d<0>([s16:3, 5, 11, 13]);
+    let expected_global_max_flat = fixed_point_util::make_fixed_points_1d<0>([s16:16]);
+    let expected_global_avg_flat = fixed_point_util::make_fixed_points_1d<0>([s16:8]);
+
+    // CHANNELS_LAST
+    let x_last = fixed_point_util::reshape_to_3d<IN_HEIGHT, IN_WIDTH, CHANNELS>(x_flat);
+    let expected_max_last = fixed_point_util::reshape_to_3d<OUT_HEIGHT, OUT_WIDTH, CHANNELS>(expected_max_flat);
+    let expected_avg_last = fixed_point_util::reshape_to_3d<OUT_HEIGHT, OUT_WIDTH, CHANNELS>(expected_avg_flat);
+    assert_eq(
+        expected_max_last,
+        pooling_2d<NB, BE, R, O, PoolingOperation::MAX, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, false, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_avg_last,
+        pooling_2d<NB, BE, R, O, PoolingOperation::AVERAGE, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, false, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_global_max_flat,
+        global_pooling_2d<NB, BE, R, O, PoolingOperation::MAX, DataFormat::CHANNELS_LAST>(x_last)
+    );
+    assert_eq(
+        expected_global_avg_flat,
+        global_pooling_2d<NB, BE, R, O, PoolingOperation::AVERAGE, DataFormat::CHANNELS_LAST>(x_last)
+    );
+
+    // CHANNELS_FIRST
+    let x_first = fixed_point_util::reshape_to_3d<CHANNELS, IN_HEIGHT, IN_WIDTH>(x_flat);
+    let expected_max_first = fixed_point_util::reshape_to_3d<CHANNELS, OUT_HEIGHT, OUT_WIDTH>(expected_max_flat);
+    let expected_avg_first = fixed_point_util::reshape_to_3d<CHANNELS, OUT_HEIGHT, OUT_WIDTH>(expected_avg_flat);
+    assert_eq(
+        expected_max_first,
+        pooling_2d<NB, BE, R, O, PoolingOperation::MAX, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, false, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_avg_first,
+        pooling_2d<NB, BE, R, O, PoolingOperation::AVERAGE, POOL_HEIGHT, POOL_WIDTH, STRIDE_HEIGHT, STRIDE_WIDTH, PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, false, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_global_max_flat,
+        global_pooling_2d<NB, BE, R, O, PoolingOperation::MAX, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+    assert_eq(
+        expected_global_avg_flat,
+        global_pooling_2d<NB, BE, R, O, PoolingOperation::AVERAGE, DataFormat::CHANNELS_FIRST>(x_first)
+    );
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/reshape.x b/hls4ml/templates/xls/firmware/nnet_utils/reshape.x
new file mode 100644
index 0000000000..c38572fbac
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/reshape.x
@@ -0,0 +1,246 @@
+import fixed_point;
+
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+pub fn reshape_1d_to_1d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_0] {
+    const_assert!(IN_DIM_0 == OUT_DIM_0);
+    fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x)
+}
+
+pub fn reshape_1d_to_2d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 == OUT_DIM_0 * OUT_DIM_1);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    fixed_point_util::reshape_to_2d<OUT_DIM_0, OUT_DIM_1>(x_flat)
+}
+
+pub fn reshape_1d_to_3d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    fixed_point_util::reshape_to_3d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2>(x_flat)
+}
+
+pub fn reshape_1d_to_4d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32, OUT_DIM_3: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2 * OUT_DIM_3);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x);
+    fixed_point_util::reshape_to_4d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2, OUT_DIM_3>(x_flat)
+}
+
+pub fn reshape_2d_to_1d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 == OUT_DIM_0);
+    let x_flat = fixed_point_util::flatten_2d(x);
+    fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x_flat)
+}
+
+pub fn reshape_2d_to_2d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 == OUT_DIM_0 * OUT_DIM_1);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_2d(x));
+    fixed_point_util::reshape_to_2d<OUT_DIM_0, OUT_DIM_1>(x_flat)
+}
+
+pub fn reshape_2d_to_3d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_2d(x));
+    fixed_point_util::reshape_to_3d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2>(x_flat)
+}
+
+pub fn reshape_2d_to_4d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32, OUT_DIM_3: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2 * OUT_DIM_3);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_2d(x));
+    fixed_point_util::reshape_to_4d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2, OUT_DIM_3>(x_flat)
+}
+
+pub fn reshape_3d_to_1d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 == OUT_DIM_0);
+    let x_flat = fixed_point_util::flatten_3d(x);
+    fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x_flat)
+}
+
+pub fn reshape_3d_to_2d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 == OUT_DIM_0 * OUT_DIM_1);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_3d(x));
+    fixed_point_util::reshape_to_2d<OUT_DIM_0, OUT_DIM_1>(x_flat)
+}
+
+pub fn reshape_3d_to_3d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_3d(x));
+    fixed_point_util::reshape_to_3d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2>(x_flat)
+}
+
+pub fn reshape_3d_to_4d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32, OUT_DIM_3: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2 * OUT_DIM_3);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_3d(x));
+    fixed_point_util::reshape_to_4d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2, OUT_DIM_3>(x_flat)
+}
+
+pub fn reshape_4d_to_1d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32, IN_DIM_3: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_3][IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 * IN_DIM_3 == OUT_DIM_0);
+    let x_flat = fixed_point_util::flatten_4d(x);
+    fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x_flat)
+}
+
+pub fn reshape_4d_to_2d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32, IN_DIM_3: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_3][IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 * IN_DIM_3 == OUT_DIM_0 * OUT_DIM_1);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_4d(x));
+    fixed_point_util::reshape_to_2d<OUT_DIM_0, OUT_DIM_1>(x_flat)
+}
+
+pub fn reshape_4d_to_3d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32, IN_DIM_3: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_3][IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 * IN_DIM_3 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_4d(x));
+    fixed_point_util::reshape_to_3d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2>(x_flat)
+}
+
+pub fn reshape_4d_to_4d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    OUT_DIM_0: u32, OUT_DIM_1: u32, OUT_DIM_2: u32, OUT_DIM_3: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32, IN_DIM_3: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_3][IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(IN_DIM_0 * IN_DIM_1 * IN_DIM_2 * IN_DIM_3 == OUT_DIM_0 * OUT_DIM_1 * OUT_DIM_2 * OUT_DIM_3);
+    let x_flat = fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(fixed_point_util::flatten_4d(x));
+    fixed_point_util::reshape_to_4d<OUT_DIM_0, OUT_DIM_1, OUT_DIM_2, OUT_DIM_3>(x_flat)
+}
+
+#[test]
+fn test_reshape_1d_to_4d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_1d<0>([s16:1, 2, 3, 4, 5, 6, 7, 8]);
+    let expected = fixed_point_util::make_fixed_points_4d<0>([
+        [[[s16:1, 2], [s16:3, 4]]],
+        [[[s16:5, 6], [s16:7, 8]]],
+    ]);
+    assert_eq(expected, reshape_1d_to_4d<16, 0, R, O, 2, 1, 2, 2>(x));
+}
+
+#[test]
+fn test_reshape_2d_to_4d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_2d<0>([[s16:1, 2, 3, 4], [s16:5, 6, 7, 8]]);
+    let expected = fixed_point_util::make_fixed_points_4d<0>([
+        [[[s16:1, 2], [s16:3, 4]]],
+        [[[s16:5, 6], [s16:7, 8]]],
+    ]);
+    assert_eq(expected, reshape_2d_to_4d<16, 0, R, O, 2, 1, 2, 2>(x));
+}
+
+#[test]
+fn test_reshape_4d_to_1d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_4d<0>([
+        [[[s16:1, 2], [s16:3, 4]]],
+        [[[s16:5, 6], [s16:7, 8]]],
+    ]);
+    let expected = fixed_point_util::make_fixed_points_1d<0>([s16:1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq(expected, reshape_4d_to_1d<16, 0, R, O, 8>(x));
+}
+
+#[test]
+fn test_reshape_3d_to_2d_resize() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_3d<0>([
+        [[s8:1, 2], [s8:3, 4]],
+        [[s8:5, 6], [s8:7, 8]],
+    ]);
+    let expected = fixed_point_util::make_fixed_points_2d<-1>([[s16:2, 4], [s16:6, 8], [s16:10, 12], [s16:14, 16]]);
+    assert_eq(expected, reshape_3d_to_2d<16, -1, R, O, 4, 2>(x));
+}
diff --git a/hls4ml/templates/xls/firmware/nnet_utils/transpose.x b/hls4ml/templates/xls/firmware/nnet_utils/transpose.x
new file mode 100644
index 0000000000..82b3245c04
--- /dev/null
+++ b/hls4ml/templates/xls/firmware/nnet_utils/transpose.x
@@ -0,0 +1,194 @@
+import std;
+import fixed_point;
+
+import ap_types.fixed_point_util;
+
+type FixedPoint = fixed_point::FixedPoint;
+type RoundingMode = fixed_point_util::RoundingMode;
+type OverflowMode = fixed_point_util::OverflowMode;
+
+// Simple bubble sort used for checking permutation indices.
+fn sort<S: bool, N: u32, DIM: u32>(x: xN[S][N][DIM])-> xN[S][N][DIM] {
+    let res = x;
+    for (i, res) in 0..DIM {
+        for (j, res) in 0..DIM {
+            if j > i && res[j] < res[i] {
+                update(update(res, i, res[j]), j, res[i])
+            } else {
+                res
+            }
+        }(res)
+    }(res)
+}
+
+#[test]
+fn test_sort() {
+    assert_eq(sort([u32:0, 1]), [u32:0, 1]);
+    assert_eq(sort([u32:1, 0]), [u32:0, 1]);
+    assert_eq(sort([u32:3, 1, 2, 0]), [u32:0, 1, 2, 3]);
+    assert_eq(sort([u32:2, 1, 2, 0]), [u32:0, 1, 2, 2]);
+}
+
+fn permute<S: bool, N: u32, DIM: u32>(x: xN[S][N][DIM], perm: u32[DIM])-> xN[S][N][DIM] {
+    let range: u32[DIM] = 0..DIM;
+    assert_fmt!(sort(perm) == range, "invalid_perm");
+    for (i, res) in 0..DIM {
+       update(res, i, x[perm[i]])
+    }(x)
+}
+
+#[test]
+fn test_permute() {
+    assert_eq(permute([0,1,2], [1,2,0]), [1,2,0]);
+    assert_eq(permute([3,4,5], [1,0,2]), [4,3,5]);
+}
+
+pub fn transpose_1d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    PERM_0: u32,
+    IN_NB: u32, IN_BE: s32, DIM: u32,
+>
+(x: FixedPoint<IN_NB, IN_BE>[DIM])
+-> FixedPoint<OUT_NB, OUT_BE>[DIM] {
+    const_assert!(PERM_0 == u32:0);
+    fixed_point_util::resize_1d<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x)
+}
+
+pub fn transpose_2d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    // Permutation: (0,1) or (1,0)
+    PERM_0: u32, PERM_1: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32,
+    OUT_DIM_0: u32 = {[IN_DIM_0, IN_DIM_1][PERM_0]},
+    OUT_DIM_1: u32 = {[IN_DIM_0, IN_DIM_1][PERM_1]},
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(sort([PERM_0, PERM_1]) == u32[2]:[0, 1]);
+    let res = zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_1][OUT_DIM_0]>();
+    for (i_0, res) in 0..IN_DIM_0 {
+        for (i_1, res) in 0..IN_DIM_1 {
+            let out_idx = permute([i_0, i_1], [PERM_0, PERM_1]);
+            update(
+                res,
+                (out_idx[0], out_idx[1]),
+                fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x[i_0][i_1])
+            )
+        }(res)
+    }(res)
+}
+
+pub fn transpose_3d<
+    OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+    PERM_0: u32, PERM_1: u32, PERM_2: u32,
+    IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32,
+    OUT_DIM_0: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2][PERM_0]},
+    OUT_DIM_1: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2][PERM_1]},
+    OUT_DIM_2: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2][PERM_2]},
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(sort([PERM_0, PERM_1, PERM_2]) == u32[3]:[0, 1, 2]);
+    let res = zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>();
+    for (i_0, res) in 0..IN_DIM_0 {
+        for (i_1, res) in 0..IN_DIM_1 {
+            for (i_2, res) in 0..IN_DIM_2 {
+                let out_idx = permute([i_0, i_1, i_2], [PERM_0, PERM_1, PERM_2]);
+                update(
+                    res,
+                    (out_idx[0], out_idx[1], out_idx[2]),
+                    fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x[i_0][i_1][i_2])
+                )
+            }(res)
+        }(res)
+    }(res)
+}
+
+
+pub fn transpose_4d<
+OUT_NB: u32, OUT_BE: s32, ROUNDING: RoundingMode, OVERFLOW: OverflowMode,
+PERM_0: u32, PERM_1: u32, PERM_2: u32, PERM_3: u32,
+IN_NB: u32, IN_BE: s32, IN_DIM_0: u32, IN_DIM_1: u32, IN_DIM_2: u32, IN_DIM_3: u32,
+OUT_DIM_0: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2, IN_DIM_3][PERM_0]},
+OUT_DIM_1: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2, IN_DIM_3][PERM_1]},
+OUT_DIM_2: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2, IN_DIM_3][PERM_2]},
+OUT_DIM_3: u32 = {[IN_DIM_0, IN_DIM_1, IN_DIM_2, IN_DIM_3][PERM_3]},
+>
+(x: FixedPoint<IN_NB, IN_BE>[IN_DIM_3][IN_DIM_2][IN_DIM_1][IN_DIM_0])
+-> FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0] {
+    const_assert!(sort([PERM_0, PERM_1, PERM_2, PERM_3]) == u32[4]:[0, 1, 2, 3]);
+    let res = zero!<FixedPoint<OUT_NB, OUT_BE>[OUT_DIM_3][OUT_DIM_2][OUT_DIM_1][OUT_DIM_0]>();
+    for (i_0, res) in 0..IN_DIM_0 {
+        for (i_1, res) in 0..IN_DIM_1 {
+            for (i_2, res) in 0..IN_DIM_2 {
+                for (i_3, res) in 0..IN_DIM_3 {
+                    let out_idx = permute([i_0, i_1, i_2, i_3], [PERM_0, PERM_1, PERM_2, PERM_3]);
+                    update(
+                        res,
+                        (out_idx[0], out_idx[1], out_idx[2], out_idx[3]),
+                        fixed_point_util::resize<OUT_NB, OUT_BE, ROUNDING, OVERFLOW>(x[i_0][i_1][i_2][i_3])
+                    )
+                }(res)
+            }(res)
+        }(res)
+    }(res)
+}
+
+// Testing
+
+#[test]
+fn test_transpose_2d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_2d<0>([[s16:1, 2, 3], [s16:4, 5, 6]]);
+    let x_t = fixed_point_util::make_fixed_points_2d<0>([[s16:1, 4], [s16:2, 5], [s16:3, 6]]);
+
+    assert_eq(x, transpose_2d<16, 0, R, O, 0, 1>(x));
+    assert_eq(x_t, transpose_2d<16, 0, R, O, 1, 0>(x));
+}
+
+#[test]
+fn test_transpose_3d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_3d<0>([
+        [[s16:1, 2], [s16:3, 4], [s16:5, 6]],
+        [[s16:7, 8], [s16:9, 10], [s16:11, 12]],
+    ]);
+    let x_102 = fixed_point_util::make_fixed_points_3d<0>([
+        [[s16:1, 2], [s16:7, 8]],
+        [[s16:3, 4], [s16:9, 10]],
+        [[s16:5, 6], [s16:11, 12]],
+    ]);
+    let x_120 = fixed_point_util::make_fixed_points_3d<0>([
+        [[s16:1, 7], [s16:2, 8]],
+        [[s16:3, 9], [s16:4, 10]],
+        [[s16:5, 11], [s16:6, 12]],
+    ]);
+    let x_210 = fixed_point_util::make_fixed_points_3d<0>([
+        [[s16:1, 7], [s16:3, 9], [s16:5, 11]],
+        [[s16:2, 8], [s16:4, 10], [s16:6, 12]],
+    ]);
+
+    assert_eq(x, transpose_3d<16, 0, R, O, 0, 1, 2>(x));
+    assert_eq(x_102, transpose_3d<16, 0, R, O, 1, 0, 2>(x));
+    assert_eq(x_120, transpose_3d<16, 0, R, O, 1, 2, 0>(x));
+    assert_eq(x_210, transpose_3d<16, 0, R, O, 2, 1, 0>(x));
+}
+
+#[test]
+fn test_transpose_4d() {
+    let R = RoundingMode::TRN;
+    let O = OverflowMode::WRAP;
+    let x = fixed_point_util::make_fixed_points_4d<0>([
+        [[[s16:1, 2], [s16:3, 4]], [[s16:5, 6], [s16:7, 8]]],
+        [[[s16:9, 10], [s16:11, 12]], [[s16:13, 14], [s16:15, 16]]],
+    ]);
+    let x_3210 = fixed_point_util::make_fixed_points_4d<0>([
+        [[[s16:1, 9], [s16:5, 13]], [[s16:3, 11], [s16:7, 15]]],
+        [[[s16:2, 10], [s16:6, 14]], [[s16:4, 12], [s16:8, 16]]],
+    ]);
+
+    assert_eq(x, transpose_4d<16, 0, R, O, 0, 1, 2, 3>(x));
+    assert_eq(x_3210, transpose_4d<16, 0, R, O, 3, 2, 1, 0>(x));
+}
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 8c48f79d2d..2792d3109c 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -7,6 +7,7 @@
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.writers import Writer, get_writer, register_writer  # noqa: F401
+from hls4ml.writer.xls_writer import XLSWriter
 
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
@@ -16,3 +17,4 @@
 register_writer('Catapult', CatapultWriter)
 register_writer('Libero', LiberoWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
+register_writer('XLS', XLSWriter)
diff --git a/hls4ml/writer/xls_writer.py b/hls4ml/writer/xls_writer.py
new file mode 100644
index 0000000000..8176e30126
--- /dev/null
+++ b/hls4ml/writer/xls_writer.py
@@ -0,0 +1,424 @@
+# Typing imports
+from __future__ import annotations  # makes all annotations into strings
+
+import tarfile
+from collections import OrderedDict
+from collections.abc import Iterable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from hls4ml.backends.xls.xls_types import (
+    XLSConst,
+    XLSFunctionCall,
+    XLSFunctionDefinition,
+    XLSImport,
+    XLSTensorVariable,
+    XLSTypeAlias,
+    XLSVariableDefinition,
+)
+from hls4ml.model.layers import Layer
+
+if TYPE_CHECKING:
+    from hls4ml.model.graph import ModelGraph
+
+import os
+from shutil import copyfile, copytree, rmtree
+
+from hls4ml.writer.writers import Writer
+
+XLS_TEMPLATE_DIR = Path(__file__).resolve().parent.parent / 'templates/xls'
+INDENT = ' ' * 4
+
+
+def firmware_dir(model: ModelGraph):
+    return Path(model.config.get_output_dir()) / 'firmware'
+
+
+def reports_dir(model: ModelGraph):
+    return Path(model.config.get_output_dir()) / 'reports'
+
+
+def append_line(line: str, x: Any, indent=None) -> str:
+    if indent is None:
+        indent = ''
+    if isinstance(indent, int):
+        indent = INDENT * indent
+    return line + f'{indent}{x}\n'
+
+
+def append_lines(s: str, *xs: Any, indent=None) -> str:
+    # Allow append_lines(s, [1,2,3]) as well as append_lines(s, 1,2,3)
+    if len(xs) == 1 and isinstance(xs[0], Iterable) and not isinstance(xs[0], (str, bytes)):
+        xs = tuple(xs[0])
+
+    for x in xs:
+        s = append_line(s, x, indent=indent)
+    return s
+
+
+def to_tuple_or_singleton_str(xs: Iterable[Any], sep: str = ', ') -> str:
+    xs = tuple(xs)
+    assert len(xs) >= 1
+    if len(xs) == 1:
+        return str(xs[0])
+    return '(' + sep.join(str(x) for x in xs) + ')'
+
+
+class XLSWriter(Writer):
+    def write_project_dir(self, model: ModelGraph) -> None:
+        """Write the base project directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        firmware = firmware_dir(model)
+        if not os.path.isdir(firmware):
+            os.makedirs(firmware)
+
+        reports = reports_dir(model)
+        if not os.path.isdir(reports):
+            os.makedirs(reports)
+
+    def write_build_script(self, model: ModelGraph) -> None:
+        for name in ('build_prj.tcl', 'constraints.xdc'):
+            srcpath = XLS_TEMPLATE_DIR / name
+            dstpath = Path(model.config.get_output_dir()) / name
+            copyfile(srcpath, dstpath)
+
+    def write_project_dslx(self, model: ModelGraph) -> None:
+        """Write the main architecture source file (myproject.x)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        output_path = firmware_dir(model) / f'{model.config.get_project_name()}.x'
+
+        layers = list(model.get_layers())
+
+        output_vars = OrderedDict(
+            (model.graph[output].get_attr('xls_module_name'), model.graph[output].get_attr('xls_output_variables')[0])
+            for output in model.outputs
+        )
+
+        with open(output_path, 'w') as f:
+            for line in open(XLS_TEMPLATE_DIR / 'firmware/myproject.x'):
+                if 'myproject' in line:
+                    line = line.replace('myproject', model.config.get_project_name())
+                elif '// hls-fpga-machine-learning insert imports' in line:
+                    line = append_lines(line, (XLSImport(layer.get_attr('xls_module_name')) for layer in layers))
+
+                    for name in model.inputs:
+                        i = model.graph[name].index
+                        input_module = model.graph[name].get_attr('xls_module_name')
+                        input_var = model.graph[name].get_attr('xls_input_variables')[0]
+                        line = append_lines(
+                            line,
+                            XLSConst(
+                                name=f'INPUT_{i}_BINARY_EXPONENT',
+                                value=f'{input_module}::{input_var.binary_exponent.name}',
+                                type='s32',
+                            ),
+                            XLSTypeAlias(name=f'Input_{i}_Type', type=f'{input_module}::{input_var.type_alias.name}'),
+                            XLSTypeAlias(
+                                name=f'Input_{i}_Type_Bits', type=f'{input_module}::{input_var.type_alias_bits.name}'
+                            ),
+                        )
+                    for name in model.outputs:
+                        i = model.graph[name].index
+                        output_module = model.graph[name].get_attr('xls_module_name')
+                        output_var = model.graph[name].get_attr('xls_output_variables')[0]
+                        line = append_lines(
+                            line,
+                            XLSConst(
+                                name=f'OUTPUT_{i}_NUM_BITS', value=f'{output_module}::{output_var.num_bits.name}', type='u32'
+                            ),
+                            XLSConst(
+                                name=f'OUTPUT_{i}_BINARY_EXPONENT',
+                                value=f'{output_module}::{output_var.binary_exponent.name}',
+                                type='s32',
+                            ),
+                            XLSTypeAlias(name=f'Output_{i}_Type', type=f'{output_module}::{output_var.type_alias.name}'),
+                            XLSTypeAlias(
+                                name=f'Output_{i}_Type_Bits', type=f'{output_module}::{output_var.type_alias_bits.name}'
+                            ),
+                        )
+                elif '// hls-fpga-machine-learning insert architecture input' in line:
+                    for name in model.inputs:
+                        i = model.graph[name].index
+                        line = append_line(line, f'input_{i}: Input_{i}_Type,', indent=1)
+                elif '// hls-fpga-machine-learning insert architecture output' in line:
+                    output_types = [f'Output_{model.graph[name].index}_Type' for name in model.outputs]
+                    line = append_line(line, to_tuple_or_singleton_str(output_types))
+
+                elif '// hls-fpga-machine-learning insert layers' in line:
+                    output_var_names = []
+                    for layer in layers:
+                        layer_module_name = layer.get_attr('xls_module_name')
+                        layer_input_vars = layer.get_attr('xls_input_variables')
+                        layer_output_vars = layer.get_attr('xls_output_variables')
+
+                        if layer.class_name == 'Input':
+                            assert len(layer.inputs) == 1, (
+                                f'Input layer {layer.name} should have a single input, but got {len(layer.inputs)}.'
+                            )
+                            input_var_names = [f'input_{layer.index}']
+                        else:
+                            input_var_names = [var.name for var in layer_input_vars]
+                        layer_output_var_names = [var.name for var in layer_output_vars]
+                        if layer.name in model.outputs:
+                            output_var_names += layer_output_var_names
+                        line = append_line(
+                            line,
+                            XLSVariableDefinition(
+                                name=to_tuple_or_singleton_str(layer_output_var_names),
+                                value=XLSFunctionCall(name=f'{layer_module_name}::transform', args=input_var_names),
+                            ),
+                            indent=1,
+                        )
+                    line = append_line(line, to_tuple_or_singleton_str(output_var_names), indent=1)
+
+                elif '// hls-fpga-machine-learning insert bits input' in line:
+                    for name in model.inputs:
+                        i = model.graph[name].index
+                        line = append_line(line, f'input_bits_{i}: Input_{i}_Type_Bits,', indent=1)
+
+                elif '// hls-fpga-machine-learning insert bits output' in line:
+                    out_types = [f'Output_{model.graph[name].index}_Type_Bits' for name in model.outputs]
+                    line = append_line(line, to_tuple_or_singleton_str(out_types))
+
+                elif '// hls-fpga-machine-learning insert convert from bits' in line:
+                    fixed_point_input_names = []
+                    xls_statements: list[XLSVariableDefinition | str] = []
+                    for name in model.inputs:
+                        i = model.graph[name].index
+                        bits_name = f'input_bits_{i}'
+                        fixed_point_name = f'input_fixed_point_{i}'
+                        input_var = model.graph[name].get_attr('xls_input_variables')[0]
+                        rank = len(input_var.shape)
+                        fixed_point_input_names.append(fixed_point_name)
+                        xls_statements.append(
+                            XLSVariableDefinition(
+                                name=fixed_point_name,
+                                value=XLSFunctionCall(
+                                    name=f'fixed_point_util::make_fixed_points_{rank}d',
+                                    params=[f'INPUT_{i}_BINARY_EXPONENT'],
+                                    args=bits_name,
+                                ),
+                            )
+                        )
+                    output_fixed_point_names = tuple(
+                        f'output_fixed_point_{output_var.name}' for output_var in output_vars.values()
+                    )
+                    xls_statements.append(
+                        XLSVariableDefinition(
+                            name=to_tuple_or_singleton_str(output_fixed_point_names),
+                            value=XLSFunctionCall(
+                                name=f'{model.config.get_project_name()}_fixed_point', args=fixed_point_input_names
+                            ),
+                        )
+                    )
+
+                    output_bits_names = []
+                    for name in model.outputs:
+                        output_layer = model.graph[name]
+                        i = output_layer.index
+                        output_var = output_layer.get_attr('xls_output_variables')[0]
+                        bits_name = f'output_bits_{i}'
+                        output_bits_names.append(bits_name)
+                        fixed_point_name = f'output_fixed_point_{output_var.name}'
+                        rank = len(output_var.shape)
+                        xls_statements.append(
+                            XLSVariableDefinition(
+                                name=bits_name,
+                                value=XLSFunctionCall(
+                                    name=f'fixed_point_util::to_significand_{rank}d',
+                                    params=[],
+                                    args=fixed_point_name,
+                                ),
+                            )
+                        )
+                    xls_statements.append(to_tuple_or_singleton_str(output_bits_names))
+
+                    line = append_lines(line, [f'{x}' for x in xls_statements], indent=1)
+
+                elif '// hls-fpga-machine-learning insert top-level function call' in line:
+                    line = append_line(
+                        line,
+                        XLSFunctionCall(
+                            name='myproject_bits',
+                            params=[],
+                            args=[f'input_bits_{model.graph[name].index}' for name in model.inputs],
+                        ),
+                        indent=1,
+                    )
+
+                else:
+                    pass
+
+                f.write(line)
+
+    def write_layers(self, model: ModelGraph):
+        for layer in model.get_layers():
+            self.write_layer(model, layer)
+
+    def write_layer(self, model: ModelGraph, layer: Layer):
+        layer_module_name = layer.get_attr('xls_module_name')
+        input_vars: list[XLSTensorVariable] = layer.get_attr('xls_input_variables')
+        output_vars: list[XLSTensorVariable] = layer.get_attr('xls_output_variables')
+        with open(firmware_dir(model) / f'{layer_module_name}.x', 'w') as f:
+            for line in open(XLS_TEMPLATE_DIR / 'firmware/layer.x'):
+                if '// hls-fpga-machine-learning insert imports' in line:
+                    imports = []
+                    func_namespace = layer.get_attr('xls_func_call').name.module_name
+                    if func_namespace is not None and func_namespace != 'fixed_point_util':
+                        imports.append(XLSImport(name=f'nnet_utils.{func_namespace}'))
+                    if layer.get_attr('lookup_tables'):
+                        imports.append(XLSImport(name='nnet_utils.lookup_table'))
+                    if layer.get_attr('data_format'):
+                        imports.append(XLSImport(name='nnet_utils.data_format'))
+                    line = append_lines(line, imports)
+
+                elif '// hls-fpga-machine-learning insert types' in line:
+                    for in_out_vars in (input_vars, output_vars):
+                        for var in in_out_vars:
+                            line = append_lines(line, var.definitions())
+                            line += '\n'
+
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    weights = layer.get_attr('xls_weights')
+                    if weights:
+                        line = append_line(line, weights)
+                    bias = layer.get_attr('xls_bias')
+                    if bias:
+                        line = append_lines(line, '\n', bias)
+
+                elif '// hls-fpga-machine-learning insert lookup tables' in line:
+                    for table in layer.get_attr('lookup_tables', []):
+                        line = append_line(line, table)
+                        line += '\n'
+
+                elif '// hls-fpga-machine-learning insert other constants' in line:
+                    # NB: sometimes constant is already defined, e.g. output dimensions for Reshape layer
+                    # In that case, we don't write it again.
+                    existing_names = {
+                        x.name
+                        for in_out_vars in (input_vars, output_vars)
+                        for var in in_out_vars
+                        for x in var.definitions()
+                        if isinstance(x, XLSConst)
+                    }
+                    extra_consts = (
+                        x
+                        for key in ('xls_extra_func_params', 'xls_extra_func_args')
+                        for x in layer.get_attr(key)
+                        if x.name not in existing_names
+                    )
+                    line = append_lines(line, extra_consts)
+
+                elif '// hls-fpga-machine-learning insert helpers for different input ranks' in line:
+                    """
+                    Generate helper functions for the case of higher-rank input data, for example:
+                        transform_1d(x) -> softmax(x)
+                        transform_2d(x) -> map(transform_1d, x)
+                        transform_3d(x) -> map(transform_2d, x)
+                        // top-level function:
+                        transform(x) -> transform_3d(x)
+                    """
+                    min_input_rank = layer.get_attr('xls_min_input_rank')
+                    input_rank = len(input_vars[0].shape)
+                    for rank in range(min_input_rank, input_rank + 1):
+                        input_types = [input_var.type_alias.type for input_var in input_vars]
+                        output_types = [output_var.type_alias.type for output_var in output_vars]
+                        # Get inner type
+                        for _ in range(input_rank - rank):
+                            input_types = [input_type.element_type for input_type in input_types]
+                            output_types = [output_type.element_type for output_type in output_types]
+                        assert input_types[0].rank == rank, (
+                            f'Input rank mismatch: expected {rank}, got {input_types[0].rank}'
+                        )
+
+                        name = f'transform_{rank}d'
+                        params = []
+                        args = [f'x_{i}: {input_type}' for i, input_type in enumerate(input_types)]
+
+                        output_type = to_tuple_or_singleton_str(output_types)
+
+                        if rank == min_input_rank:
+                            body = layer.get_attr('xls_func_call')
+                        else:
+                            dim_0 = input_types[0].shape[0]
+                            acc_vars = tuple(f'acc_{i}' for i in range(len(output_types)))
+                            out_var_i = tuple(f'out_{i}' for i in range(len(output_types)))
+                            in_vars_i = [f'x_{i}[i]' for i, input_type in enumerate(input_types)]
+                            transform_i = XLSVariableDefinition(
+                                name=to_tuple_or_singleton_str(out_var_i),
+                                value=XLSFunctionCall(name=f'transform_{rank - 1}d', args=in_vars_i),
+                            )
+                            update_i = to_tuple_or_singleton_str(
+                                [f'update({acc}, i, out_{i})' for i, acc in enumerate(acc_vars)]
+                            )
+                            body = f"""{INDENT}for (i, {to_tuple_or_singleton_str(acc_vars)}) in 0..{dim_0} {{
+{INDENT}{INDENT}{transform_i}
+{INDENT}{INDENT}{update_i}
+{INDENT}}}(zero!<{output_type}>())
+                            """
+                        line = append_line(
+                            line,
+                            XLSFunctionDefinition(name=name, params=params, args=args, output_type=output_type, body=body),
+                        )
+                elif '// hls-fpga-machine-learning insert layer input' in line:
+                    input_args = [f'{INDENT}x_{i}: {input_var.type_alias.name}' for i, input_var in enumerate(input_vars)]
+                    line = append_line(line, ',\n'.join(input_args))
+                elif '// hls-fpga-machine-learning insert layer output' in line:
+                    output_types = to_tuple_or_singleton_str(output_var.type_alias.name for output_var in output_vars)
+                    line = append_line(line, f'{output_types}')
+
+                elif '// hls-fpga-machine-learning insert top-level function call' in line:
+                    input_rank = len(input_vars[0].shape)
+                    line = append_line(
+                        line,
+                        XLSFunctionCall(
+                            name=f'transform_{input_rank}d', params=[], args=[f'x_{i}' for i in range(len(input_vars))]
+                        ),
+                        indent=1,
+                    )
+                else:
+                    pass
+                f.write(line)
+
+    def write_nnet_utils(self, model: ModelGraph) -> None:
+        """Copy the nnet_utils, AP types headers to the project output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        for dirname in 'nnet_utils', 'ap_types':
+            srcpath = XLS_TEMPLATE_DIR / 'firmware' / dirname
+            dstpath = firmware_dir(model) / dirname
+            if os.path.exists(dstpath):
+                rmtree(dstpath)
+            copytree(srcpath, dstpath)
+
+    @staticmethod
+    def write_tar(model):
+        """Write the generated project as a .tar.gz archive
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        write_tar = model.config.get_writer_config().get('WriteTar', False)
+        if write_tar:
+            tar_path = Path(model.config.get_output_dir() + '.tar.gz')
+            tar_path.unlink(missing_ok=True)
+            with tarfile.open(tar_path, mode='w:gz') as archive:
+                archive.add(model.config.get_output_dir(), recursive=True, arcname='')
+
+    def write_hls(self, model: ModelGraph) -> None:
+        self.write_project_dir(model)
+        self.write_build_script(model)
+        self.write_project_dslx(model)
+        self.write_layers(model)
+        self.write_nnet_utils(model)
+        self.write_tar(model)
diff --git a/pyproject.toml b/pyproject.toml
index a39c7cb362..77f43d374c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,7 @@ optional-dependencies.testing-keras3 = [
   "keras>=3.10",
   "tensorflow>=2.15",
 ]
+optional-dependencies.xls = [ "xls-python>=0.1.9875" ]
 urls.Homepage = "https://fastmachinelearning.org/hls4ml"
 scripts.hls4ml = "hls4ml.cli:main"
 entry-points.pytest_randomly.random_seeder = "hls4ml:reseed"
diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index ebbcd8a21e..777d208656 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -5,7 +5,7 @@
     - k8s-default
   variables:
     CONDA_ENV: "hls4ml-testing"
-    EXTRA_DEPS: "[da,testing,testing-keras2,sr,optimization]"
+    EXTRA_DEPS: "[da,testing,testing-keras2,sr,optimization,xls]"
   before_script:
     - eval "$(conda shell.bash hook)"
     - conda activate "$CONDA_ENV"
@@ -49,4 +49,4 @@
   extends: .pytest
   variables:
     CONDA_ENV: "hls4ml-testing-keras3"
-    EXTRA_DEPS: "[da,testing,testing-keras3,sr]"
+    EXTRA_DEPS: "[da,testing,testing-keras3,sr,xls]"
diff --git a/test/pytest/synthesis_helpers.py b/test/pytest/synthesis_helpers.py
index 27d953b101..5d3a7989ea 100644
--- a/test/pytest/synthesis_helpers.py
+++ b/test/pytest/synthesis_helpers.py
@@ -147,8 +147,8 @@ def run_synthesis_test(config, hls_model, baseline_file_name, backend):
     if not config.get('run_synthesis', False):
         return
 
-    # Skip Quartus backend
-    if backend == 'Quartus':
+    # Skip Quartus and XLS backends
+    if backend in ['Quartus', 'XLS']:
         return
 
     # Run synthesis
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index 19f2ed9d01..adbf36ebaf 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -13,7 +13,7 @@
 # Variable 'name' is simply used as an identifier for the activation
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 @pytest.mark.parametrize(
     'activation, name',
@@ -52,6 +52,8 @@
 def test_activations(test_case_id, backend, activation, name, shape, io_type):
     if name == 'prelu' and shape == (8, 8, 3):
         return
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     # Subtract 0.5 to include negative values
     X = np.random.rand(1000, *shape) - 0.5
 
@@ -62,6 +64,13 @@ def test_activations(test_case_id, backend, activation, name, shape, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend)
     output_dir = str(test_root_path / test_case_id)
 
+    # XLS uses a custom algorithm for determining lookup table boundaries,
+    # so we need to increase the table size for some activations
+    # (note that other backends use a hardcoded range [-8; 8]).
+    # See hls4ml/backends/xls/passes/build_tables.py
+    if backend == 'XLS' and name == 'softsign':
+        hls_config['LayerName']['activation_3']['TableSize'] = 2048
+
     hls_model = hls4ml.converters.convert_from_keras_model(
         keras_model, hls_config=hls_config, io_type=io_type, output_dir=output_dir, backend=backend
     )
diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py
index d3738c8461..284ca24f56 100644
--- a/test/pytest/test_auto_precision.py
+++ b/test/pytest/test_auto_precision.py
@@ -21,33 +21,60 @@
 
 test_root_path = Path(__file__).parent
 
-in_height = 10
-in_width = 12
-in_feat = 4
 
+# XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+def in_height(backend):
+    if backend == 'XLS':
+        return 8
+    return 10
 
-@pytest.fixture(scope='module')
-def data_1d():
-    X = np.random.rand(100, in_feat)
+
+def in_width(backend):
+    if backend == 'XLS':
+        return 8
+    return 12
+
+
+def in_feat(backend):
+    if backend == 'XLS':
+        return 2
+    return 4
+
+
+def input_shape_1d(backend):
+    return (in_feat(backend),)
+
+
+def input_shape_2d(backend):
+    return in_width(backend), in_feat(backend)
+
+
+def input_shape_3d(backend):
+    return in_height(backend), in_width(backend), in_feat(backend)
+
+
+@pytest.fixture()
+def data_1d(backend):
+    X = np.random.rand(100, *input_shape_1d(backend))
     return X
 
 
-@pytest.fixture(scope='module')
-def data_2d():
-    X = np.random.rand(100, in_width, in_feat)
+@pytest.fixture()
+def data_2d(backend):
+    X = np.random.rand(100, *input_shape_2d(backend))
     return X
 
 
-@pytest.fixture(scope='module')
-def data_3d():
-    X = np.random.rand(100, in_height, in_width, in_feat)
+@pytest.fixture()
+def data_3d(backend):
+    X = np.random.rand(100, *input_shape_3d(backend))
     return X
 
 
-@pytest.fixture(scope='module')
-def keras_model_dense():
+@pytest.fixture()
+def keras_model_dense(backend):
     model = Sequential()
-    model.add(Dense(8, activation='relu', input_shape=(in_feat,), name='first_layer'))
+    model.add(Dense(8, activation='relu', input_shape=input_shape_1d(backend), name='first_layer'))
     model.add(BatchNormalization(name='first_bn'))
     model.add(Dense(6, activation='relu', name='middle_layer'))
     model.add(BatchNormalization(name='middle_bn'))
@@ -56,10 +83,10 @@ def keras_model_dense():
     return model
 
 
-@pytest.fixture(scope='module')
-def keras_model_conv1d():
+@pytest.fixture()
+def keras_model_conv1d(backend):
     model = Sequential()
-    model.add(Conv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat)))
+    model.add(Conv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=input_shape_2d(backend)))
     model.add(AveragePooling1D(pool_size=2, name='first_pool'))
     model.add(ReLU(name='first_act'))
     model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer'))
@@ -70,12 +97,10 @@ def keras_model_conv1d():
     return model
 
 
-@pytest.fixture(scope='module')
-def keras_model_conv2d():
+@pytest.fixture()
+def keras_model_conv2d(backend):
     model = Sequential()
-    model.add(
-        Conv2D(8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat))
-    )
+    model.add(Conv2D(8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=input_shape_3d(backend)))
     model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool'))
     model.add(ReLU(name='first_act'))
     model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', name='middle_layer'))
@@ -86,10 +111,12 @@ def keras_model_conv2d():
     return model
 
 
-@pytest.fixture(scope='module')
-def keras_model_sepconv1d():
+@pytest.fixture()
+def keras_model_sepconv1d(backend):
     model = Sequential()
-    model.add(SeparableConv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat)))
+    model.add(
+        SeparableConv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=input_shape_2d(backend))
+    )
     model.add(AveragePooling1D(pool_size=2, name='first_pool'))
     model.add(ReLU(name='first_act'))
     model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer'))
@@ -100,13 +127,11 @@ def keras_model_sepconv1d():
     return model
 
 
-@pytest.fixture(scope='module')
-def keras_model_sepconv2d():
+@pytest.fixture()
+def keras_model_sepconv2d(backend):
     model = Sequential()
     model.add(
-        SeparableConv2D(
-            8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat)
-        )
+        SeparableConv2D(8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=input_shape_3d(backend))
     )
     model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool'))
     model.add(ReLU(name='first_act'))
@@ -119,11 +144,14 @@ def keras_model_sepconv2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'XLS'])
 @pytest.mark.parametrize('model_type', ['conv1d', 'conv2d'])
 def test_auto_precision_conv(
     test_case_id, keras_model_conv1d, keras_model_conv2d, data_2d, data_3d, model_type, io_type, backend
 ):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     if model_type == 'conv1d':
         model = keras_model_conv1d
         data = data_2d
@@ -218,8 +246,11 @@ def test_auto_precision_sepconv(
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'XLS'])
 def test_auto_precision_dense(test_case_id, keras_model_dense, data_1d, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     model = keras_model_dense
     data = data_1d
 
diff --git a/test/pytest/test_binary_cnn.py b/test/pytest/test_binary_cnn.py
index f74b1ab3f1..d5667ca4c2 100644
--- a/test/pytest/test_binary_cnn.py
+++ b/test/pytest/test_binary_cnn.py
@@ -25,10 +25,17 @@
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
         ('Vitis', 'io_stream', 'resource'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 def test_binary_cnn(test_case_id, backend, io_type, strategy):
-    x_in = Input(shape=(28, 28, 1))
+    if backend == 'XLS':
+        # XLS test is slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (12, 12, 1)
+    else:
+        input_shape = (28, 28, 1)
+
+    x_in = Input(shape=input_shape)
 
     x = QConv2D(
         4,
@@ -94,7 +101,7 @@ def test_binary_cnn(test_case_id, backend, io_type, strategy):
         io_type=io_type,
     )
 
-    X = np.random.rand(100, 28, 28, 1)
+    X = np.random.rand(100, *input_shape)
     X = np.round(X * 2**10) * 2**-10
 
     hls_model.compile()
diff --git a/test/pytest/test_causalpadding.py b/test/pytest/test_causalpadding.py
index e000ad4dbf..49051c7c59 100644
--- a/test/pytest/test_causalpadding.py
+++ b/test/pytest/test_causalpadding.py
@@ -13,8 +13,11 @@
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'XLS'])
 def test_causalpadding(test_case_id, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     model = Sequential()
     model.add(Conv1D(1, 5, padding='causal', input_shape=(100, 1)))
     model.compile()
diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py
index 85b58cf17c..74a0dc090e 100644
--- a/test/pytest/test_depthconv1d.py
+++ b/test/pytest/test_depthconv1d.py
@@ -32,6 +32,7 @@
         ('Vivado', 'io_stream', 'resource'),
         ('Vitis', 'io_stream', 'resource'),
         ('Catapult', 'io_stream', 'latency'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 @pytest.mark.parametrize('rf', rf_options)
diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py
index fe2dd5e6f8..b5bbfdd5dd 100644
--- a/test/pytest/test_depthconv2d.py
+++ b/test/pytest/test_depthconv2d.py
@@ -32,6 +32,7 @@
         ('Vivado', 'io_stream', 'resource'),
         ('Vitis', 'io_stream', 'resource'),
         ('Catapult', 'io_stream', 'latency'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 @pytest.mark.parametrize('rf', rf_options)
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index 606f2bc51d..6c77c22f4a 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -26,9 +26,11 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(test_case_id, backend, io_type, synthesis_config):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = tf.keras.models.Sequential()
     model.add(
         Dense(
@@ -95,9 +97,11 @@ def test_dense(test_case_id, backend, io_type, synthesis_config):
     ids=['relu', 'leaky_relu', 'elu', 'prelu', 'sigmoid'],
 )
 # ThresholdedReLU(theta=1.0)])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(test_case_id, activation_function, backend, io_type, synthesis_config):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = tf.keras.models.Sequential()
     model.add(Dense(64, input_shape=(1,), name='Dense', kernel_initializer='lecun_uniform', kernel_regularizer=None))
     model.add(activation_function)
@@ -137,15 +141,24 @@ def test_activations(test_case_id, activation_function, backend, io_type, synthe
         ('Vitis', 'Latency'),
         ('Quartus', 'Resource'),
         ('oneAPI', 'Resource'),
+        ('XLS', 'Latency'),
     ],
 )
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(test_case_id, padds, backend, strategy, io_type, synthesis_config):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'XLS':
+        # XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (10, 32, 4)
+        filters = 8
+    else:
+        input_shape = (10, 128, 4)
+        filters = 32
     model = tf.keras.models.Sequential()
-    input_shape = (10, 128, 4)
     model.add(
         Conv1D(
-            filters=32,
+            filters=filters,
             kernel_size=3,
             strides=1,
             padding=padds,
@@ -159,7 +172,7 @@ def test_conv1d(test_case_id, padds, backend, strategy, io_type, synthesis_confi
     model.add(Activation(activation='relu'))
     model.compile(optimizer='adam', loss='mse')
 
-    X_input = np.random.rand(10, 128, 4)
+    X_input = np.random.rand(*input_shape)
     keras_prediction = model.predict(X_input)
 
     config = hls4ml.utils.config_from_keras_model(model)
@@ -222,15 +235,24 @@ def test_conv1d(test_case_id, padds, backend, strategy, io_type, synthesis_confi
         ('Vitis', 'Latency'),
         ('Quartus', 'Resource'),
         ('oneAPI', 'Resource'),
+        ('XLS', 'Latency'),
     ],
 )
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(test_case_id, chans, padds, backend, strategy, io_type, synthesis_config):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'XLS':
+        # XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (12, 12, 3)
+        filters = 6
+    else:
+        input_shape = (28, 28, 3)
+        filters = 32
     model = tf.keras.models.Sequential()
-    input_shape = (28, 28, 3)
     model.add(
         Conv2D(
-            filters=32,
+            filters=filters,
             kernel_size=(4, 4),
             strides=(4, 4),
             padding=padds,
@@ -407,7 +429,7 @@ def test_depthwise1d(test_case_id, backend, io_type, synthesis_config):
 )
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_pooling(test_case_id, pooling, padds, chans, backend, synthesis_config):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
 
diff --git a/test/pytest/test_keras_v3_api.py b/test/pytest/test_keras_v3_api.py
index 21f90d5d60..f4f661fb56 100644
--- a/test/pytest/test_keras_v3_api.py
+++ b/test/pytest/test_keras_v3_api.py
@@ -29,9 +29,11 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = keras.Sequential(
         [
             Dense(
@@ -91,9 +93,11 @@ def test_dense(test_case_id, backend, io_type):
         Activation(activation='sigmoid', name='sigmoid'),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(test_case_id, activation_function, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = keras.models.Sequential()
     model.add(Dense(64, input_shape=(1,), name='Dense', kernel_initializer='lecun_uniform', kernel_regularizer=None))
     model.add(activation_function)
@@ -125,15 +129,23 @@ def test_activations(test_case_id, activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('activation', ['elu', 'relu'])
 def test_conv1d(test_case_id, padds, backend, io_type, activation):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'XLS':
+        # XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (10, 32, 4)
+        filters = 8
+    else:
+        input_shape = (10, 128, 4)
+        filters = 32
     model = keras.models.Sequential()
-    input_shape = (10, 128, 4)
     model.add(
         Conv1D(
-            filters=32,
+            filters=filters,
             kernel_size=3,
             strides=2,
             padding=padds,
@@ -148,7 +160,7 @@ def test_conv1d(test_case_id, padds, backend, io_type, activation):
     model.add(Activation(activation='relu'))
     model.compile(optimizer='adam', loss='mse')
 
-    X_input = np.random.rand(10, 128, 4)
+    X_input = np.random.rand(*input_shape)
     keras_prediction = model.predict(X_input, verbose=0)  # type: ignore
 
     config = hls4ml.utils.config_from_keras_model(model)
@@ -204,15 +216,23 @@ def test_conv1d(test_case_id, padds, backend, io_type, activation):
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(test_case_id, chans, padds, backend, io_type):
-    input_shape = (32, 32, 3)
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'XLS':
+        # XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (16, 16, 3)
+        filters = 6
+    else:
+        input_shape = (32, 32, 3)
+        filters = 32
     model = keras.Sequential(
         [
             keras.layers.InputLayer(input_shape),
             Conv2D(
-                filters=32,
+                filters=filters,
                 kernel_size=(2, 3),
                 strides=(4, 5),
                 padding=padds,
@@ -298,15 +318,23 @@ def test_conv2d(test_case_id, chans, padds, backend, io_type):
         assert hls_conv_attr['pad_right'] == 0
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
 def test_depthwise2d(test_case_id, backend, io_type):
     """
     Test proper handling of DepthwiseConv2D
     """
-    X = np.random.rand(10, 32, 32, 3)
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'XLS':
+        # XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (8, 8, 3)
+    else:
+        input_shape = (32, 32, 3)
+
+    X = np.random.rand(10, *input_shape)
     X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
-    model = keras.models.Sequential([keras.layers.Input((32, 32, 3)), DepthwiseConv2D(kernel_size=(3, 3))])
+    model = keras.models.Sequential([keras.layers.Input(input_shape), DepthwiseConv2D(kernel_size=(3, 3))])
     model.compile()
 
     config = hls4ml.utils.config_from_keras_model(
@@ -355,7 +383,7 @@ def test_depthwise1d(test_case_id, backend, io_type):
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult', 'XLS'])
 def test_pooling(test_case_id, pooling, padds, chans, backend):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
 
@@ -476,9 +504,11 @@ def test_pooling(test_case_id, pooling, padds, chans, backend):
     #         assert hls_pool.attributes['pad_right'] == 0
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_reused_layer(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     inp1 = keras.layers.Input(shape=(10, 10))
     inp2 = keras.layers.Input(shape=(10, 10))
 
diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py
index a0660c5893..9b5e1095bc 100644
--- a/test/pytest/test_merge.py
+++ b/test/pytest/test_merge.py
@@ -12,9 +12,12 @@
 
 @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('swap_inputs', [True, False])
 def test_merge(test_case_id, merge_layer, io_type, backend, swap_inputs):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     input_shape = (10, 10, 3)
 
     in1 = Input(shape=input_shape, name='inp1')
@@ -47,8 +50,11 @@ def test_merge(test_case_id, merge_layer, io_type, backend, swap_inputs):
 
 @pytest.mark.parametrize('axes', [1])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # No io_stream implementation yet
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_dot(test_case_id, axes, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     # Only 1D implemented
     input_shape = (10,)
 
@@ -76,8 +82,11 @@ def test_dot(test_case_id, axes, io_type, backend):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_concatenate1d(test_case_id, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     input_shape1 = (10,)
     input_shape2 = (8,)
 
@@ -106,8 +115,11 @@ def test_concatenate1d(test_case_id, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_concatenate2d(test_case_id, axis, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     input_shape1 = [10, 3]
     input_shape2 = [10, 4]
 
@@ -139,8 +151,11 @@ def test_concatenate2d(test_case_id, axis, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2, 3])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_concatenate3d(test_case_id, axis, io_type, backend):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     input_shape1 = [10, 10, 3]
     input_shape2 = [10, 10, 4]
 
diff --git a/test/pytest/test_multi_dense.py b/test/pytest/test_multi_dense.py
index 04c21f8923..bf67dc3448 100644
--- a/test/pytest/test_multi_dense.py
+++ b/test/pytest/test_multi_dense.py
@@ -21,11 +21,15 @@
         ('oneAPI', 'Resource'),
         ('Catapult', 'Latency'),
         ('Catapult', 'Resource'),
+        ('XLS', 'Latency'),
     ],
 )
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('shape', [(4, 3), (4, 1), (2, 3, 2), (1, 3, 1)])
 def test_multi_dense(test_case_id, backend, strategy, io_type, shape):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     model = tf.keras.models.Sequential()
     model.add(Dense(7, input_shape=shape, activation='relu'))
     model.add(Dense(2, activation='relu'))
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index bc7aaa88e2..50a2785905 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -37,14 +37,21 @@
         ('Vitis', 'io_stream', 'resource', 1),
         ('Catapult', 'io_stream', 'latency', 1),
         ('Catapult', 'io_stream', 'resource', 1),
+        ('XLS', 'io_parallel', 'latency', 1),
     ],
 )
 def test_pointwiseconv1d(test_case_id, chans, padds, strides, backend, io_type, strategy, rf):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
+    filters = 32
+    # XLS test is slow due to big IR size, we reduce dimensions to make it faster.
+    if backend == 'XLS':
+        input_shape = (14, 3)
+        filters = 8
+
     model.add(
         Conv1D(
-            filters=32,
+            filters=filters,
             kernel_size=(1,),
             strides=strides,
             padding=padds,
@@ -72,7 +79,7 @@ def test_pointwiseconv1d(test_case_id, chans, padds, strides, backend, io_type,
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
+    if backend != 'XLS' and not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
@@ -94,14 +101,22 @@ def test_pointwiseconv1d(test_case_id, chans, padds, strides, backend, io_type,
         ('Vivado', 'io_stream', 'resource'),
         ('Catapult', 'io_stream', 'latency'),
         ('Catapult', 'io_stream', 'resource'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 def test_pointwiseconv2d(test_case_id, chans, padds, strides, backend, io_type, strategy):
     model = tf.keras.models.Sequential()
     input_shape = (28, 28, 3)
+    filters = 32
+
+    # XLS test is slow due to big IR size, we reduce dimensions to make it faster.
+    if backend == 'XLS':
+        input_shape = (14, 14, 3)
+        filters = 8
+
     model.add(
         Conv2D(
-            filters=32,
+            filters=filters,
             kernel_size=(1, 1),
             strides=strides,
             padding=padds,
@@ -129,7 +144,7 @@ def test_pointwiseconv2d(test_case_id, chans, padds, strides, backend, io_type,
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
+    if backend != 'XLS' and not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py
index 07e40e340e..f4822e60ef 100644
--- a/test/pytest/test_pooling.py
+++ b/test/pytest/test_pooling.py
@@ -9,31 +9,47 @@
 
 test_root_path = Path(__file__).parent
 
-in_shape = 124
-in_filt = 5
 atol = 5e-3
 
 
-@pytest.fixture(scope='module')
-def data_1d():
-    return np.random.rand(100, in_shape, in_filt)
+# XLS tests are slow due to big IR size, we reduce dimensions to make it faster.
+def in_shape(backend):
+    if backend == 'XLS':
+        return 17
+    return 124
 
 
-@pytest.fixture(scope='module')
-def keras_model_1d(request):
+def in_filt(backend):
+    if backend == 'XLS':
+        return 3
+    return 5
+
+
+def input_shape_1d(backend):
+    return (in_shape(backend), in_filt(backend))
+
+
+@pytest.fixture()
+def data_1d(backend):
+    return np.random.rand(100, *input_shape_1d(backend))
+
+
+@pytest.fixture()
+def keras_model_1d(request, backend):
     model_type = request.param['model_type']
     pads = request.param['padding']
     strides = request.param.get('strides', None)
+    input_shape = input_shape_1d(backend)
     model = Sequential()
     if model_type == 'avg':
-        model.add(AveragePooling1D(pool_size=3, input_shape=(in_shape, in_filt), padding=pads, strides=strides))
+        model.add(AveragePooling1D(pool_size=3, input_shape=input_shape, padding=pads, strides=strides))
     elif model_type == 'max':
-        model.add(MaxPooling1D(pool_size=3, input_shape=(in_shape, in_filt), padding=pads))
+        model.add(MaxPooling1D(pool_size=3, input_shape=input_shape, padding=pads))
     model.compile()
     return model, model_type, pads, strides
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -109,26 +125,31 @@ def test_pool1d_stream(test_case_id, backend, keras_model_1d, data_1d, io_type):
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
 
 
-@pytest.fixture(scope='module')
-def data_2d():
-    return np.random.rand(100, in_shape, in_shape, in_filt)
+def input_shape_2d(backend):
+    return (in_shape(backend), in_shape(backend), in_filt(backend))
+
+
+@pytest.fixture()
+def data_2d(backend):
+    return np.random.rand(100, *input_shape_2d(backend))
 
 
-@pytest.fixture(scope='module')
-def keras_model_2d(request):
+@pytest.fixture()
+def keras_model_2d(request, backend):
     model_type = request.param['model_type']
     pads = request.param['padding']
     strides = request.param.get('strides', None)
+    input_shape = input_shape_2d(backend)
     model = Sequential()
     if model_type == 'avg':
-        model.add(AveragePooling2D(input_shape=(in_shape, in_shape, in_filt), padding=pads, strides=strides))
+        model.add(AveragePooling2D(input_shape=input_shape, padding=pads, strides=strides))
     elif model_type == 'max':
-        model.add(MaxPooling2D(input_shape=(in_shape, in_shape, in_filt), padding=pads, strides=strides))
+        model.add(MaxPooling2D(input_shape=input_shape, padding=pads, strides=strides))
     model.compile()
     return model, model_type, pads, strides
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index 4860b5aa20..6a5cb111fd 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -22,9 +22,11 @@ def forward(self, x):
         return self.linear(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_linear(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = LinearModel()
     model.eval()
 
@@ -74,9 +76,11 @@ def test_linear(test_case_id, backend, io_type):
     ],
     ids=['softmax', 'relu', 'tanh', 'leaky_relu', 'elu', 'prelu', 'sigmoid', 'threshold'],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(test_case_id, activation_function, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to()
     model.eval()
 
@@ -181,9 +185,11 @@ def forward(self, x):
     ],
     ids=['softmax', 'relu', 'tanh', 'leaky_relu', 'elu', 'sigmoid', 'threshold'],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activation_functionals(test_case_id, activation_function, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = activation_function
     model.eval()
 
@@ -215,9 +221,11 @@ def test_activation_functionals(test_case_id, activation_function, backend, io_t
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(test_case_id, padds, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     n_in = 2
     n_out = 2
     kernel_size = 3
@@ -322,9 +330,11 @@ def test_conv1d(test_case_id, padds, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(test_case_id, padds, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     n_in = 2
     n_out = 2
     kernel_size = 3
@@ -477,7 +487,7 @@ def test_conv2d(test_case_id, padds, backend, io_type):
 
 @pytest.mark.parametrize('pooling', pooling_layers, ids=['MaxPool1d', 'MaxPool2d', 'AvgPool1d', 'AvgPool2d'])
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_pooling(test_case_id, pooling, padds, backend):
     assert '1d' in pooling.__name__ or '2d' in pooling.__name__
 
@@ -597,9 +607,11 @@ def forward(self, x):
         return self.bn(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_bn(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = BatchNormModel()
     model.eval()
 
@@ -638,6 +650,7 @@ def forward(self, x):
         return x
 
 
+# TODO: this test fails for XLS due to PyTorch weights shape mismatch.
 @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_squeeze(test_case_id, backend, io_type):
@@ -673,7 +686,7 @@ def test_squeeze(test_case_id, backend, io_type):
         assert list(hls_model.get_layers())[3].attributes['target_shape'] == [3]
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 def test_flatten(test_case_id, backend):
     input = torch.randn(1, 1, 5, 5)
     model = nn.Sequential(nn.Conv2d(1, 32, 5, 1, 1), nn.Flatten(), nn.ReLU())
@@ -717,9 +730,11 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_skipped_layers(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     model = ModelSkippedLayers()
     model.eval()
 
@@ -750,7 +765,7 @@ def test_skipped_layers(test_case_id, backend, io_type):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # Only io_parallel for now
 @pytest.mark.parametrize('tensor_rank', [2, 3])
 def test_remove_transpose(test_case_id, backend, io_type, tensor_rank):
@@ -817,9 +832,12 @@ def forward(self, x):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_view(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     class TestModel(nn.Module):
         def __init__(self, n_in, n_out, size_in):
             super().__init__()
diff --git a/test/pytest/test_reshape.py b/test/pytest/test_reshape.py
index 45b71c29b7..25fd6418c9 100755
--- a/test/pytest/test_reshape.py
+++ b/test/pytest/test_reshape.py
@@ -20,9 +20,12 @@ def randX_20_10():
     return randX(20, 10)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult', 'oneAPI'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult', 'oneAPI', 'XLS'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_reshape_parallel(test_case_id, randX_20_10, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+
     model = tf.keras.models.Sequential(
         [
             tf.keras.layers.Input(shape=(10,)),
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 1dd08aa608..9ac8f3a93f 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -32,6 +32,7 @@
         ('Vivado', 'io_stream', 'resource'),
         ('Vitis', 'io_stream', 'resource'),
         ('Catapult', 'io_stream', 'latency'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 @pytest.mark.parametrize('rf', rf_options)
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 98ecbbc83e..d38d330487 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -32,16 +32,23 @@
         ('Vivado', 'io_stream', 'resource'),
         ('Vitis', 'io_stream', 'resource'),
         ('Catapult', 'io_stream', 'latency'),
+        ('XLS', 'io_parallel', 'latency'),
     ],
 )
 @pytest.mark.parametrize('rf', rf_options)
 @pytest.mark.parametrize('input_size', input_size_options)
 def test_sepconv2d(test_case_id, chans, padds, strides, kernels, bias, io_type, backend, strategy, rf, input_size):
+    if backend == 'XLS':
+        # XLS test is slow due to big IR size, we reduce dimensions to make it faster.
+        input_shape = (8, 8, input_size)
+        filters = 4
+    else:
+        input_shape = (16, 16, input_size)
+        filters = 8
     model = tf.keras.models.Sequential()
-    input_shape = (16, 16, input_size)
     model.add(
         tf.keras.layers.SeparableConv2D(
-            filters=8,
+            filters=filters,
             kernel_size=kernels,
             strides=strides,
             padding=padds,
diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 418f64b558..c7613c6392 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -11,16 +11,18 @@
 
 
 @pytest.fixture()
-def generate_data(input_shape):
+def generate_data(input_shape, implementation):
     shape = (5000, *input_shape)
     d = np.random.normal(0, 2, shape)
     modify_entries = np.random.randint(0, 1, shape) < 0.05
     d[modify_entries] = d[modify_entries] * 5 + 10
-    return np.clip(d, -32, 31)
+    clip_min = -32
+    clip_max = 0 if implementation == 'latency' else 31
+    return np.clip(d, clip_min, clip_max)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
-@pytest.mark.parametrize('strategy', ['stable', 'latency', 'argmax'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'XLS'])
+@pytest.mark.parametrize('implementation', ['stable', 'latency', 'argmax'])
 @pytest.mark.parametrize(
     'input_bits,input_shape,table_bits,io_type,custom_accum',
     [
@@ -35,7 +37,14 @@ def generate_data(input_shape):
         ('16,6', (8, 8, 3), '18,8', 'io_stream', False),
     ],
 )
-def test_softmax(test_case_id, backend, strategy, generate_data, input_bits, input_shape, table_bits, io_type, custom_accum):
+def test_softmax(
+    test_case_id, backend, implementation, generate_data, input_bits, input_shape, table_bits, io_type, custom_accum
+):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
+    if backend == 'Catapult' and implementation == 'argmax':
+        pytest.skip('Catapult backend does not support argmax implementation')
+
     X = generate_data
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Activation(input_shape=input_shape, activation='softmax', name='softmax'))
@@ -44,7 +53,7 @@ def test_softmax(test_case_id, backend, strategy, generate_data, input_bits, inp
     table_type = f'fixed<{table_bits}, RND, SAT>'
 
     cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend)
-    cfg['LayerName']['softmax']['Strategy'] = strategy
+    cfg['LayerName']['softmax']['implementation'] = implementation
     cfg['LayerName']['softmax']['inv_table_t'] = table_type
     cfg['LayerName']['softmax']['exp_table_t'] = table_type
     cfg['LayerName']['softmax']['accum_t'] = table_type
@@ -76,6 +85,8 @@ def test_softmax(test_case_id, backend, strategy, generate_data, input_bits, inp
 @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_softmax_skipped(test_case_id, backend, io_type):
+    if backend == 'XLS' and io_type != 'io_parallel':
+        pytest.skip(f'XLS backend only supports IOType: io_parallel, but got: {io_type}')
     X = np.random.rand(100, 10)
     dense = tf.keras.layers.Dense(14, input_shape=(10,), name='dense')
     softmax = tf.keras.layers.Activation(activation='softmax', name='softmax')