diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index c6ec83a453..05c348f117 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -204,6 +204,65 @@ extern "C"
q15_t * bufferA,
q7_t * bufferB);
+ /**
+ * @brief Basic Q7 convolution function (1D)
+ * @param[in] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention y
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] wt pointer to kernel weights
+ * @param[in] ch_im_out number of filters, i.e., output tensor channels
+ * @param[in] dim_kernel filter kernel size y
+ * @param[in] padding padding size y
+ * @param[in] stride. convolution stride y
+ * @param[in] bias pointer to bias
+ * @param[in] bias_shift amount of left-shift for bias
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in,out] Im_out pointer to output tensor
+ * @param[in] dim_im_out. output tensor dimension y
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferB pointer to buffer space for output
+ * @return The function returns ARM_MATH_SUCCESS
+ */
+
+static inline arm_status arm_convolve_HWC_q7_basic_1d(
+ const q7_t * Im_in,
+ const uint16_t dim_im_in,
+ const uint16_t ch_im_in,
+ const q7_t * wt,
+ const uint16_t ch_im_out,
+ const uint16_t dim_kernel,
+ const uint16_t padding,
+ const uint16_t stride,
+ const q7_t * bias,
+ const uint16_t bias_shift,
+ const uint16_t out_shift,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
+ q7_t * bufferB ) {
+ return arm_convolve_HWC_q7_basic_nonsquare(
+ Im_in, //const q7_t * Im_in
+ dim_im_in, //const uint16_t dim_im_in_x,
+ 1, //const uint16_t dim_im_in_y,
+ ch_im_in, //const uint16_t ch_im_in,
+ wt, //const q7_t * wt,
+ ch_im_out, //const uint16_t ch_im_out,
+ dim_kernel, //const uint16_t dim_kernel_x,
+ 1, //const uint16_t dim_kernel_y,
+ padding, //const uint16_t padding_x,
+ 1, //const uint16_t padding_y,
+ stride, //const uint16_t stride_x,
+ 1, //const uint16_t stride_y,
+ bias, //const q7_t * bias,
+ bias_shift, //const uint16_t bias_shift,
+ out_shift, //const uint16_t out_shift,
+ Im_out, //q7_t * Im_out,
+ dim_im_out, //const uint16_t dim_im_out_x,
+ 1, //const uint16_t dim_im_out_y,
+ bufferA, //q15_t * bufferA,
+ bufferB); //q7_t * bufferB);
+}
+
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
@@ -335,6 +394,71 @@ extern "C"
q15_t * bufferA,
q7_t * bufferB);
+ /**
+ * @brief Fast Q7 convolution function (1D)
+ * @param[in] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention y
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] wt pointer to kernel weights
+ * @param[in] ch_im_out number of filters, i.e., output tensor channels
+ * @param[in] dim_kernel filter kernel size y
+ * @param[in] padding padding size y
+ * @param[in] stride. convolution stride y
+ * @param[in] bias pointer to bias
+ * @param[in] bias_shift amount of left-shift for bias
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in,out] Im_out pointer to output tensor
+ * @param[in] dim_im_out. output tensor dimension y
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferB pointer to buffer space for output
+ * @return The function returns ARM_MATH_SUCCESS
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ * ch_im_in is multiple of 4
+ * ch_im_out is multiple of 2
+ */
+
+static inline arm_status arm_convolve_HWC_q7_fast_1d(
+ const q7_t * Im_in,
+ const uint16_t dim_im_in,
+ const uint16_t ch_im_in,
+ const q7_t * wt,
+ const uint16_t ch_im_out,
+ const uint16_t dim_kernel,
+ const uint16_t padding,
+ const uint16_t stride,
+ const q7_t * bias,
+ const uint16_t bias_shift,
+ const uint16_t out_shift,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
+ q7_t * bufferB ) {
+ return arm_convolve_HWC_q7_fast_nonsquare(
+ Im_in, //const q7_t * Im_in
+ dim_im_in, //const uint16_t dim_im_in_x,
+ 1, //const uint16_t dim_im_in_y,
+ ch_im_in, //const uint16_t ch_im_in,
+ wt, //const q7_t * wt,
+ ch_im_out, //const uint16_t ch_im_out,
+ dim_kernel, //const uint16_t dim_kernel_x,
+ 1, //const uint16_t dim_kernel_y,
+ padding, //const uint16_t padding_x,
+ 1, //const uint16_t padding_y,
+ stride, //const uint16_t stride_x,
+ 1, //const uint16_t stride_y,
+ bias, //const q7_t * bias,
+ bias_shift, //const uint16_t bias_shift,
+ out_shift, //const uint16_t out_shift,
+ Im_out, //q7_t * Im_out,
+ dim_im_out, //const uint16_t dim_im_out_x,
+ 1, //const uint16_t dim_im_out_y,
+ bufferA, //q15_t * bufferA,
+ bufferB); //q7_t * bufferB);
+}
+
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
@@ -502,7 +626,7 @@ extern "C"
*
* Buffer size:
*
- * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+ * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
*
* bufferB size: 0
*
@@ -535,7 +659,83 @@ extern "C"
const uint16_t dim_im_out_y,
q15_t * bufferA,
q7_t * bufferB);
-
+
+
+ /**
+ * @brief Fast Q15 convolution function (1D)
+ * @param[in] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention y
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] wt pointer to kernel weights
+ * @param[in] ch_im_out number of filters, i.e., output tensor channels
+ * @param[in] dim_kernel filter kernel size y
+ * @param[in] padding padding size y
+ * @param[in] stride. convolution stride y
+ * @param[in] bias pointer to bias
+ * @param[in] bias_shift amount of left-shift for bias
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in,out] Im_out pointer to output tensor
+ * @param[in] dim_im_out. output tensor dimension y
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferB pointer to buffer space for output
+ * @return The function returns ARM_MATH_SUCCESS
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ *
+ * Buffer size:
+ *
+ * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
+ *
+ * bufferB size: 0
+ *
+ * Input dimension constraints:
+ *
+ * ch_im_in is multiple of 2
+ *
+ * ch_im_out is multipe of 2
+ *
+ */
+static inline arm_status arm_convolve_HWC_q15_fast_1d(
+ const q7_t * Im_in,
+ const uint16_t dim_im_in,
+ const uint16_t ch_im_in,
+ const q7_t * wt,
+ const uint16_t ch_im_out,
+ const uint16_t dim_kernel,
+ const uint16_t padding,
+ const uint16_t stride,
+ const q7_t * bias,
+ const uint16_t bias_shift,
+ const uint16_t out_shift,
+ q7_t * Im_out,
+ const uint16_t dim_im_out,
+ q15_t * bufferA,
+ q7_t * bufferB ) {
+ return arm_convolve_HWC_q15_fast_nonsquare(
+ Im_in, //const q7_t * Im_in
+ dim_im_in, //const uint16_t dim_im_in_x,
+ 1, //const uint16_t dim_im_in_y,
+ ch_im_in, //const uint16_t ch_im_in,
+ wt, //const q7_t * wt,
+ ch_im_out, //const uint16_t ch_im_out,
+ dim_kernel, //const uint16_t dim_kernel_x,
+ 1, //const uint16_t dim_kernel_y,
+ padding, //const uint16_t padding_x,
+ 1, //const uint16_t padding_y,
+ stride, //const uint16_t stride_x,
+ 1, //const uint16_t stride_y,
+ bias, //const q7_t * bias,
+ bias_shift, //const uint16_t bias_shift,
+ out_shift, //const uint16_t out_shift,
+ Im_out, //q7_t * Im_out,
+ dim_im_out, //const uint16_t dim_im_out_x,
+ 1, //const uint16_t dim_im_out_y,
+ bufferA, //q15_t * bufferA,
+ bufferB); //q7_t * bufferB);
+}
+
+
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
@@ -629,7 +829,6 @@ extern "C"
q15_t * bufferA,
q7_t * bufferB);
-
/**
* @defgroup FC Fully-connected Layer Functions
*
@@ -949,6 +1148,64 @@ extern "C"
q7_t * bufferA,
q7_t * Im_out);
+ /**
+ * @brief Q7 max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention along X axis
+ * @param[in] dim_im_in_y input tensor dimention along Y axis
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel_x filter kernel size along X axis
+ * @param[in] dim_kernel_y filter kernel size along Y axis
+ * @param[in] padding_x padding sizes along X axis
+ * @param[in] padding_y padding sizes along Y axis
+ * @param[in] stride_x convolution stride along X axis
+ * @param[in] stride_y convolution stride along Y axis
+ * @param[in] dim_im_out_x output tensor dimension along X axis
+ * @param[in] dim_im_out_y output tensor dimension along Y axis
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ */
+
+void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out);
+
+ /**
+ * @brief Q7 1-D max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out);
+
/**
* @brief Q7 average pooling function
* @param[in] Im_in pointer to input tensor
@@ -974,6 +1231,65 @@ extern "C"
q7_t * bufferA,
q7_t * Im_out);
+ /**
+ * @brief Q7 average pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention along X axis
+ * @param[in] dim_im_in_y input tensor dimention along Y axis
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel_x filter kernel size along X axis
+ * @param[in] dim_kernel_y filter kernel size along Y axis
+ * @param[in] padding_x padding sizes along X axis
+ * @param[in] padding_y padding sizes along Y axis
+ * @param[in] stride_x convolution stride along X axis
+ * @param[in] stride_y convolution stride along Y axis
+ * @param[in] dim_im_out_x output tensor dimension along X axis
+ * @param[in] dim_im_out_y output tensor dimension along Y axis
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ */
+
+void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out);
+
+ /**
+ * @brief Q7 1-D max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out);
+
+
/**
* @defgroup Softmax Softmax Functions
*
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
index 9a4adb29ab..2292e181ff 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
@@ -94,3 +94,163 @@ void arm_maxpool_q7_HWC_ref(const q7_t * Im_in, // input image
}
}
}
+
+void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out) {
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ int sum = 0;
+ int count = 0;
+ int16_t y_start = i_y * stride_y - padding_y;
+ int16_t x_start = i_x * stride_x - padding_x;
+ for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
+ {
+ for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+ {
+ sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+ count++
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+ }
+ }
+ }
+}
+
+
+void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out) {
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ int max = -129;
+ int16_t y_start = i_y * stride_y - padding_y;
+ int16_t x_start = i_x * stride_x - padding_x;
+ for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
+ {
+ for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+ {
+ if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+ {
+ max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+ }
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+ }
+ }
+ }
+}
+
+
+void
+arm_avepool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out) {
+ int16_t i_ch_in, i;
+ int16_t k;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i = 0; i < dim_im_out; i++)
+ {
+ int sum = 0;
+ int count = 0;
+ int16_t start = i * stride - padding;
+ for (k = start; k < start + dim_kernel; k++)
+ {
+ if (k >= 0 && k < dim_im_in)
+ {
+ sum += Im_in[i_ch_in + ch_im_in * k];
+ count++;
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * i] = sum/count;
+ }
+ }
+}
+
+void
+arm_maxpool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out) {
+ int16_t i_ch_in, i;
+ int16_t k;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i = 0; i < dim_im_out; i++)
+ {
+ int max = -129;
+ int16_t start = i * stride - padding;
+ for (k = start; k < start + dim_kernel; k++)
+ {
+ if (k >= 0 && k < dim_im_in)
+ {
+ if (Im_in[i_ch_in + ch_im_in * k] > max)
+ {
+ max = Im_in[i_ch_in + ch_im_in * k];
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * i] = max;
+ }
+ }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
index 4a0647a9bd..c1cee95846 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
@@ -229,6 +229,39 @@ extern "C"
q7_t * bufferA, // a buffer for local storage
q7_t * Im_out);
+ void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out);
+
+
+ void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out);
+
/*
*
* Other reference implemenation
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index 5cf72a2a05..efc1fbf571 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -38,9 +38,11 @@
#include "arm_nnexamples_nn_test.h"
-//#define TEST_SIGMOID
-//#define TEST_TANH
+#define TEST_SIGMOID
+#define TEST_TANH
#define TEST_POOL
+#define TEST_POOL_NS
+#define TEST_POOL_1D
#define TEST_RELU
#define TEST_IP
#define TEST_CONV
@@ -256,29 +258,205 @@ int main()
img_in[i] = test1[i];
}
+ printf("Start avepool ref implementation\n");
+
+ arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
+
// copy over the img input
for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
{
img_in[i] = test1[i];
}
+ printf("Start avepool opt implementation\n");
+
+ arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
+
+ // special check here
+ bool if_ave_pool_match = true;
+ for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++)
+ {
+ // we tolerate at most difference of 1 here because of rounding errors
+ if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+ {
+ printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+ if_ave_pool_match = false;
+ }
+ }
+ if (if_ave_pool_match == true)
+ {
+ printf("Outputs match.\n");
+ }
+
+ delete[]test1;
+ delete[]test2;
+ delete[]test3;
+
+#endif//TEST_POOL
+
+#ifdef TEST_POOL_NS
+
+#define POOL_NS_IM_DIM1 64
+#define POOL_NS_IM_DIM2 16
+#define POOL_NS_IM_CH 4
+
+ test1 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH * 2];
+ test2 = new q15_t[(POOL_NS_IM_DIM1 + POOL_NS_IM_DIM2) * POOL_NS_IM_CH];
+ test3 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH];
+
+ for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+ {
+ test1[i] = (rand() % 256 - 128);
+ }
+
+ q7_t *img_in = test1 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH;
+ q7_t *pool_out_ref = test3;
+ q7_t *pool_out_opt = test3 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH / 2;
+
+ for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+ {
+ test3[i] = 0;
+ }
+
+ // copy over the img input
+ for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
+
+ printf("Start maxpool reference implementation\n");
+
+ arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
+
+ // copy over the img input
+ for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ printf("Start maxpool opt implementation\n");
+
+ arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, 1, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
+
+ verify_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
+
+ // copy over the img input
+ for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1/2) * (POOL_NS_IM_DIM2/2) * POOL_NS_IM_CH);
+
+ printf("Start maxpool reference implementation\n");
+
+ arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
+
+ // copy over the img input
+ for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ printf("Start maxpool opt implementation\n");
+
+ arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
+
+ // special check here
+ bool if_ave_pool_match = true;
+ for (int i = 0; i < (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH; i++)
+ {
+ // we tolerate at most difference of 1 here because of rounding errors
+ if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+ {
+ printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+ if_ave_pool_match = false;
+ }
+ }
+ if (if_ave_pool_match == true)
+ {
+ printf("Outputs match.\n");
+ }
+
+ delete[]test1;
+ delete[]test2;
+ delete[]test3;
+
+#endif//TEST_POOL_NS
+
+
+#ifdef TEST_POOL_1D
+
+#define POOL_IM_DIM_1D 32
+#define POOL_IM_CH_1D 8
+
+ test1 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D * 2];
+ test2 = new q15_t[POOL_IM_CH_1D];
+ test3 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D];
+
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+ {
+ test1[i] = (rand() % 256 - 128);
+ }
+
+ q7_t *img_in = test1 + POOL_IM_DIM_1D * POOL_IM_CH_1D;
+ q7_t *pool_out_ref = test3;
+ q7_t *pool_out_opt = test3 + POOL_IM_DIM_1D * POOL_IM_CH_1D / 2;
+
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+ {
+ test3[i] = 0;
+ }
+
+ // copy over the img input
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+ printf("Start maxpool reference implementation\n");
+
+ arm_maxpool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
+
+ // copy over the img input
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
+ printf("Start maxpool opt implementation\n");
+
+ arm_maxpool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
+
+ verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+ // copy over the img input
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+ {
+ img_in[i] = test1[i];
+ }
+
printf("Start avepool ref implementation\n");
- arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
+ arm_avepool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
// copy over the img input
- for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
+ for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
{
img_in[i] = test1[i];
}
printf("Start avepool opt implementation\n");
- arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
+ arm_avepool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
// special check here
bool if_ave_pool_match = true;
- for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++)
+ for (int i = 0; i < POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D; i++)
{
// we tolerate at most difference of 1 here because of rounding errors
if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
@@ -296,7 +474,7 @@ int main()
delete[]test2;
delete[]test3;
-#endif
+#endif//TEST_POOL_1D
#ifdef TEST_RELU
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index b451f5e695..83bff56939 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -286,6 +286,179 @@ arm_maxpool_q7_HWC(q7_t * Im_in,
}
+ /**
+ * @brief Q7 max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention along X axis
+ * @param[in] dim_im_in_y input tensor dimention along Y axis
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel_x filter kernel size along X axis
+ * @param[in] dim_kernel_y filter kernel size along Y axis
+ * @param[in] padding_x padding sizes along X axis
+ * @param[in] padding_y padding sizes along Y axis
+ * @param[in] stride_x convolution stride along X axis
+ * @param[in] stride_y convolution stride along Y axis
+ * @param[in] dim_im_out_x output tensor dimension along X axis
+ * @param[in] dim_im_out_y output tensor dimension along Y axis
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * Buffer size:
+ *
+ * bufferA size: 0
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void
+arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+ int16_t i_x, i_y;
+
+ /* first does the pooling along x axis */
+ for (i_y = 0; i_y < dim_im_in_y; i_y++)
+ {
+
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ /* for each output pixel */
+ q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+ q7_t *win_start;
+ q7_t *win_stop;
+ if (i_x * stride_x - padding_x < 0)
+ {
+ win_start = target;
+ } else
+ {
+ win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+ }
+
+ if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+ {
+ win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+ } else
+ {
+ win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+ }
+
+ /* first step is to copy over initial data */
+ /* arm_copy_q7(win_start, target, ch_im_in); */
+ memmove(target, win_start, ch_im_in);
+
+ /* start the max operation from the second part */
+ win_start += ch_im_in;
+ for (; win_start < win_stop; win_start += ch_im_in)
+ {
+ compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
+ }
+ }
+ }
+
+ /* then does the pooling along y axis */
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+
+ /* for each output row */
+ q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+ q7_t *row_start;
+ q7_t *row_end;
+ /* setting the starting row */
+ /* EQUIVILANT :
+ row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+ */
+ if (i_y * stride_y - padding_y < 0)
+ {
+ row_start = Im_in;
+ } else
+ {
+ row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+ }
+ /* setting the stopping row */
+ /* EQUIVILANT :
+ row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+ */
+ if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+ {
+ row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+ } else
+ {
+ row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+ }
+
+ /* copy over the first row */
+ /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+ memmove(target, row_start, dim_im_out_y * ch_im_in);
+
+ /* move over to next row */
+ row_start += ch_im_in * dim_im_in_x;
+
+ for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+ {
+ compare_and_replace_if_larger_q7(target, row_start, dim_im_out_x * ch_im_in);
+ }
+ }
+
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ int max = -129;
+ for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+ {
+ for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+ {
+ if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+ {
+ max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+ }
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+ }
+ }
+ }
+
+#endif /* ARM_MATH_DSP */
+
+}
+
+
/**
* @brief Q7 average pooling function
* @param[in,out] Im_in pointer to input tensor
@@ -443,6 +616,390 @@ arm_avepool_q7_HWC(q7_t * Im_in,
}
+ /**
+ * @brief Q7 average pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention along X axis
+ * @param[in] dim_im_in_y input tensor dimention along Y axis
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel_x filter kernel size along X axis
+ * @param[in] dim_kernel_y filter kernel size along Y axis
+ * @param[in] padding_x padding sizes along X axis
+ * @param[in] padding_y padding sizes along Y axis
+ * @param[in] stride_x convolution stride along X axis
+ * @param[in] stride_y convolution stride along Y axis
+ * @param[in] dim_im_out_x output tensor dimension along X axis
+ * @param[in] dim_im_out_y output tensor dimension along Y axis
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * Buffer size:
+ *
+ * bufferA size: dim_im_out_x*dim_im_out_y*ch_im_in
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void
+arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q7_t * bufferA,
+ q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+ int16_t i_x, i_y;
+ int16_t count = 0;
+
+ /* first does the pooling along x axis */
+ for (i_y = 0; i_y < dim_im_in_y; i_y++)
+ {
+
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ /* for each output pixel */
+ q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+ q7_t *win_start;
+ q7_t *win_stop;
+ if (i_x * stride_x - padding_x < 0)
+ {
+ win_start = target;
+ } else
+ {
+ win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+ }
+
+ if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+ {
+ win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+ } else
+ {
+ win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+ }
+
+ /* first step is to copy over initial data */
+ arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
+ count = 1;
+
+ /* start the max operation from the second part */
+ win_start += ch_im_in;
+ for (; win_start < win_stop; win_start += ch_im_in)
+ {
+ accumulate_q7_to_q15(buffer, win_start, ch_im_in);
+ count++;
+ }
+ buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+ }
+ }
+
+ /* then does the pooling along y axis */
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+
+ /* for each output row */
+ q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+ q7_t *row_start;
+ q7_t *row_end;
+ /* setting the starting row */
+ /* EQUIVILANT :
+ row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+ */
+ if (i_y * stride_y - padding_y < 0)
+ {
+ row_start = Im_in;
+ } else
+ {
+ row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+ }
+ /* setting the stopping row */
+ /* EQUIVILANT :
+ row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+ */
+ if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+ {
+ row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+ } else
+ {
+ row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+ }
+
+ /* copy over the first row */
+ arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in);
+ count = 1;
+
+ /* move over to next row */
+ row_start += ch_im_in * dim_im_in_x;
+
+ for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+ {
+ accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in);
+ count++;
+ }
+ buffer_scale_back_q15_to_q7(buffer, target, dim_im_out_x * ch_im_in, count);
+ }
+
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out_y; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out_x; i_x++)
+ {
+ int sum = 0;
+ int count = 0;
+ for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+ {
+ for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+ {
+ sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+ count++
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+ }
+ }
+ }
+
+#endif /* ARM_MATH_DSP */
+
+}
+
+ /**
+ * @brief Q7 1-D max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * Buffer size:
+ *
+ * bufferA size: ch_im_in
+ *
+ * The pooling function is implemented on on axis
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+ int16_t i;
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+ for (i = 0; i < dim_im_out; i++)
+ {
+
+ /* for each output row */
+ q7_t *target = Im_out + i * ch_im_in;
+ q7_t *start;
+ q7_t *end;
+ /* setting the starting row */
+ if (i * stride - padding < 0)
+ {
+ start = Im_in;
+ } else
+ {
+ start = Im_in + (i * stride - padding) * ch_im_in;
+ }
+ /* setting the stopping row */
+ if (i * stride - padding + dim_kernel >= dim_im_in)
+ {
+ end = Im_in + dim_im_in * ch_im_in;
+ } else
+ {
+ end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+ }
+
+ /* copy over the first row */
+ arm_q7_to_q15_no_shift(start, buffer, ch_im_in);
+ count = 1;
+
+ /* move over to next row */
+ start += ch_im_in ;
+
+ for (; start < end; start += ch_im_in)
+ {
+ accumulate_q7_to_q15(buffer, start, ch_im_in);
+ count++;
+ }
+ buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+ }
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i;
+ int16_t k;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i = 0; i < dim_im_out; i++)
+ {
+ int sum = 0;
+ int count = 0;
+ int16_t start = i * stride - padding;
+ for (k = start; k < start + dim_kernel; k++)
+ {
+ if (k >= 0 && k < dim_im_in)
+ {
+ sum += Im_in[i_ch_in + ch_im_in * k];
+ count++;
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * i] = sum/count;
+ }
+ }
+#endif /* ARM_MATH_DSP */
+}
+
+ /**
+ * @brief Q7 1-D max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * Buffer size:
+ *
+ * bufferA size: 0
+ *
+ * The pooling function is implemented on on axis
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+ const uint16_t dim_im_in, // input image dimension
+ const uint16_t ch_im_in, // number of input image channels
+ const uint16_t dim_kernel, // window kernel size
+ const uint16_t padding, // padding sizes
+ const uint16_t stride, // stride
+ const uint16_t dim_im_out, // output image dimension
+ q7_t * bufferA, // a buffer for local storage
+ q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+ int16_t i;
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+ for (i = 0; i < dim_im_out; i++)
+ {
+
+ /* for each output row */
+ q7_t *target = Im_out + i * ch_im_in;
+ q7_t *start;
+ q7_t *end;
+ /* setting the starting row */
+ if (i * stride - padding < 0)
+ {
+ start = Im_in;
+ } else
+ {
+ start = Im_in + (i * stride - padding) * ch_im_in;
+ }
+ /* setting the stopping row */
+ if (i * stride - padding + dim_kernel >= dim_im_in)
+ {
+ end = Im_in + dim_im_in * ch_im_in;
+ } else
+ {
+ end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+ }
+
+ /* copy over the first row */
+ /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+ memmove(target, start, ch_im_in);
+
+ /* move over to next row */
+ start += dim_im_in;
+
+ for (; start < end; start += ch_im_in)
+ {
+ compare_and_replace_if_larger_q7(target, start, ch_im_in);
+ }
+ }
+
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i;
+ int16_t k;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i = 0; i < dim_im_out; i++)
+ {
+ int max = -129;
+ int16_t start = i * stride - padding;
+ for (k = start; k < start + dim_kernel; k++)
+ {
+ if (k >= 0 && k < dim_im_in)
+ {
+ if (Im_in[i_ch_in + ch_im_in * k] > max)
+ {
+ max = Im_in[i_ch_in + ch_im_in * k];
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * i] = max;
+ }
+ }
+#endif /* ARM_MATH_DSP */
+}
+
+
/**
* @} end of Pooling group
*/