diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index c6ec83a453..05c348f117 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -204,6 +204,65 @@ extern "C" q15_t * bufferA, q7_t * bufferB); + /** + * @brief Basic Q7 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ + +static inline arm_status arm_convolve_HWC_q7_basic_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q7_basic_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + /** * @brief Basic Q15 convolution function * @param[in] Im_in pointer to input tensor @@ -335,6 +394,71 @@ extern "C" q15_t * bufferA, q7_t * bufferB); + /** + * @brief Fast Q7 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +static inline arm_status arm_convolve_HWC_q7_fast_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q7_fast_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + /** * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) * @param[in] Im_in pointer to input tensor @@ -502,7 +626,7 @@ extern "C" * * Buffer size: * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y * * bufferB size: 0 * @@ -535,7 +659,83 @@ extern "C" const uint16_t dim_im_out_y, q15_t * bufferA, q7_t * bufferB); - + + + /** + * @brief Fast Q15 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * + */ +static inline arm_status arm_convolve_HWC_q15_fast_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q15_fast_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + + /** * @brief Q7 depthwise separable convolution function * @param[in] Im_in pointer to input tensor @@ -629,7 +829,6 @@ extern "C" q15_t * bufferA, q7_t * bufferB); - /** * @defgroup FC Fully-connected Layer Functions * @@ -949,6 +1148,64 @@ extern "C" q7_t * bufferA, q7_t * Im_out); + /** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out); + /** * @brief Q7 average pooling function * @param[in] Im_in pointer to input tensor @@ -974,6 +1231,65 @@ extern "C" q7_t * bufferA, q7_t * Im_out); + /** + * @brief Q7 average pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out); + + /** * @defgroup Softmax Softmax Functions * diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c index 9a4adb29ab..2292e181ff 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c @@ -94,3 +94,163 @@ void arm_maxpool_q7_HWC_ref(const q7_t * Im_in, // input image } } } + +void +arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) { + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + int16_t y_start = i_y * stride_y - padding_y; + int16_t x_start = i_x * stride_x - padding_x; + for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++) + { + for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++ + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count; + } + } + } +} + + +void +arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) { + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + int16_t y_start = i_y * stride_y - padding_y; + int16_t x_start = i_x * stride_x - padding_x; + for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++) + { + for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } +} + + +void +arm_avepool_q7_HWC_1d_ref(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int sum = 0; + int count = 0; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * k]; + count++; + } + } + Im_out[i_ch_in + ch_im_in * i] = sum/count; + } + } +} + +void +arm_maxpool_q7_HWC_1d_ref(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int max = -129; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * k] > max) + { + max = Im_in[i_ch_in + ch_im_in * k]; + } + } + } + Im_out[i_ch_in + ch_im_in * i] = max; + } + } +} diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h index 4a0647a9bd..c1cee95846 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h @@ -229,6 +229,39 @@ extern "C" q7_t * bufferA, // a buffer for local storage q7_t * Im_out); + void +arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + + + void +arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + /* * * Other reference implemenation diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp index 5cf72a2a05..efc1fbf571 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp @@ -38,9 +38,11 @@ #include "arm_nnexamples_nn_test.h" -//#define TEST_SIGMOID -//#define TEST_TANH +#define TEST_SIGMOID +#define TEST_TANH #define TEST_POOL +#define TEST_POOL_NS +#define TEST_POOL_1D #define TEST_RELU #define TEST_IP #define TEST_CONV @@ -256,29 +258,205 @@ int main() img_in[i] = test1[i]; } + printf("Start avepool ref implementation\n"); + + arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref); + // copy over the img input for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++) { img_in[i] = test1[i]; } + printf("Start avepool opt implementation\n"); + + arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt); + + // special check here + bool if_ave_pool_match = true; + for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++) + { + // we tolerate at most difference of 1 here because of rounding errors + if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) + { + printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]); + if_ave_pool_match = false; + } + } + if (if_ave_pool_match == true) + { + printf("Outputs match.\n"); + } + + delete[]test1; + delete[]test2; + delete[]test3; + +#endif//TEST_POOL + +#ifdef TEST_POOL_NS + +#define POOL_NS_IM_DIM1 64 +#define POOL_NS_IM_DIM2 16 +#define POOL_NS_IM_CH 4 + + test1 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH * 2]; + test2 = new q15_t[(POOL_NS_IM_DIM1 + POOL_NS_IM_DIM2) * POOL_NS_IM_CH]; + test3 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH]; + + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) + { + test1[i] = (rand() % 256 - 128); + } + + q7_t *img_in = test1 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; + q7_t *pool_out_ref = test3; + q7_t *pool_out_opt = test3 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH / 2; + + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) + { + test3[i] = 0; + } + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH); + + printf("Start maxpool reference implementation\n"); + + arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref) + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, 1, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt) + + verify_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH); + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1/2) * (POOL_NS_IM_DIM2/2) * POOL_NS_IM_CH); + + printf("Start maxpool reference implementation\n"); + + arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref) + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt) + + // special check here + bool if_ave_pool_match = true; + for (int i = 0; i < (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH; i++) + { + // we tolerate at most difference of 1 here because of rounding errors + if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) + { + printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]); + if_ave_pool_match = false; + } + } + if (if_ave_pool_match == true) + { + printf("Outputs match.\n"); + } + + delete[]test1; + delete[]test2; + delete[]test3; + +#endif//TEST_POOL_NS + + +#ifdef TEST_POOL_1D + +#define POOL_IM_DIM_1D 32 +#define POOL_IM_CH_1D 8 + + test1 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D * 2]; + test2 = new q15_t[POOL_IM_CH_1D]; + test3 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D]; + + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + test1[i] = (rand() % 256 - 128); + } + + q7_t *img_in = test1 + POOL_IM_DIM_1D * POOL_IM_CH_1D; + q7_t *pool_out_ref = test3; + q7_t *pool_out_opt = test3 + POOL_IM_DIM_1D * POOL_IM_CH_1D / 2; + + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + test3[i] = 0; + } + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D); + + printf("Start maxpool reference implementation\n"); + + arm_maxpool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref); + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_maxpool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt); + + verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D); + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + printf("Start avepool ref implementation\n"); - arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref); + arm_avepool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref); // copy over the img input - for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++) + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) { img_in[i] = test1[i]; } printf("Start avepool opt implementation\n"); - arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt); + arm_avepool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt); // special check here bool if_ave_pool_match = true; - for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++) + for (int i = 0; i < POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D; i++) { // we tolerate at most difference of 1 here because of rounding errors if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) @@ -296,7 +474,7 @@ int main() delete[]test2; delete[]test3; -#endif +#endif//TEST_POOL_1D #ifdef TEST_RELU diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c index b451f5e695..83bff56939 100644 --- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c +++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -286,6 +286,179 @@ arm_maxpool_q7_HWC(q7_t * Im_in, } + /** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride_x - padding_x < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x) + { + win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + /* EQUIVILANT : + row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in; + } + /* setting the stopping row */ + /* EQUIVILANT : + row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y) + { + row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out_y * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out_x * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + + /** * @brief Q7 average pooling function * @param[in,out] Im_in pointer to input tensor @@ -443,6 +616,390 @@ arm_avepool_q7_HWC(q7_t * Im_in, } + /** + * @brief Q7 average pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: dim_im_out_x*dim_im_out_y*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride_x - padding_x < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x) + { + win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + /* EQUIVILANT : + row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in; + } + /* setting the stopping row */ + /* EQUIVILANT : + row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y) + { + row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out_x * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++ + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: ch_im_in + * + * The pooling function is implemented on on axis + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + int16_t i; + /* Run the following code for Cortex-M4 and Cortex-M7 */ + for (i = 0; i < dim_im_out; i++) + { + + /* for each output row */ + q7_t *target = Im_out + i * ch_im_in; + q7_t *start; + q7_t *end; + /* setting the starting row */ + if (i * stride - padding < 0) + { + start = Im_in; + } else + { + start = Im_in + (i * stride - padding) * ch_im_in; + } + /* setting the stopping row */ + if (i * stride - padding + dim_kernel >= dim_im_in) + { + end = Im_in + dim_im_in * ch_im_in; + } else + { + end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(start, buffer, ch_im_in); + count = 1; + + /* move over to next row */ + start += ch_im_in ; + + for (; start < end; start += ch_im_in) + { + accumulate_q7_to_q15(buffer, start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int sum = 0; + int count = 0; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * k]; + count++; + } + } + Im_out[i_ch_in + ch_im_in * i] = sum/count; + } + } +#endif /* ARM_MATH_DSP */ +} + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented on on axis + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { +#if defined (ARM_MATH_DSP) + int16_t i; + /* Run the following code for Cortex-M4 and Cortex-M7 */ + for (i = 0; i < dim_im_out; i++) + { + + /* for each output row */ + q7_t *target = Im_out + i * ch_im_in; + q7_t *start; + q7_t *end; + /* setting the starting row */ + if (i * stride - padding < 0) + { + start = Im_in; + } else + { + start = Im_in + (i * stride - padding) * ch_im_in; + } + /* setting the stopping row */ + if (i * stride - padding + dim_kernel >= dim_im_in) + { + end = Im_in + dim_im_in * ch_im_in; + } else + { + end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, start, ch_im_in); + + /* move over to next row */ + start += dim_im_in; + + for (; start < end; start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, start, ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int max = -129; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * k] > max) + { + max = Im_in[i_ch_in + ch_im_in * k]; + } + } + } + Im_out[i_ch_in + ch_im_in * i] = max; + } + } +#endif /* ARM_MATH_DSP */ +} + + /** * @} end of Pooling group */