From ac045f077325810fab4cd80460a7cf9b88fa28cf Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 12:53:21 -0700 Subject: [PATCH 1/7] created first pass at nonsquare pooling functions --- .../Source/PoolingFunctions/arm_pool_q7_HWC.c | 350 ++++++++++++++++++ 1 file changed, 350 insertions(+) diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c index b451f5e695..96afe033fc 100644 --- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c +++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -286,6 +286,179 @@ arm_maxpool_q7_HWC(q7_t * Im_in, } + /** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride_x - padding_x < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x) + { + win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + /* EQUIVILANT : + row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in; + } + /* setting the stopping row */ + /* EQUIVILANT : + row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y) + { + row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out_y * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out_x * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + + /** * @brief Q7 average pooling function * @param[in,out] Im_in pointer to input tensor @@ -443,6 +616,183 @@ arm_avepool_q7_HWC(q7_t * Im_in, } + /** + * @brief Q7 average pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: dim_im_out_x*dim_im_out_y*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in_y; i_y++) + { + + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride_x - padding_x < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x) + { + win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out_x * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + /* EQUIVILANT : + row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in; + } + /* setting the stopping row */ + /* EQUIVILANT : + row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + */ + if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y) + { + row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in_x; + + for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out_x * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++ + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + + /** * @} end of Pooling group */ From 1d2a75e50c6e8a2cf7bf7f68785845a205deb606 Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 13:57:38 -0700 Subject: [PATCH 2/7] added ref function and proposed test --- CMSIS/NN/Include/arm_nnfunctions.h | 197 +++++++++++++++++- .../Ref_Implementations/arm_pool_ref.c | 92 ++++++++ .../Ref_Implementations/ref_functions.h | 33 +++ .../nn_test/arm_nnexamples_nn_test.cpp | 114 ++++++++-- 4 files changed, 423 insertions(+), 13 deletions(-) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index c6ec83a453..9aa35fe03f 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -204,6 +204,45 @@ extern "C" q15_t * bufferA, q7_t * bufferB); +static inline arm_status arm_convolve_HWC_q7_basic_nonsquare( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q7_basic_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + /** * @brief Basic Q15 convolution function * @param[in] Im_in pointer to input tensor @@ -335,6 +374,45 @@ extern "C" q15_t * bufferA, q7_t * bufferB); +static inline arm_status arm_convolve_HWC_q7_fast_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q7_fast_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + /** * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) * @param[in] Im_in pointer to input tensor @@ -390,6 +468,45 @@ extern "C" q15_t * bufferA, q7_t * bufferB); +static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_1x1_HWC_q7_fast_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + /** * @brief Q7 version of convolution for RGB image * @param[in] Im_in pointer to input tensor @@ -535,7 +652,47 @@ extern "C" const uint16_t dim_im_out_y, q15_t * bufferA, q7_t * bufferB); - + +static inline arm_status arm_convolve_HWC_q15_fast_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_convolve_HWC_q15_fast_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} + + /** * @brief Q7 depthwise separable convolution function * @param[in] Im_in pointer to input tensor @@ -629,7 +786,45 @@ extern "C" q15_t * bufferA, q7_t * bufferB); +static inline arm_status arm_depthwise_separable_conv_HWC_q7_1d( + const q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t * wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t * bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t * Im_out, + const uint16_t dim_im_out, + q15_t * bufferA, + q7_t * bufferB ) { + return arm_depthwise_separable_conv_HWC_q7_nonsquare( + Im_in, //const q7_t * Im_in + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + wt, //const q7_t * wt, + ch_im_out, //const uint16_t ch_im_out, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 1, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 1, //const uint16_t stride_y, + bias, //const q7_t * bias, + bias_shift, //const uint16_t bias_shift, + out_shift, //const uint16_t out_shift, + Im_out, //q7_t * Im_out, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q15_t * bufferA, + bufferB); //q7_t * bufferB); +} /** * @defgroup FC Fully-connected Layer Functions * diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c index 9a4adb29ab..9c7f2d8155 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c @@ -94,3 +94,95 @@ void arm_maxpool_q7_HWC_ref(const q7_t * Im_in, // input image } } } + +void +arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) { + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + count++ + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count; + } + } + } +} + + +void +arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out) { + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out_y; i_y++) + { + for (i_x = 0; i_x < dim_im_out_x; i_x++) + { + int max = -129; + for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + { + for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max; + } + } + } +} diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h index 4a0647a9bd..c1cee95846 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h @@ -229,6 +229,39 @@ extern "C" q7_t * bufferA, // a buffer for local storage q7_t * Im_out); + void +arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + + + void +arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + /* * * Other reference implemenation diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp index 5cf72a2a05..be6769d62d 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp @@ -41,11 +41,12 @@ //#define TEST_SIGMOID //#define TEST_TANH #define TEST_POOL -#define TEST_RELU -#define TEST_IP -#define TEST_CONV -#define TEST_NONSQUARE -#define TEST_NNMULT +#define TEST_POOL_NS +// #define TEST_RELU +// #define TEST_IP +// #define TEST_CONV +// #define TEST_NONSQUARE +// #define TEST_NNMULT int test_index = 0; q7_t test_flags[50]; @@ -256,12 +257,6 @@ int main() img_in[i] = test1[i]; } - // copy over the img input - for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++) - { - img_in[i] = test1[i]; - } - printf("Start avepool ref implementation\n"); arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref); @@ -296,7 +291,102 @@ int main() delete[]test2; delete[]test3; -#endif +#endif//TEST_POOL + +#ifdef TEST_POOL_NS + +#define POOL_NS_IM_DIM 64 +#define POOL_NS_IM_CH 4 + + test1 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH * 2]; + test2 = new q15_t[POOL_NS_IM_DIM * POOL_NS_IM_CH]; + test3 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH]; + + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + test1[i] = (rand() % 256 - 128); + } + + q7_t *img_in = test1 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; + q7_t *pool_out_ref = test3; + q7_t *pool_out_opt = test3 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH / 2; + + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + test3[i] = 0; + } + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + + printf("Start maxpool reference implementation\n"); + + arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref) + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt) + + verify_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + + printf("Start maxpool reference implementation\n"); + + arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref) + + // copy over the img input + for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt) + + // special check here + bool if_ave_pool_match = true; + for (int i = 0; i < POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH; i++) + { + // we tolerate at most difference of 1 here because of rounding errors + if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) + { + printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]); + if_ave_pool_match = false; + } + } + if (if_ave_pool_match == true) + { + printf("Outputs match.\n"); + } + + delete[]test1; + delete[]test2; + delete[]test3; + + + + +#endif//TEST_POOL_NS + #ifdef TEST_RELU From f42d943cb813556c8d1888fd774c9d4f189bb504 Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 14:08:24 -0700 Subject: [PATCH 3/7] updated reference and armnn functions --- CMSIS/NN/Include/arm_nnfunctions.h | 69 +++++++++++++++++++ .../Ref_Implementations/arm_pool_ref.c | 12 ++-- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index 9aa35fe03f..a0ad543b25 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -1144,6 +1144,40 @@ extern "C" q7_t * bufferA, q7_t * Im_out); + /** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + /** * @brief Q7 average pooling function * @param[in] Im_in pointer to input tensor @@ -1169,6 +1203,41 @@ extern "C" q7_t * bufferA, q7_t * Im_out); + /** + * @brief Q7 average pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention along X axis + * @param[in] dim_im_in_y input tensor dimention along Y axis + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel_x filter kernel size along X axis + * @param[in] dim_kernel_y filter kernel size along Y axis + * @param[in] padding_x padding sizes along X axis + * @param[in] padding_y padding sizes along Y axis + * @param[in] stride_x convolution stride along X axis + * @param[in] stride_y convolution stride along Y axis + * @param[in] dim_im_out_x output tensor dimension along X axis + * @param[in] dim_im_out_y output tensor dimension along Y axis + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q7_t * bufferA, + q7_t * Im_out); + + /** * @defgroup Softmax Softmax Functions * diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c index 9c7f2d8155..5701edecc6 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c @@ -123,9 +123,11 @@ arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, { int sum = 0; int count = 0; - for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + int16_t y_start = i_y * stride_y - padding_y; + int16_t x_start = i_x * stride_x - padding_x; + for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++) { - for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++) { if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) { @@ -168,9 +170,11 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, for (i_x = 0; i_x < dim_im_out_x; i_x++) { int max = -129; - for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) + int16_t y_start = i_y * stride_y - padding_y; + int16_t x_start = i_x * stride_x - padding_x; + for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++) { - for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) + for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++) { if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) { From 4d6b1edbd32eba84efa7ddfc7606f5d3a1b19f37 Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 14:18:32 -0700 Subject: [PATCH 4/7] added 1d inline wrappers for pooling --- CMSIS/NN/Include/arm_nnfunctions.h | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index a0ad543b25..2ad0d60253 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -1178,6 +1178,33 @@ void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, q7_t * bufferA, q7_t * Im_out); +static inline +void arm_maxpool_q7_HWC_1d(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, + q7_t * Im_out) { + arm_maxpool_q7_HWC_nonsquare( + Im_in, //q7_t * Im_in, + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 0, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 0, //const uint16_t stride_y, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q7_t * bufferA, + Im_out); //q7_t * Im_out); +} + /** * @brief Q7 average pooling function * @param[in] Im_in pointer to input tensor @@ -1237,6 +1264,33 @@ void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, q7_t * bufferA, q7_t * Im_out); +static inline +void arm_avepool_q7_HWC_1d(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t * bufferA, + q7_t * Im_out) { + arm_avepool_q7_HWC_nonsquare( + Im_in, //q7_t * Im_in, + dim_im_in, //const uint16_t dim_im_in_x, + 1, //const uint16_t dim_im_in_y, + ch_im_in, //const uint16_t ch_im_in, + dim_kernel, //const uint16_t dim_kernel_x, + 1, //const uint16_t dim_kernel_y, + padding, //const uint16_t padding_x, + 0, //const uint16_t padding_y, + stride, //const uint16_t stride_x, + 0, //const uint16_t stride_y, + dim_im_out, //const uint16_t dim_im_out_x, + 1, //const uint16_t dim_im_out_y, + bufferA, //q7_t * bufferA, + Im_out); //q7_t * Im_out); +} + /** * @defgroup Softmax Softmax Functions From 07591ee55dae26bab97a56eaf54262292c8bfbd5 Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 15:20:21 -0700 Subject: [PATCH 5/7] added 1d pooling functions and cleaned up 1d wrappers, plus built basic test for 1d pooling --- CMSIS/NN/Include/arm_nnfunctions.h | 178 ++++----------- .../Ref_Implementations/arm_pool_ref.c | 72 +++++- .../nn_test/arm_nnexamples_nn_test.cpp | 144 +++++++++--- .../Source/PoolingFunctions/arm_pool_q7_HWC.c | 207 ++++++++++++++++++ 4 files changed, 438 insertions(+), 163 deletions(-) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index 2ad0d60253..1294b1998e 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -468,45 +468,6 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d( q15_t * bufferA, q7_t * bufferB); -static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d( - const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, - q7_t * bufferB ) { - return arm_convolve_1x1_HWC_q7_fast_nonsquare( - Im_in, //const q7_t * Im_in - dim_im_in, //const uint16_t dim_im_in_x, - 1, //const uint16_t dim_im_in_y, - ch_im_in, //const uint16_t ch_im_in, - wt, //const q7_t * wt, - ch_im_out, //const uint16_t ch_im_out, - dim_kernel, //const uint16_t dim_kernel_x, - 1, //const uint16_t dim_kernel_y, - padding, //const uint16_t padding_x, - 1, //const uint16_t padding_y, - stride, //const uint16_t stride_x, - 1, //const uint16_t stride_y, - bias, //const q7_t * bias, - bias_shift, //const uint16_t bias_shift, - out_shift, //const uint16_t out_shift, - Im_out, //q7_t * Im_out, - dim_im_out, //const uint16_t dim_im_out_x, - 1, //const uint16_t dim_im_out_y, - bufferA, //q15_t * bufferA, - bufferB); //q7_t * bufferB); -} - /** * @brief Q7 version of convolution for RGB image * @param[in] Im_in pointer to input tensor @@ -619,7 +580,7 @@ static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d( * * Buffer size: * - * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y * * bufferB size: 0 * @@ -786,45 +747,6 @@ static inline arm_status arm_convolve_HWC_q15_fast_1d( q15_t * bufferA, q7_t * bufferB); -static inline arm_status arm_depthwise_separable_conv_HWC_q7_1d( - const q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const q7_t * wt, - const uint16_t ch_im_out, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const q7_t * bias, - const uint16_t bias_shift, - const uint16_t out_shift, - q7_t * Im_out, - const uint16_t dim_im_out, - q15_t * bufferA, - q7_t * bufferB ) { - return arm_depthwise_separable_conv_HWC_q7_nonsquare( - Im_in, //const q7_t * Im_in - dim_im_in, //const uint16_t dim_im_in_x, - 1, //const uint16_t dim_im_in_y, - ch_im_in, //const uint16_t ch_im_in, - wt, //const q7_t * wt, - ch_im_out, //const uint16_t ch_im_out, - dim_kernel, //const uint16_t dim_kernel_x, - 1, //const uint16_t dim_kernel_y, - padding, //const uint16_t padding_x, - 1, //const uint16_t padding_y, - stride, //const uint16_t stride_x, - 1, //const uint16_t stride_y, - bias, //const q7_t * bias, - bias_shift, //const uint16_t bias_shift, - out_shift, //const uint16_t out_shift, - Im_out, //q7_t * Im_out, - dim_im_out, //const uint16_t dim_im_out_x, - 1, //const uint16_t dim_im_out_y, - bufferA, //q15_t * bufferA, - bufferB); //q7_t * bufferB); - -} /** * @defgroup FC Fully-connected Layer Functions * @@ -1178,32 +1100,29 @@ void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in, q7_t * bufferA, q7_t * Im_out); -static inline -void arm_maxpool_q7_HWC_1d(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out) { - arm_maxpool_q7_HWC_nonsquare( - Im_in, //q7_t * Im_in, - dim_im_in, //const uint16_t dim_im_in_x, - 1, //const uint16_t dim_im_in_y, - ch_im_in, //const uint16_t ch_im_in, - dim_kernel, //const uint16_t dim_kernel_x, - 1, //const uint16_t dim_kernel_y, - padding, //const uint16_t padding_x, - 0, //const uint16_t padding_y, - stride, //const uint16_t stride_x, - 0, //const uint16_t stride_y, - dim_im_out, //const uint16_t dim_im_out_x, - 1, //const uint16_t dim_im_out_y, - bufferA, //q7_t * bufferA, - Im_out); //q7_t * Im_out); -} + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out); /** * @brief Q7 average pooling function @@ -1264,32 +1183,29 @@ void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, q7_t * bufferA, q7_t * Im_out); -static inline -void arm_avepool_q7_HWC_1d(q7_t * Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t * bufferA, - q7_t * Im_out) { - arm_avepool_q7_HWC_nonsquare( - Im_in, //q7_t * Im_in, - dim_im_in, //const uint16_t dim_im_in_x, - 1, //const uint16_t dim_im_in_y, - ch_im_in, //const uint16_t ch_im_in, - dim_kernel, //const uint16_t dim_kernel_x, - 1, //const uint16_t dim_kernel_y, - padding, //const uint16_t padding_x, - 0, //const uint16_t padding_y, - stride, //const uint16_t stride_x, - 0, //const uint16_t stride_y, - dim_im_out, //const uint16_t dim_im_out_x, - 1, //const uint16_t dim_im_out_y, - bufferA, //q7_t * bufferA, - Im_out); //q7_t * Im_out); -} + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + */ + +void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out); /** diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c index 5701edecc6..2292e181ff 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c @@ -110,8 +110,6 @@ arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in, const uint16_t dim_im_out_y, q7_t * bufferA, q7_t * Im_out) { - /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ - int16_t i_ch_in, i_x, i_y; int16_t k_x, k_y; @@ -158,8 +156,6 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, const uint16_t dim_im_out_y, q7_t * bufferA, q7_t * Im_out) { - /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ - int16_t i_ch_in, i_x, i_y; int16_t k_x, k_y; @@ -190,3 +186,71 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in, } } } + + +void +arm_avepool_q7_HWC_1d_ref(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int sum = 0; + int count = 0; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * k]; + count++; + } + } + Im_out[i_ch_in + ch_im_in * i] = sum/count; + } + } +} + +void +arm_maxpool_q7_HWC_1d_ref(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int max = -129; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * k] > max) + { + max = Im_in[i_ch_in + ch_im_in * k]; + } + } + } + Im_out[i_ch_in + ch_im_in * i] = max; + } + } +} diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp index be6769d62d..efc1fbf571 100644 --- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp +++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp @@ -38,15 +38,16 @@ #include "arm_nnexamples_nn_test.h" -//#define TEST_SIGMOID -//#define TEST_TANH +#define TEST_SIGMOID +#define TEST_TANH #define TEST_POOL #define TEST_POOL_NS -// #define TEST_RELU -// #define TEST_IP -// #define TEST_CONV -// #define TEST_NONSQUARE -// #define TEST_NNMULT +#define TEST_POOL_1D +#define TEST_RELU +#define TEST_IP +#define TEST_CONV +#define TEST_NONSQUARE +#define TEST_NNMULT int test_index = 0; q7_t test_flags[50]; @@ -295,76 +296,77 @@ int main() #ifdef TEST_POOL_NS -#define POOL_NS_IM_DIM 64 +#define POOL_NS_IM_DIM1 64 +#define POOL_NS_IM_DIM2 16 #define POOL_NS_IM_CH 4 - test1 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH * 2]; - test2 = new q15_t[POOL_NS_IM_DIM * POOL_NS_IM_CH]; - test3 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH]; + test1 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH * 2]; + test2 = new q15_t[(POOL_NS_IM_DIM1 + POOL_NS_IM_DIM2) * POOL_NS_IM_CH]; + test3 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH]; - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) { test1[i] = (rand() % 256 - 128); } - q7_t *img_in = test1 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; + q7_t *img_in = test1 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; q7_t *pool_out_ref = test3; - q7_t *pool_out_opt = test3 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH / 2; + q7_t *pool_out_opt = test3 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH / 2; - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) { test3[i] = 0; } // copy over the img input - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++) { img_in[i] = test1[i]; } - initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH); printf("Start maxpool reference implementation\n"); - arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref) + arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref) // copy over the img input - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) { img_in[i] = test1[i]; } printf("Start maxpool opt implementation\n"); - arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt) + arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, 1, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt) - verify_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + verify_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH); // copy over the img input - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) { img_in[i] = test1[i]; } - initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH); + initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1/2) * (POOL_NS_IM_DIM2/2) * POOL_NS_IM_CH); printf("Start maxpool reference implementation\n"); - arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref) + arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref) // copy over the img input - for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++) { img_in[i] = test1[i]; } printf("Start maxpool opt implementation\n"); - arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt) + arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt) // special check here bool if_ave_pool_match = true; - for (int i = 0; i < POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH; i++) + for (int i = 0; i < (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH; i++) { // we tolerate at most difference of 1 here because of rounding errors if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) @@ -382,11 +384,97 @@ int main() delete[]test2; delete[]test3; +#endif//TEST_POOL_NS +#ifdef TEST_POOL_1D -#endif//TEST_POOL_NS +#define POOL_IM_DIM_1D 32 +#define POOL_IM_CH_1D 8 + + test1 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D * 2]; + test2 = new q15_t[POOL_IM_CH_1D]; + test3 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D]; + + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + test1[i] = (rand() % 256 - 128); + } + + q7_t *img_in = test1 + POOL_IM_DIM_1D * POOL_IM_CH_1D; + q7_t *pool_out_ref = test3; + q7_t *pool_out_opt = test3 + POOL_IM_DIM_1D * POOL_IM_CH_1D / 2; + + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + test3[i] = 0; + } + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D); + + printf("Start maxpool reference implementation\n"); + + arm_maxpool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref); + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + printf("Start maxpool opt implementation\n"); + + arm_maxpool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt); + + verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D); + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + printf("Start avepool ref implementation\n"); + + arm_avepool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref); + + // copy over the img input + for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++) + { + img_in[i] = test1[i]; + } + + printf("Start avepool opt implementation\n"); + + arm_avepool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt); + + // special check here + bool if_ave_pool_match = true; + for (int i = 0; i < POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D; i++) + { + // we tolerate at most difference of 1 here because of rounding errors + if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2) + { + printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]); + if_ave_pool_match = false; + } + } + if (if_ave_pool_match == true) + { + printf("Outputs match.\n"); + } + + delete[]test1; + delete[]test2; + delete[]test3; +#endif//TEST_POOL_1D #ifdef TEST_RELU diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c index 96afe033fc..83bff56939 100644 --- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c +++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -790,6 +790,213 @@ arm_avepool_q7_HWC_nonsquare(q7_t * Im_in, #endif /* ARM_MATH_DSP */ +} + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: ch_im_in + * + * The pooling function is implemented on on axis + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + int16_t i; + /* Run the following code for Cortex-M4 and Cortex-M7 */ + for (i = 0; i < dim_im_out; i++) + { + + /* for each output row */ + q7_t *target = Im_out + i * ch_im_in; + q7_t *start; + q7_t *end; + /* setting the starting row */ + if (i * stride - padding < 0) + { + start = Im_in; + } else + { + start = Im_in + (i * stride - padding) * ch_im_in; + } + /* setting the stopping row */ + if (i * stride - padding + dim_kernel >= dim_im_in) + { + end = Im_in + dim_im_in * ch_im_in; + } else + { + end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(start, buffer, ch_im_in); + count = 1; + + /* move over to next row */ + start += ch_im_in ; + + for (; start < end; start += ch_im_in) + { + accumulate_q7_to_q15(buffer, start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int sum = 0; + int count = 0; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * k]; + count++; + } + } + Im_out[i_ch_in + ch_im_in * i] = sum/count; + } + } +#endif /* ARM_MATH_DSP */ +} + + /** + * @brief Q7 1-D max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented on on axis + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image + const uint16_t dim_im_in, // input image dimension + const uint16_t ch_im_in, // number of input image channels + const uint16_t dim_kernel, // window kernel size + const uint16_t padding, // padding sizes + const uint16_t stride, // stride + const uint16_t dim_im_out, // output image dimension + q7_t * bufferA, // a buffer for local storage + q7_t * Im_out) { +#if defined (ARM_MATH_DSP) + int16_t i; + /* Run the following code for Cortex-M4 and Cortex-M7 */ + for (i = 0; i < dim_im_out; i++) + { + + /* for each output row */ + q7_t *target = Im_out + i * ch_im_in; + q7_t *start; + q7_t *end; + /* setting the starting row */ + if (i * stride - padding < 0) + { + start = Im_in; + } else + { + start = Im_in + (i * stride - padding) * ch_im_in; + } + /* setting the stopping row */ + if (i * stride - padding + dim_kernel >= dim_im_in) + { + end = Im_in + dim_im_in * ch_im_in; + } else + { + end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, start, ch_im_in); + + /* move over to next row */ + start += dim_im_in; + + for (; start < end; start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, start, ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i; + int16_t k; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i = 0; i < dim_im_out; i++) + { + int max = -129; + int16_t start = i * stride - padding; + for (k = start; k < start + dim_kernel; k++) + { + if (k >= 0 && k < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * k] > max) + { + max = Im_in[i_ch_in + ch_im_in * k]; + } + } + } + Im_out[i_ch_in + ch_im_in * i] = max; + } + } +#endif /* ARM_MATH_DSP */ } From 6d2e9b9059918616fed87399a68cc088c2d74951 Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 15:24:29 -0700 Subject: [PATCH 6/7] cleaned up comments on 1d functions --- CMSIS/NN/Include/arm_nnfunctions.h | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index 1294b1998e..471c7a1448 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -204,6 +204,26 @@ extern "C" q15_t * bufferA, q7_t * bufferB); + /** + * @brief Basic Q7 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ + static inline arm_status arm_convolve_HWC_q7_basic_nonsquare( const q7_t * Im_in, const uint16_t dim_im_in, @@ -374,6 +394,26 @@ static inline arm_status arm_convolve_HWC_q7_basic_nonsquare( q15_t * bufferA, q7_t * bufferB); + /** + * @brief Fast Q7 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ + static inline arm_status arm_convolve_HWC_q7_fast_1d( const q7_t * Im_in, const uint16_t dim_im_in, @@ -614,6 +654,26 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d( q15_t * bufferA, q7_t * bufferB); + + /** + * @brief Fast Q15 convolution function (1D) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size y + * @param[in] padding padding size y + * @param[in] stride. convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out. output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns ARM_MATH_SUCCESS + */ static inline arm_status arm_convolve_HWC_q15_fast_1d( const q7_t * Im_in, const uint16_t dim_im_in, From cdfce23bb2fd8c95b16a801ccc36ee694d4af9aa Mon Sep 17 00:00:00 2001 From: Quinn Abrahams-Vaughn Date: Thu, 9 May 2019 15:33:27 -0700 Subject: [PATCH 7/7] kept warnings from fast implementations --- CMSIS/NN/Include/arm_nnfunctions.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h index 471c7a1448..05c348f117 100644 --- a/CMSIS/NN/Include/arm_nnfunctions.h +++ b/CMSIS/NN/Include/arm_nnfunctions.h @@ -224,7 +224,7 @@ extern "C" * @return The function returns ARM_MATH_SUCCESS */ -static inline arm_status arm_convolve_HWC_q7_basic_nonsquare( +static inline arm_status arm_convolve_HWC_q7_basic_1d( const q7_t * Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, @@ -412,6 +412,12 @@ static inline arm_status arm_convolve_HWC_q7_basic_nonsquare( * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns ARM_MATH_SUCCESS + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 */ static inline arm_status arm_convolve_HWC_q7_fast_1d( @@ -673,6 +679,22 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d( * @param[in,out] bufferA pointer to buffer space for input * @param[in,out] bufferB pointer to buffer space for output * @return The function returns ARM_MATH_SUCCESS + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y + * + * bufferB size: 0 + * + * Input dimension constraints: + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * */ static inline arm_status arm_convolve_HWC_q15_fast_1d( const q7_t * Im_in,