From ac045f077325810fab4cd80460a7cf9b88fa28cf Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 12:53:21 -0700
Subject: [PATCH 1/7] created first pass at nonsquare pooling functions

---
 .../Source/PoolingFunctions/arm_pool_q7_HWC.c | 350 ++++++++++++++++++
 1 file changed, 350 insertions(+)
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index b451f5e695..96afe033fc 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -286,6 +286,179 @@ arm_maxpool_q7_HWC(q7_t * Im_in,
 
 }
 
+  /**
+   * @brief Q7 max pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  0
+   *
+   * The pooling function is implemented as split x-pooling then
+   * y-pooling.
+   *
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void
+arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t   i_x, i_y;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out_x; i_x++)
+        {
+            /* for each output pixel */
+            q7_t     *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+            q7_t     *win_start;
+            q7_t     *win_stop;
+            if (i_x * stride_x - padding_x < 0)
+            {
+                win_start = target;
+            } else
+            {
+                win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+            } else
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            /* arm_copy_q7(win_start, target, ch_im_in); */
+            memmove(target, win_start, ch_im_in);
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
+            }
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out_y; i_y++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+        q7_t     *row_start;
+        q7_t     *row_end;
+        /* setting the starting row */
+        /* EQUIVILANT :
+        row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y < 0)
+        {
+            row_start = Im_in;
+        } else
+        {
+            row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+        }
+        /* setting the stopping row */
+        /* EQUIVILANT :
+        row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+        {
+            row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+        } else
+        {
+            row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        }
+
+        /* copy over the first row */
+        /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+        memmove(target, row_start, dim_im_out_y * ch_im_in);
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in_x;
+
+        for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+        {
+            compare_and_replace_if_larger_q7(target, row_start, dim_im_out_x * ch_im_in);
+        }
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int       max = -129;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+
+#endif                          /* ARM_MATH_DSP */
+
+}
+
+
   /**
    * @brief Q7 average pooling function
    * @param[in,out]   Im_in       pointer to input tensor
@@ -443,6 +616,183 @@ arm_avepool_q7_HWC(q7_t * Im_in,
 
 }
 
+  /**
+   * @brief Q7 average pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  dim_im_out_x*dim_im_out_y*ch_im_in
+   *
+   * The pooling function is implemented as split x-pooling then
+   * y-pooling.
+   *
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void
+arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t   i_x, i_y;
+    int16_t   count = 0;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out_x; i_x++)
+        {
+            /* for each output pixel */
+            q7_t     *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+            q7_t     *win_start;
+            q7_t     *win_stop;
+            if (i_x * stride_x - padding_x < 0)
+            {
+                win_start = target;
+            } else
+            {
+                win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+            } else
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
+            count = 1;
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
+                count++;
+            }
+            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out_y; i_y++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+        q7_t     *row_start;
+        q7_t     *row_end;
+        /* setting the starting row */
+        /* EQUIVILANT :
+        row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y < 0)
+        {
+            row_start = Im_in;
+        } else
+        {
+            row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+        }
+        /* setting the stopping row */
+        /* EQUIVILANT :
+        row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+        {
+            row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+        } else
+        {
+            row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        }
+
+        /* copy over the first row */
+        arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in);
+        count = 1;
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in_x;
+
+        for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+        {
+            accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in);
+            count++;
+        }
+        buffer_scale_back_q15_to_q7(buffer, target, dim_im_out_x * ch_im_in, count);
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+            }
+        }
+    }
+
+#endif                          /* ARM_MATH_DSP */
+
+}
+
+
 /**
  * @} end of Pooling group
  */

From 1d2a75e50c6e8a2cf7bf7f68785845a205deb606 Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 13:57:38 -0700
Subject: [PATCH 2/7] added ref function and proposed test

---
 CMSIS/NN/Include/arm_nnfunctions.h            | 197 +++++++++++++++++-
 .../Ref_Implementations/arm_pool_ref.c        |  92 ++++++++
 .../Ref_Implementations/ref_functions.h       |  33 +++
 .../nn_test/arm_nnexamples_nn_test.cpp        | 114 ++++++++--
 4 files changed, 423 insertions(+), 13 deletions(-)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index c6ec83a453..9aa35fe03f 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -204,6 +204,45 @@ extern    "C"
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+static inline arm_status arm_convolve_HWC_q7_basic_nonsquare(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q7_basic_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
   /**
    * @brief Basic Q15 convolution function
    * @param[in]       Im_in       pointer to input tensor
@@ -335,6 +374,45 @@ extern    "C"
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+static inline arm_status arm_convolve_HWC_q7_fast_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q7_fast_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
   /**
    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
    * @param[in]       Im_in        pointer to input tensor
@@ -390,6 +468,45 @@ extern    "C"
                                                       q15_t * bufferA,
                                                       q7_t * bufferB);
 
+static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_1x1_HWC_q7_fast_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
   /**
    * @brief Q7 version of convolution for RGB image
    * @param[in]       Im_in       pointer to input tensor
@@ -535,7 +652,47 @@ extern    "C"
                               const uint16_t dim_im_out_y, 
                               q15_t * bufferA, 
                               q7_t * bufferB);
-										 
+			
+static inline arm_status arm_convolve_HWC_q15_fast_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q15_fast_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
+
   /**
    * @brief Q7 depthwise separable convolution function
    * @param[in]       Im_in       pointer to input tensor
@@ -629,7 +786,45 @@ extern    "C"
                                                              q15_t * bufferA,
                                                              q7_t * bufferB);
 
+static inline arm_status arm_depthwise_separable_conv_HWC_q7_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_depthwise_separable_conv_HWC_q7_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
 
+}
 /**
  * @defgroup FC Fully-connected Layer Functions
  *
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
index 9a4adb29ab..9c7f2d8155 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
@@ -94,3 +94,95 @@ void arm_maxpool_q7_HWC_ref(const q7_t * Im_in, // input image
         }
     }
 }
+
+void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out) {
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+            }
+        }
+    }
+}
+
+
+void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out) {
+   /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int       max = -129;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
index 4a0647a9bd..c1cee95846 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
@@ -229,6 +229,39 @@ extern    "C"
                                      q7_t * bufferA,    // a buffer for local storage
                                      q7_t * Im_out);
 
+    void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
+
+    void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
 /*
  *
  * Other reference implemenation
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index 5cf72a2a05..be6769d62d 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -41,11 +41,12 @@
 //#define TEST_SIGMOID
 //#define TEST_TANH
 #define TEST_POOL
-#define TEST_RELU
-#define TEST_IP
-#define TEST_CONV
-#define TEST_NONSQUARE
-#define TEST_NNMULT
+#define TEST_POOL_NS
+// #define TEST_RELU
+// #define TEST_IP
+// #define TEST_CONV
+// #define TEST_NONSQUARE
+// #define TEST_NNMULT
 
 int test_index = 0;
 q7_t test_flags[50];
@@ -256,12 +257,6 @@ int main()
         img_in[i] = test1[i];
     }
 
-    // copy over the img input
-    for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
-    {
-        img_in[i] = test1[i];
-    }
-
     printf("Start avepool ref implementation\n");
 
     arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
@@ -296,7 +291,102 @@ int main()
     delete[]test2;
     delete[]test3;
 
-#endif
+#endif//TEST_POOL
+
+#ifdef TEST_POOL_NS
+
+#define POOL_NS_IM_DIM 64
+#define POOL_NS_IM_CH 4
+
+    test1 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH * 2];
+    test2 = new q15_t[POOL_NS_IM_DIM * POOL_NS_IM_CH];
+    test3 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH];
+
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        test1[i] = (rand() % 256 - 128);
+    }
+
+    q7_t     *img_in = test1 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH;
+    q7_t     *pool_out_ref = test3;
+    q7_t     *pool_out_opt = test3 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH / 2;
+
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        test3[i] = 0;
+    }
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref)
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt)
+
+    verify_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref)
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt)
+
+    // special check here
+    bool      if_ave_pool_match = true;
+    for (int i = 0; i < POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH; i++)
+    {
+        // we tolerate at most difference of 1 here because of rounding errors
+        if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+        {
+            printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+            if_ave_pool_match = false;
+        }
+    }
+    if (if_ave_pool_match == true)
+    {
+        printf("Outputs match.\n");
+    }
+
+    delete[]test1;
+    delete[]test2;
+    delete[]test3;
+
+
+
+
+#endif//TEST_POOL_NS
+
 
 #ifdef TEST_RELU
 

From f42d943cb813556c8d1888fd774c9d4f189bb504 Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 14:08:24 -0700
Subject: [PATCH 3/7] updated reference and armnn functions

---
 CMSIS/NN/Include/arm_nnfunctions.h            | 69 +++++++++++++++++++
 .../Ref_Implementations/arm_pool_ref.c        | 12 ++--
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 9aa35fe03f..a0ad543b25 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -1144,6 +1144,40 @@ extern    "C"
                                  q7_t * bufferA, 
                                  q7_t * Im_out);
 
+    /**
+   * @brief Q7 max pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   */
+
+void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
   /**
    * @brief Q7 average pooling function
    * @param[in]       Im_in       pointer to input tensor
@@ -1169,6 +1203,41 @@ extern    "C"
                                  q7_t * bufferA, 
                                  q7_t * Im_out);
 
+  /**
+   * @brief Q7 average pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   */
+
+void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
+
 /**
  * @defgroup Softmax Softmax Functions
  *
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
index 9c7f2d8155..5701edecc6 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
@@ -123,9 +123,11 @@ arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
             {
                 int sum = 0;
                 int count = 0;
-                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                int16_t y_start = i_y * stride_y - padding_y;
+                int16_t x_start = i_x * stride_x - padding_x;
+                for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
                 {
-                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
                     {
                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
                         {
@@ -168,9 +170,11 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
             for (i_x = 0; i_x < dim_im_out_x; i_x++)
             {
                 int       max = -129;
-                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                int16_t y_start = i_y * stride_y - padding_y;
+                int16_t x_start = i_x * stride_x - padding_x;
+                for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
                 {
-                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
                     {
                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
                         {

From 4d6b1edbd32eba84efa7ddfc7606f5d3a1b19f37 Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 14:18:32 -0700
Subject: [PATCH 4/7] added 1d inline wrappers for pooling

---
 CMSIS/NN/Include/arm_nnfunctions.h | 54 ++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index a0ad543b25..2ad0d60253 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -1178,6 +1178,33 @@ void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
                    q7_t * bufferA, 
                    q7_t * Im_out);
 
+static inline
+void      arm_maxpool_q7_HWC_1d(q7_t * Im_in,
+                             const uint16_t dim_im_in,
+                             const uint16_t ch_im_in,
+                             const uint16_t dim_kernel,
+                             const uint16_t padding,
+                             const uint16_t stride, 
+                             const uint16_t dim_im_out, 
+                             q7_t * bufferA, 
+                             q7_t * Im_out) {
+  arm_maxpool_q7_HWC_nonsquare(
+    Im_in,        //q7_t * Im_in,
+    dim_im_in,    //const uint16_t dim_im_in_x,
+    1,            //const uint16_t dim_im_in_y,
+    ch_im_in,     //const uint16_t ch_im_in,
+    dim_kernel,   //const uint16_t dim_kernel_x,
+    1,            //const uint16_t dim_kernel_y,
+    padding,      //const uint16_t padding_x,
+    0,            //const uint16_t padding_y,
+    stride,       //const uint16_t stride_x,
+    0,            //const uint16_t stride_y,
+    dim_im_out,   //const uint16_t dim_im_out_x,
+    1,            //const uint16_t dim_im_out_y,
+    bufferA,      //q7_t * bufferA, 
+    Im_out);      //q7_t * Im_out);
+}
+
   /**
    * @brief Q7 average pooling function
    * @param[in]       Im_in       pointer to input tensor
@@ -1237,6 +1264,33 @@ void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
                    q7_t * bufferA, 
                    q7_t * Im_out);
 
+static inline
+void      arm_avepool_q7_HWC_1d(q7_t * Im_in,
+                             const uint16_t dim_im_in,
+                             const uint16_t ch_im_in,
+                             const uint16_t dim_kernel,
+                             const uint16_t padding,
+                             const uint16_t stride, 
+                             const uint16_t dim_im_out, 
+                             q7_t * bufferA, 
+                             q7_t * Im_out) {
+  arm_avepool_q7_HWC_nonsquare(
+    Im_in,        //q7_t * Im_in,
+    dim_im_in,    //const uint16_t dim_im_in_x,
+    1,            //const uint16_t dim_im_in_y,
+    ch_im_in,     //const uint16_t ch_im_in,
+    dim_kernel,   //const uint16_t dim_kernel_x,
+    1,            //const uint16_t dim_kernel_y,
+    padding,      //const uint16_t padding_x,
+    0,            //const uint16_t padding_y,
+    stride,       //const uint16_t stride_x,
+    0,            //const uint16_t stride_y,
+    dim_im_out,   //const uint16_t dim_im_out_x,
+    1,            //const uint16_t dim_im_out_y,
+    bufferA,      //q7_t * bufferA, 
+    Im_out);      //q7_t * Im_out);
+}
+
 
 /**
  * @defgroup Softmax Softmax Functions

From 07591ee55dae26bab97a56eaf54262292c8bfbd5 Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 15:20:21 -0700
Subject: [PATCH 5/7] added 1d pooling functions and cleaned up 1d wrappers,
 plus built basic test for 1d pooling

---
 CMSIS/NN/Include/arm_nnfunctions.h            | 178 ++++-----------
 .../Ref_Implementations/arm_pool_ref.c        |  72 +++++-
 .../nn_test/arm_nnexamples_nn_test.cpp        | 144 +++++++++---
 .../Source/PoolingFunctions/arm_pool_q7_HWC.c | 207 ++++++++++++++++++
 4 files changed, 438 insertions(+), 163 deletions(-)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 2ad0d60253..1294b1998e 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -468,45 +468,6 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d(
                                                       q15_t * bufferA,
                                                       q7_t * bufferB);
 
-static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d(
-  const q7_t * Im_in,
-  const uint16_t dim_im_in,
-  const uint16_t ch_im_in,
-  const q7_t * wt,
-  const uint16_t ch_im_out,
-  const uint16_t dim_kernel,
-  const uint16_t padding,
-  const uint16_t stride,
-  const q7_t * bias,
-  const uint16_t bias_shift,
-  const uint16_t out_shift,
-  q7_t * Im_out,
-  const uint16_t dim_im_out,
-  q15_t * bufferA,
-  q7_t * bufferB ) {
-        return arm_convolve_1x1_HWC_q7_fast_nonsquare(
-          Im_in,      //const q7_t * Im_in
-          dim_im_in,  //const uint16_t dim_im_in_x,
-          1,          //const uint16_t dim_im_in_y,
-          ch_im_in,   //const uint16_t ch_im_in,
-          wt,         //const q7_t * wt,
-          ch_im_out,  //const uint16_t ch_im_out,
-          dim_kernel, //const uint16_t dim_kernel_x,
-          1,          //const uint16_t dim_kernel_y,
-          padding,    //const uint16_t padding_x,
-          1,          //const uint16_t padding_y,
-          stride,     //const uint16_t stride_x,
-          1,          //const uint16_t stride_y,
-          bias,       //const q7_t * bias,
-          bias_shift, //const uint16_t bias_shift,
-          out_shift,  //const uint16_t out_shift,
-          Im_out,     //q7_t * Im_out,
-          dim_im_out, //const uint16_t dim_im_out_x,
-          1,          //const uint16_t dim_im_out_y,
-          bufferA,    //q15_t * bufferA,
-          bufferB);   //q7_t * bufferB);
-}
-
   /**
    * @brief Q7 version of convolution for RGB image
    * @param[in]       Im_in       pointer to input tensor
@@ -619,7 +580,7 @@ static inline arm_status arm_convolve_1x1_HWC_q7_fast_1d(
    *
    * <b>Buffer size:</b>
    *
-   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+   * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
    *
    * bufferB size: 0
    *
@@ -786,45 +747,6 @@ static inline arm_status arm_convolve_HWC_q15_fast_1d(
                                                              q15_t * bufferA,
                                                              q7_t * bufferB);
 
-static inline arm_status arm_depthwise_separable_conv_HWC_q7_1d(
-  const q7_t * Im_in,
-  const uint16_t dim_im_in,
-  const uint16_t ch_im_in,
-  const q7_t * wt,
-  const uint16_t ch_im_out,
-  const uint16_t dim_kernel,
-  const uint16_t padding,
-  const uint16_t stride,
-  const q7_t * bias,
-  const uint16_t bias_shift,
-  const uint16_t out_shift,
-  q7_t * Im_out,
-  const uint16_t dim_im_out,
-  q15_t * bufferA,
-  q7_t * bufferB ) {
-        return arm_depthwise_separable_conv_HWC_q7_nonsquare(
-          Im_in,      //const q7_t * Im_in
-          dim_im_in,  //const uint16_t dim_im_in_x,
-          1,          //const uint16_t dim_im_in_y,
-          ch_im_in,   //const uint16_t ch_im_in,
-          wt,         //const q7_t * wt,
-          ch_im_out,  //const uint16_t ch_im_out,
-          dim_kernel, //const uint16_t dim_kernel_x,
-          1,          //const uint16_t dim_kernel_y,
-          padding,    //const uint16_t padding_x,
-          1,          //const uint16_t padding_y,
-          stride,     //const uint16_t stride_x,
-          1,          //const uint16_t stride_y,
-          bias,       //const q7_t * bias,
-          bias_shift, //const uint16_t bias_shift,
-          out_shift,  //const uint16_t out_shift,
-          Im_out,     //q7_t * Im_out,
-          dim_im_out, //const uint16_t dim_im_out_x,
-          1,          //const uint16_t dim_im_out_y,
-          bufferA,    //q15_t * bufferA,
-          bufferB);   //q7_t * bufferB);
-
-}
 /**
  * @defgroup FC Fully-connected Layer Functions
  *
@@ -1178,32 +1100,29 @@ void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
                    q7_t * bufferA, 
                    q7_t * Im_out);
 
-static inline
-void      arm_maxpool_q7_HWC_1d(q7_t * Im_in,
-                             const uint16_t dim_im_in,
-                             const uint16_t ch_im_in,
-                             const uint16_t dim_kernel,
-                             const uint16_t padding,
-                             const uint16_t stride, 
-                             const uint16_t dim_im_out, 
-                             q7_t * bufferA, 
-                             q7_t * Im_out) {
-  arm_maxpool_q7_HWC_nonsquare(
-    Im_in,        //q7_t * Im_in,
-    dim_im_in,    //const uint16_t dim_im_in_x,
-    1,            //const uint16_t dim_im_in_y,
-    ch_im_in,     //const uint16_t ch_im_in,
-    dim_kernel,   //const uint16_t dim_kernel_x,
-    1,            //const uint16_t dim_kernel_y,
-    padding,      //const uint16_t padding_x,
-    0,            //const uint16_t padding_y,
-    stride,       //const uint16_t stride_x,
-    0,            //const uint16_t stride_y,
-    dim_im_out,   //const uint16_t dim_im_out_x,
-    1,            //const uint16_t dim_im_out_y,
-    bufferA,      //q7_t * bufferA, 
-    Im_out);      //q7_t * Im_out);
-}
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out);
 
   /**
    * @brief Q7 average pooling function
@@ -1264,32 +1183,29 @@ void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
                    q7_t * bufferA, 
                    q7_t * Im_out);
 
-static inline
-void      arm_avepool_q7_HWC_1d(q7_t * Im_in,
-                             const uint16_t dim_im_in,
-                             const uint16_t ch_im_in,
-                             const uint16_t dim_kernel,
-                             const uint16_t padding,
-                             const uint16_t stride, 
-                             const uint16_t dim_im_out, 
-                             q7_t * bufferA, 
-                             q7_t * Im_out) {
-  arm_avepool_q7_HWC_nonsquare(
-    Im_in,        //q7_t * Im_in,
-    dim_im_in,    //const uint16_t dim_im_in_x,
-    1,            //const uint16_t dim_im_in_y,
-    ch_im_in,     //const uint16_t ch_im_in,
-    dim_kernel,   //const uint16_t dim_kernel_x,
-    1,            //const uint16_t dim_kernel_y,
-    padding,      //const uint16_t padding_x,
-    0,            //const uint16_t padding_y,
-    stride,       //const uint16_t stride_x,
-    0,            //const uint16_t stride_y,
-    dim_im_out,   //const uint16_t dim_im_out_x,
-    1,            //const uint16_t dim_im_out_y,
-    bufferA,      //q7_t * bufferA, 
-    Im_out);      //q7_t * Im_out);
-}
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out);
 
 
 /**
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
index 5701edecc6..2292e181ff 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
@@ -110,8 +110,6 @@ arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
                    const uint16_t dim_im_out_y,
                    q7_t * bufferA, 
                    q7_t * Im_out) {
-    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
-
     int16_t   i_ch_in, i_x, i_y;
     int16_t   k_x, k_y;
 
@@ -158,8 +156,6 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
                    const uint16_t dim_im_out_y,
                    q7_t * bufferA, 
                    q7_t * Im_out) {
-   /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
-
     int16_t   i_ch_in, i_x, i_y;
     int16_t   k_x, k_y;
 
@@ -190,3 +186,71 @@ arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
         }
     }
 }
+
+
+void
+arm_avepool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int sum = 0;
+            int count = 0;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    sum += Im_in[i_ch_in + ch_im_in * k];
+                    count++;
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = sum/count;
+        }
+    }
+}
+
+void
+arm_maxpool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int       max = -129;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    if (Im_in[i_ch_in + ch_im_in * k] > max)
+                    {
+                        max = Im_in[i_ch_in + ch_im_in * k];
+                    }
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = max;
+        }
+    }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index be6769d62d..efc1fbf571 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -38,15 +38,16 @@
 
 #include "arm_nnexamples_nn_test.h"
 
-//#define TEST_SIGMOID
-//#define TEST_TANH
+#define TEST_SIGMOID
+#define TEST_TANH
 #define TEST_POOL
 #define TEST_POOL_NS
-// #define TEST_RELU
-// #define TEST_IP
-// #define TEST_CONV
-// #define TEST_NONSQUARE
-// #define TEST_NNMULT
+#define TEST_POOL_1D
+#define TEST_RELU
+#define TEST_IP
+#define TEST_CONV
+#define TEST_NONSQUARE
+#define TEST_NNMULT
 
 int test_index = 0;
 q7_t test_flags[50];
@@ -295,76 +296,77 @@ int main()
 
 #ifdef TEST_POOL_NS
 
-#define POOL_NS_IM_DIM 64
+#define POOL_NS_IM_DIM1 64
+#define POOL_NS_IM_DIM2 16
 #define POOL_NS_IM_CH 4
 
-    test1 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH * 2];
-    test2 = new q15_t[POOL_NS_IM_DIM * POOL_NS_IM_CH];
-    test3 = new q7_t[POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH];
+    test1 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH * 2];
+    test2 = new q15_t[(POOL_NS_IM_DIM1 + POOL_NS_IM_DIM2) * POOL_NS_IM_CH];
+    test3 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH];
 
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
     {
         test1[i] = (rand() % 256 - 128);
     }
 
-    q7_t     *img_in = test1 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH;
+    q7_t     *img_in = test1 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH;
     q7_t     *pool_out_ref = test3;
-    q7_t     *pool_out_opt = test3 + POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH / 2;
+    q7_t     *pool_out_opt = test3 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH / 2;
 
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
     {
         test3[i] = 0;
     }
 
     // copy over the img input
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
     {
         img_in[i] = test1[i];
     }
 
-    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+    initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
 
     printf("Start maxpool reference implementation\n");
 
-    arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref)
+    arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
 
     // copy over the img input
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
     {
         img_in[i] = test1[i];
     }
 
     printf("Start maxpool opt implementation\n");
 
-    arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt)
+    arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, 1, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
 
-    verify_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+    verify_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
 
     // copy over the img input
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
     {
         img_in[i] = test1[i];
     }
 
-    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH);
+    initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1/2) * (POOL_NS_IM_DIM2/2) * POOL_NS_IM_CH);
 
     printf("Start maxpool reference implementation\n");
 
-    arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_ref)
+    arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
 
     // copy over the img input
-    for (int i = 0; i < POOL_NS_IM_DIM * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
     {
         img_in[i] = test1[i];
     }
 
     printf("Start maxpool opt implementation\n");
 
-    arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM, 1, POOL_NS_IM_CH, 3, 0, 0, 0, 2, 0, POOL_NS_IM_DIM / 2, 1, (q7_t *) test2, pool_out_opt)
+    arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
 
     // special check here
     bool      if_ave_pool_match = true;
-    for (int i = 0; i < POOL_NS_IM_DIM / 2 * 1 * POOL_NS_IM_CH; i++)
+    for (int i = 0; i < (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH; i++)
     {
         // we tolerate at most difference of 1 here because of rounding errors
         if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
@@ -382,11 +384,97 @@ int main()
     delete[]test2;
     delete[]test3;
 
+#endif//TEST_POOL_NS
 
 
+#ifdef TEST_POOL_1D
 
-#endif//TEST_POOL_NS
+#define POOL_IM_DIM_1D 32
+#define POOL_IM_CH_1D 8
+
+    test1 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D * 2];
+    test2 = new q15_t[POOL_IM_CH_1D];
+    test3 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D];
+
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        test1[i] = (rand() % 256 - 128);
+    }
+
+    q7_t     *img_in = test1 + POOL_IM_DIM_1D * POOL_IM_CH_1D;
+    q7_t     *pool_out_ref = test3;
+    q7_t     *pool_out_opt = test3 + POOL_IM_DIM_1D * POOL_IM_CH_1D / 2;
+
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        test3[i] = 0;
+    }
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_maxpool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_maxpool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
+
+    verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start avepool ref implementation\n");
+
+    arm_avepool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start avepool opt implementation\n");
+
+    arm_avepool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
+
+    // special check here
+    bool      if_ave_pool_match = true;
+    for (int i = 0; i < POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D; i++)
+    {
+        // we tolerate at most difference of 1 here because of rounding errors
+        if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+        {
+            printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+            if_ave_pool_match = false;
+        }
+    }
+    if (if_ave_pool_match == true)
+    {
+        printf("Outputs match.\n");
+    }
+
+    delete[]test1;
+    delete[]test2;
+    delete[]test3;
 
+#endif//TEST_POOL_1D
 
 #ifdef TEST_RELU
 
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index 96afe033fc..83bff56939 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -790,6 +790,213 @@ arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
 
 #endif                          /* ARM_MATH_DSP */
 
+}
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  ch_im_in
+   *
+   * The pooling function is implemented on on axis
+   * 
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    int16_t   i;
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    for (i = 0; i < dim_im_out; i++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i * ch_im_in;
+        q7_t     *start;
+        q7_t     *end;
+        /* setting the starting row */
+        if (i * stride - padding < 0)
+        {
+            start = Im_in;
+        } else
+        {
+            start = Im_in + (i * stride - padding) * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i * stride - padding + dim_kernel >= dim_im_in)
+        {
+            end = Im_in + dim_im_in * ch_im_in;
+        } else
+        {
+            end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+        }
+
+        /* copy over the first row */
+        arm_q7_to_q15_no_shift(start, buffer, ch_im_in);
+        count = 1;
+
+        /* move over to next row */
+        start += ch_im_in ;
+
+        for (; start < end; start += ch_im_in)
+        {
+            accumulate_q7_to_q15(buffer, start, ch_im_in);
+            count++;
+        }
+        buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+    }
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int sum = 0;
+            int count = 0;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    sum += Im_in[i_ch_in + ch_im_in * k];
+                    count++;
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = sum/count;
+        }
+    }
+#endif                          /* ARM_MATH_DSP */
+}
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  0
+   *
+   * The pooling function is implemented on on axis
+   * 
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+    int16_t   i;
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    for (i = 0; i < dim_im_out; i++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i * ch_im_in;
+        q7_t     *start;
+        q7_t     *end;
+        /* setting the starting row */
+        if (i * stride - padding < 0)
+        {
+            start = Im_in;
+        } else
+        {
+            start = Im_in + (i * stride - padding) * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i * stride - padding + dim_kernel >= dim_im_in)
+        {
+            end = Im_in + dim_im_in * ch_im_in;
+        } else
+        {
+            end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+        }
+
+        /* copy over the first row */
+        /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+        memmove(target, start,  ch_im_in);
+
+        /* move over to next row */
+        start +=  dim_im_in;
+
+        for (; start < end; start += ch_im_in)
+        {
+            compare_and_replace_if_larger_q7(target, start, ch_im_in);
+        }
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+  
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int       max = -129;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    if (Im_in[i_ch_in + ch_im_in * k] > max)
+                    {
+                        max = Im_in[i_ch_in + ch_im_in * k];
+                    }
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = max;
+        }
+    }
+#endif                          /* ARM_MATH_DSP */
 }
 
 

From 6d2e9b9059918616fed87399a68cc088c2d74951 Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 15:24:29 -0700
Subject: [PATCH 6/7] cleaned up comments on 1d functions

---
 CMSIS/NN/Include/arm_nnfunctions.h | 60 ++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 1294b1998e..471c7a1448 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -204,6 +204,26 @@ extern    "C"
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+  /**
+   * @brief Basic Q7 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   */
+
 static inline arm_status arm_convolve_HWC_q7_basic_nonsquare(
   const q7_t * Im_in,
   const uint16_t dim_im_in,
@@ -374,6 +394,26 @@ static inline arm_status arm_convolve_HWC_q7_basic_nonsquare(
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+  /**
+   * @brief Fast Q7 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   */
+
 static inline arm_status arm_convolve_HWC_q7_fast_1d(
   const q7_t * Im_in,
   const uint16_t dim_im_in,
@@ -614,6 +654,26 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d(
                               q15_t * bufferA, 
                               q7_t * bufferB);
 			
+
+  /**
+   * @brief Fast Q15 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   */
 static inline arm_status arm_convolve_HWC_q15_fast_1d(
   const q7_t * Im_in,
   const uint16_t dim_im_in,

From cdfce23bb2fd8c95b16a801ccc36ee694d4af9aa Mon Sep 17 00:00:00 2001
From: Quinn Abrahams-Vaughn <quinna@wearstrive.com>
Date: Thu, 9 May 2019 15:33:27 -0700
Subject: [PATCH 7/7] kept warnings from fast implementations

---
 CMSIS/NN/Include/arm_nnfunctions.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 471c7a1448..05c348f117 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -224,7 +224,7 @@ extern    "C"
    * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
    */
 
-static inline arm_status arm_convolve_HWC_q7_basic_nonsquare(
+static inline arm_status arm_convolve_HWC_q7_basic_1d(
   const q7_t * Im_in,
   const uint16_t dim_im_in,
   const uint16_t ch_im_in,
@@ -412,6 +412,12 @@ static inline arm_status arm_convolve_HWC_q7_basic_nonsquare(
    * @param[in,out]   bufferA      pointer to buffer space for input
    * @param[in,out]   bufferB      pointer to buffer space for output
    * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 4
+   *   ch_im_out is multiple of 2
    */
 
 static inline arm_status arm_convolve_HWC_q7_fast_1d(
@@ -673,6 +679,22 @@ static inline arm_status arm_convolve_HWC_q7_fast_1d(
    * @param[in,out]   bufferA      pointer to buffer space for input
    * @param[in,out]   bufferB      pointer to buffer space for output
    * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
+   *
+   * bufferB size: 0
+   *
+   * <b>Input dimension constraints:</b>
+   *
+   * ch_im_in is multiple of 2 
+   *
+   * ch_im_out is multipe of 2
+   *
    */
 static inline arm_status arm_convolve_HWC_q15_fast_1d(
   const q7_t * Im_in,