diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index c6ec83a453..05c348f117 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -204,6 +204,65 @@ extern    "C"
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+  /**
+   * @brief Basic Q7 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   */
+
+static inline arm_status arm_convolve_HWC_q7_basic_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q7_basic_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
   /**
    * @brief Basic Q15 convolution function
    * @param[in]       Im_in       pointer to input tensor
@@ -335,6 +394,71 @@ extern    "C"
                                                   q15_t * bufferA,
                                                   q7_t * bufferB);
 
+  /**
+   * @brief Fast Q7 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * This function is the version with full list of optimization tricks, but with
+   * some contraints:
+   *   ch_im_in is multiple of 4
+   *   ch_im_out is multiple of 2
+   */
+
+static inline arm_status arm_convolve_HWC_q7_fast_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q7_fast_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
   /**
    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
    * @param[in]       Im_in        pointer to input tensor
@@ -502,7 +626,7 @@ extern    "C"
    *
    * <b>Buffer size:</b>
    *
-   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+   * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
    *
    * bufferB size: 0
    *
@@ -535,7 +659,83 @@ extern    "C"
                               const uint16_t dim_im_out_y, 
                               q15_t * bufferA, 
                               q7_t * bufferB);
-										 
+			
+
+  /**
+   * @brief Fast Q15 convolution function (1D)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel   filter kernel size y
+   * @param[in]       padding      padding size y
+   * @param[in]       stride.      convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out.  output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size: 2*ch_im_in*dim_kernel_x*dim_kernel_y
+   *
+   * bufferB size: 0
+   *
+   * <b>Input dimension constraints:</b>
+   *
+   * ch_im_in is multiple of 2 
+   *
+   * ch_im_out is multipe of 2
+   *
+   */
+static inline arm_status arm_convolve_HWC_q15_fast_1d(
+  const q7_t * Im_in,
+  const uint16_t dim_im_in,
+  const uint16_t ch_im_in,
+  const q7_t * wt,
+  const uint16_t ch_im_out,
+  const uint16_t dim_kernel,
+  const uint16_t padding,
+  const uint16_t stride,
+  const q7_t * bias,
+  const uint16_t bias_shift,
+  const uint16_t out_shift,
+  q7_t * Im_out,
+  const uint16_t dim_im_out,
+  q15_t * bufferA,
+  q7_t * bufferB ) {
+        return arm_convolve_HWC_q15_fast_nonsquare(
+          Im_in,      //const q7_t * Im_in
+          dim_im_in,  //const uint16_t dim_im_in_x,
+          1,          //const uint16_t dim_im_in_y,
+          ch_im_in,   //const uint16_t ch_im_in,
+          wt,         //const q7_t * wt,
+          ch_im_out,  //const uint16_t ch_im_out,
+          dim_kernel, //const uint16_t dim_kernel_x,
+          1,          //const uint16_t dim_kernel_y,
+          padding,    //const uint16_t padding_x,
+          1,          //const uint16_t padding_y,
+          stride,     //const uint16_t stride_x,
+          1,          //const uint16_t stride_y,
+          bias,       //const q7_t * bias,
+          bias_shift, //const uint16_t bias_shift,
+          out_shift,  //const uint16_t out_shift,
+          Im_out,     //q7_t * Im_out,
+          dim_im_out, //const uint16_t dim_im_out_x,
+          1,          //const uint16_t dim_im_out_y,
+          bufferA,    //q15_t * bufferA,
+          bufferB);   //q7_t * bufferB);
+}
+
+
   /**
    * @brief Q7 depthwise separable convolution function
    * @param[in]       Im_in       pointer to input tensor
@@ -629,7 +829,6 @@ extern    "C"
                                                              q15_t * bufferA,
                                                              q7_t * bufferB);
 
-
 /**
  * @defgroup FC Fully-connected Layer Functions
  *
@@ -949,6 +1148,64 @@ extern    "C"
                                  q7_t * bufferA, 
                                  q7_t * Im_out);
 
+    /**
+   * @brief Q7 max pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   */
+
+void arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out);
+
   /**
    * @brief Q7 average pooling function
    * @param[in]       Im_in       pointer to input tensor
@@ -974,6 +1231,65 @@ extern    "C"
                                  q7_t * bufferA, 
                                  q7_t * Im_out);
 
+  /**
+   * @brief Q7 average pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   */
+
+void arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out);
+
+
 /**
  * @defgroup Softmax Softmax Functions
  *
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
index 9a4adb29ab..2292e181ff 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_pool_ref.c
@@ -94,3 +94,163 @@ void arm_maxpool_q7_HWC_ref(const q7_t * Im_in, // input image
         }
     }
 }
+
+void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out) {
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                int16_t y_start = i_y * stride_y - padding_y;
+                int16_t x_start = i_x * stride_x - padding_x;
+                for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
+                {
+                    for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+            }
+        }
+    }
+}
+
+
+void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out) {
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int       max = -129;
+                int16_t y_start = i_y * stride_y - padding_y;
+                int16_t x_start = i_x * stride_x - padding_x;
+                for (k_y = y_start; k_y < y_start + dim_kernel_y; k_y++)
+                {
+                    for (k_x = x_start; k_x < x_start + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+}
+
+
+void
+arm_avepool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int sum = 0;
+            int count = 0;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    sum += Im_in[i_ch_in + ch_im_in * k];
+                    count++;
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = sum/count;
+        }
+    }
+}
+
+void
+arm_maxpool_q7_HWC_1d_ref(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int       max = -129;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    if (Im_in[i_ch_in + ch_im_in * k] > max)
+                    {
+                        max = Im_in[i_ch_in + ch_im_in * k];
+                    }
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = max;
+        }
+    }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
index 4a0647a9bd..c1cee95846 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
@@ -229,6 +229,39 @@ extern    "C"
                                      q7_t * bufferA,    // a buffer for local storage
                                      q7_t * Im_out);
 
+    void
+arm_maxpool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
+
+    void
+arm_avepool_q7_HWC_nonsquare_ref(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out);
+
 /*
  *
  * Other reference implemenation
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index 5cf72a2a05..efc1fbf571 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -38,9 +38,11 @@
 
 #include "arm_nnexamples_nn_test.h"
 
-//#define TEST_SIGMOID
-//#define TEST_TANH
+#define TEST_SIGMOID
+#define TEST_TANH
 #define TEST_POOL
+#define TEST_POOL_NS
+#define TEST_POOL_1D
 #define TEST_RELU
 #define TEST_IP
 #define TEST_CONV
@@ -256,29 +258,205 @@ int main()
         img_in[i] = test1[i];
     }
 
+    printf("Start avepool ref implementation\n");
+
+    arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
+
     // copy over the img input
     for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
     {
         img_in[i] = test1[i];
     }
 
+    printf("Start avepool opt implementation\n");
+
+    arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
+
+    // special check here
+    bool      if_ave_pool_match = true;
+    for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++)
+    {
+        // we tolerate at most difference of 1 here because of rounding errors
+        if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+        {
+            printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+            if_ave_pool_match = false;
+        }
+    }
+    if (if_ave_pool_match == true)
+    {
+        printf("Outputs match.\n");
+    }
+
+    delete[]test1;
+    delete[]test2;
+    delete[]test3;
+
+#endif//TEST_POOL
+
+#ifdef TEST_POOL_NS
+
+#define POOL_NS_IM_DIM1 64
+#define POOL_NS_IM_DIM2 16
+#define POOL_NS_IM_CH 4
+
+    test1 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH * 2];
+    test2 = new q15_t[(POOL_NS_IM_DIM1 + POOL_NS_IM_DIM2) * POOL_NS_IM_CH];
+    test3 = new q7_t[POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH];
+
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+    {
+        test1[i] = (rand() % 256 - 128);
+    }
+
+    q7_t     *img_in = test1 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH;
+    q7_t     *pool_out_ref = test3;
+    q7_t     *pool_out_opt = test3 + POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH / 2;
+
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+    {
+        test3[i] = 0;
+    }
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM1 * POOL_NS_IM_DIM2 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_maxpool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_maxpool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, 1, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
+
+    verify_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, (POOL_NS_IM_DIM1/2) * (POOL_NS_IM_DIM2/2) * POOL_NS_IM_CH);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_avepool_q7_HWC_nonsquare_ref(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_ref)
+
+    // copy over the img input
+    for (int i = 0; i < POOL_NS_IM_DIM1 * 1 * POOL_NS_IM_CH; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_avepool_q7_HWC_nonsquare(img_in, POOL_NS_IM_DIM1, POOL_NS_IM_DIM2, POOL_NS_IM_CH, 3, 2, 0, 0, 2, 2, POOL_NS_IM_DIM1 / 2, POOL_NS_IM_DIM2 / 2, (q7_t *) test2, pool_out_opt)
+
+    // special check here
+    bool      if_ave_pool_match = true;
+    for (int i = 0; i < (POOL_NS_IM_DIM1 / 2) * (POOL_NS_IM_DIM2 / 2) * POOL_NS_IM_CH; i++)
+    {
+        // we tolerate at most difference of 1 here because of rounding errors
+        if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
+        {
+            printf("Output mismatch at %d, expected %d, actual %d\n", i, pool_out_ref[i], pool_out_opt[i]);
+            if_ave_pool_match = false;
+        }
+    }
+    if (if_ave_pool_match == true)
+    {
+        printf("Outputs match.\n");
+    }
+
+    delete[]test1;
+    delete[]test2;
+    delete[]test3;
+
+#endif//TEST_POOL_NS
+
+
+#ifdef TEST_POOL_1D
+
+#define POOL_IM_DIM_1D 32
+#define POOL_IM_CH_1D 8
+
+    test1 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D * 2];
+    test2 = new q15_t[POOL_IM_CH_1D];
+    test3 = new q7_t[POOL_IM_DIM_1D * POOL_IM_CH_1D];
+
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        test1[i] = (rand() % 256 - 128);
+    }
+
+    q7_t     *img_in = test1 + POOL_IM_DIM_1D * POOL_IM_CH_1D;
+    q7_t     *pool_out_ref = test3;
+    q7_t     *pool_out_opt = test3 + POOL_IM_DIM_1D * POOL_IM_CH_1D / 2;
+
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        test3[i] = 0;
+    }
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    initialize_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+    printf("Start maxpool reference implementation\n");
+
+    arm_maxpool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
+    printf("Start maxpool opt implementation\n");
+
+    arm_maxpool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
+
+    verify_results_q7(pool_out_ref, pool_out_opt, POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D);
+
+    // copy over the img input
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
+    {
+        img_in[i] = test1[i];
+    }
+
     printf("Start avepool ref implementation\n");
 
-    arm_avepool_q7_HWC_ref(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_ref);
+    arm_avepool_q7_HWC_1d_ref(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_ref);
 
     // copy over the img input
-    for (int i = 0; i < POOL_IM_DIM * POOL_IM_DIM * POOL_IM_CH; i++)
+    for (int i = 0; i < POOL_IM_DIM_1D * POOL_IM_CH_1D; i++)
     {
         img_in[i] = test1[i];
     }
 
     printf("Start avepool opt implementation\n");
 
-    arm_avepool_q7_HWC(img_in, POOL_IM_DIM, POOL_IM_CH, 3, 0, 2, POOL_IM_DIM / 2, (q7_t *) test2, pool_out_opt);
+    arm_avepool_q7_HWC_1d(img_in, POOL_IM_DIM_1D, POOL_IM_CH_1D, 3, 0, 2, POOL_IM_DIM_1D / 2, (q7_t *) test2, pool_out_opt);
 
     // special check here
     bool      if_ave_pool_match = true;
-    for (int i = 0; i < POOL_IM_DIM / 2 * POOL_IM_DIM / 2 * POOL_IM_CH; i++)
+    for (int i = 0; i < POOL_IM_DIM_1D / 2 * POOL_IM_CH_1D; i++)
     {
         // we tolerate at most difference of 1 here because of rounding errors
         if (pool_out_ref[i] - pool_out_opt[i] >= 2 || pool_out_opt[i] - pool_out_ref[i] >= 2)
@@ -296,7 +474,7 @@ int main()
     delete[]test2;
     delete[]test3;
 
-#endif
+#endif//TEST_POOL_1D
 
 #ifdef TEST_RELU
 
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
index b451f5e695..83bff56939 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -286,6 +286,179 @@ arm_maxpool_q7_HWC(q7_t * Im_in,
 
 }
 
+  /**
+   * @brief Q7 max pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  0
+   *
+   * The pooling function is implemented as split x-pooling then
+   * y-pooling.
+   *
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void
+arm_maxpool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t   i_x, i_y;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out_x; i_x++)
+        {
+            /* for each output pixel */
+            q7_t     *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+            q7_t     *win_start;
+            q7_t     *win_stop;
+            if (i_x * stride_x - padding_x < 0)
+            {
+                win_start = target;
+            } else
+            {
+                win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+            } else
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            /* arm_copy_q7(win_start, target, ch_im_in); */
+            memmove(target, win_start, ch_im_in);
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
+            }
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out_y; i_y++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+        q7_t     *row_start;
+        q7_t     *row_end;
+        /* setting the starting row */
+        /* EQUIVILANT :
+        row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y < 0)
+        {
+            row_start = Im_in;
+        } else
+        {
+            row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+        }
+        /* setting the stopping row */
+        /* EQUIVILANT :
+        row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+        {
+            row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+        } else
+        {
+            row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        }
+
+        /* copy over the first row */
+        /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+        memmove(target, row_start, dim_im_out_y * ch_im_in);
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in_x;
+
+        for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+        {
+            compare_and_replace_if_larger_q7(target, row_start, dim_im_out_x * ch_im_in);
+        }
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int       max = -129;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
+            }
+        }
+    }
+
+#endif                          /* ARM_MATH_DSP */
+
+}
+
+
   /**
    * @brief Q7 average pooling function
    * @param[in,out]   Im_in       pointer to input tensor
@@ -443,6 +616,390 @@ arm_avepool_q7_HWC(q7_t * Im_in,
 
 }
 
+  /**
+   * @brief Q7 average pooling function
+   * @param[in, out]  Im_in         pointer to input tensor
+   * @param[in]       dim_im_in_x   input tensor dimention along X axis
+   * @param[in]       dim_im_in_y   input tensor dimention along Y axis
+   * @param[in]       ch_im_in      number of input tensor channels
+   * @param[in]       dim_kernel_x  filter kernel size along X axis
+   * @param[in]       dim_kernel_y  filter kernel size along Y axis
+   * @param[in]       padding_x     padding sizes along X axis
+   * @param[in]       padding_y     padding sizes along Y axis
+   * @param[in]       stride_x      convolution stride along X axis
+   * @param[in]       stride_y      convolution stride along Y axis
+   * @param[in]       dim_im_out_x  output tensor dimension along X axis
+   * @param[in]       dim_im_out_y  output tensor dimension along Y axis
+   * @param[in,out]   bufferA       pointer to buffer space for input
+   * @param[in,out]   Im_out        pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  dim_im_out_x*dim_im_out_y*ch_im_in
+   *
+   * The pooling function is implemented as split x-pooling then
+   * y-pooling.
+   *
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void
+arm_avepool_q7_HWC_nonsquare(q7_t * Im_in,
+                   const uint16_t dim_im_in_x,
+                   const uint16_t dim_im_in_y,
+                   const uint16_t ch_im_in,
+                   const uint16_t dim_kernel_x,
+                   const uint16_t dim_kernel_y,
+                   const uint16_t padding_x,
+                   const uint16_t padding_y,
+                   const uint16_t stride_x,
+                   const uint16_t stride_y,
+                   const uint16_t dim_im_out_x,
+                   const uint16_t dim_im_out_y,
+                   q7_t * bufferA, 
+                   q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t   i_x, i_y;
+    int16_t   count = 0;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in_y; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out_x; i_x++)
+        {
+            /* for each output pixel */
+            q7_t     *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
+            q7_t     *win_start;
+            q7_t     *win_stop;
+            if (i_x * stride_x - padding_x < 0)
+            {
+                win_start = target;
+            } else
+            {
+                win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel_x >= dim_im_in_x)
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in) * ch_im_in;
+            } else
+            {
+                win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x + dim_kernel_x) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
+            count = 1;
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
+                count++;
+            }
+            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out_y; i_y++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i_y * dim_im_out_x * ch_im_in;
+        q7_t     *row_start;
+        q7_t     *row_end;
+        /* setting the starting row */
+        /* EQUIVILANT :
+        row_end = Im_in + MAX(0, (i_y * stride_y - padding_y)) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y < 0)
+        {
+            row_start = Im_in;
+        } else
+        {
+            row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
+        }
+        /* setting the stopping row */
+        /* EQUIVILANT :
+        row_end = Im_in + MIN(dim_im_in_y, i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        */
+        if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
+        {
+            row_end = Im_in + dim_im_in_y * dim_im_in_x * ch_im_in;
+        } else
+        {
+            row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y) * dim_im_in_x * ch_im_in;
+        }
+
+        /* copy over the first row */
+        arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in);
+        count = 1;
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in_x;
+
+        for (; row_start < row_end; row_start += dim_im_in_y * ch_im_in)
+        {
+            accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in);
+            count++;
+        }
+        buffer_scale_back_q15_to_q7(buffer, target, dim_im_out_x * ch_im_in, count);
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i_x, i_y;
+    int16_t   k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum/count;
+            }
+        }
+    }
+
+#endif                          /* ARM_MATH_DSP */
+
+}
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  ch_im_in
+   *
+   * The pooling function is implemented on on axis
+   * 
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void arm_avepool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    int16_t   i;
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    for (i = 0; i < dim_im_out; i++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i * ch_im_in;
+        q7_t     *start;
+        q7_t     *end;
+        /* setting the starting row */
+        if (i * stride - padding < 0)
+        {
+            start = Im_in;
+        } else
+        {
+            start = Im_in + (i * stride - padding) * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i * stride - padding + dim_kernel >= dim_im_in)
+        {
+            end = Im_in + dim_im_in * ch_im_in;
+        } else
+        {
+            end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+        }
+
+        /* copy over the first row */
+        arm_q7_to_q15_no_shift(start, buffer, ch_im_in);
+        count = 1;
+
+        /* move over to next row */
+        start += ch_im_in ;
+
+        for (; start < end; start += ch_im_in)
+        {
+            accumulate_q7_to_q15(buffer, start, ch_im_in);
+            count++;
+        }
+        buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+    }
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int sum = 0;
+            int count = 0;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    sum += Im_in[i_ch_in + ch_im_in * k];
+                    count++;
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = sum/count;
+        }
+    }
+#endif                          /* ARM_MATH_DSP */
+}
+
+  /**
+   * @brief Q7 1-D max pooling function
+   * @param[in, out]  Im_in       pointer to input tensor
+   * @param[in]       dim_im_in   input tensor dimention
+   * @param[in]       ch_im_in    number of input tensor channels
+   * @param[in]       dim_kernel  filter kernel size
+   * @param[in]       padding     padding sizes
+   * @param[in]       stride      convolution stride
+   * @param[in]       dim_im_out  output tensor dimension
+   * @param[in,out]   bufferA     pointer to buffer space for input
+   * @param[in,out]   Im_out      pointer to output tensor
+   * @return none.
+   *
+   * @details
+   *
+   * <b>Buffer size:</b>
+   *
+   * bufferA size:  0
+   *
+   * The pooling function is implemented on on axis
+   * 
+   * This pooling function is input-destructive. Input data is undefined
+   * after calling this function.
+   *
+   */
+
+void arm_maxpool_q7_HWC_1d(const q7_t * Im_in, // input image
+                            const uint16_t dim_im_in,   // input image dimension
+                            const uint16_t ch_im_in,    // number of input image channels
+                            const uint16_t dim_kernel,  // window kernel size
+                            const uint16_t padding, // padding sizes
+                            const uint16_t stride,  // stride
+                            const uint16_t dim_im_out,  // output image dimension
+                            q7_t * bufferA, // a buffer for local storage
+                            q7_t * Im_out) {
+#if defined (ARM_MATH_DSP)
+    int16_t   i;
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+    for (i = 0; i < dim_im_out; i++)
+    {
+
+        /* for each output row */
+        q7_t     *target = Im_out + i * ch_im_in;
+        q7_t     *start;
+        q7_t     *end;
+        /* setting the starting row */
+        if (i * stride - padding < 0)
+        {
+            start = Im_in;
+        } else
+        {
+            start = Im_in + (i * stride - padding) * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i * stride - padding + dim_kernel >= dim_im_in)
+        {
+            end = Im_in + dim_im_in * ch_im_in;
+        } else
+        {
+            end = Im_in + (i * stride - padding + dim_kernel) * ch_im_in;
+        }
+
+        /* copy over the first row */
+        /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+        memmove(target, start,  ch_im_in);
+
+        /* move over to next row */
+        start +=  dim_im_in;
+
+        for (; start < end; start += ch_im_in)
+        {
+            compare_and_replace_if_larger_q7(target, start, ch_im_in);
+        }
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+  
+    int16_t   i_ch_in, i;
+    int16_t   k;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i = 0; i < dim_im_out; i++)
+        {
+            int       max = -129;
+            int16_t start = i * stride - padding;
+            for (k = start; k < start + dim_kernel; k++)
+            {
+                if (k >= 0 && k < dim_im_in)
+                {
+                    if (Im_in[i_ch_in + ch_im_in * k] > max)
+                    {
+                        max = Im_in[i_ch_in + ch_im_in * k];
+                    }
+                }
+            }
+            Im_out[i_ch_in + ch_im_in * i] = max;
+        }
+    }
+#endif                          /* ARM_MATH_DSP */
+}
+
+
 /**
  * @} end of Pooling group
  */