Math: Library: Add the hifi4 exponential function

ShriramShastry · kv2019i · commit c069197b63f4 · 2023-06-16T12:17:09.000+03:00
The 32-bit HiFi4 exponential library function has an accuracy of 1e-4,
a unit in last place error of 5.60032793, and output ranges from
0.0067379470 to 148.4131591026 for inputs from -5 to +5 (Q4.28) (Q9.23).

Signed-off-by: ShriramShastry &lt;malladi.sastry@intel.com&gt;
diff --git a/src/include/sof/math/exp_fcn.h b/src/include/sof/math/exp_fcn.h
@@ -10,6 +10,18 @@
 
 #include <stdint.h>
 
+#if defined(__XCC__)
+/* HiFi */
+#include <xtensa/config/core-isa.h>
+#if XCHAL_HAVE_HIFI4 == 1
+#define SOFM_EXPONENTIAL_HIFI4 1
+#endif
+#else
+/* !XCC */
+#define EXPONENTIAL_GENERIC 1
+
+#endif
+
 int32_t sofm_exp_int32(int32_t x);
 
 #endif
diff --git a/src/math/CMakeLists.txt b/src/math/CMakeLists.txt
@@ -14,8 +14,8 @@ if(CONFIG_SQRT_FIXED)
         add_local_sources(sof sqrt_int16.c)
 endif()
 
-if(CONFIG_EXP_FIXED)
-        add_local_sources(sof exp_fcn.c)
+if(CONFIG_MATH_EXP)
+        add_local_sources(sof exp_fcn.c exp_fcn_hifi4.c)
 endif()
 
 if(CONFIG_MATH_DECIBELS)
diff --git a/src/math/Kconfig b/src/math/Kconfig
@@ -38,13 +38,13 @@ config SQRT_FIXED
 	  to  calculate square root.square function having positive number
 	  y as input and return the positive number x multiplied by itself (squared)
 
-config EXP_FIXED
+config MATH_EXP
 	bool "Exponential functions"
 	default n
 	help
 	  By selecting this, the 32-bit sofm_exp_int32() function can be used to calculate
 	  exponential values. With a maximum ulp of 5, an exponential function with
-	  an input range of -5 to +5 y gives positive numbers between 0.00673794699908547 and
+	  an input range of -5 to +5 gives positive numbers between 0.00673794699908547 and
 	  148.413159102577. The precision of this function is 1e-4.
 
 config NATURAL_LOGARITHM_FIXED
diff --git a/src/math/exp_fcn.c b/src/math/exp_fcn.c
@@ -6,20 +6,21 @@
  *
  */
 
-/* Include Files */
 #include <sof/math/exp_fcn.h>
 #include <sof/math/numbers.h>
 #include <sof/common.h>
 #include <rtos/bit.h>
 #include <stdbool.h>
 #include <stdint.h>
 
-#define BIT_MASK_Q62P2 0x4000000000000000LL
-#define CONVERG_ERROR 28823037607936LL	// error smaller than 1e-4,1/2 ^ -44.7122876200884
-#define BIT_MASK_LOW_Q27P5 0x8000000
-#define QUOTIENT_SCALE BIT(30)
-#define TERMS_Q23P9 8388608
-#define LSHIFT_BITS 8192
+#if defined(EXPONENTIAL_GENERIC)
+
+#define SOFM_BIT_MASK_Q62P2 0x4000000000000000LL
+#define SOFM_CONVERG_ERROR 28823037607936LL	// error smaller than 1e-4,1/2 ^ -44.7122876200884
+#define SOFM_BIT_MASK_LOW_Q27P5 0x8000000
+#define SOFM_QUOTIENT_SCALE BIT(30)
+#define SOFM_TERMS_Q23P9 8388608
+#define SOFM_LSHIFT_BITS 8192
 
 /* inv multiplication lookup table */
 /* LUT = ceil(1/factorial(b_n) * 2 ^ 63) */
@@ -152,7 +153,7 @@ static inline int64_t lomul_s64_sr_sat_near(int64_t a, int64_t b)
 	uint64_t u64_rlo;
 
 	mul_s64(a, b, &u64_rhi, &u64_rlo);
-	const bool roundup = (u64_rlo & BIT_MASK_LOW_Q27P5) != 0;
+	const bool roundup = (u64_rlo & SOFM_BIT_MASK_LOW_Q27P5) != 0;
 
 	u64_rlo = (u64_rhi << 36 | u64_rlo >> 28) + (roundup ? 1 : 0);
 	return u64_rlo;
@@ -179,8 +180,8 @@ int32_t sofm_exp_int32(int32_t x)
 	uint64_t ou0Lo;
 	int64_t qt;
 	int32_t b_n;
-	int32_t ts =  TERMS_Q23P9; /* Q23.9 */
-	int64_t dividend = (x + LSHIFT_BITS) >> 14; /* x in Q50.14 */
+	int32_t ts =  SOFM_TERMS_Q23P9; /* Q23.9 */
+	int64_t dividend = (x + SOFM_LSHIFT_BITS) >> 14; /* x in Q50.14 */
 	static const int32_t i_emin = -1342177280; /* Q4.28 */
 	static const int32_t o_emin = 56601; /* Q9.23 */
 	static const int32_t i_emax = 1342177280; /* Q4.28 */
@@ -195,22 +196,23 @@ int32_t sofm_exp_int32(int32_t x)
 		return o_emax; /* 148.4131494760513306 in Q9.23 */
 
 	/* pre-computation of 1st & 2nd terms */
-	mul_s64(dividend, BIT_MASK_Q62P2, &ou0Hi, &ou0Lo);
+	mul_s64(dividend, SOFM_BIT_MASK_Q62P2, &ou0Hi, &ou0Lo);
 	qt = (ou0Hi << 46) | (ou0Lo >> 18);/* Q6.26 */
-	ts += (int32_t)((qt >> 35) + ((qt & QUOTIENT_SCALE) >> 18));
+	ts += (int32_t)((qt >> 35) + ((qt & SOFM_QUOTIENT_SCALE) >> 18));
 	dividend = lomul_s64_sr_sat_near(dividend, x);
 	for (b_n = 0; b_n < ARRAY_SIZE(exp_iv_ilookup); b_n++) {
 		mul_s64(dividend, exp_iv_ilookup[b_n], &ou0Hi, &ou0Lo);
 		qt = (ou0Hi << 45) | (ou0Lo >> 19);
 
 		/* sum of the remaining terms */
-		ts += (int32_t)((qt >> 35) + ((qt & QUOTIENT_SCALE) ? 1 : 0));
+		ts += (int32_t)((qt >> 35) + ((qt & SOFM_QUOTIENT_SCALE) ? 1 : 0));
 		dividend = lomul_s64_sr_sat_near(dividend, x);
 
 		qt  = ABS(qt);
-		/* For inputs between -5 and 5, (qt < CONVERG_ERROR) is always true */
-		if (qt < CONVERG_ERROR)
+		/* For inputs between -5 and 5, (qt < SOFM_CONVERG_ERROR) is always true */
+		if (qt < SOFM_CONVERG_ERROR)
 			break;
 	}
 	return ts;
 }
+#endif
diff --git a/src/math/exp_fcn_hifi4.c b/src/math/exp_fcn_hifi4.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ *Copyright(c) 2023 Intel Corporation. All rights reserved.
+ *
+ * Author: Shriram Shastry <malladi.sastry@linux.intel.com>
+ *
+ */
+
+#include <sof/common.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(SOFM_EXPONENTIAL_HIFI4)
+#include <xtensa/tie/xt_hifi4.h>
+
+#define SOFM_BIT_MASK_LOW_Q27P5 0x0000000008000000
+#define SOFM_BIT_MASK_Q62P2 0x4000000000000000
+#define SOFM_CONVERG_ERROR 0x1A36E2EB4000LL /* error smaller than 1e-4,1/2 ^ -44.7122876209085 */
+#define SOFM_QUOTIENT_SCALE 0x400000000
+#define SOFM_TERMS_Q23P9 0x800000
+#define SOFM_LSHIFT_BITS BIT(13)
+
+/*
+ * Arguments	: int64_t in_0
+ *		  int64_t in_1
+ *		  uint64_t *ptroutbitshi
+ *		  uint64_t *ptroutbitslo
+ * Return Type	: void
+ * Description:Perform element-wise multiplication on in_0 and in_1
+ * while keeping the required product word length and fractional
+ * length in mind. mul_s64 function divide the 64-bit quantities
+ * into two 32-bit words,multiply the low words to produce the
+ * lowest and second-lowest words in the result, then both pairs
+ * of low and high words from different numbers to produce the
+ * second and third lowest words in the result, and finally both
+ * high words to produce the two highest words in the outcome.
+ * Add them all up, taking carry into consideration.
+ *
+ * The 64 x 64 bit multiplication of operands in_0 and in_1 is
+ * shown in the image below. The 64-bit operand in_0,in_1 is
+ * represented by the notation in0_H, in1_H for the top 32 bits
+ * and in0_L, in1_L for the bottom 32 bits.
+ *
+ *				in0_H : in0_L
+ *			x	in1_H : in1_L
+ *			---------------------
+ *	P0			in0_L x in1_L
+ *	P1		in0_H x in1_L		64 bit inner multiplication
+ *	P2		in0_L x in1_H		64 bit inner multiplication
+ *	P3	in0_H x in1_H
+ *			--------------------
+ *			[64 x 64 bit multiplication] sum of inner products
+ * All combinations are multiplied by one another and then added.
+ * Each inner product is moved into its proper power location.given the names
+ * of the inner products, redoing the addition where 000 represents 32 zero
+ * bits.The inner products can be added together in 64 bit addition.The sum
+ * of two 64-bit numbers yields a 65-bit output.
+ *		   (P0H:P0L)
+ *		P1H(P1L:000)
+ *		P2H(P2L:000)
+ *	P3H:P3L(000:000)
+ *	.......(aaa:P0L)
+ * By combining P0H:P0L and P1L:000. This can lead to a carry, denote as CRY0.
+ * The partial result is then multiplied by P2L:000.
+ * We call it CRY1 because it has the potential to carry again.
+ *	(CRY0 + CRY1)P0H:P0L
+ *	(	 P1H)P1L:000
+ *	(	 P2H)P2L:000
+ *	(P3H:	 P3L)000:000
+ *	--------------------
+ *	(ccc:bbb)aaa:P0L
+ * P1H, P2H, and P3H:P3L are added to the carry CRY0 + CRY1.This increase will
+ * not result in an overflow.
+ *
+ */
+static void mul_s64(ae_int64 in_0, ae_int64 in_1, ae_int64 *ptroutbitshi, ae_int64 *ptroutbitslo)
+{
+	ae_int64 producthihi, producthilo, productlolo;
+	ae_int64 producthi, carry, product_hl_lh_h, product_hl_lh_l;
+
+	ae_int32x2 in0_32 = AE_MOVINT32X2_FROMINT64(in_0);
+	ae_int32x2 in1_32 = AE_MOVINT32X2_FROMINT64(in_1);
+
+	ae_ep ep_lolo = AE_MOVEA(0);
+	ae_ep ep_hilo = AE_MOVEA(0);
+	ae_ep ep_HL_LH = AE_MOVEA(0);
+
+	producthihi = AE_MUL32_HH(in0_32, in1_32);
+
+	/* AE_MULZAAD32USEP.HL.LH - Unsigned lower parts and signed higher 32-bit parts dual */
+	/* multiply and accumulate operation on two 32x2 operands with 72-bit output */
+	/* Input-32x32-bit(in1_32xin0_32)into 72-bit multiplication operations */
+	/* Output-lower 64 bits of the result are stored in producthilo */
+	/* Output-upper eight bits are stored in ep_hilo */
+	AE_MULZAAD32USEP_HL_LH(ep_hilo, producthilo, in1_32, in0_32);
+	productlolo = AE_MUL32U_LL(in0_32, in1_32);
+
+	product_hl_lh_h = AE_SRAI72(ep_hilo, producthilo, 32);
+	product_hl_lh_l = AE_SLAI64(producthilo, 32);
+
+	/* The AE_ADD72 procedure adds two 72-bit elements. The first 72-bit value is created */
+	/* by concatenating the MSBs and LSBs of operands ep[7:0] and d[63:0]. Similarly, the */
+	/* second value is created by concatenating bits from operands ep1[7:0] and d1[63:0]. */
+	AE_ADD72(ep_lolo, productlolo, ep_HL_LH, product_hl_lh_l);
+
+	carry = AE_SRAI72(ep_lolo, productlolo, 32);
+
+	carry = AE_SRLI64(carry, 32);
+	producthi = AE_ADD64(producthihi, carry);
+
+	producthi = AE_ADD64(producthi, product_hl_lh_h);
+
+	*ptroutbitslo = productlolo;
+	*ptroutbitshi = producthi;
+}
+
+/*
+ * Arguments	: int64_t a
+ *		  int64_t b
+ * Return Type	: int64_t
+ */
+static int64_t mul_s64_sr_sat_near(int64_t a, int64_t b)
+{
+	ae_int64 result;
+	ae_int64 u64_chi;
+	ae_int64 u64_clo;
+	ae_int64 temp;
+
+	mul_s64(a, b, &u64_chi, &u64_clo);
+
+	ae_int64 roundup = AE_AND64(u64_clo, SOFM_SOFM_BIT_MASK_LOW_Q27P5);
+
+	roundup = AE_SRLI64(roundup, 27);
+	temp = AE_OR64(AE_SLAI64(u64_chi, 36), AE_SRLI64(u64_clo, 28));
+
+	return AE_ADD64(temp, roundup);
+}
+
+static ae_int64 onebyfact_Q63[19] = {
+		4611686018427387904LL,
+		1537228672809129301LL,
+		384307168202282325LL,
+		76861433640456465LL,
+		12810238940076077LL,
+		1830034134296582LL,
+		228754266787072LL,
+		25417140754119LL,
+		2541714075411LL,
+		231064915946LL,
+		19255409662LL,
+		1481185358LL,
+		105798954LL,
+		7053264LL,
+		440829LL,
+		25931LL,
+		1441LL,
+		76LL,
+		4LL
+};
+
+/* f(x) = a^x, x is variable and a is base
+ *
+ * Arguments   : int32_t x(Q4.28)
+ * input range : -5 to 5
+ *
+ * Return Type : int32_t ts(Q9.23)
+ * output range 0.0067465305 to 148.4131488800
+ *+------------------+-----------------+--------+--------+
+ *| x                | ts (returntype) |   x	|  ts	 |
+ *+----+-----+-------+----+----+-------+--------+--------+
+ *|WLen| FLen|Signbit|WLen|FLen|Signbit| Qformat| Qformat|
+ *+----+-----+-------+----+----+-------+--------+--------+
+ *| 32 | 28  |	1    | 32 | 23 |   0   | 4.28	| 9.23	 |
+ *+------------------+-----------------+--------+--------+
+ */
+int32_t sofm_exp_int32(int32_t x)
+{
+	ae_int64 outhi;
+	ae_int64 outlo;
+	ae_int64 qt;
+	ae_int64 onebyfact;
+	ae_int64 temp;
+
+	ae_int64 *ponebyfact_Q63 = &onebyfact_Q63[0];
+	ae_int64 ts = SOFM_TERMS_Q23P9;
+	ae_int64 mp = (x + SOFM_LSHIFT_BITS) >> 14; /* x in Q50.14 */;
+	xtbool flag;
+
+	int64_t qt_temp;
+	int64_t b_n;
+
+	mul_s64(mp, SOFM_BIT_MASK_Q62P2, &outhi, &outlo);
+	qt = AE_OR64(AE_SLAI64(outhi, 46), AE_SRLI64(outlo, 18));
+
+	temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35);
+
+	ts = AE_ADD64(ts, temp);
+
+	mp = mul_s64_sr_sat_near(mp, (int64_t)x);
+
+	for (b_n = 0; b_n < 64;) {
+		AE_L64_IP(onebyfact, ponebyfact_Q63, 8);
+
+		mul_s64(mp, onebyfact, &outhi, &outlo);
+		qt = AE_OR64(AE_SLAI64(outhi, 45), AE_SRLI64(outlo, 19));
+
+		temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35);
+		ts = AE_ADD64(ts, temp);
+
+		mp = mul_s64_sr_sat_near(mp, (int64_t)x);
+
+		const ae_int64 sign = AE_NEG64(qt);
+
+		flag = AE_LT64(qt, 0);
+		AE_MOVT64(qt, sign, flag);
+
+		if (!(qt < (ae_int64)SOFM_CONVERG_ERROR))
+			b_n++;
+		else
+			b_n = 64;
+	}
+
+	return AE_MOVAD32_L(AE_MOVINT32X2_FROMINT64(ts));
+}
+#endif
diff --git a/test/cmocka/src/math/arithmetic/CMakeLists.txt b/test/cmocka/src/math/arithmetic/CMakeLists.txt
@@ -13,6 +13,7 @@ cmocka_test(base2_logarithm
 cmocka_test(exponential
 	exponential.c
 	${PROJECT_SOURCE_DIR}/src/math/exp_fcn.c
+	${PROJECT_SOURCE_DIR}/src/math/exp_fcn_hifi4.c
 )
 cmocka_test(square_root
 	square_root.c
diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt
@@ -700,8 +700,9 @@ zephyr_library_sources_ifdef(CONFIG_SQRT_FIXED
 	${SOF_MATH_PATH}/sqrt_int16.c
 )
 
-zephyr_library_sources_ifdef(CONFIG_EXP_FIXED
+zephyr_library_sources_ifdef(CONFIG_MATH_EXP
 	${SOF_MATH_PATH}/exp_fcn.c
+	${SOF_MATH_PATH}/exp_fcn_hifi4.c
 )
 
 zephyr_library_sources_ifdef(CONFIG_COMP_UP_DOWN_MIXER

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ cmocka_test(base2_logarithm`
`13`	`13`	`cmocka_test(exponential`
`14`	`14`	`exponential.c`
`15`	`15`	`${PROJECT_SOURCE_DIR}/src/math/exp_fcn.c`
	`16`	`+ ${PROJECT_SOURCE_DIR}/src/math/exp_fcn_hifi4.c`
`16`	`17`	`)`
`17`	`18`	`cmocka_test(square_root`
`18`	`19`	`square_root.c`
Original file line number	Diff line number	Diff line change
`@@ -700,8 +700,9 @@ zephyr_library_sources_ifdef(CONFIG_SQRT_FIXED`
`700`	`700`	`${SOF_MATH_PATH}/sqrt_int16.c`
`701`	`701`	`)`
`702`	`702`
`703`		`-zephyr_library_sources_ifdef(CONFIG_EXP_FIXED`
	`703`	`+zephyr_library_sources_ifdef(CONFIG_MATH_EXP`
`704`	`704`	`${SOF_MATH_PATH}/exp_fcn.c`
	`705`	`+ ${SOF_MATH_PATH}/exp_fcn_hifi4.c`
`705`	`706`	`)`
`706`	`707`
`707`	`708`	`zephyr_library_sources_ifdef(CONFIG_COMP_UP_DOWN_MIXER`