|
| 1 | +// SPDX-License-Identifier: BSD-3-Clause |
| 2 | +/* |
| 3 | + *Copyright(c) 2023 Intel Corporation. All rights reserved. |
| 4 | + * |
| 5 | + * Author: Shriram Shastry <malladi.sastry@linux.intel.com> |
| 6 | + * |
| 7 | + */ |
| 8 | + |
| 9 | +#include <sof/common.h> |
| 10 | +#include <stdbool.h> |
| 11 | +#include <stdint.h> |
| 12 | +#include <stddef.h> |
| 13 | + |
| 14 | +#if defined(SOFM_EXPONENTIAL_HIFI4) |
| 15 | +#include <xtensa/tie/xt_hifi4.h> |
| 16 | + |
| 17 | +#define SOFM_BIT_MASK_LOW_Q27P5 0x0000000008000000 |
| 18 | +#define SOFM_BIT_MASK_Q62P2 0x4000000000000000 |
| 19 | +#define SOFM_CONVERG_ERROR 0x1A36E2EB4000LL /* error smaller than 1e-4,1/2 ^ -44.7122876209085 */ |
| 20 | +#define SOFM_QUOTIENT_SCALE 0x400000000 |
| 21 | +#define SOFM_TERMS_Q23P9 0x800000 |
| 22 | +#define SOFM_LSHIFT_BITS BIT(13) |
| 23 | + |
| 24 | +/* |
| 25 | + * Arguments : int64_t in_0 |
| 26 | + * int64_t in_1 |
| 27 | + * uint64_t *ptroutbitshi |
| 28 | + * uint64_t *ptroutbitslo |
| 29 | + * Return Type : void |
| 30 | + * Description:Perform element-wise multiplication on in_0 and in_1 |
| 31 | + * while keeping the required product word length and fractional |
| 32 | + * length in mind. mul_s64 function divide the 64-bit quantities |
| 33 | + * into two 32-bit words,multiply the low words to produce the |
| 34 | + * lowest and second-lowest words in the result, then both pairs |
| 35 | + * of low and high words from different numbers to produce the |
| 36 | + * second and third lowest words in the result, and finally both |
| 37 | + * high words to produce the two highest words in the outcome. |
| 38 | + * Add them all up, taking carry into consideration. |
| 39 | + * |
| 40 | + * The 64 x 64 bit multiplication of operands in_0 and in_1 is |
| 41 | + * shown in the image below. The 64-bit operand in_0,in_1 is |
| 42 | + * represented by the notation in0_H, in1_H for the top 32 bits |
| 43 | + * and in0_L, in1_L for the bottom 32 bits. |
| 44 | + * |
| 45 | + * in0_H : in0_L |
| 46 | + * x in1_H : in1_L |
| 47 | + * --------------------- |
| 48 | + * P0 in0_L x in1_L |
| 49 | + * P1 in0_H x in1_L 64 bit inner multiplication |
| 50 | + * P2 in0_L x in1_H 64 bit inner multiplication |
| 51 | + * P3 in0_H x in1_H |
| 52 | + * -------------------- |
| 53 | + * [64 x 64 bit multiplication] sum of inner products |
| 54 | + * All combinations are multiplied by one another and then added. |
| 55 | + * Each inner product is moved into its proper power location.given the names |
| 56 | + * of the inner products, redoing the addition where 000 represents 32 zero |
| 57 | + * bits.The inner products can be added together in 64 bit addition.The sum |
| 58 | + * of two 64-bit numbers yields a 65-bit output. |
| 59 | + * (P0H:P0L) |
| 60 | + * P1H(P1L:000) |
| 61 | + * P2H(P2L:000) |
| 62 | + * P3H:P3L(000:000) |
| 63 | + * .......(aaa:P0L) |
| 64 | + * By combining P0H:P0L and P1L:000. This can lead to a carry, denote as CRY0. |
| 65 | + * The partial result is then multiplied by P2L:000. |
| 66 | + * We call it CRY1 because it has the potential to carry again. |
| 67 | + * (CRY0 + CRY1)P0H:P0L |
| 68 | + * ( P1H)P1L:000 |
| 69 | + * ( P2H)P2L:000 |
| 70 | + * (P3H: P3L)000:000 |
| 71 | + * -------------------- |
| 72 | + * (ccc:bbb)aaa:P0L |
| 73 | + * P1H, P2H, and P3H:P3L are added to the carry CRY0 + CRY1.This increase will |
| 74 | + * not result in an overflow. |
| 75 | + * |
| 76 | + */ |
| 77 | +static void mul_s64(ae_int64 in_0, ae_int64 in_1, ae_int64 *ptroutbitshi, ae_int64 *ptroutbitslo) |
| 78 | +{ |
| 79 | + ae_int64 producthihi, producthilo, productlolo; |
| 80 | + ae_int64 producthi, carry, product_hl_lh_h, product_hl_lh_l; |
| 81 | + |
| 82 | + ae_int32x2 in0_32 = AE_MOVINT32X2_FROMINT64(in_0); |
| 83 | + ae_int32x2 in1_32 = AE_MOVINT32X2_FROMINT64(in_1); |
| 84 | + |
| 85 | + ae_ep ep_lolo = AE_MOVEA(0); |
| 86 | + ae_ep ep_hilo = AE_MOVEA(0); |
| 87 | + ae_ep ep_HL_LH = AE_MOVEA(0); |
| 88 | + |
| 89 | + producthihi = AE_MUL32_HH(in0_32, in1_32); |
| 90 | + |
| 91 | + /* AE_MULZAAD32USEP.HL.LH - Unsigned lower parts and signed higher 32-bit parts dual */ |
| 92 | + /* multiply and accumulate operation on two 32x2 operands with 72-bit output */ |
| 93 | + /* Input-32x32-bit(in1_32xin0_32)into 72-bit multiplication operations */ |
| 94 | + /* Output-lower 64 bits of the result are stored in producthilo */ |
| 95 | + /* Output-upper eight bits are stored in ep_hilo */ |
| 96 | + AE_MULZAAD32USEP_HL_LH(ep_hilo, producthilo, in1_32, in0_32); |
| 97 | + productlolo = AE_MUL32U_LL(in0_32, in1_32); |
| 98 | + |
| 99 | + product_hl_lh_h = AE_SRAI72(ep_hilo, producthilo, 32); |
| 100 | + product_hl_lh_l = AE_SLAI64(producthilo, 32); |
| 101 | + |
| 102 | + /* The AE_ADD72 procedure adds two 72-bit elements. The first 72-bit value is created */ |
| 103 | + /* by concatenating the MSBs and LSBs of operands ep[7:0] and d[63:0]. Similarly, the */ |
| 104 | + /* second value is created by concatenating bits from operands ep1[7:0] and d1[63:0]. */ |
| 105 | + AE_ADD72(ep_lolo, productlolo, ep_HL_LH, product_hl_lh_l); |
| 106 | + |
| 107 | + carry = AE_SRAI72(ep_lolo, productlolo, 32); |
| 108 | + |
| 109 | + carry = AE_SRLI64(carry, 32); |
| 110 | + producthi = AE_ADD64(producthihi, carry); |
| 111 | + |
| 112 | + producthi = AE_ADD64(producthi, product_hl_lh_h); |
| 113 | + |
| 114 | + *ptroutbitslo = productlolo; |
| 115 | + *ptroutbitshi = producthi; |
| 116 | +} |
| 117 | + |
| 118 | +/* |
| 119 | + * Arguments : int64_t a |
| 120 | + * int64_t b |
| 121 | + * Return Type : int64_t |
| 122 | + */ |
| 123 | +static int64_t mul_s64_sr_sat_near(int64_t a, int64_t b) |
| 124 | +{ |
| 125 | + ae_int64 result; |
| 126 | + ae_int64 u64_chi; |
| 127 | + ae_int64 u64_clo; |
| 128 | + ae_int64 temp; |
| 129 | + |
| 130 | + mul_s64(a, b, &u64_chi, &u64_clo); |
| 131 | + |
| 132 | + ae_int64 roundup = AE_AND64(u64_clo, SOFM_SOFM_BIT_MASK_LOW_Q27P5); |
| 133 | + |
| 134 | + roundup = AE_SRLI64(roundup, 27); |
| 135 | + temp = AE_OR64(AE_SLAI64(u64_chi, 36), AE_SRLI64(u64_clo, 28)); |
| 136 | + |
| 137 | + return AE_ADD64(temp, roundup); |
| 138 | +} |
| 139 | + |
| 140 | +static ae_int64 onebyfact_Q63[19] = { |
| 141 | + 4611686018427387904LL, |
| 142 | + 1537228672809129301LL, |
| 143 | + 384307168202282325LL, |
| 144 | + 76861433640456465LL, |
| 145 | + 12810238940076077LL, |
| 146 | + 1830034134296582LL, |
| 147 | + 228754266787072LL, |
| 148 | + 25417140754119LL, |
| 149 | + 2541714075411LL, |
| 150 | + 231064915946LL, |
| 151 | + 19255409662LL, |
| 152 | + 1481185358LL, |
| 153 | + 105798954LL, |
| 154 | + 7053264LL, |
| 155 | + 440829LL, |
| 156 | + 25931LL, |
| 157 | + 1441LL, |
| 158 | + 76LL, |
| 159 | + 4LL |
| 160 | +}; |
| 161 | + |
| 162 | +/* f(x) = a^x, x is variable and a is base |
| 163 | + * |
| 164 | + * Arguments : int32_t x(Q4.28) |
| 165 | + * input range : -5 to 5 |
| 166 | + * |
| 167 | + * Return Type : int32_t ts(Q9.23) |
| 168 | + * output range 0.0067465305 to 148.4131488800 |
| 169 | + *+------------------+-----------------+--------+--------+ |
| 170 | + *| x | ts (returntype) | x | ts | |
| 171 | + *+----+-----+-------+----+----+-------+--------+--------+ |
| 172 | + *|WLen| FLen|Signbit|WLen|FLen|Signbit| Qformat| Qformat| |
| 173 | + *+----+-----+-------+----+----+-------+--------+--------+ |
| 174 | + *| 32 | 28 | 1 | 32 | 23 | 0 | 4.28 | 9.23 | |
| 175 | + *+------------------+-----------------+--------+--------+ |
| 176 | + */ |
| 177 | +int32_t sofm_exp_int32(int32_t x) |
| 178 | +{ |
| 179 | + ae_int64 outhi; |
| 180 | + ae_int64 outlo; |
| 181 | + ae_int64 qt; |
| 182 | + ae_int64 onebyfact; |
| 183 | + ae_int64 temp; |
| 184 | + |
| 185 | + ae_int64 *ponebyfact_Q63 = &onebyfact_Q63[0]; |
| 186 | + ae_int64 ts = SOFM_TERMS_Q23P9; |
| 187 | + ae_int64 mp = (x + SOFM_LSHIFT_BITS) >> 14; /* x in Q50.14 */; |
| 188 | + xtbool flag; |
| 189 | + |
| 190 | + int64_t qt_temp; |
| 191 | + int64_t b_n; |
| 192 | + |
| 193 | + mul_s64(mp, SOFM_BIT_MASK_Q62P2, &outhi, &outlo); |
| 194 | + qt = AE_OR64(AE_SLAI64(outhi, 46), AE_SRLI64(outlo, 18)); |
| 195 | + |
| 196 | + temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35); |
| 197 | + |
| 198 | + ts = AE_ADD64(ts, temp); |
| 199 | + |
| 200 | + mp = mul_s64_sr_sat_near(mp, (int64_t)x); |
| 201 | + |
| 202 | + for (b_n = 0; b_n < 64;) { |
| 203 | + AE_L64_IP(onebyfact, ponebyfact_Q63, 8); |
| 204 | + |
| 205 | + mul_s64(mp, onebyfact, &outhi, &outlo); |
| 206 | + qt = AE_OR64(AE_SLAI64(outhi, 45), AE_SRLI64(outlo, 19)); |
| 207 | + |
| 208 | + temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35); |
| 209 | + ts = AE_ADD64(ts, temp); |
| 210 | + |
| 211 | + mp = mul_s64_sr_sat_near(mp, (int64_t)x); |
| 212 | + |
| 213 | + const ae_int64 sign = AE_NEG64(qt); |
| 214 | + |
| 215 | + flag = AE_LT64(qt, 0); |
| 216 | + AE_MOVT64(qt, sign, flag); |
| 217 | + |
| 218 | + if (!(qt < (ae_int64)SOFM_CONVERG_ERROR)) |
| 219 | + b_n++; |
| 220 | + else |
| 221 | + b_n = 64; |
| 222 | + } |
| 223 | + |
| 224 | + return AE_MOVAD32_L(AE_MOVINT32X2_FROMINT64(ts)); |
| 225 | +} |
| 226 | +#endif |
0 commit comments