Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
59 #define INCLUDED_volk_32f_x2_pow_32f_a_H
60 
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 #include <stdlib.h>
65 
66 #define POW_POLY_DEGREE 3
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
72 #define POLY1_AVX2_FMA(x, c0, c1) \
73  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
74 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
75  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
76 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
77  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
78 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
79  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
80 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
81  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
82 
83 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
84  const float* bVector,
85  const float* aVector,
86  unsigned int num_points)
87 {
88  float* cPtr = cVector;
89  const float* bPtr = bVector;
90  const float* aPtr = aVector;
91 
92  unsigned int number = 0;
93  const unsigned int eighthPoints = num_points / 8;
94 
95  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
96  __m256 tmp, fx, mask, pow2n, z, y;
97  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
98  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
99  __m256i bias, exp, emm0, pi32_0x7f;
100 
101  one = _mm256_set1_ps(1.0);
102  exp_hi = _mm256_set1_ps(88.3762626647949);
103  exp_lo = _mm256_set1_ps(-88.3762626647949);
104  ln2 = _mm256_set1_ps(0.6931471805);
105  log2EF = _mm256_set1_ps(1.44269504088896341);
106  half = _mm256_set1_ps(0.5);
107  exp_C1 = _mm256_set1_ps(0.693359375);
108  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
109  pi32_0x7f = _mm256_set1_epi32(0x7f);
110 
111  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
112  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
113  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
114  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
115  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
116  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
117 
118  for (; number < eighthPoints; number++) {
119  // First compute the logarithm
120  aVal = _mm256_load_ps(aPtr);
121  bias = _mm256_set1_epi32(127);
122  leadingOne = _mm256_set1_ps(1.0f);
123  exp = _mm256_sub_epi32(
124  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
125  _mm256_set1_epi32(0x7f800000)),
126  23),
127  bias);
128  logarithm = _mm256_cvtepi32_ps(exp);
129 
130  frac = _mm256_or_ps(
131  leadingOne,
132  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
133 
134 #if POW_POLY_DEGREE == 6
135  mantissa = POLY5_AVX2_FMA(frac,
136  3.1157899f,
137  -3.3241990f,
138  2.5988452f,
139  -1.2315303f,
140  3.1821337e-1f,
141  -3.4436006e-2f);
142 #elif POW_POLY_DEGREE == 5
143  mantissa = POLY4_AVX2_FMA(frac,
144  2.8882704548164776201f,
145  -2.52074962577807006663f,
146  1.48116647521213171641f,
147  -0.465725644288844778798f,
148  0.0596515482674574969533f);
149 #elif POW_POLY_DEGREE == 4
150  mantissa = POLY3_AVX2_FMA(frac,
151  2.61761038894603480148f,
152  -1.75647175389045657003f,
153  0.688243882994381274313f,
154  -0.107254423828329604454f);
155 #elif POW_POLY_DEGREE == 3
156  mantissa = POLY2_AVX2_FMA(frac,
157  2.28330284476918490682f,
158  -1.04913055217340124191f,
159  0.204446009836232697516f);
160 #else
161 #error
162 #endif
163 
164  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
165  logarithm = _mm256_mul_ps(logarithm, ln2);
166 
167  // Now calculate b*lna
168  bVal = _mm256_load_ps(bPtr);
169  bVal = _mm256_mul_ps(bVal, logarithm);
170 
171  // Now compute exp(b*lna)
172  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
173 
174  fx = _mm256_fmadd_ps(bVal, log2EF, half);
175 
176  emm0 = _mm256_cvttps_epi32(fx);
177  tmp = _mm256_cvtepi32_ps(emm0);
178 
179  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
180  fx = _mm256_sub_ps(tmp, mask);
181 
182  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
183  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
184  z = _mm256_mul_ps(bVal, bVal);
185 
186  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
187  y = _mm256_fmadd_ps(y, bVal, exp_p2);
188  y = _mm256_fmadd_ps(y, bVal, exp_p3);
189  y = _mm256_fmadd_ps(y, bVal, exp_p4);
190  y = _mm256_fmadd_ps(y, bVal, exp_p5);
191  y = _mm256_fmadd_ps(y, z, bVal);
192  y = _mm256_add_ps(y, one);
193 
194  emm0 =
195  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
196 
197  pow2n = _mm256_castsi256_ps(emm0);
198  cVal = _mm256_mul_ps(y, pow2n);
199 
200  _mm256_store_ps(cPtr, cVal);
201 
202  aPtr += 8;
203  bPtr += 8;
204  cPtr += 8;
205  }
206 
207  number = eighthPoints * 8;
208  for (; number < num_points; number++) {
209  *cPtr++ = pow(*aPtr++, *bPtr++);
210  }
211 }
212 
213 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
214 
215 #ifdef LV_HAVE_AVX2
216 #include <immintrin.h>
217 
218 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
219 #define POLY1_AVX2(x, c0, c1) \
220  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
221 #define POLY2_AVX2(x, c0, c1, c2) \
222  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
223 #define POLY3_AVX2(x, c0, c1, c2, c3) \
224  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
225 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
226  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
227 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
228  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
229 
230 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
231  const float* bVector,
232  const float* aVector,
233  unsigned int num_points)
234 {
235  float* cPtr = cVector;
236  const float* bPtr = bVector;
237  const float* aPtr = aVector;
238 
239  unsigned int number = 0;
240  const unsigned int eighthPoints = num_points / 8;
241 
242  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
243  __m256 tmp, fx, mask, pow2n, z, y;
244  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
245  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
246  __m256i bias, exp, emm0, pi32_0x7f;
247 
248  one = _mm256_set1_ps(1.0);
249  exp_hi = _mm256_set1_ps(88.3762626647949);
250  exp_lo = _mm256_set1_ps(-88.3762626647949);
251  ln2 = _mm256_set1_ps(0.6931471805);
252  log2EF = _mm256_set1_ps(1.44269504088896341);
253  half = _mm256_set1_ps(0.5);
254  exp_C1 = _mm256_set1_ps(0.693359375);
255  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
256  pi32_0x7f = _mm256_set1_epi32(0x7f);
257 
258  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
259  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
260  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
261  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
262  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
263  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
264 
265  for (; number < eighthPoints; number++) {
266  // First compute the logarithm
267  aVal = _mm256_load_ps(aPtr);
268  bias = _mm256_set1_epi32(127);
269  leadingOne = _mm256_set1_ps(1.0f);
270  exp = _mm256_sub_epi32(
271  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
272  _mm256_set1_epi32(0x7f800000)),
273  23),
274  bias);
275  logarithm = _mm256_cvtepi32_ps(exp);
276 
277  frac = _mm256_or_ps(
278  leadingOne,
279  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
280 
281 #if POW_POLY_DEGREE == 6
282  mantissa = POLY5_AVX2(frac,
283  3.1157899f,
284  -3.3241990f,
285  2.5988452f,
286  -1.2315303f,
287  3.1821337e-1f,
288  -3.4436006e-2f);
289 #elif POW_POLY_DEGREE == 5
290  mantissa = POLY4_AVX2(frac,
291  2.8882704548164776201f,
292  -2.52074962577807006663f,
293  1.48116647521213171641f,
294  -0.465725644288844778798f,
295  0.0596515482674574969533f);
296 #elif POW_POLY_DEGREE == 4
297  mantissa = POLY3_AVX2(frac,
298  2.61761038894603480148f,
299  -1.75647175389045657003f,
300  0.688243882994381274313f,
301  -0.107254423828329604454f);
302 #elif POW_POLY_DEGREE == 3
303  mantissa = POLY2_AVX2(frac,
304  2.28330284476918490682f,
305  -1.04913055217340124191f,
306  0.204446009836232697516f);
307 #else
308 #error
309 #endif
310 
311  logarithm = _mm256_add_ps(
312  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
313  logarithm = _mm256_mul_ps(logarithm, ln2);
314 
315  // Now calculate b*lna
316  bVal = _mm256_load_ps(bPtr);
317  bVal = _mm256_mul_ps(bVal, logarithm);
318 
319  // Now compute exp(b*lna)
320  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
321 
322  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
323 
324  emm0 = _mm256_cvttps_epi32(fx);
325  tmp = _mm256_cvtepi32_ps(emm0);
326 
327  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
328  fx = _mm256_sub_ps(tmp, mask);
329 
330  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
331  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
332  z = _mm256_mul_ps(bVal, bVal);
333 
334  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
335  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
336  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
337  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
338  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
339  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
340  y = _mm256_add_ps(y, one);
341 
342  emm0 =
343  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
344 
345  pow2n = _mm256_castsi256_ps(emm0);
346  cVal = _mm256_mul_ps(y, pow2n);
347 
348  _mm256_store_ps(cPtr, cVal);
349 
350  aPtr += 8;
351  bPtr += 8;
352  cPtr += 8;
353  }
354 
355  number = eighthPoints * 8;
356  for (; number < num_points; number++) {
357  *cPtr++ = pow(*aPtr++, *bPtr++);
358  }
359 }
360 
361 #endif /* LV_HAVE_AVX2 for aligned */
362 
363 
364 #ifdef LV_HAVE_SSE4_1
365 #include <smmintrin.h>
366 
367 #define POLY0(x, c0) _mm_set1_ps(c0)
368 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
369 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
370 #define POLY3(x, c0, c1, c2, c3) \
371  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
372 #define POLY4(x, c0, c1, c2, c3, c4) \
373  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
374 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
375  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
376 
377 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
378  const float* bVector,
379  const float* aVector,
380  unsigned int num_points)
381 {
382  float* cPtr = cVector;
383  const float* bPtr = bVector;
384  const float* aPtr = aVector;
385 
386  unsigned int number = 0;
387  const unsigned int quarterPoints = num_points / 4;
388 
389  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
390  __m128 tmp, fx, mask, pow2n, z, y;
391  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
392  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
393  __m128i bias, exp, emm0, pi32_0x7f;
394 
395  one = _mm_set1_ps(1.0);
396  exp_hi = _mm_set1_ps(88.3762626647949);
397  exp_lo = _mm_set1_ps(-88.3762626647949);
398  ln2 = _mm_set1_ps(0.6931471805);
399  log2EF = _mm_set1_ps(1.44269504088896341);
400  half = _mm_set1_ps(0.5);
401  exp_C1 = _mm_set1_ps(0.693359375);
402  exp_C2 = _mm_set1_ps(-2.12194440e-4);
403  pi32_0x7f = _mm_set1_epi32(0x7f);
404 
405  exp_p0 = _mm_set1_ps(1.9875691500e-4);
406  exp_p1 = _mm_set1_ps(1.3981999507e-3);
407  exp_p2 = _mm_set1_ps(8.3334519073e-3);
408  exp_p3 = _mm_set1_ps(4.1665795894e-2);
409  exp_p4 = _mm_set1_ps(1.6666665459e-1);
410  exp_p5 = _mm_set1_ps(5.0000001201e-1);
411 
412  for (; number < quarterPoints; number++) {
413  // First compute the logarithm
414  aVal = _mm_load_ps(aPtr);
415  bias = _mm_set1_epi32(127);
416  leadingOne = _mm_set1_ps(1.0f);
417  exp = _mm_sub_epi32(
419  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
420  bias);
421  logarithm = _mm_cvtepi32_ps(exp);
422 
423  frac = _mm_or_ps(leadingOne,
424  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
425 
426 #if POW_POLY_DEGREE == 6
427  mantissa = POLY5(frac,
428  3.1157899f,
429  -3.3241990f,
430  2.5988452f,
431  -1.2315303f,
432  3.1821337e-1f,
433  -3.4436006e-2f);
434 #elif POW_POLY_DEGREE == 5
435  mantissa = POLY4(frac,
436  2.8882704548164776201f,
437  -2.52074962577807006663f,
438  1.48116647521213171641f,
439  -0.465725644288844778798f,
440  0.0596515482674574969533f);
441 #elif POW_POLY_DEGREE == 4
442  mantissa = POLY3(frac,
443  2.61761038894603480148f,
444  -1.75647175389045657003f,
445  0.688243882994381274313f,
446  -0.107254423828329604454f);
447 #elif POW_POLY_DEGREE == 3
448  mantissa = POLY2(frac,
449  2.28330284476918490682f,
450  -1.04913055217340124191f,
451  0.204446009836232697516f);
452 #else
453 #error
454 #endif
455 
456  logarithm =
457  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
458  logarithm = _mm_mul_ps(logarithm, ln2);
459 
460 
461  // Now calculate b*lna
462  bVal = _mm_load_ps(bPtr);
463  bVal = _mm_mul_ps(bVal, logarithm);
464 
465  // Now compute exp(b*lna)
466  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
467 
468  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
469 
470  emm0 = _mm_cvttps_epi32(fx);
471  tmp = _mm_cvtepi32_ps(emm0);
472 
473  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
474  fx = _mm_sub_ps(tmp, mask);
475 
476  tmp = _mm_mul_ps(fx, exp_C1);
477  z = _mm_mul_ps(fx, exp_C2);
478  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
479  z = _mm_mul_ps(bVal, bVal);
480 
481  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
482  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
483  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
484  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
485  y = _mm_add_ps(y, one);
486 
487  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
488 
489  pow2n = _mm_castsi128_ps(emm0);
490  cVal = _mm_mul_ps(y, pow2n);
491 
492  _mm_store_ps(cPtr, cVal);
493 
494  aPtr += 4;
495  bPtr += 4;
496  cPtr += 4;
497  }
498 
499  number = quarterPoints * 4;
500  for (; number < num_points; number++) {
501  *cPtr++ = powf(*aPtr++, *bPtr++);
502  }
503 }
504 
505 #endif /* LV_HAVE_SSE4_1 for aligned */
506 
507 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
508 
509 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
510 #define INCLUDED_volk_32f_x2_pow_32f_u_H
511 
512 #include <inttypes.h>
513 #include <math.h>
514 #include <stdio.h>
515 #include <stdlib.h>
516 
517 #define POW_POLY_DEGREE 3
518 
519 #ifdef LV_HAVE_GENERIC
520 
521 static inline void volk_32f_x2_pow_32f_generic(float* cVector,
522  const float* bVector,
523  const float* aVector,
524  unsigned int num_points)
525 {
526  float* cPtr = cVector;
527  const float* bPtr = bVector;
528  const float* aPtr = aVector;
529  unsigned int number = 0;
530 
531  for (number = 0; number < num_points; number++) {
532  *cPtr++ = powf(*aPtr++, *bPtr++);
533  }
534 }
535 #endif /* LV_HAVE_GENERIC */
536 
537 
538 #ifdef LV_HAVE_SSE4_1
539 #include <smmintrin.h>
540 
541 #define POLY0(x, c0) _mm_set1_ps(c0)
542 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
543 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
544 #define POLY3(x, c0, c1, c2, c3) \
545  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
546 #define POLY4(x, c0, c1, c2, c3, c4) \
547  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
548 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
549  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
550 
551 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
552  const float* bVector,
553  const float* aVector,
554  unsigned int num_points)
555 {
556  float* cPtr = cVector;
557  const float* bPtr = bVector;
558  const float* aPtr = aVector;
559 
560  unsigned int number = 0;
561  const unsigned int quarterPoints = num_points / 4;
562 
563  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
564  __m128 tmp, fx, mask, pow2n, z, y;
565  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
566  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
567  __m128i bias, exp, emm0, pi32_0x7f;
568 
569  one = _mm_set1_ps(1.0);
570  exp_hi = _mm_set1_ps(88.3762626647949);
571  exp_lo = _mm_set1_ps(-88.3762626647949);
572  ln2 = _mm_set1_ps(0.6931471805);
573  log2EF = _mm_set1_ps(1.44269504088896341);
574  half = _mm_set1_ps(0.5);
575  exp_C1 = _mm_set1_ps(0.693359375);
576  exp_C2 = _mm_set1_ps(-2.12194440e-4);
577  pi32_0x7f = _mm_set1_epi32(0x7f);
578 
579  exp_p0 = _mm_set1_ps(1.9875691500e-4);
580  exp_p1 = _mm_set1_ps(1.3981999507e-3);
581  exp_p2 = _mm_set1_ps(8.3334519073e-3);
582  exp_p3 = _mm_set1_ps(4.1665795894e-2);
583  exp_p4 = _mm_set1_ps(1.6666665459e-1);
584  exp_p5 = _mm_set1_ps(5.0000001201e-1);
585 
586  for (; number < quarterPoints; number++) {
587  // First compute the logarithm
588  aVal = _mm_loadu_ps(aPtr);
589  bias = _mm_set1_epi32(127);
590  leadingOne = _mm_set1_ps(1.0f);
591  exp = _mm_sub_epi32(
593  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
594  bias);
595  logarithm = _mm_cvtepi32_ps(exp);
596 
597  frac = _mm_or_ps(leadingOne,
598  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
599 
600 #if POW_POLY_DEGREE == 6
601  mantissa = POLY5(frac,
602  3.1157899f,
603  -3.3241990f,
604  2.5988452f,
605  -1.2315303f,
606  3.1821337e-1f,
607  -3.4436006e-2f);
608 #elif POW_POLY_DEGREE == 5
609  mantissa = POLY4(frac,
610  2.8882704548164776201f,
611  -2.52074962577807006663f,
612  1.48116647521213171641f,
613  -0.465725644288844778798f,
614  0.0596515482674574969533f);
615 #elif POW_POLY_DEGREE == 4
616  mantissa = POLY3(frac,
617  2.61761038894603480148f,
618  -1.75647175389045657003f,
619  0.688243882994381274313f,
620  -0.107254423828329604454f);
621 #elif POW_POLY_DEGREE == 3
622  mantissa = POLY2(frac,
623  2.28330284476918490682f,
624  -1.04913055217340124191f,
625  0.204446009836232697516f);
626 #else
627 #error
628 #endif
629 
630  logarithm =
631  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
632  logarithm = _mm_mul_ps(logarithm, ln2);
633 
634 
635  // Now calculate b*lna
636  bVal = _mm_loadu_ps(bPtr);
637  bVal = _mm_mul_ps(bVal, logarithm);
638 
639  // Now compute exp(b*lna)
640  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
641 
642  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
643 
644  emm0 = _mm_cvttps_epi32(fx);
645  tmp = _mm_cvtepi32_ps(emm0);
646 
647  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
648  fx = _mm_sub_ps(tmp, mask);
649 
650  tmp = _mm_mul_ps(fx, exp_C1);
651  z = _mm_mul_ps(fx, exp_C2);
652  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
653  z = _mm_mul_ps(bVal, bVal);
654 
655  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
656  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
657  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
658  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
659  y = _mm_add_ps(y, one);
660 
661  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
662 
663  pow2n = _mm_castsi128_ps(emm0);
664  cVal = _mm_mul_ps(y, pow2n);
665 
666  _mm_storeu_ps(cPtr, cVal);
667 
668  aPtr += 4;
669  bPtr += 4;
670  cPtr += 4;
671  }
672 
673  number = quarterPoints * 4;
674  for (; number < num_points; number++) {
675  *cPtr++ = powf(*aPtr++, *bPtr++);
676  }
677 }
678 
679 #endif /* LV_HAVE_SSE4_1 for unaligned */
680 
681 #if LV_HAVE_AVX2 && LV_HAVE_FMA
682 #include <immintrin.h>
683 
684 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
685 #define POLY1_AVX2_FMA(x, c0, c1) \
686  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
687 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
688  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
689 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
690  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
691 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
692  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
693 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
694  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
695 
696 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
697  const float* bVector,
698  const float* aVector,
699  unsigned int num_points)
700 {
701  float* cPtr = cVector;
702  const float* bPtr = bVector;
703  const float* aPtr = aVector;
704 
705  unsigned int number = 0;
706  const unsigned int eighthPoints = num_points / 8;
707 
708  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
709  __m256 tmp, fx, mask, pow2n, z, y;
710  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
711  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
712  __m256i bias, exp, emm0, pi32_0x7f;
713 
714  one = _mm256_set1_ps(1.0);
715  exp_hi = _mm256_set1_ps(88.3762626647949);
716  exp_lo = _mm256_set1_ps(-88.3762626647949);
717  ln2 = _mm256_set1_ps(0.6931471805);
718  log2EF = _mm256_set1_ps(1.44269504088896341);
719  half = _mm256_set1_ps(0.5);
720  exp_C1 = _mm256_set1_ps(0.693359375);
721  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
722  pi32_0x7f = _mm256_set1_epi32(0x7f);
723 
724  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
725  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
726  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
727  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
728  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
729  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
730 
731  for (; number < eighthPoints; number++) {
732  // First compute the logarithm
733  aVal = _mm256_loadu_ps(aPtr);
734  bias = _mm256_set1_epi32(127);
735  leadingOne = _mm256_set1_ps(1.0f);
736  exp = _mm256_sub_epi32(
737  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
738  _mm256_set1_epi32(0x7f800000)),
739  23),
740  bias);
741  logarithm = _mm256_cvtepi32_ps(exp);
742 
743  frac = _mm256_or_ps(
744  leadingOne,
745  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
746 
747 #if POW_POLY_DEGREE == 6
748  mantissa = POLY5_AVX2_FMA(frac,
749  3.1157899f,
750  -3.3241990f,
751  2.5988452f,
752  -1.2315303f,
753  3.1821337e-1f,
754  -3.4436006e-2f);
755 #elif POW_POLY_DEGREE == 5
756  mantissa = POLY4_AVX2_FMA(frac,
757  2.8882704548164776201f,
758  -2.52074962577807006663f,
759  1.48116647521213171641f,
760  -0.465725644288844778798f,
761  0.0596515482674574969533f);
762 #elif POW_POLY_DEGREE == 4
763  mantissa = POLY3_AVX2_FMA(frac,
764  2.61761038894603480148f,
765  -1.75647175389045657003f,
766  0.688243882994381274313f,
767  -0.107254423828329604454f);
768 #elif POW_POLY_DEGREE == 3
769  mantissa = POLY2_AVX2_FMA(frac,
770  2.28330284476918490682f,
771  -1.04913055217340124191f,
772  0.204446009836232697516f);
773 #else
774 #error
775 #endif
776 
777  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
778  logarithm = _mm256_mul_ps(logarithm, ln2);
779 
780 
781  // Now calculate b*lna
782  bVal = _mm256_loadu_ps(bPtr);
783  bVal = _mm256_mul_ps(bVal, logarithm);
784 
785  // Now compute exp(b*lna)
786  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
787 
788  fx = _mm256_fmadd_ps(bVal, log2EF, half);
789 
790  emm0 = _mm256_cvttps_epi32(fx);
791  tmp = _mm256_cvtepi32_ps(emm0);
792 
793  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
794  fx = _mm256_sub_ps(tmp, mask);
795 
796  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
797  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
798  z = _mm256_mul_ps(bVal, bVal);
799 
800  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
801  y = _mm256_fmadd_ps(y, bVal, exp_p2);
802  y = _mm256_fmadd_ps(y, bVal, exp_p3);
803  y = _mm256_fmadd_ps(y, bVal, exp_p4);
804  y = _mm256_fmadd_ps(y, bVal, exp_p5);
805  y = _mm256_fmadd_ps(y, z, bVal);
806  y = _mm256_add_ps(y, one);
807 
808  emm0 =
809  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
810 
811  pow2n = _mm256_castsi256_ps(emm0);
812  cVal = _mm256_mul_ps(y, pow2n);
813 
814  _mm256_storeu_ps(cPtr, cVal);
815 
816  aPtr += 8;
817  bPtr += 8;
818  cPtr += 8;
819  }
820 
821  number = eighthPoints * 8;
822  for (; number < num_points; number++) {
823  *cPtr++ = pow(*aPtr++, *bPtr++);
824  }
825 }
826 
827 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
828 
829 #ifdef LV_HAVE_AVX2
830 #include <immintrin.h>
831 
832 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
833 #define POLY1_AVX2(x, c0, c1) \
834  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
835 #define POLY2_AVX2(x, c0, c1, c2) \
836  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
837 #define POLY3_AVX2(x, c0, c1, c2, c3) \
838  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
839 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
840  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
841 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
842  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
843 
844 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
845  const float* bVector,
846  const float* aVector,
847  unsigned int num_points)
848 {
849  float* cPtr = cVector;
850  const float* bPtr = bVector;
851  const float* aPtr = aVector;
852 
853  unsigned int number = 0;
854  const unsigned int eighthPoints = num_points / 8;
855 
856  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
857  __m256 tmp, fx, mask, pow2n, z, y;
858  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
859  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
860  __m256i bias, exp, emm0, pi32_0x7f;
861 
862  one = _mm256_set1_ps(1.0);
863  exp_hi = _mm256_set1_ps(88.3762626647949);
864  exp_lo = _mm256_set1_ps(-88.3762626647949);
865  ln2 = _mm256_set1_ps(0.6931471805);
866  log2EF = _mm256_set1_ps(1.44269504088896341);
867  half = _mm256_set1_ps(0.5);
868  exp_C1 = _mm256_set1_ps(0.693359375);
869  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
870  pi32_0x7f = _mm256_set1_epi32(0x7f);
871 
872  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
873  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
874  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
875  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
876  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
877  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
878 
879  for (; number < eighthPoints; number++) {
880  // First compute the logarithm
881  aVal = _mm256_loadu_ps(aPtr);
882  bias = _mm256_set1_epi32(127);
883  leadingOne = _mm256_set1_ps(1.0f);
884  exp = _mm256_sub_epi32(
885  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
886  _mm256_set1_epi32(0x7f800000)),
887  23),
888  bias);
889  logarithm = _mm256_cvtepi32_ps(exp);
890 
891  frac = _mm256_or_ps(
892  leadingOne,
893  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
894 
895 #if POW_POLY_DEGREE == 6
896  mantissa = POLY5_AVX2(frac,
897  3.1157899f,
898  -3.3241990f,
899  2.5988452f,
900  -1.2315303f,
901  3.1821337e-1f,
902  -3.4436006e-2f);
903 #elif POW_POLY_DEGREE == 5
904  mantissa = POLY4_AVX2(frac,
905  2.8882704548164776201f,
906  -2.52074962577807006663f,
907  1.48116647521213171641f,
908  -0.465725644288844778798f,
909  0.0596515482674574969533f);
910 #elif POW_POLY_DEGREE == 4
911  mantissa = POLY3_AVX2(frac,
912  2.61761038894603480148f,
913  -1.75647175389045657003f,
914  0.688243882994381274313f,
915  -0.107254423828329604454f);
916 #elif POW_POLY_DEGREE == 3
917  mantissa = POLY2_AVX2(frac,
918  2.28330284476918490682f,
919  -1.04913055217340124191f,
920  0.204446009836232697516f);
921 #else
922 #error
923 #endif
924 
925  logarithm = _mm256_add_ps(
926  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
927  logarithm = _mm256_mul_ps(logarithm, ln2);
928 
929  // Now calculate b*lna
930  bVal = _mm256_loadu_ps(bPtr);
931  bVal = _mm256_mul_ps(bVal, logarithm);
932 
933  // Now compute exp(b*lna)
934  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
935 
936  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
937 
938  emm0 = _mm256_cvttps_epi32(fx);
939  tmp = _mm256_cvtepi32_ps(emm0);
940 
941  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
942  fx = _mm256_sub_ps(tmp, mask);
943 
944  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
945  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
946  z = _mm256_mul_ps(bVal, bVal);
947 
948  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
949  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
950  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
951  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
952  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
953  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
954  y = _mm256_add_ps(y, one);
955 
956  emm0 =
957  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
958 
959  pow2n = _mm256_castsi256_ps(emm0);
960  cVal = _mm256_mul_ps(y, pow2n);
961 
962  _mm256_storeu_ps(cPtr, cVal);
963 
964  aPtr += 8;
965  bPtr += 8;
966  cPtr += 8;
967  }
968 
969  number = eighthPoints * 8;
970  for (; number < num_points; number++) {
971  *cPtr++ = pow(*aPtr++, *bPtr++);
972  }
973 }
974 
975 #endif /* LV_HAVE_AVX2 for unaligned */
976 
977 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:521