58 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
59 #define INCLUDED_volk_32f_x2_pow_32f_a_H
66 #define POW_POLY_DEGREE 3
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
71 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
72 #define POLY1_AVX2_FMA(x, c0, c1) \
73 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
74 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
75 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
76 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
77 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
78 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
79 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
80 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
81 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
83 static inline void volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
86 unsigned int num_points)
88 float* cPtr = cVector;
89 const float* bPtr = bVector;
90 const float* aPtr = aVector;
92 unsigned int number = 0;
93 const unsigned int eighthPoints = num_points / 8;
95 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
96 __m256 tmp, fx, mask, pow2n, z, y;
97 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
98 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
99 __m256i bias, exp, emm0, pi32_0x7f;
101 one = _mm256_set1_ps(1.0);
102 exp_hi = _mm256_set1_ps(88.3762626647949);
103 exp_lo = _mm256_set1_ps(-88.3762626647949);
104 ln2 = _mm256_set1_ps(0.6931471805);
105 log2EF = _mm256_set1_ps(1.44269504088896341);
106 half = _mm256_set1_ps(0.5);
107 exp_C1 = _mm256_set1_ps(0.693359375);
108 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
109 pi32_0x7f = _mm256_set1_epi32(0x7f);
111 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
112 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
113 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
114 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
115 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
116 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
118 for (; number < eighthPoints; number++) {
120 aVal = _mm256_load_ps(aPtr);
121 bias = _mm256_set1_epi32(127);
122 leadingOne = _mm256_set1_ps(1.0f);
123 exp = _mm256_sub_epi32(
124 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
125 _mm256_set1_epi32(0x7f800000)),
128 logarithm = _mm256_cvtepi32_ps(exp);
132 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
134 #if POW_POLY_DEGREE == 6
135 mantissa = POLY5_AVX2_FMA(frac,
142 #elif POW_POLY_DEGREE == 5
143 mantissa = POLY4_AVX2_FMA(frac,
144 2.8882704548164776201f,
145 -2.52074962577807006663f,
146 1.48116647521213171641f,
147 -0.465725644288844778798f,
148 0.0596515482674574969533f);
149 #elif POW_POLY_DEGREE == 4
150 mantissa = POLY3_AVX2_FMA(frac,
151 2.61761038894603480148f,
152 -1.75647175389045657003f,
153 0.688243882994381274313f,
154 -0.107254423828329604454f);
155 #elif POW_POLY_DEGREE == 3
156 mantissa = POLY2_AVX2_FMA(frac,
157 2.28330284476918490682f,
158 -1.04913055217340124191f,
159 0.204446009836232697516f);
164 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
165 logarithm = _mm256_mul_ps(logarithm, ln2);
168 bVal = _mm256_load_ps(bPtr);
169 bVal = _mm256_mul_ps(bVal, logarithm);
172 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
174 fx = _mm256_fmadd_ps(bVal, log2EF, half);
176 emm0 = _mm256_cvttps_epi32(fx);
177 tmp = _mm256_cvtepi32_ps(emm0);
179 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
180 fx = _mm256_sub_ps(tmp, mask);
182 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
183 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
184 z = _mm256_mul_ps(bVal, bVal);
186 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
187 y = _mm256_fmadd_ps(y, bVal, exp_p2);
188 y = _mm256_fmadd_ps(y, bVal, exp_p3);
189 y = _mm256_fmadd_ps(y, bVal, exp_p4);
190 y = _mm256_fmadd_ps(y, bVal, exp_p5);
191 y = _mm256_fmadd_ps(y, z, bVal);
192 y = _mm256_add_ps(y, one);
195 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
197 pow2n = _mm256_castsi256_ps(emm0);
198 cVal = _mm256_mul_ps(y, pow2n);
200 _mm256_store_ps(cPtr, cVal);
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *cPtr++ = pow(*aPtr++, *bPtr++);
216 #include <immintrin.h>
218 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
219 #define POLY1_AVX2(x, c0, c1) \
220 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
221 #define POLY2_AVX2(x, c0, c1, c2) \
222 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
223 #define POLY3_AVX2(x, c0, c1, c2, c3) \
224 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
225 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
226 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
227 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
228 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
230 static inline void volk_32f_x2_pow_32f_a_avx2(
float* cVector,
231 const float* bVector,
232 const float* aVector,
233 unsigned int num_points)
235 float* cPtr = cVector;
236 const float* bPtr = bVector;
237 const float* aPtr = aVector;
239 unsigned int number = 0;
240 const unsigned int eighthPoints = num_points / 8;
242 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
243 __m256 tmp, fx, mask, pow2n, z, y;
244 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
245 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
246 __m256i bias, exp, emm0, pi32_0x7f;
248 one = _mm256_set1_ps(1.0);
249 exp_hi = _mm256_set1_ps(88.3762626647949);
250 exp_lo = _mm256_set1_ps(-88.3762626647949);
251 ln2 = _mm256_set1_ps(0.6931471805);
252 log2EF = _mm256_set1_ps(1.44269504088896341);
253 half = _mm256_set1_ps(0.5);
254 exp_C1 = _mm256_set1_ps(0.693359375);
255 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
256 pi32_0x7f = _mm256_set1_epi32(0x7f);
258 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
259 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
260 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
261 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
262 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
263 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
265 for (; number < eighthPoints; number++) {
267 aVal = _mm256_load_ps(aPtr);
268 bias = _mm256_set1_epi32(127);
269 leadingOne = _mm256_set1_ps(1.0f);
270 exp = _mm256_sub_epi32(
271 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
272 _mm256_set1_epi32(0x7f800000)),
275 logarithm = _mm256_cvtepi32_ps(exp);
279 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
281 #if POW_POLY_DEGREE == 6
282 mantissa = POLY5_AVX2(frac,
289 #elif POW_POLY_DEGREE == 5
290 mantissa = POLY4_AVX2(frac,
291 2.8882704548164776201f,
292 -2.52074962577807006663f,
293 1.48116647521213171641f,
294 -0.465725644288844778798f,
295 0.0596515482674574969533f);
296 #elif POW_POLY_DEGREE == 4
297 mantissa = POLY3_AVX2(frac,
298 2.61761038894603480148f,
299 -1.75647175389045657003f,
300 0.688243882994381274313f,
301 -0.107254423828329604454f);
302 #elif POW_POLY_DEGREE == 3
303 mantissa = POLY2_AVX2(frac,
304 2.28330284476918490682f,
305 -1.04913055217340124191f,
306 0.204446009836232697516f);
311 logarithm = _mm256_add_ps(
312 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
313 logarithm = _mm256_mul_ps(logarithm, ln2);
316 bVal = _mm256_load_ps(bPtr);
317 bVal = _mm256_mul_ps(bVal, logarithm);
320 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
322 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
324 emm0 = _mm256_cvttps_epi32(fx);
325 tmp = _mm256_cvtepi32_ps(emm0);
327 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
328 fx = _mm256_sub_ps(tmp, mask);
330 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
331 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
332 z = _mm256_mul_ps(bVal, bVal);
334 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
335 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
336 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
339 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
340 y = _mm256_add_ps(y, one);
343 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
345 pow2n = _mm256_castsi256_ps(emm0);
346 cVal = _mm256_mul_ps(y, pow2n);
348 _mm256_store_ps(cPtr, cVal);
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 *cPtr++ = pow(*aPtr++, *bPtr++);
364 #ifdef LV_HAVE_SSE4_1
365 #include <smmintrin.h>
367 #define POLY0(x, c0) _mm_set1_ps(c0)
368 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
369 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
370 #define POLY3(x, c0, c1, c2, c3) \
371 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
372 #define POLY4(x, c0, c1, c2, c3, c4) \
373 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
374 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
375 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
377 static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
378 const float* bVector,
379 const float* aVector,
380 unsigned int num_points)
382 float* cPtr = cVector;
383 const float* bPtr = bVector;
384 const float* aPtr = aVector;
386 unsigned int number = 0;
387 const unsigned int quarterPoints = num_points / 4;
389 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
390 __m128 tmp, fx, mask, pow2n, z, y;
391 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
392 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
393 __m128i bias, exp, emm0, pi32_0x7f;
412 for (; number < quarterPoints; number++) {
426 #if POW_POLY_DEGREE == 6
427 mantissa = POLY5(frac,
434 #elif POW_POLY_DEGREE == 5
435 mantissa = POLY4(frac,
436 2.8882704548164776201f,
437 -2.52074962577807006663f,
438 1.48116647521213171641f,
439 -0.465725644288844778798f,
440 0.0596515482674574969533f);
441 #elif POW_POLY_DEGREE == 4
442 mantissa = POLY3(frac,
443 2.61761038894603480148f,
444 -1.75647175389045657003f,
445 0.688243882994381274313f,
446 -0.107254423828329604454f);
447 #elif POW_POLY_DEGREE == 3
448 mantissa = POLY2(frac,
449 2.28330284476918490682f,
450 -1.04913055217340124191f,
451 0.204446009836232697516f);
499 number = quarterPoints * 4;
500 for (; number < num_points; number++) {
501 *cPtr++ = powf(*aPtr++, *bPtr++);
509 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
510 #define INCLUDED_volk_32f_x2_pow_32f_u_H
512 #include <inttypes.h>
517 #define POW_POLY_DEGREE 3
519 #ifdef LV_HAVE_GENERIC
522 const float* bVector,
523 const float* aVector,
524 unsigned int num_points)
526 float* cPtr = cVector;
527 const float* bPtr = bVector;
528 const float* aPtr = aVector;
529 unsigned int number = 0;
531 for (number = 0; number < num_points; number++) {
532 *cPtr++ = powf(*aPtr++, *bPtr++);
538 #ifdef LV_HAVE_SSE4_1
539 #include <smmintrin.h>
541 #define POLY0(x, c0) _mm_set1_ps(c0)
542 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
543 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
544 #define POLY3(x, c0, c1, c2, c3) \
545 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
546 #define POLY4(x, c0, c1, c2, c3, c4) \
547 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
548 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
549 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
551 static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
552 const float* bVector,
553 const float* aVector,
554 unsigned int num_points)
556 float* cPtr = cVector;
557 const float* bPtr = bVector;
558 const float* aPtr = aVector;
560 unsigned int number = 0;
561 const unsigned int quarterPoints = num_points / 4;
563 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
564 __m128 tmp, fx, mask, pow2n, z, y;
565 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
566 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
567 __m128i bias, exp, emm0, pi32_0x7f;
586 for (; number < quarterPoints; number++) {
600 #if POW_POLY_DEGREE == 6
601 mantissa = POLY5(frac,
608 #elif POW_POLY_DEGREE == 5
609 mantissa = POLY4(frac,
610 2.8882704548164776201f,
611 -2.52074962577807006663f,
612 1.48116647521213171641f,
613 -0.465725644288844778798f,
614 0.0596515482674574969533f);
615 #elif POW_POLY_DEGREE == 4
616 mantissa = POLY3(frac,
617 2.61761038894603480148f,
618 -1.75647175389045657003f,
619 0.688243882994381274313f,
620 -0.107254423828329604454f);
621 #elif POW_POLY_DEGREE == 3
622 mantissa = POLY2(frac,
623 2.28330284476918490682f,
624 -1.04913055217340124191f,
625 0.204446009836232697516f);
673 number = quarterPoints * 4;
674 for (; number < num_points; number++) {
675 *cPtr++ = powf(*aPtr++, *bPtr++);
681 #if LV_HAVE_AVX2 && LV_HAVE_FMA
682 #include <immintrin.h>
684 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
685 #define POLY1_AVX2_FMA(x, c0, c1) \
686 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
687 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
688 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
689 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
690 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
691 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
692 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
693 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
694 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
696 static inline void volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
697 const float* bVector,
698 const float* aVector,
699 unsigned int num_points)
701 float* cPtr = cVector;
702 const float* bPtr = bVector;
703 const float* aPtr = aVector;
705 unsigned int number = 0;
706 const unsigned int eighthPoints = num_points / 8;
708 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
709 __m256 tmp, fx, mask, pow2n, z, y;
710 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
711 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
712 __m256i bias, exp, emm0, pi32_0x7f;
714 one = _mm256_set1_ps(1.0);
715 exp_hi = _mm256_set1_ps(88.3762626647949);
716 exp_lo = _mm256_set1_ps(-88.3762626647949);
717 ln2 = _mm256_set1_ps(0.6931471805);
718 log2EF = _mm256_set1_ps(1.44269504088896341);
719 half = _mm256_set1_ps(0.5);
720 exp_C1 = _mm256_set1_ps(0.693359375);
721 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
722 pi32_0x7f = _mm256_set1_epi32(0x7f);
724 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
725 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
726 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
727 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
728 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
729 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
731 for (; number < eighthPoints; number++) {
733 aVal = _mm256_loadu_ps(aPtr);
734 bias = _mm256_set1_epi32(127);
735 leadingOne = _mm256_set1_ps(1.0f);
736 exp = _mm256_sub_epi32(
737 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
738 _mm256_set1_epi32(0x7f800000)),
741 logarithm = _mm256_cvtepi32_ps(exp);
745 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
747 #if POW_POLY_DEGREE == 6
748 mantissa = POLY5_AVX2_FMA(frac,
755 #elif POW_POLY_DEGREE == 5
756 mantissa = POLY4_AVX2_FMA(frac,
757 2.8882704548164776201f,
758 -2.52074962577807006663f,
759 1.48116647521213171641f,
760 -0.465725644288844778798f,
761 0.0596515482674574969533f);
762 #elif POW_POLY_DEGREE == 4
763 mantissa = POLY3_AVX2_FMA(frac,
764 2.61761038894603480148f,
765 -1.75647175389045657003f,
766 0.688243882994381274313f,
767 -0.107254423828329604454f);
768 #elif POW_POLY_DEGREE == 3
769 mantissa = POLY2_AVX2_FMA(frac,
770 2.28330284476918490682f,
771 -1.04913055217340124191f,
772 0.204446009836232697516f);
777 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
778 logarithm = _mm256_mul_ps(logarithm, ln2);
782 bVal = _mm256_loadu_ps(bPtr);
783 bVal = _mm256_mul_ps(bVal, logarithm);
786 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
788 fx = _mm256_fmadd_ps(bVal, log2EF, half);
790 emm0 = _mm256_cvttps_epi32(fx);
791 tmp = _mm256_cvtepi32_ps(emm0);
793 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
794 fx = _mm256_sub_ps(tmp, mask);
796 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
797 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
798 z = _mm256_mul_ps(bVal, bVal);
800 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
801 y = _mm256_fmadd_ps(y, bVal, exp_p2);
802 y = _mm256_fmadd_ps(y, bVal, exp_p3);
803 y = _mm256_fmadd_ps(y, bVal, exp_p4);
804 y = _mm256_fmadd_ps(y, bVal, exp_p5);
805 y = _mm256_fmadd_ps(y, z, bVal);
806 y = _mm256_add_ps(y, one);
809 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
811 pow2n = _mm256_castsi256_ps(emm0);
812 cVal = _mm256_mul_ps(y, pow2n);
814 _mm256_storeu_ps(cPtr, cVal);
821 number = eighthPoints * 8;
822 for (; number < num_points; number++) {
823 *cPtr++ = pow(*aPtr++, *bPtr++);
830 #include <immintrin.h>
832 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
833 #define POLY1_AVX2(x, c0, c1) \
834 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
835 #define POLY2_AVX2(x, c0, c1, c2) \
836 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
837 #define POLY3_AVX2(x, c0, c1, c2, c3) \
838 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
839 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
840 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
841 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
842 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
844 static inline void volk_32f_x2_pow_32f_u_avx2(
float* cVector,
845 const float* bVector,
846 const float* aVector,
847 unsigned int num_points)
849 float* cPtr = cVector;
850 const float* bPtr = bVector;
851 const float* aPtr = aVector;
853 unsigned int number = 0;
854 const unsigned int eighthPoints = num_points / 8;
856 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
857 __m256 tmp, fx, mask, pow2n, z, y;
858 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
859 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
860 __m256i bias, exp, emm0, pi32_0x7f;
862 one = _mm256_set1_ps(1.0);
863 exp_hi = _mm256_set1_ps(88.3762626647949);
864 exp_lo = _mm256_set1_ps(-88.3762626647949);
865 ln2 = _mm256_set1_ps(0.6931471805);
866 log2EF = _mm256_set1_ps(1.44269504088896341);
867 half = _mm256_set1_ps(0.5);
868 exp_C1 = _mm256_set1_ps(0.693359375);
869 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
870 pi32_0x7f = _mm256_set1_epi32(0x7f);
872 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
873 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
874 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
875 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
876 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
877 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
879 for (; number < eighthPoints; number++) {
881 aVal = _mm256_loadu_ps(aPtr);
882 bias = _mm256_set1_epi32(127);
883 leadingOne = _mm256_set1_ps(1.0f);
884 exp = _mm256_sub_epi32(
885 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
886 _mm256_set1_epi32(0x7f800000)),
889 logarithm = _mm256_cvtepi32_ps(exp);
893 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
895 #if POW_POLY_DEGREE == 6
896 mantissa = POLY5_AVX2(frac,
903 #elif POW_POLY_DEGREE == 5
904 mantissa = POLY4_AVX2(frac,
905 2.8882704548164776201f,
906 -2.52074962577807006663f,
907 1.48116647521213171641f,
908 -0.465725644288844778798f,
909 0.0596515482674574969533f);
910 #elif POW_POLY_DEGREE == 4
911 mantissa = POLY3_AVX2(frac,
912 2.61761038894603480148f,
913 -1.75647175389045657003f,
914 0.688243882994381274313f,
915 -0.107254423828329604454f);
916 #elif POW_POLY_DEGREE == 3
917 mantissa = POLY2_AVX2(frac,
918 2.28330284476918490682f,
919 -1.04913055217340124191f,
920 0.204446009836232697516f);
925 logarithm = _mm256_add_ps(
926 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
927 logarithm = _mm256_mul_ps(logarithm, ln2);
930 bVal = _mm256_loadu_ps(bPtr);
931 bVal = _mm256_mul_ps(bVal, logarithm);
934 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
936 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
938 emm0 = _mm256_cvttps_epi32(fx);
939 tmp = _mm256_cvtepi32_ps(emm0);
941 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
942 fx = _mm256_sub_ps(tmp, mask);
944 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
945 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
946 z = _mm256_mul_ps(bVal, bVal);
948 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
949 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
950 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
951 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
952 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
953 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
954 y = _mm256_add_ps(y, one);
957 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
959 pow2n = _mm256_castsi256_ps(emm0);
960 cVal = _mm256_mul_ps(y, pow2n);
962 _mm256_storeu_ps(cPtr, cVal);
969 number = eighthPoints * 8;
970 for (; number < num_points; number++) {
971 *cPtr++ = pow(*aPtr++, *bPtr++);
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:521