doxygen/volk__32f__x2__pow__32f_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2014 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H

 #define INCLUDED_volk_32f_x2_pow_32f_a_H


 #include <inttypes.h>

 #include <math.h>

 #include <stdio.h>

 #include <stdlib.h>


 #define POW_POLY_DEGREE 3


 #if LV_HAVE_AVX2 && LV_HAVE_FMA

 #include <immintrin.h>


 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)

 #define POLY1_AVX2_FMA(x, c0, c1) \

     _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))

 #define POLY2_AVX2_FMA(x, c0, c1, c2) \

     _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))

 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \

     _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))

 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \

     _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))

 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \

     _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,

                                                   const float* bVector,

                                                   const float* aVector,

                                                   unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m256 tmp, fx, mask, pow2n, z, y;

     __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m256i bias, exp, emm0, pi32_0x7f;


     one = _mm256_set1_ps(1.0);

     exp_hi = _mm256_set1_ps(88.3762626647949);

     exp_lo = _mm256_set1_ps(-88.3762626647949);

     ln2 = _mm256_set1_ps(0.6931471805);

     log2EF = _mm256_set1_ps(1.44269504088896341);

     half = _mm256_set1_ps(0.5);

     exp_C1 = _mm256_set1_ps(0.693359375);

     exp_C2 = _mm256_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm256_set1_epi32(0x7f);


     exp_p0 = _mm256_set1_ps(1.9875691500e-4);

     exp_p1 = _mm256_set1_ps(1.3981999507e-3);

     exp_p2 = _mm256_set1_ps(8.3334519073e-3);

     exp_p3 = _mm256_set1_ps(4.1665795894e-2);

     exp_p4 = _mm256_set1_ps(1.6666665459e-1);

     exp_p5 = _mm256_set1_ps(5.0000001201e-1);


     for (; number < eighthPoints; number++) {

         // First compute the logarithm

         aVal = _mm256_load_ps(aPtr);

         bias = _mm256_set1_epi32(127);

         leadingOne = _mm256_set1_ps(1.0f);

         exp = _mm256_sub_epi32(

             _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),

                                                _mm256_set1_epi32(0x7f800000)),

                               23),

             bias);

         logarithm = _mm256_cvtepi32_ps(exp);


         frac = _mm256_or_ps(

             leadingOne,

             _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5_AVX2_FMA(frac,

                                   3.1157899f,

                                   -3.3241990f,

                                   2.5988452f,

                                   -1.2315303f,

                                   3.1821337e-1f,

                                   -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4_AVX2_FMA(frac,

                                   2.8882704548164776201f,

                                   -2.52074962577807006663f,

                                   1.48116647521213171641f,

                                   -0.465725644288844778798f,

                                   0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3_AVX2_FMA(frac,

                                   2.61761038894603480148f,

                                   -1.75647175389045657003f,

                                   0.688243882994381274313f,

                                   -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2_AVX2_FMA(frac,

                                   2.28330284476918490682f,

                                   -1.04913055217340124191f,

                                   0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);

         logarithm = _mm256_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm256_load_ps(bPtr);

         bVal = _mm256_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm256_fmadd_ps(bVal, log2EF, half);


         emm0 = _mm256_cvttps_epi32(fx);

         tmp = _mm256_cvtepi32_ps(emm0);


         mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);

         fx = _mm256_sub_ps(tmp, mask);


         tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);

         bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);

         z = _mm256_mul_ps(bVal, bVal);


         y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);

         y = _mm256_fmadd_ps(y, bVal, exp_p2);

         y = _mm256_fmadd_ps(y, bVal, exp_p3);

         y = _mm256_fmadd_ps(y, bVal, exp_p4);

         y = _mm256_fmadd_ps(y, bVal, exp_p5);

         y = _mm256_fmadd_ps(y, z, bVal);

         y = _mm256_add_ps(y, one);


         emm0 =

             _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm256_castsi256_ps(emm0);

         cVal = _mm256_mul_ps(y, pow2n);


         _mm256_store_ps(cPtr, cVal);


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *cPtr++ = pow(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */


 #ifdef LV_HAVE_AVX2

 #include <immintrin.h>


 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)

 #define POLY1_AVX2(x, c0, c1) \

     _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))

 #define POLY2_AVX2(x, c0, c1, c2) \

     _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))

 #define POLY3_AVX2(x, c0, c1, c2, c3) \

     _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))

 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \

     _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))

 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \

     _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,

                                               const float* bVector,

                                               const float* aVector,

                                               unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m256 tmp, fx, mask, pow2n, z, y;

     __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m256i bias, exp, emm0, pi32_0x7f;


     one = _mm256_set1_ps(1.0);

     exp_hi = _mm256_set1_ps(88.3762626647949);

     exp_lo = _mm256_set1_ps(-88.3762626647949);

     ln2 = _mm256_set1_ps(0.6931471805);

     log2EF = _mm256_set1_ps(1.44269504088896341);

     half = _mm256_set1_ps(0.5);

     exp_C1 = _mm256_set1_ps(0.693359375);

     exp_C2 = _mm256_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm256_set1_epi32(0x7f);


     exp_p0 = _mm256_set1_ps(1.9875691500e-4);

     exp_p1 = _mm256_set1_ps(1.3981999507e-3);

     exp_p2 = _mm256_set1_ps(8.3334519073e-3);

     exp_p3 = _mm256_set1_ps(4.1665795894e-2);

     exp_p4 = _mm256_set1_ps(1.6666665459e-1);

     exp_p5 = _mm256_set1_ps(5.0000001201e-1);


     for (; number < eighthPoints; number++) {

         // First compute the logarithm

         aVal = _mm256_load_ps(aPtr);

         bias = _mm256_set1_epi32(127);

         leadingOne = _mm256_set1_ps(1.0f);

         exp = _mm256_sub_epi32(

             _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),

                                                _mm256_set1_epi32(0x7f800000)),

                               23),

             bias);

         logarithm = _mm256_cvtepi32_ps(exp);


         frac = _mm256_or_ps(

             leadingOne,

             _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5_AVX2(frac,

                               3.1157899f,

                               -3.3241990f,

                               2.5988452f,

                               -1.2315303f,

                               3.1821337e-1f,

                               -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4_AVX2(frac,

                               2.8882704548164776201f,

                               -2.52074962577807006663f,

                               1.48116647521213171641f,

                               -0.465725644288844778798f,

                               0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3_AVX2(frac,

                               2.61761038894603480148f,

                               -1.75647175389045657003f,

                               0.688243882994381274313f,

                               -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2_AVX2(frac,

                               2.28330284476918490682f,

                               -1.04913055217340124191f,

                               0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm = _mm256_add_ps(

             _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);

         logarithm = _mm256_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm256_load_ps(bPtr);

         bVal = _mm256_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);


         emm0 = _mm256_cvttps_epi32(fx);

         tmp = _mm256_cvtepi32_ps(emm0);


         mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);

         fx = _mm256_sub_ps(tmp, mask);


         tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));

         bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));

         z = _mm256_mul_ps(bVal, bVal);


         y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);

         y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);

         y = _mm256_add_ps(y, one);


         emm0 =

             _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm256_castsi256_ps(emm0);

         cVal = _mm256_mul_ps(y, pow2n);


         _mm256_store_ps(cPtr, cVal);


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *cPtr++ = pow(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX2 for aligned */


 #ifdef LV_HAVE_SSE4_1

 #include <smmintrin.h>


 #define POLY0(x, c0) _mm_set1_ps(c0)

 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))

 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))

 #define POLY3(x, c0, c1, c2, c3) \

     _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))

 #define POLY4(x, c0, c1, c2, c3, c4) \

     _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))

 #define POLY5(x, c0, c1, c2, c3, c4, c5) \

     _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,

                                                 const float* bVector,

                                                 const float* aVector,

                                                 unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m128 tmp, fx, mask, pow2n, z, y;

     __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m128i bias, exp, emm0, pi32_0x7f;


     one = _mm_set1_ps(1.0);

     exp_hi = _mm_set1_ps(88.3762626647949);

     exp_lo = _mm_set1_ps(-88.3762626647949);

     ln2 = _mm_set1_ps(0.6931471805);

     log2EF = _mm_set1_ps(1.44269504088896341);

     half = _mm_set1_ps(0.5);

     exp_C1 = _mm_set1_ps(0.693359375);

     exp_C2 = _mm_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm_set1_epi32(0x7f);


     exp_p0 = _mm_set1_ps(1.9875691500e-4);

     exp_p1 = _mm_set1_ps(1.3981999507e-3);

     exp_p2 = _mm_set1_ps(8.3334519073e-3);

     exp_p3 = _mm_set1_ps(4.1665795894e-2);

     exp_p4 = _mm_set1_ps(1.6666665459e-1);

     exp_p5 = _mm_set1_ps(5.0000001201e-1);


     for (; number < quarterPoints; number++) {

         // First compute the logarithm

         aVal = _mm_load_ps(aPtr);

         bias = _mm_set1_epi32(127);

         leadingOne = _mm_set1_ps(1.0f);

         exp = _mm_sub_epi32(

             _mm_srli_epi32(

                 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),

             bias);

         logarithm = _mm_cvtepi32_ps(exp);


         frac = _mm_or_ps(leadingOne,

                          _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5(frac,

                          3.1157899f,

                          -3.3241990f,

                          2.5988452f,

                          -1.2315303f,

                          3.1821337e-1f,

                          -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4(frac,

                          2.8882704548164776201f,

                          -2.52074962577807006663f,

                          1.48116647521213171641f,

                          -0.465725644288844778798f,

                          0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3(frac,

                          2.61761038894603480148f,

                          -1.75647175389045657003f,

                          0.688243882994381274313f,

                          -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2(frac,

                          2.28330284476918490682f,

                          -1.04913055217340124191f,

                          0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm =

             _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));

         logarithm = _mm_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm_load_ps(bPtr);

         bVal = _mm_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);


         emm0 = _mm_cvttps_epi32(fx);

         tmp = _mm_cvtepi32_ps(emm0);


         mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);

         fx = _mm_sub_ps(tmp, mask);


         tmp = _mm_mul_ps(fx, exp_C1);

         z = _mm_mul_ps(fx, exp_C2);

         bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);

         z = _mm_mul_ps(bVal, bVal);


         y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);

         y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);

         y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);

         y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);

         y = _mm_add_ps(y, one);


         emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm_castsi128_ps(emm0);

         cVal = _mm_mul_ps(y, pow2n);


         _mm_store_ps(cPtr, cVal);


         aPtr += 4;

         bPtr += 4;

         cPtr += 4;

     }


     number = quarterPoints * 4;

     for (; number < num_points; number++) {

         *cPtr++ = powf(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_SSE4_1 for aligned */


 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */


 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H

 #define INCLUDED_volk_32f_x2_pow_32f_u_H


 #include <inttypes.h>

 #include <math.h>

 #include <stdio.h>

 #include <stdlib.h>


 #define POW_POLY_DEGREE 3


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32f_x2_pow_32f_generic(float* cVector,

                                                const float* bVector,

                                                const float* aVector,

                                                unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;

     unsigned int number = 0;


     for (number = 0; number < num_points; number++) {

         *cPtr++ = powf(*aPtr++, *bPtr++);

     }

 }

 #endif /* LV_HAVE_GENERIC */


 #ifdef LV_HAVE_SSE4_1

 #include <smmintrin.h>


 #define POLY0(x, c0) _mm_set1_ps(c0)

 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))

 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))

 #define POLY3(x, c0, c1, c2, c3) \

     _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))

 #define POLY4(x, c0, c1, c2, c3, c4) \

     _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))

 #define POLY5(x, c0, c1, c2, c3, c4, c5) \

     _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,

                                                 const float* bVector,

                                                 const float* aVector,

                                                 unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m128 tmp, fx, mask, pow2n, z, y;

     __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m128i bias, exp, emm0, pi32_0x7f;


     one = _mm_set1_ps(1.0);

     exp_hi = _mm_set1_ps(88.3762626647949);

     exp_lo = _mm_set1_ps(-88.3762626647949);

     ln2 = _mm_set1_ps(0.6931471805);

     log2EF = _mm_set1_ps(1.44269504088896341);

     half = _mm_set1_ps(0.5);

     exp_C1 = _mm_set1_ps(0.693359375);

     exp_C2 = _mm_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm_set1_epi32(0x7f);


     exp_p0 = _mm_set1_ps(1.9875691500e-4);

     exp_p1 = _mm_set1_ps(1.3981999507e-3);

     exp_p2 = _mm_set1_ps(8.3334519073e-3);

     exp_p3 = _mm_set1_ps(4.1665795894e-2);

     exp_p4 = _mm_set1_ps(1.6666665459e-1);

     exp_p5 = _mm_set1_ps(5.0000001201e-1);


     for (; number < quarterPoints; number++) {

         // First compute the logarithm

         aVal = _mm_loadu_ps(aPtr);

         bias = _mm_set1_epi32(127);

         leadingOne = _mm_set1_ps(1.0f);

         exp = _mm_sub_epi32(

             _mm_srli_epi32(

                 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),

             bias);

         logarithm = _mm_cvtepi32_ps(exp);


         frac = _mm_or_ps(leadingOne,

                          _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5(frac,

                          3.1157899f,

                          -3.3241990f,

                          2.5988452f,

                          -1.2315303f,

                          3.1821337e-1f,

                          -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4(frac,

                          2.8882704548164776201f,

                          -2.52074962577807006663f,

                          1.48116647521213171641f,

                          -0.465725644288844778798f,

                          0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3(frac,

                          2.61761038894603480148f,

                          -1.75647175389045657003f,

                          0.688243882994381274313f,

                          -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2(frac,

                          2.28330284476918490682f,

                          -1.04913055217340124191f,

                          0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm =

             _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));

         logarithm = _mm_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm_loadu_ps(bPtr);

         bVal = _mm_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);


         emm0 = _mm_cvttps_epi32(fx);

         tmp = _mm_cvtepi32_ps(emm0);


         mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);

         fx = _mm_sub_ps(tmp, mask);


         tmp = _mm_mul_ps(fx, exp_C1);

         z = _mm_mul_ps(fx, exp_C2);

         bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);

         z = _mm_mul_ps(bVal, bVal);


         y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);

         y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);

         y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);

         y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);

         y = _mm_add_ps(y, one);


         emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm_castsi128_ps(emm0);

         cVal = _mm_mul_ps(y, pow2n);


         _mm_storeu_ps(cPtr, cVal);


         aPtr += 4;

         bPtr += 4;

         cPtr += 4;

     }


     number = quarterPoints * 4;

     for (; number < num_points; number++) {

         *cPtr++ = powf(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_SSE4_1 for unaligned */


 #if LV_HAVE_AVX2 && LV_HAVE_FMA

 #include <immintrin.h>


 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)

 #define POLY1_AVX2_FMA(x, c0, c1) \

     _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))

 #define POLY2_AVX2_FMA(x, c0, c1, c2) \

     _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))

 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \

     _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))

 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \

     _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))

 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \

     _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,

                                                   const float* bVector,

                                                   const float* aVector,

                                                   unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m256 tmp, fx, mask, pow2n, z, y;

     __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m256i bias, exp, emm0, pi32_0x7f;


     one = _mm256_set1_ps(1.0);

     exp_hi = _mm256_set1_ps(88.3762626647949);

     exp_lo = _mm256_set1_ps(-88.3762626647949);

     ln2 = _mm256_set1_ps(0.6931471805);

     log2EF = _mm256_set1_ps(1.44269504088896341);

     half = _mm256_set1_ps(0.5);

     exp_C1 = _mm256_set1_ps(0.693359375);

     exp_C2 = _mm256_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm256_set1_epi32(0x7f);


     exp_p0 = _mm256_set1_ps(1.9875691500e-4);

     exp_p1 = _mm256_set1_ps(1.3981999507e-3);

     exp_p2 = _mm256_set1_ps(8.3334519073e-3);

     exp_p3 = _mm256_set1_ps(4.1665795894e-2);

     exp_p4 = _mm256_set1_ps(1.6666665459e-1);

     exp_p5 = _mm256_set1_ps(5.0000001201e-1);


     for (; number < eighthPoints; number++) {

         // First compute the logarithm

         aVal = _mm256_loadu_ps(aPtr);

         bias = _mm256_set1_epi32(127);

         leadingOne = _mm256_set1_ps(1.0f);

         exp = _mm256_sub_epi32(

             _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),

                                                _mm256_set1_epi32(0x7f800000)),

                               23),

             bias);

         logarithm = _mm256_cvtepi32_ps(exp);


         frac = _mm256_or_ps(

             leadingOne,

             _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5_AVX2_FMA(frac,

                                   3.1157899f,

                                   -3.3241990f,

                                   2.5988452f,

                                   -1.2315303f,

                                   3.1821337e-1f,

                                   -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4_AVX2_FMA(frac,

                                   2.8882704548164776201f,

                                   -2.52074962577807006663f,

                                   1.48116647521213171641f,

                                   -0.465725644288844778798f,

                                   0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3_AVX2_FMA(frac,

                                   2.61761038894603480148f,

                                   -1.75647175389045657003f,

                                   0.688243882994381274313f,

                                   -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2_AVX2_FMA(frac,

                                   2.28330284476918490682f,

                                   -1.04913055217340124191f,

                                   0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);

         logarithm = _mm256_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm256_loadu_ps(bPtr);

         bVal = _mm256_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm256_fmadd_ps(bVal, log2EF, half);


         emm0 = _mm256_cvttps_epi32(fx);

         tmp = _mm256_cvtepi32_ps(emm0);


         mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);

         fx = _mm256_sub_ps(tmp, mask);


         tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);

         bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);

         z = _mm256_mul_ps(bVal, bVal);


         y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);

         y = _mm256_fmadd_ps(y, bVal, exp_p2);

         y = _mm256_fmadd_ps(y, bVal, exp_p3);

         y = _mm256_fmadd_ps(y, bVal, exp_p4);

         y = _mm256_fmadd_ps(y, bVal, exp_p5);

         y = _mm256_fmadd_ps(y, z, bVal);

         y = _mm256_add_ps(y, one);


         emm0 =

             _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm256_castsi256_ps(emm0);

         cVal = _mm256_mul_ps(y, pow2n);


         _mm256_storeu_ps(cPtr, cVal);


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *cPtr++ = pow(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */


 #ifdef LV_HAVE_AVX2

 #include <immintrin.h>


 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)

 #define POLY1_AVX2(x, c0, c1) \

     _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))

 #define POLY2_AVX2(x, c0, c1, c2) \

     _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))

 #define POLY3_AVX2(x, c0, c1, c2, c3) \

     _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))

 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \

     _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))

 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \

     _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))


 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,

                                               const float* bVector,

                                               const float* aVector,

                                               unsigned int num_points)

 {

     float* cPtr = cVector;

     const float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;

     __m256 tmp, fx, mask, pow2n, z, y;

     __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;

     __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;

     __m256i bias, exp, emm0, pi32_0x7f;


     one = _mm256_set1_ps(1.0);

     exp_hi = _mm256_set1_ps(88.3762626647949);

     exp_lo = _mm256_set1_ps(-88.3762626647949);

     ln2 = _mm256_set1_ps(0.6931471805);

     log2EF = _mm256_set1_ps(1.44269504088896341);

     half = _mm256_set1_ps(0.5);

     exp_C1 = _mm256_set1_ps(0.693359375);

     exp_C2 = _mm256_set1_ps(-2.12194440e-4);

     pi32_0x7f = _mm256_set1_epi32(0x7f);


     exp_p0 = _mm256_set1_ps(1.9875691500e-4);

     exp_p1 = _mm256_set1_ps(1.3981999507e-3);

     exp_p2 = _mm256_set1_ps(8.3334519073e-3);

     exp_p3 = _mm256_set1_ps(4.1665795894e-2);

     exp_p4 = _mm256_set1_ps(1.6666665459e-1);

     exp_p5 = _mm256_set1_ps(5.0000001201e-1);


     for (; number < eighthPoints; number++) {

         // First compute the logarithm

         aVal = _mm256_loadu_ps(aPtr);

         bias = _mm256_set1_epi32(127);

         leadingOne = _mm256_set1_ps(1.0f);

         exp = _mm256_sub_epi32(

             _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),

                                                _mm256_set1_epi32(0x7f800000)),

                               23),

             bias);

         logarithm = _mm256_cvtepi32_ps(exp);


         frac = _mm256_or_ps(

             leadingOne,

             _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));


 #if POW_POLY_DEGREE == 6

         mantissa = POLY5_AVX2(frac,

                               3.1157899f,

                               -3.3241990f,

                               2.5988452f,

                               -1.2315303f,

                               3.1821337e-1f,

                               -3.4436006e-2f);

 #elif POW_POLY_DEGREE == 5

         mantissa = POLY4_AVX2(frac,

                               2.8882704548164776201f,

                               -2.52074962577807006663f,

                               1.48116647521213171641f,

                               -0.465725644288844778798f,

                               0.0596515482674574969533f);

 #elif POW_POLY_DEGREE == 4

         mantissa = POLY3_AVX2(frac,

                               2.61761038894603480148f,

                               -1.75647175389045657003f,

                               0.688243882994381274313f,

                               -0.107254423828329604454f);

 #elif POW_POLY_DEGREE == 3

         mantissa = POLY2_AVX2(frac,

                               2.28330284476918490682f,

                               -1.04913055217340124191f,

                               0.204446009836232697516f);

 #else

 #error

 #endif


         logarithm = _mm256_add_ps(

             _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);

         logarithm = _mm256_mul_ps(logarithm, ln2);


         // Now calculate b*lna

         bVal = _mm256_loadu_ps(bPtr);

         bVal = _mm256_mul_ps(bVal, logarithm);


         // Now compute exp(b*lna)

         bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);


         fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);


         emm0 = _mm256_cvttps_epi32(fx);

         tmp = _mm256_cvtepi32_ps(emm0);


         mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);

         fx = _mm256_sub_ps(tmp, mask);


         tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));

         bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));

         z = _mm256_mul_ps(bVal, bVal);


         y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);

         y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);

         y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);

         y = _mm256_add_ps(y, one);


         emm0 =

             _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);


         pow2n = _mm256_castsi256_ps(emm0);

         cVal = _mm256_mul_ps(y, pow2n);


         _mm256_storeu_ps(cPtr, cVal);


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *cPtr++ = pow(*aPtr++, *bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX2 for unaligned */


 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */

_mm_slli_epi32
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565

_mm_sub_ps
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_add_epi32
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984

_mm_srli_epi32
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838

_mm_and_si128
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128

_mm_set1_epi32
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212

_mm_storeu_ps
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787

_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205

_mm_cvttps_epi32
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324

_mm_set1_ps
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503

_mm_cmpgt_ps
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154

_mm_sub_epi32
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087

_mm_loadu_ps
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941

_mm_and_ps
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064

_mm_castsi128_ps
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250

_mm_add_ps
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039

_mm_castps_si128
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

__m128i
int64x2_t __m128i
Definition: sse2neon.h:244

_mm_store_ps
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704

_mm_min_ps
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080

_mm_cvtepi32_ps
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937

_mm_or_ps
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237

_mm_max_ps
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025

volk_32f_x2_pow_32f_generic
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:521