doxygen/volk__32f__expfast__32f_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2014 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #include <inttypes.h>

 #include <math.h>

 #include <stdio.h>


 #define Mln2 0.6931471805f

 #define A 8388608.0f

 #define B 1065353216.0f

 #define C 60801.0f


 #ifndef INCLUDED_volk_32f_expfast_32f_a_H

 #define INCLUDED_volk_32f_expfast_32f_a_H


 #if LV_HAVE_AVX && LV_HAVE_FMA


 #include <immintrin.h>


 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,

                                                   const float* aVector,

                                                   unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, a, b;

     __m256i exp;

     a = _mm256_set1_ps(A / Mln2);

     b = _mm256_set1_ps(B - C);


     for (; number < eighthPoints; number++) {

         aVal = _mm256_load_ps(aPtr);

         exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));

         bVal = _mm256_castsi256_ps(exp);


         _mm256_store_ps(bPtr, bVal);

         aPtr += 8;

         bPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */


 #ifdef LV_HAVE_AVX


 #include <immintrin.h>


 static inline void

 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, a, b;

     __m256i exp;

     a = _mm256_set1_ps(A / Mln2);

     b = _mm256_set1_ps(B - C);


     for (; number < eighthPoints; number++) {

         aVal = _mm256_load_ps(aPtr);

         exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));

         bVal = _mm256_castsi256_ps(exp);


         _mm256_store_ps(bPtr, bVal);

         aPtr += 8;

         bPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_AVX for aligned */


 #ifdef LV_HAVE_SSE4_1

 #include <smmintrin.h>


 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,

                                                  const float* aVector,

                                                  unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 aVal, bVal, a, b;

     __m128i exp;

     a = _mm_set1_ps(A / Mln2);

     b = _mm_set1_ps(B - C);


     for (; number < quarterPoints; number++) {

         aVal = _mm_load_ps(aPtr);

         exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));

         bVal = _mm_castsi128_ps(exp);


         _mm_store_ps(bPtr, bVal);

         aPtr += 4;

         bPtr += 4;

     }


     number = quarterPoints * 4;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_SSE4_1 for aligned */


 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */


 #ifndef INCLUDED_volk_32f_expfast_32f_u_H

 #define INCLUDED_volk_32f_expfast_32f_u_H


 #if LV_HAVE_AVX && LV_HAVE_FMA

 #include <immintrin.h>


 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,

                                                   const float* aVector,

                                                   unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, a, b;

     __m256i exp;

     a = _mm256_set1_ps(A / Mln2);

     b = _mm256_set1_ps(B - C);


     for (; number < eighthPoints; number++) {

         aVal = _mm256_loadu_ps(aPtr);

         exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));

         bVal = _mm256_castsi256_ps(exp);


         _mm256_storeu_ps(bPtr, bVal);

         aPtr += 8;

         bPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */


 #ifdef LV_HAVE_AVX

 #include <immintrin.h>


 static inline void

 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;


     __m256 aVal, bVal, a, b;

     __m256i exp;

     a = _mm256_set1_ps(A / Mln2);

     b = _mm256_set1_ps(B - C);


     for (; number < eighthPoints; number++) {

         aVal = _mm256_loadu_ps(aPtr);

         exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));

         bVal = _mm256_castsi256_ps(exp);


         _mm256_storeu_ps(bPtr, bVal);

         aPtr += 8;

         bPtr += 8;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_AVX for unaligned */


 #ifdef LV_HAVE_SSE4_1

 #include <smmintrin.h>


 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,

                                                  const float* aVector,

                                                  unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;


     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 aVal, bVal, a, b;

     __m128i exp;

     a = _mm_set1_ps(A / Mln2);

     b = _mm_set1_ps(B - C);


     for (; number < quarterPoints; number++) {

         aVal = _mm_loadu_ps(aPtr);

         exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));

         bVal = _mm_castsi128_ps(exp);


         _mm_storeu_ps(bPtr, bVal);

         aPtr += 4;

         bPtr += 4;

     }


     number = quarterPoints * 4;

     for (; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }


 #endif /* LV_HAVE_SSE4_1 for unaligned */


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32f_expfast_32f_generic(float* bVector,

                                                 const float* aVector,

                                                 unsigned int num_points)

 {

     float* bPtr = bVector;

     const float* aPtr = aVector;

     unsigned int number = 0;


     for (number = 0; number < num_points; number++) {

         *bPtr++ = expf(*aPtr++);

     }

 }

 #endif /* LV_HAVE_GENERIC */


 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_cvtps_epi32
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036

_mm_storeu_ps
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787

_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205

_mm_set1_ps
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503

_mm_loadu_ps
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941

_mm_castsi128_ps
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250

_mm_add_ps
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

__m128i
int64x2_t __m128i
Definition: sse2neon.h:244

_mm_store_ps
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704

Mln2
#define Mln2
Definition: volk_32f_expfast_32f.h:56

B
#define B
Definition: volk_32f_expfast_32f.h:58

A
#define A
Definition: volk_32f_expfast_32f.h:57

volk_32f_expfast_32f_u_avx
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:219

volk_32f_expfast_32f_generic
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:290

C
#define C
Definition: volk_32f_expfast_32f.h:59

volk_32f_expfast_32f_a_avx
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:107