56 #define Mln2 0.6931471805f
58 #define B 1065353216.0f
62 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
63 #define INCLUDED_volk_32f_expfast_32f_a_H
65 #if LV_HAVE_AVX && LV_HAVE_FMA
67 #include <immintrin.h>
69 static inline void volk_32f_expfast_32f_a_avx_fma(
float* bVector,
71 unsigned int num_points)
73 float* bPtr = bVector;
74 const float* aPtr = aVector;
76 unsigned int number = 0;
77 const unsigned int eighthPoints = num_points / 8;
79 __m256 aVal, bVal, a, b;
81 a = _mm256_set1_ps(
A /
Mln2);
82 b = _mm256_set1_ps(
B -
C);
84 for (; number < eighthPoints; number++) {
85 aVal = _mm256_load_ps(aPtr);
86 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87 bVal = _mm256_castsi256_ps(exp);
89 _mm256_store_ps(bPtr, bVal);
94 number = eighthPoints * 8;
95 for (; number < num_points; number++) {
96 *bPtr++ = expf(*aPtr++);
104 #include <immintrin.h>
109 float* bPtr = bVector;
110 const float* aPtr = aVector;
112 unsigned int number = 0;
113 const unsigned int eighthPoints = num_points / 8;
115 __m256 aVal, bVal, a, b;
117 a = _mm256_set1_ps(
A /
Mln2);
118 b = _mm256_set1_ps(
B -
C);
120 for (; number < eighthPoints; number++) {
121 aVal = _mm256_load_ps(aPtr);
122 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123 bVal = _mm256_castsi256_ps(exp);
125 _mm256_store_ps(bPtr, bVal);
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *bPtr++ = expf(*aPtr++);
138 #ifdef LV_HAVE_SSE4_1
139 #include <smmintrin.h>
141 static inline void volk_32f_expfast_32f_a_sse4_1(
float* bVector,
142 const float* aVector,
143 unsigned int num_points)
145 float* bPtr = bVector;
146 const float* aPtr = aVector;
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
156 for (; number < quarterPoints; number++) {
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 *bPtr++ = expf(*aPtr++);
176 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
177 #define INCLUDED_volk_32f_expfast_32f_u_H
179 #if LV_HAVE_AVX && LV_HAVE_FMA
180 #include <immintrin.h>
182 static inline void volk_32f_expfast_32f_u_avx_fma(
float* bVector,
183 const float* aVector,
184 unsigned int num_points)
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
189 unsigned int number = 0;
190 const unsigned int eighthPoints = num_points / 8;
192 __m256 aVal, bVal, a, b;
194 a = _mm256_set1_ps(
A /
Mln2);
195 b = _mm256_set1_ps(
B -
C);
197 for (; number < eighthPoints; number++) {
198 aVal = _mm256_loadu_ps(aPtr);
199 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200 bVal = _mm256_castsi256_ps(exp);
202 _mm256_storeu_ps(bPtr, bVal);
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *bPtr++ = expf(*aPtr++);
216 #include <immintrin.h>
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
224 unsigned int number = 0;
225 const unsigned int eighthPoints = num_points / 8;
227 __m256 aVal, bVal, a, b;
229 a = _mm256_set1_ps(
A /
Mln2);
230 b = _mm256_set1_ps(
B -
C);
232 for (; number < eighthPoints; number++) {
233 aVal = _mm256_loadu_ps(aPtr);
234 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235 bVal = _mm256_castsi256_ps(exp);
237 _mm256_storeu_ps(bPtr, bVal);
242 number = eighthPoints * 8;
243 for (; number < num_points; number++) {
244 *bPtr++ = expf(*aPtr++);
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
254 static inline void volk_32f_expfast_32f_u_sse4_1(
float* bVector,
255 const float* aVector,
256 unsigned int num_points)
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
269 for (; number < quarterPoints; number++) {
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 *bPtr++ = expf(*aPtr++);
288 #ifdef LV_HAVE_GENERIC
291 const float* aVector,
292 unsigned int num_points)
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
296 unsigned int number = 0;
298 for (number = 0; number < num_points; number++) {
299 *bPtr++ = expf(*aPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define Mln2
Definition: volk_32f_expfast_32f.h:56
#define B
Definition: volk_32f_expfast_32f.h:58
#define A
Definition: volk_32f_expfast_32f.h:57
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:219
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:290
#define C
Definition: volk_32f_expfast_32f.h:59
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:107