Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #include <inttypes.h>
53 #include <math.h>
54 #include <stdio.h>
55 
56 #define Mln2 0.6931471805f
57 #define A 8388608.0f
58 #define B 1065353216.0f
59 #define C 60801.0f
60 
61 
62 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
63 #define INCLUDED_volk_32f_expfast_32f_a_H
64 
65 #if LV_HAVE_AVX && LV_HAVE_FMA
66 
67 #include <immintrin.h>
68 
69 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70  const float* aVector,
71  unsigned int num_points)
72 {
73  float* bPtr = bVector;
74  const float* aPtr = aVector;
75 
76  unsigned int number = 0;
77  const unsigned int eighthPoints = num_points / 8;
78 
79  __m256 aVal, bVal, a, b;
80  __m256i exp;
81  a = _mm256_set1_ps(A / Mln2);
82  b = _mm256_set1_ps(B - C);
83 
84  for (; number < eighthPoints; number++) {
85  aVal = _mm256_load_ps(aPtr);
86  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87  bVal = _mm256_castsi256_ps(exp);
88 
89  _mm256_store_ps(bPtr, bVal);
90  aPtr += 8;
91  bPtr += 8;
92  }
93 
94  number = eighthPoints * 8;
95  for (; number < num_points; number++) {
96  *bPtr++ = expf(*aPtr++);
97  }
98 }
99 
100 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101 
102 #ifdef LV_HAVE_AVX
103 
104 #include <immintrin.h>
105 
106 static inline void
107 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108 {
109  float* bPtr = bVector;
110  const float* aPtr = aVector;
111 
112  unsigned int number = 0;
113  const unsigned int eighthPoints = num_points / 8;
114 
115  __m256 aVal, bVal, a, b;
116  __m256i exp;
117  a = _mm256_set1_ps(A / Mln2);
118  b = _mm256_set1_ps(B - C);
119 
120  for (; number < eighthPoints; number++) {
121  aVal = _mm256_load_ps(aPtr);
122  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123  bVal = _mm256_castsi256_ps(exp);
124 
125  _mm256_store_ps(bPtr, bVal);
126  aPtr += 8;
127  bPtr += 8;
128  }
129 
130  number = eighthPoints * 8;
131  for (; number < num_points; number++) {
132  *bPtr++ = expf(*aPtr++);
133  }
134 }
135 
136 #endif /* LV_HAVE_AVX for aligned */
137 
138 #ifdef LV_HAVE_SSE4_1
139 #include <smmintrin.h>
140 
141 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142  const float* aVector,
143  unsigned int num_points)
144 {
145  float* bPtr = bVector;
146  const float* aPtr = aVector;
147 
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150 
151  __m128 aVal, bVal, a, b;
152  __m128i exp;
153  a = _mm_set1_ps(A / Mln2);
154  b = _mm_set1_ps(B - C);
155 
156  for (; number < quarterPoints; number++) {
157  aVal = _mm_load_ps(aPtr);
158  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159  bVal = _mm_castsi128_ps(exp);
160 
161  _mm_store_ps(bPtr, bVal);
162  aPtr += 4;
163  bPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  *bPtr++ = expf(*aPtr++);
169  }
170 }
171 
172 #endif /* LV_HAVE_SSE4_1 for aligned */
173 
174 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175 
176 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
177 #define INCLUDED_volk_32f_expfast_32f_u_H
178 
179 #if LV_HAVE_AVX && LV_HAVE_FMA
180 #include <immintrin.h>
181 
182 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183  const float* aVector,
184  unsigned int num_points)
185 {
186  float* bPtr = bVector;
187  const float* aPtr = aVector;
188 
189  unsigned int number = 0;
190  const unsigned int eighthPoints = num_points / 8;
191 
192  __m256 aVal, bVal, a, b;
193  __m256i exp;
194  a = _mm256_set1_ps(A / Mln2);
195  b = _mm256_set1_ps(B - C);
196 
197  for (; number < eighthPoints; number++) {
198  aVal = _mm256_loadu_ps(aPtr);
199  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200  bVal = _mm256_castsi256_ps(exp);
201 
202  _mm256_storeu_ps(bPtr, bVal);
203  aPtr += 8;
204  bPtr += 8;
205  }
206 
207  number = eighthPoints * 8;
208  for (; number < num_points; number++) {
209  *bPtr++ = expf(*aPtr++);
210  }
211 }
212 
213 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214 
215 #ifdef LV_HAVE_AVX
216 #include <immintrin.h>
217 
218 static inline void
219 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220 {
221  float* bPtr = bVector;
222  const float* aPtr = aVector;
223 
224  unsigned int number = 0;
225  const unsigned int eighthPoints = num_points / 8;
226 
227  __m256 aVal, bVal, a, b;
228  __m256i exp;
229  a = _mm256_set1_ps(A / Mln2);
230  b = _mm256_set1_ps(B - C);
231 
232  for (; number < eighthPoints; number++) {
233  aVal = _mm256_loadu_ps(aPtr);
234  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235  bVal = _mm256_castsi256_ps(exp);
236 
237  _mm256_storeu_ps(bPtr, bVal);
238  aPtr += 8;
239  bPtr += 8;
240  }
241 
242  number = eighthPoints * 8;
243  for (; number < num_points; number++) {
244  *bPtr++ = expf(*aPtr++);
245  }
246 }
247 
248 #endif /* LV_HAVE_AVX for unaligned */
249 
250 
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
253 
254 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255  const float* aVector,
256  unsigned int num_points)
257 {
258  float* bPtr = bVector;
259  const float* aPtr = aVector;
260 
261  unsigned int number = 0;
262  const unsigned int quarterPoints = num_points / 4;
263 
264  __m128 aVal, bVal, a, b;
265  __m128i exp;
266  a = _mm_set1_ps(A / Mln2);
267  b = _mm_set1_ps(B - C);
268 
269  for (; number < quarterPoints; number++) {
270  aVal = _mm_loadu_ps(aPtr);
271  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272  bVal = _mm_castsi128_ps(exp);
273 
274  _mm_storeu_ps(bPtr, bVal);
275  aPtr += 4;
276  bPtr += 4;
277  }
278 
279  number = quarterPoints * 4;
280  for (; number < num_points; number++) {
281  *bPtr++ = expf(*aPtr++);
282  }
283 }
284 
285 #endif /* LV_HAVE_SSE4_1 for unaligned */
286 
287 
288 #ifdef LV_HAVE_GENERIC
289 
290 static inline void volk_32f_expfast_32f_generic(float* bVector,
291  const float* aVector,
292  unsigned int num_points)
293 {
294  float* bPtr = bVector;
295  const float* aPtr = aVector;
296  unsigned int number = 0;
297 
298  for (number = 0; number < num_points; number++) {
299  *bPtr++ = expf(*aPtr++);
300  }
301 }
302 #endif /* LV_HAVE_GENERIC */
303 
304 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define Mln2
Definition: volk_32f_expfast_32f.h:56
#define B
Definition: volk_32f_expfast_32f.h:58
#define A
Definition: volk_32f_expfast_32f.h:57
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:219
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:290
#define C
Definition: volk_32f_expfast_32f.h:59
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:107