Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #include <inttypes.h>
58 #include <math.h>
59 #include <stdio.h>
60 
61 /* This is the number of terms of Taylor series to evaluate, increase this for more
62  * accuracy*/
63 #define TERMS 2
64 
65 #ifndef INCLUDED_volk_32f_atan_32f_a_H
66 #define INCLUDED_volk_32f_atan_32f_a_H
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
72  const float* aVector,
73  unsigned int num_points)
74 {
75  float* bPtr = bVector;
76  const float* aPtr = aVector;
77 
78  unsigned int number = 0;
79  unsigned int eighthPoints = num_points / 8;
80  int i, j;
81 
82  __m256 aVal, pio2, x, y, z, arctangent;
83  __m256 fzeroes, fones, ftwos, ffours, condition;
84 
85  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86  fzeroes = _mm256_setzero_ps();
87  fones = _mm256_set1_ps(1.0);
88  ftwos = _mm256_set1_ps(2.0);
89  ffours = _mm256_set1_ps(4.0);
90 
91  for (; number < eighthPoints; number++) {
92  aVal = _mm256_load_ps(aPtr);
93  z = aVal;
94  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
95  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
96  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
97  x = _mm256_add_ps(
98  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
99 
100  for (i = 0; i < 2; i++) {
101  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
102  }
103  x = _mm256_div_ps(fones, x);
104  y = fzeroes;
105  for (j = TERMS - 1; j >= 0; j--) {
106  y = _mm256_fmadd_ps(
107  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
108  }
109 
110  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
111  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
112 
113  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
114  arctangent = y;
115  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
116  arctangent = _mm256_sub_ps(
117  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
118 
119  _mm256_store_ps(bPtr, arctangent);
120  aPtr += 8;
121  bPtr += 8;
122  }
123 
124  number = eighthPoints * 8;
125  for (; number < num_points; number++) {
126  *bPtr++ = atan(*aPtr++);
127  }
128 }
129 
130 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
131 
132 
133 #ifdef LV_HAVE_AVX
134 #include <immintrin.h>
135 
136 static inline void
137 volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
138 {
139  float* bPtr = bVector;
140  const float* aPtr = aVector;
141 
142  unsigned int number = 0;
143  unsigned int eighthPoints = num_points / 8;
144  int i, j;
145 
146  __m256 aVal, pio2, x, y, z, arctangent;
147  __m256 fzeroes, fones, ftwos, ffours, condition;
148 
149  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
150  fzeroes = _mm256_setzero_ps();
151  fones = _mm256_set1_ps(1.0);
152  ftwos = _mm256_set1_ps(2.0);
153  ffours = _mm256_set1_ps(4.0);
154 
155  for (; number < eighthPoints; number++) {
156  aVal = _mm256_load_ps(aPtr);
157  z = aVal;
158  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
159  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
160  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
161  x = _mm256_add_ps(
162  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
163 
164  for (i = 0; i < 2; i++) {
165  x = _mm256_add_ps(x,
166  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
167  }
168  x = _mm256_div_ps(fones, x);
169  y = fzeroes;
170  for (j = TERMS - 1; j >= 0; j--) {
171  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
172  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
173  }
174 
175  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
176  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
177 
178  y = _mm256_add_ps(
179  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
180  arctangent = y;
181  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
182  arctangent = _mm256_sub_ps(
183  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
184 
185  _mm256_store_ps(bPtr, arctangent);
186  aPtr += 8;
187  bPtr += 8;
188  }
189 
190  number = eighthPoints * 8;
191  for (; number < num_points; number++) {
192  *bPtr++ = atan(*aPtr++);
193  }
194 }
195 
196 #endif /* LV_HAVE_AVX for aligned */
197 
198 #ifdef LV_HAVE_SSE4_1
199 #include <smmintrin.h>
200 
201 static inline void
202 volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
203 {
204  float* bPtr = bVector;
205  const float* aPtr = aVector;
206 
207  unsigned int number = 0;
208  unsigned int quarterPoints = num_points / 4;
209  int i, j;
210 
211  __m128 aVal, pio2, x, y, z, arctangent;
212  __m128 fzeroes, fones, ftwos, ffours, condition;
213 
214  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
215  fzeroes = _mm_setzero_ps();
216  fones = _mm_set1_ps(1.0);
217  ftwos = _mm_set1_ps(2.0);
218  ffours = _mm_set1_ps(4.0);
219 
220  for (; number < quarterPoints; number++) {
221  aVal = _mm_load_ps(aPtr);
222  z = aVal;
223  condition = _mm_cmplt_ps(z, fzeroes);
224  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
225  condition = _mm_cmplt_ps(z, fones);
226  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
227 
228  for (i = 0; i < 2; i++) {
229  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
230  }
231  x = _mm_div_ps(fones, x);
232  y = fzeroes;
233  for (j = TERMS - 1; j >= 0; j--) {
234  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
235  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
236  }
237 
238  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
239  condition = _mm_cmpgt_ps(z, fones);
240 
241  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
242  arctangent = y;
243  condition = _mm_cmplt_ps(aVal, fzeroes);
244  arctangent =
245  _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
246 
247  _mm_store_ps(bPtr, arctangent);
248  aPtr += 4;
249  bPtr += 4;
250  }
251 
252  number = quarterPoints * 4;
253  for (; number < num_points; number++) {
254  *bPtr++ = atanf(*aPtr++);
255  }
256 }
257 
258 #endif /* LV_HAVE_SSE4_1 for aligned */
259 
260 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
261 
262 #ifndef INCLUDED_volk_32f_atan_32f_u_H
263 #define INCLUDED_volk_32f_atan_32f_u_H
264 
265 #if LV_HAVE_AVX2 && LV_HAVE_FMA
266 #include <immintrin.h>
267 
268 static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
269  const float* aVector,
270  unsigned int num_points)
271 {
272  float* bPtr = bVector;
273  const float* aPtr = aVector;
274 
275  unsigned int number = 0;
276  unsigned int eighthPoints = num_points / 8;
277  int i, j;
278 
279  __m256 aVal, pio2, x, y, z, arctangent;
280  __m256 fzeroes, fones, ftwos, ffours, condition;
281 
282  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
283  fzeroes = _mm256_setzero_ps();
284  fones = _mm256_set1_ps(1.0);
285  ftwos = _mm256_set1_ps(2.0);
286  ffours = _mm256_set1_ps(4.0);
287 
288  for (; number < eighthPoints; number++) {
289  aVal = _mm256_loadu_ps(aPtr);
290  z = aVal;
291  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
292  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
293  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
294  x = _mm256_add_ps(
295  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
296 
297  for (i = 0; i < 2; i++) {
298  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
299  }
300  x = _mm256_div_ps(fones, x);
301  y = fzeroes;
302  for (j = TERMS - 1; j >= 0; j--) {
303  y = _mm256_fmadd_ps(
304  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
305  }
306 
307  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
308  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
309 
310  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
311  arctangent = y;
312  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
313  arctangent = _mm256_sub_ps(
314  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
315 
316  _mm256_storeu_ps(bPtr, arctangent);
317  aPtr += 8;
318  bPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  for (; number < num_points; number++) {
323  *bPtr++ = atan(*aPtr++);
324  }
325 }
326 
327 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
328 
329 
330 #ifdef LV_HAVE_AVX
331 #include <immintrin.h>
332 
333 static inline void
334 volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
335 {
336  float* bPtr = bVector;
337  const float* aPtr = aVector;
338 
339  unsigned int number = 0;
340  unsigned int eighthPoints = num_points / 8;
341  int i, j;
342 
343  __m256 aVal, pio2, x, y, z, arctangent;
344  __m256 fzeroes, fones, ftwos, ffours, condition;
345 
346  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
347  fzeroes = _mm256_setzero_ps();
348  fones = _mm256_set1_ps(1.0);
349  ftwos = _mm256_set1_ps(2.0);
350  ffours = _mm256_set1_ps(4.0);
351 
352  for (; number < eighthPoints; number++) {
353  aVal = _mm256_loadu_ps(aPtr);
354  z = aVal;
355  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
356  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
357  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
358  x = _mm256_add_ps(
359  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
360 
361  for (i = 0; i < 2; i++) {
362  x = _mm256_add_ps(x,
363  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
364  }
365  x = _mm256_div_ps(fones, x);
366  y = fzeroes;
367  for (j = TERMS - 1; j >= 0; j--) {
368  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
369  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
370  }
371 
372  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
373  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
374 
375  y = _mm256_add_ps(
376  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
377  arctangent = y;
378  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
379  arctangent = _mm256_sub_ps(
380  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
381 
382  _mm256_storeu_ps(bPtr, arctangent);
383  aPtr += 8;
384  bPtr += 8;
385  }
386 
387  number = eighthPoints * 8;
388  for (; number < num_points; number++) {
389  *bPtr++ = atan(*aPtr++);
390  }
391 }
392 
393 #endif /* LV_HAVE_AVX for unaligned */
394 
395 #ifdef LV_HAVE_SSE4_1
396 #include <smmintrin.h>
397 
398 static inline void
399 volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
400 {
401  float* bPtr = bVector;
402  const float* aPtr = aVector;
403 
404  unsigned int number = 0;
405  unsigned int quarterPoints = num_points / 4;
406  int i, j;
407 
408  __m128 aVal, pio2, x, y, z, arctangent;
409  __m128 fzeroes, fones, ftwos, ffours, condition;
410 
411  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
412  fzeroes = _mm_setzero_ps();
413  fones = _mm_set1_ps(1.0);
414  ftwos = _mm_set1_ps(2.0);
415  ffours = _mm_set1_ps(4.0);
416 
417  for (; number < quarterPoints; number++) {
418  aVal = _mm_loadu_ps(aPtr);
419  z = aVal;
420  condition = _mm_cmplt_ps(z, fzeroes);
421  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
422  condition = _mm_cmplt_ps(z, fones);
423  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
424 
425  for (i = 0; i < 2; i++)
426  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
427  x = _mm_div_ps(fones, x);
428  y = fzeroes;
429  for (j = TERMS - 1; j >= 0; j--)
430  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
431  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
432 
433  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
434  condition = _mm_cmpgt_ps(z, fones);
435 
436  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
437  arctangent = y;
438  condition = _mm_cmplt_ps(aVal, fzeroes);
439  arctangent =
440  _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
441 
442  _mm_storeu_ps(bPtr, arctangent);
443  aPtr += 4;
444  bPtr += 4;
445  }
446 
447  number = quarterPoints * 4;
448  for (; number < num_points; number++) {
449  *bPtr++ = atanf(*aPtr++);
450  }
451 }
452 
453 #endif /* LV_HAVE_SSE4_1 for unaligned */
454 
455 #ifdef LV_HAVE_GENERIC
456 
457 static inline void
458 volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
459 {
460  float* bPtr = bVector;
461  const float* aPtr = aVector;
462  unsigned int number = 0;
463 
464  for (number = 0; number < num_points; number++) {
465  *bPtr++ = atanf(*aPtr++);
466  }
467 }
468 #endif /* LV_HAVE_GENERIC */
469 
470 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:334
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:458
#define TERMS
Definition: volk_32f_atan_32f.h:63
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:137
for i
Definition: volk_config_fixed.tmpl.h:13