Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #include <inttypes.h>
58 #include <math.h>
59 #include <stdio.h>
60 
61 /* This is the number of terms of Taylor series to evaluate, increase this for more
62  * accuracy*/
63 #define ACOS_TERMS 2
64 
65 #ifndef INCLUDED_volk_32f_acos_32f_a_H
66 #define INCLUDED_volk_32f_acos_32f_a_H
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72  const float* aVector,
73  unsigned int num_points)
74 {
75  float* bPtr = bVector;
76  const float* aPtr = aVector;
77 
78  unsigned int number = 0;
79  unsigned int eighthPoints = num_points / 8;
80  int i, j;
81 
82  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83  __m256 fzeroes, fones, ftwos, ffours, condition;
84 
85  pi = _mm256_set1_ps(3.14159265358979323846);
86  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87  fzeroes = _mm256_setzero_ps();
88  fones = _mm256_set1_ps(1.0);
89  ftwos = _mm256_set1_ps(2.0);
90  ffours = _mm256_set1_ps(4.0);
91 
92  for (; number < eighthPoints; number++) {
93  aVal = _mm256_load_ps(aPtr);
94  d = aVal;
95  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96  _mm256_sub_ps(fones, aVal))),
97  aVal);
98  z = aVal;
99  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102  x = _mm256_add_ps(
103  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104 
105  for (i = 0; i < 2; i++)
106  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107  x = _mm256_div_ps(fones, x);
108  y = fzeroes;
109  for (j = ACOS_TERMS - 1; j >= 0; j--)
110  y = _mm256_fmadd_ps(
111  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
112 
113  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115 
116  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117  arccosine = y;
118  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119  arccosine = _mm256_sub_ps(
120  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
123 
124  _mm256_store_ps(bPtr, arccosine);
125  aPtr += 8;
126  bPtr += 8;
127  }
128 
129  number = eighthPoints * 8;
130  for (; number < num_points; number++) {
131  *bPtr++ = acos(*aPtr++);
132  }
133 }
134 
135 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
136 
137 
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
140 
141 static inline void
142 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
143 {
144  float* bPtr = bVector;
145  const float* aPtr = aVector;
146 
147  unsigned int number = 0;
148  unsigned int eighthPoints = num_points / 8;
149  int i, j;
150 
151  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
152  __m256 fzeroes, fones, ftwos, ffours, condition;
153 
154  pi = _mm256_set1_ps(3.14159265358979323846);
155  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156  fzeroes = _mm256_setzero_ps();
157  fones = _mm256_set1_ps(1.0);
158  ftwos = _mm256_set1_ps(2.0);
159  ffours = _mm256_set1_ps(4.0);
160 
161  for (; number < eighthPoints; number++) {
162  aVal = _mm256_load_ps(aPtr);
163  d = aVal;
164  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165  _mm256_sub_ps(fones, aVal))),
166  aVal);
167  z = aVal;
168  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
171  x = _mm256_add_ps(
172  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
173 
174  for (i = 0; i < 2; i++)
175  x = _mm256_add_ps(x,
176  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177  x = _mm256_div_ps(fones, x);
178  y = fzeroes;
179  for (j = ACOS_TERMS - 1; j >= 0; j--)
180  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
182 
183  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185 
186  y = _mm256_add_ps(
187  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188  arccosine = y;
189  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190  arccosine = _mm256_sub_ps(
191  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
194 
195  _mm256_store_ps(bPtr, arccosine);
196  aPtr += 8;
197  bPtr += 8;
198  }
199 
200  number = eighthPoints * 8;
201  for (; number < num_points; number++) {
202  *bPtr++ = acos(*aPtr++);
203  }
204 }
205 
206 #endif /* LV_HAVE_AVX2 for aligned */
207 
208 #ifdef LV_HAVE_SSE4_1
209 #include <smmintrin.h>
210 
211 static inline void
212 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
213 {
214  float* bPtr = bVector;
215  const float* aPtr = aVector;
216 
217  unsigned int number = 0;
218  unsigned int quarterPoints = num_points / 4;
219  int i, j;
220 
221  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
222  __m128 fzeroes, fones, ftwos, ffours, condition;
223 
224  pi = _mm_set1_ps(3.14159265358979323846);
225  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
226  fzeroes = _mm_setzero_ps();
227  fones = _mm_set1_ps(1.0);
228  ftwos = _mm_set1_ps(2.0);
229  ffours = _mm_set1_ps(4.0);
230 
231  for (; number < quarterPoints; number++) {
232  aVal = _mm_load_ps(aPtr);
233  d = aVal;
234  aVal = _mm_div_ps(
235  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
236  aVal);
237  z = aVal;
238  condition = _mm_cmplt_ps(z, fzeroes);
239  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
240  condition = _mm_cmplt_ps(z, fones);
241  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
242 
243  for (i = 0; i < 2; i++)
244  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245  x = _mm_div_ps(fones, x);
246  y = fzeroes;
247  for (j = ACOS_TERMS - 1; j >= 0; j--)
248  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
249  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
250 
251  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252  condition = _mm_cmpgt_ps(z, fones);
253 
254  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255  arccosine = y;
256  condition = _mm_cmplt_ps(aVal, fzeroes);
257  arccosine =
258  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
259  condition = _mm_cmplt_ps(d, fzeroes);
260  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
261 
262  _mm_store_ps(bPtr, arccosine);
263  aPtr += 4;
264  bPtr += 4;
265  }
266 
267  number = quarterPoints * 4;
268  for (; number < num_points; number++) {
269  *bPtr++ = acosf(*aPtr++);
270  }
271 }
272 
273 #endif /* LV_HAVE_SSE4_1 for aligned */
274 
275 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
276 
277 
278 #ifndef INCLUDED_volk_32f_acos_32f_u_H
279 #define INCLUDED_volk_32f_acos_32f_u_H
280 
281 #if LV_HAVE_AVX2 && LV_HAVE_FMA
282 #include <immintrin.h>
283 
284 static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
285  const float* aVector,
286  unsigned int num_points)
287 {
288  float* bPtr = bVector;
289  const float* aPtr = aVector;
290 
291  unsigned int number = 0;
292  unsigned int eighthPoints = num_points / 8;
293  int i, j;
294 
295  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
296  __m256 fzeroes, fones, ftwos, ffours, condition;
297 
298  pi = _mm256_set1_ps(3.14159265358979323846);
299  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300  fzeroes = _mm256_setzero_ps();
301  fones = _mm256_set1_ps(1.0);
302  ftwos = _mm256_set1_ps(2.0);
303  ffours = _mm256_set1_ps(4.0);
304 
305  for (; number < eighthPoints; number++) {
306  aVal = _mm256_loadu_ps(aPtr);
307  d = aVal;
308  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309  _mm256_sub_ps(fones, aVal))),
310  aVal);
311  z = aVal;
312  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
315  x = _mm256_add_ps(
316  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
317 
318  for (i = 0; i < 2; i++)
319  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320  x = _mm256_div_ps(fones, x);
321  y = fzeroes;
322  for (j = ACOS_TERMS - 1; j >= 0; j--)
323  y = _mm256_fmadd_ps(
324  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
325 
326  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
328 
329  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
330  arccosine = y;
331  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332  arccosine = _mm256_sub_ps(
333  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
336 
337  _mm256_storeu_ps(bPtr, arccosine);
338  aPtr += 8;
339  bPtr += 8;
340  }
341 
342  number = eighthPoints * 8;
343  for (; number < num_points; number++) {
344  *bPtr++ = acos(*aPtr++);
345  }
346 }
347 
348 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
349 
350 
351 #ifdef LV_HAVE_AVX
352 #include <immintrin.h>
353 
354 static inline void
355 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
356 {
357  float* bPtr = bVector;
358  const float* aPtr = aVector;
359 
360  unsigned int number = 0;
361  unsigned int eighthPoints = num_points / 8;
362  int i, j;
363 
364  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
365  __m256 fzeroes, fones, ftwos, ffours, condition;
366 
367  pi = _mm256_set1_ps(3.14159265358979323846);
368  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369  fzeroes = _mm256_setzero_ps();
370  fones = _mm256_set1_ps(1.0);
371  ftwos = _mm256_set1_ps(2.0);
372  ffours = _mm256_set1_ps(4.0);
373 
374  for (; number < eighthPoints; number++) {
375  aVal = _mm256_loadu_ps(aPtr);
376  d = aVal;
377  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378  _mm256_sub_ps(fones, aVal))),
379  aVal);
380  z = aVal;
381  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
384  x = _mm256_add_ps(
385  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
386 
387  for (i = 0; i < 2; i++)
388  x = _mm256_add_ps(x,
389  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390  x = _mm256_div_ps(fones, x);
391  y = fzeroes;
392  for (j = ACOS_TERMS - 1; j >= 0; j--)
393  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
395 
396  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
398 
399  y = _mm256_add_ps(
400  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
401  arccosine = y;
402  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403  arccosine = _mm256_sub_ps(
404  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
407 
408  _mm256_storeu_ps(bPtr, arccosine);
409  aPtr += 8;
410  bPtr += 8;
411  }
412 
413  number = eighthPoints * 8;
414  for (; number < num_points; number++) {
415  *bPtr++ = acos(*aPtr++);
416  }
417 }
418 
419 #endif /* LV_HAVE_AVX2 for unaligned */
420 
421 #ifdef LV_HAVE_SSE4_1
422 #include <smmintrin.h>
423 
424 static inline void
425 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
426 {
427  float* bPtr = bVector;
428  const float* aPtr = aVector;
429 
430  unsigned int number = 0;
431  unsigned int quarterPoints = num_points / 4;
432  int i, j;
433 
434  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
435  __m128 fzeroes, fones, ftwos, ffours, condition;
436 
437  pi = _mm_set1_ps(3.14159265358979323846);
438  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
439  fzeroes = _mm_setzero_ps();
440  fones = _mm_set1_ps(1.0);
441  ftwos = _mm_set1_ps(2.0);
442  ffours = _mm_set1_ps(4.0);
443 
444  for (; number < quarterPoints; number++) {
445  aVal = _mm_loadu_ps(aPtr);
446  d = aVal;
447  aVal = _mm_div_ps(
448  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
449  aVal);
450  z = aVal;
451  condition = _mm_cmplt_ps(z, fzeroes);
452  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
453  condition = _mm_cmplt_ps(z, fones);
454  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
455 
456  for (i = 0; i < 2; i++)
457  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
458  x = _mm_div_ps(fones, x);
459  y = fzeroes;
460 
461  for (j = ACOS_TERMS - 1; j >= 0; j--)
462  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
463  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
464 
465  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
466  condition = _mm_cmpgt_ps(z, fones);
467 
468  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
469  arccosine = y;
470  condition = _mm_cmplt_ps(aVal, fzeroes);
471  arccosine =
472  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
473  condition = _mm_cmplt_ps(d, fzeroes);
474  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
475 
476  _mm_storeu_ps(bPtr, arccosine);
477  aPtr += 4;
478  bPtr += 4;
479  }
480 
481  number = quarterPoints * 4;
482  for (; number < num_points; number++) {
483  *bPtr++ = acosf(*aPtr++);
484  }
485 }
486 
487 #endif /* LV_HAVE_SSE4_1 for aligned */
488 
489 #ifdef LV_HAVE_GENERIC
490 
491 static inline void
492 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
493 {
494  float* bPtr = bVector;
495  const float* aPtr = aVector;
496  unsigned int number = 0;
497 
498  for (number = 0; number < num_points; number++) {
499  *bPtr++ = acosf(*aPtr++);
500  }
501 }
502 #endif /* LV_HAVE_GENERIC */
503 
504 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:492
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:63
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:355
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:142
for i
Definition: volk_config_fixed.tmpl.h:13