Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_asin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #include <inttypes.h>
58 #include <math.h>
59 #include <stdio.h>
60 
61 /* This is the number of terms of Taylor series to evaluate, increase this for more
62  * accuracy*/
63 #define ASIN_TERMS 2
64 
65 #ifndef INCLUDED_volk_32f_asin_32f_a_H
66 #define INCLUDED_volk_32f_asin_32f_a_H
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
72  const float* aVector,
73  unsigned int num_points)
74 {
75  float* bPtr = bVector;
76  const float* aPtr = aVector;
77 
78  unsigned int number = 0;
79  unsigned int eighthPoints = num_points / 8;
80  int i, j;
81 
82  __m256 aVal, pio2, x, y, z, arcsine;
83  __m256 fzeroes, fones, ftwos, ffours, condition;
84 
85  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86  fzeroes = _mm256_setzero_ps();
87  fones = _mm256_set1_ps(1.0);
88  ftwos = _mm256_set1_ps(2.0);
89  ffours = _mm256_set1_ps(4.0);
90 
91  for (; number < eighthPoints; number++) {
92  aVal = _mm256_load_ps(aPtr);
93  aVal = _mm256_div_ps(aVal,
94  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
95  _mm256_sub_ps(fones, aVal))));
96  z = aVal;
97  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
98  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
99  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
100  x = _mm256_add_ps(
101  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
102 
103  for (i = 0; i < 2; i++) {
104  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
105  }
106  x = _mm256_div_ps(fones, x);
107  y = fzeroes;
108  for (j = ASIN_TERMS - 1; j >= 0; j--) {
109  y = _mm256_fmadd_ps(
110  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
111  }
112 
113  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
115 
116  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
117  arcsine = y;
118  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119  arcsine = _mm256_sub_ps(arcsine,
120  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
121 
122  _mm256_store_ps(bPtr, arcsine);
123  aPtr += 8;
124  bPtr += 8;
125  }
126 
127  number = eighthPoints * 8;
128  for (; number < num_points; number++) {
129  *bPtr++ = asin(*aPtr++);
130  }
131 }
132 
133 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
134 
135 
136 #ifdef LV_HAVE_AVX
137 #include <immintrin.h>
138 
139 static inline void
140 volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
141 {
142  float* bPtr = bVector;
143  const float* aPtr = aVector;
144 
145  unsigned int number = 0;
146  unsigned int eighthPoints = num_points / 8;
147  int i, j;
148 
149  __m256 aVal, pio2, x, y, z, arcsine;
150  __m256 fzeroes, fones, ftwos, ffours, condition;
151 
152  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
153  fzeroes = _mm256_setzero_ps();
154  fones = _mm256_set1_ps(1.0);
155  ftwos = _mm256_set1_ps(2.0);
156  ffours = _mm256_set1_ps(4.0);
157 
158  for (; number < eighthPoints; number++) {
159  aVal = _mm256_load_ps(aPtr);
160  aVal = _mm256_div_ps(aVal,
161  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
162  _mm256_sub_ps(fones, aVal))));
163  z = aVal;
164  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
165  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
166  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
167  x = _mm256_add_ps(
168  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
169 
170  for (i = 0; i < 2; i++) {
171  x = _mm256_add_ps(x,
172  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
173  }
174  x = _mm256_div_ps(fones, x);
175  y = fzeroes;
176  for (j = ASIN_TERMS - 1; j >= 0; j--) {
177  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
178  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
179  }
180 
181  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
182  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
183 
184  y = _mm256_add_ps(
185  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
186  arcsine = y;
187  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188  arcsine = _mm256_sub_ps(arcsine,
189  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
190 
191  _mm256_store_ps(bPtr, arcsine);
192  aPtr += 8;
193  bPtr += 8;
194  }
195 
196  number = eighthPoints * 8;
197  for (; number < num_points; number++) {
198  *bPtr++ = asin(*aPtr++);
199  }
200 }
201 
202 #endif /* LV_HAVE_AVX for aligned */
203 
204 #ifdef LV_HAVE_SSE4_1
205 #include <smmintrin.h>
206 
207 static inline void
208 volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
209 {
210  float* bPtr = bVector;
211  const float* aPtr = aVector;
212 
213  unsigned int number = 0;
214  unsigned int quarterPoints = num_points / 4;
215  int i, j;
216 
217  __m128 aVal, pio2, x, y, z, arcsine;
218  __m128 fzeroes, fones, ftwos, ffours, condition;
219 
220  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
221  fzeroes = _mm_setzero_ps();
222  fones = _mm_set1_ps(1.0);
223  ftwos = _mm_set1_ps(2.0);
224  ffours = _mm_set1_ps(4.0);
225 
226  for (; number < quarterPoints; number++) {
227  aVal = _mm_load_ps(aPtr);
228  aVal = _mm_div_ps(
229  aVal,
230  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
231  z = aVal;
232  condition = _mm_cmplt_ps(z, fzeroes);
233  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
234  condition = _mm_cmplt_ps(z, fones);
235  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
236 
237  for (i = 0; i < 2; i++) {
238  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
239  }
240  x = _mm_div_ps(fones, x);
241  y = fzeroes;
242  for (j = ASIN_TERMS - 1; j >= 0; j--) {
243  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
244  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
245  }
246 
247  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248  condition = _mm_cmpgt_ps(z, fones);
249 
250  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
251  arcsine = y;
252  condition = _mm_cmplt_ps(aVal, fzeroes);
253  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
254 
255  _mm_store_ps(bPtr, arcsine);
256  aPtr += 4;
257  bPtr += 4;
258  }
259 
260  number = quarterPoints * 4;
261  for (; number < num_points; number++) {
262  *bPtr++ = asinf(*aPtr++);
263  }
264 }
265 
266 #endif /* LV_HAVE_SSE4_1 for aligned */
267 
268 #endif /* INCLUDED_volk_32f_asin_32f_a_H */
269 
270 #ifndef INCLUDED_volk_32f_asin_32f_u_H
271 #define INCLUDED_volk_32f_asin_32f_u_H
272 
273 #if LV_HAVE_AVX2 && LV_HAVE_FMA
274 #include <immintrin.h>
275 
276 static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
277  const float* aVector,
278  unsigned int num_points)
279 {
280  float* bPtr = bVector;
281  const float* aPtr = aVector;
282 
283  unsigned int number = 0;
284  unsigned int eighthPoints = num_points / 8;
285  int i, j;
286 
287  __m256 aVal, pio2, x, y, z, arcsine;
288  __m256 fzeroes, fones, ftwos, ffours, condition;
289 
290  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
291  fzeroes = _mm256_setzero_ps();
292  fones = _mm256_set1_ps(1.0);
293  ftwos = _mm256_set1_ps(2.0);
294  ffours = _mm256_set1_ps(4.0);
295 
296  for (; number < eighthPoints; number++) {
297  aVal = _mm256_loadu_ps(aPtr);
298  aVal = _mm256_div_ps(aVal,
299  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
300  _mm256_sub_ps(fones, aVal))));
301  z = aVal;
302  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
303  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
304  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
305  x = _mm256_add_ps(
306  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
307 
308  for (i = 0; i < 2; i++) {
309  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
310  }
311  x = _mm256_div_ps(fones, x);
312  y = fzeroes;
313  for (j = ASIN_TERMS - 1; j >= 0; j--) {
314  y = _mm256_fmadd_ps(
315  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
316  }
317 
318  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
319  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
320 
321  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
322  arcsine = y;
323  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
324  arcsine = _mm256_sub_ps(arcsine,
325  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
326 
327  _mm256_storeu_ps(bPtr, arcsine);
328  aPtr += 8;
329  bPtr += 8;
330  }
331 
332  number = eighthPoints * 8;
333  for (; number < num_points; number++) {
334  *bPtr++ = asin(*aPtr++);
335  }
336 }
337 
338 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
339 
340 
341 #ifdef LV_HAVE_AVX
342 #include <immintrin.h>
343 
344 static inline void
345 volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
346 {
347  float* bPtr = bVector;
348  const float* aPtr = aVector;
349 
350  unsigned int number = 0;
351  unsigned int eighthPoints = num_points / 8;
352  int i, j;
353 
354  __m256 aVal, pio2, x, y, z, arcsine;
355  __m256 fzeroes, fones, ftwos, ffours, condition;
356 
357  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
358  fzeroes = _mm256_setzero_ps();
359  fones = _mm256_set1_ps(1.0);
360  ftwos = _mm256_set1_ps(2.0);
361  ffours = _mm256_set1_ps(4.0);
362 
363  for (; number < eighthPoints; number++) {
364  aVal = _mm256_loadu_ps(aPtr);
365  aVal = _mm256_div_ps(aVal,
366  _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
367  _mm256_sub_ps(fones, aVal))));
368  z = aVal;
369  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
370  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
371  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
372  x = _mm256_add_ps(
373  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
374 
375  for (i = 0; i < 2; i++) {
376  x = _mm256_add_ps(x,
377  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
378  }
379  x = _mm256_div_ps(fones, x);
380  y = fzeroes;
381  for (j = ASIN_TERMS - 1; j >= 0; j--) {
382  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
383  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
384  }
385 
386  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
387  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
388 
389  y = _mm256_add_ps(
390  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
391  arcsine = y;
392  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
393  arcsine = _mm256_sub_ps(arcsine,
394  _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
395 
396  _mm256_storeu_ps(bPtr, arcsine);
397  aPtr += 8;
398  bPtr += 8;
399  }
400 
401  number = eighthPoints * 8;
402  for (; number < num_points; number++) {
403  *bPtr++ = asin(*aPtr++);
404  }
405 }
406 
407 #endif /* LV_HAVE_AVX for unaligned */
408 
409 
410 #ifdef LV_HAVE_SSE4_1
411 #include <smmintrin.h>
412 
413 static inline void
414 volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
415 {
416  float* bPtr = bVector;
417  const float* aPtr = aVector;
418 
419  unsigned int number = 0;
420  unsigned int quarterPoints = num_points / 4;
421  int i, j;
422 
423  __m128 aVal, pio2, x, y, z, arcsine;
424  __m128 fzeroes, fones, ftwos, ffours, condition;
425 
426  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
427  fzeroes = _mm_setzero_ps();
428  fones = _mm_set1_ps(1.0);
429  ftwos = _mm_set1_ps(2.0);
430  ffours = _mm_set1_ps(4.0);
431 
432  for (; number < quarterPoints; number++) {
433  aVal = _mm_loadu_ps(aPtr);
434  aVal = _mm_div_ps(
435  aVal,
436  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
437  z = aVal;
438  condition = _mm_cmplt_ps(z, fzeroes);
439  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
440  condition = _mm_cmplt_ps(z, fones);
441  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
442 
443  for (i = 0; i < 2; i++) {
444  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
445  }
446  x = _mm_div_ps(fones, x);
447  y = fzeroes;
448  for (j = ASIN_TERMS - 1; j >= 0; j--) {
449  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
450  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
451  }
452 
453  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
454  condition = _mm_cmpgt_ps(z, fones);
455 
456  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
457  arcsine = y;
458  condition = _mm_cmplt_ps(aVal, fzeroes);
459  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
460 
461  _mm_storeu_ps(bPtr, arcsine);
462  aPtr += 4;
463  bPtr += 4;
464  }
465 
466  number = quarterPoints * 4;
467  for (; number < num_points; number++) {
468  *bPtr++ = asinf(*aPtr++);
469  }
470 }
471 
472 #endif /* LV_HAVE_SSE4_1 for unaligned */
473 
474 #ifdef LV_HAVE_GENERIC
475 
476 static inline void
477 volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
478 {
479  float* bPtr = bVector;
480  const float* aPtr = aVector;
481  unsigned int number = 0;
482 
483  for (number = 0; number < num_points; number++) {
484  *bPtr++ = asinf(*aPtr++);
485  }
486 }
487 #endif /* LV_HAVE_GENERIC */
488 
489 #endif /* INCLUDED_volk_32f_asin_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:63
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:345
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:140
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:477
for i
Definition: volk_config_fixed.tmpl.h:13