65 #ifndef INCLUDED_volk_32f_acos_32f_a_H
66 #define INCLUDED_volk_32f_acos_32f_a_H
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
71 static inline void volk_32f_acos_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pi = _mm256_set1_ps(3.14159265358979323846);
86 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 fzeroes = _mm256_setzero_ps();
88 fones = _mm256_set1_ps(1.0);
89 ftwos = _mm256_set1_ps(2.0);
90 ffours = _mm256_set1_ps(4.0);
92 for (; number < eighthPoints; number++) {
93 aVal = _mm256_load_ps(aPtr);
95 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
99 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
105 for (
i = 0;
i < 2;
i++)
106 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107 x = _mm256_div_ps(fones, x);
111 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arccosine = _mm256_sub_ps(
120 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
121 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
122 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
124 _mm256_store_ps(bPtr, arccosine);
129 number = eighthPoints * 8;
130 for (; number < num_points; number++) {
131 *bPtr++ = acos(*aPtr++);
139 #include <immintrin.h>
144 float* bPtr = bVector;
145 const float* aPtr = aVector;
147 unsigned int number = 0;
148 unsigned int eighthPoints = num_points / 8;
151 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
152 __m256 fzeroes, fones, ftwos, ffours, condition;
154 pi = _mm256_set1_ps(3.14159265358979323846);
155 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
156 fzeroes = _mm256_setzero_ps();
157 fones = _mm256_set1_ps(1.0);
158 ftwos = _mm256_set1_ps(2.0);
159 ffours = _mm256_set1_ps(4.0);
161 for (; number < eighthPoints; number++) {
162 aVal = _mm256_load_ps(aPtr);
164 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
165 _mm256_sub_ps(fones, aVal))),
168 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
169 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
170 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
172 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
174 for (
i = 0;
i < 2;
i++)
176 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177 x = _mm256_div_ps(fones, x);
180 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
181 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
183 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
184 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
187 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
189 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
190 arccosine = _mm256_sub_ps(
191 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
192 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
193 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
195 _mm256_store_ps(bPtr, arccosine);
200 number = eighthPoints * 8;
201 for (; number < num_points; number++) {
202 *bPtr++ = acos(*aPtr++);
208 #ifdef LV_HAVE_SSE4_1
209 #include <smmintrin.h>
212 volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
214 float* bPtr = bVector;
215 const float* aPtr = aVector;
217 unsigned int number = 0;
218 unsigned int quarterPoints = num_points / 4;
221 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
222 __m128 fzeroes, fones, ftwos, ffours, condition;
231 for (; number < quarterPoints; number++) {
243 for (
i = 0;
i < 2;
i++)
267 number = quarterPoints * 4;
268 for (; number < num_points; number++) {
269 *bPtr++ = acosf(*aPtr++);
278 #ifndef INCLUDED_volk_32f_acos_32f_u_H
279 #define INCLUDED_volk_32f_acos_32f_u_H
281 #if LV_HAVE_AVX2 && LV_HAVE_FMA
282 #include <immintrin.h>
284 static inline void volk_32f_acos_32f_u_avx2_fma(
float* bVector,
285 const float* aVector,
286 unsigned int num_points)
288 float* bPtr = bVector;
289 const float* aPtr = aVector;
291 unsigned int number = 0;
292 unsigned int eighthPoints = num_points / 8;
295 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
296 __m256 fzeroes, fones, ftwos, ffours, condition;
298 pi = _mm256_set1_ps(3.14159265358979323846);
299 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
300 fzeroes = _mm256_setzero_ps();
301 fones = _mm256_set1_ps(1.0);
302 ftwos = _mm256_set1_ps(2.0);
303 ffours = _mm256_set1_ps(4.0);
305 for (; number < eighthPoints; number++) {
306 aVal = _mm256_loadu_ps(aPtr);
308 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
309 _mm256_sub_ps(fones, aVal))),
312 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
313 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
314 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
316 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
318 for (
i = 0;
i < 2;
i++)
319 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
320 x = _mm256_div_ps(fones, x);
324 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
326 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
327 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
329 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
331 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
332 arccosine = _mm256_sub_ps(
333 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
334 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
335 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
337 _mm256_storeu_ps(bPtr, arccosine);
342 number = eighthPoints * 8;
343 for (; number < num_points; number++) {
344 *bPtr++ = acos(*aPtr++);
352 #include <immintrin.h>
357 float* bPtr = bVector;
358 const float* aPtr = aVector;
360 unsigned int number = 0;
361 unsigned int eighthPoints = num_points / 8;
364 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
365 __m256 fzeroes, fones, ftwos, ffours, condition;
367 pi = _mm256_set1_ps(3.14159265358979323846);
368 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
369 fzeroes = _mm256_setzero_ps();
370 fones = _mm256_set1_ps(1.0);
371 ftwos = _mm256_set1_ps(2.0);
372 ffours = _mm256_set1_ps(4.0);
374 for (; number < eighthPoints; number++) {
375 aVal = _mm256_loadu_ps(aPtr);
377 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
378 _mm256_sub_ps(fones, aVal))),
381 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
382 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
383 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
385 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
387 for (
i = 0;
i < 2;
i++)
389 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
390 x = _mm256_div_ps(fones, x);
393 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
394 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
396 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
397 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
400 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
402 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
403 arccosine = _mm256_sub_ps(
404 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
405 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
406 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
408 _mm256_storeu_ps(bPtr, arccosine);
413 number = eighthPoints * 8;
414 for (; number < num_points; number++) {
415 *bPtr++ = acos(*aPtr++);
421 #ifdef LV_HAVE_SSE4_1
422 #include <smmintrin.h>
425 volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
427 float* bPtr = bVector;
428 const float* aPtr = aVector;
430 unsigned int number = 0;
431 unsigned int quarterPoints = num_points / 4;
434 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
435 __m128 fzeroes, fones, ftwos, ffours, condition;
444 for (; number < quarterPoints; number++) {
456 for (
i = 0;
i < 2;
i++)
481 number = quarterPoints * 4;
482 for (; number < num_points; number++) {
483 *bPtr++ = acosf(*aPtr++);
489 #ifdef LV_HAVE_GENERIC
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
496 unsigned int number = 0;
498 for (number = 0; number < num_points; number++) {
499 *bPtr++ = acosf(*aPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:492
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:63
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:355
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:142
for i
Definition: volk_config_fixed.tmpl.h:13