65 #ifndef INCLUDED_volk_32f_atan_32f_a_H
66 #define INCLUDED_volk_32f_atan_32f_a_H
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
71 static inline void volk_32f_atan_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, pio2, x, y, z, arctangent;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86 fzeroes = _mm256_setzero_ps();
87 fones = _mm256_set1_ps(1.0);
88 ftwos = _mm256_set1_ps(2.0);
89 ffours = _mm256_set1_ps(4.0);
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
94 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
95 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
96 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
98 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
100 for (
i = 0;
i < 2;
i++) {
101 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
103 x = _mm256_div_ps(fones, x);
105 for (j =
TERMS - 1; j >= 0; j--) {
107 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
110 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
111 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
113 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
115 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
116 arctangent = _mm256_sub_ps(
117 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
119 _mm256_store_ps(bPtr, arctangent);
124 number = eighthPoints * 8;
125 for (; number < num_points; number++) {
126 *bPtr++ = atan(*aPtr++);
134 #include <immintrin.h>
139 float* bPtr = bVector;
140 const float* aPtr = aVector;
142 unsigned int number = 0;
143 unsigned int eighthPoints = num_points / 8;
146 __m256 aVal, pio2, x, y, z, arctangent;
147 __m256 fzeroes, fones, ftwos, ffours, condition;
149 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
150 fzeroes = _mm256_setzero_ps();
151 fones = _mm256_set1_ps(1.0);
152 ftwos = _mm256_set1_ps(2.0);
153 ffours = _mm256_set1_ps(4.0);
155 for (; number < eighthPoints; number++) {
156 aVal = _mm256_load_ps(aPtr);
158 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
159 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
160 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
162 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
164 for (
i = 0;
i < 2;
i++) {
166 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
168 x = _mm256_div_ps(fones, x);
170 for (j =
TERMS - 1; j >= 0; j--) {
171 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
172 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
175 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
176 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
179 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
181 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
182 arctangent = _mm256_sub_ps(
183 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
185 _mm256_store_ps(bPtr, arctangent);
190 number = eighthPoints * 8;
191 for (; number < num_points; number++) {
192 *bPtr++ = atan(*aPtr++);
198 #ifdef LV_HAVE_SSE4_1
199 #include <smmintrin.h>
202 volk_32f_atan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
204 float* bPtr = bVector;
205 const float* aPtr = aVector;
207 unsigned int number = 0;
208 unsigned int quarterPoints = num_points / 4;
211 __m128 aVal, pio2, x, y, z, arctangent;
212 __m128 fzeroes, fones, ftwos, ffours, condition;
220 for (; number < quarterPoints; number++) {
228 for (
i = 0;
i < 2;
i++) {
233 for (j =
TERMS - 1; j >= 0; j--) {
252 number = quarterPoints * 4;
253 for (; number < num_points; number++) {
254 *bPtr++ = atanf(*aPtr++);
262 #ifndef INCLUDED_volk_32f_atan_32f_u_H
263 #define INCLUDED_volk_32f_atan_32f_u_H
265 #if LV_HAVE_AVX2 && LV_HAVE_FMA
266 #include <immintrin.h>
268 static inline void volk_32f_atan_32f_u_avx2_fma(
float* bVector,
269 const float* aVector,
270 unsigned int num_points)
272 float* bPtr = bVector;
273 const float* aPtr = aVector;
275 unsigned int number = 0;
276 unsigned int eighthPoints = num_points / 8;
279 __m256 aVal, pio2, x, y, z, arctangent;
280 __m256 fzeroes, fones, ftwos, ffours, condition;
282 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
283 fzeroes = _mm256_setzero_ps();
284 fones = _mm256_set1_ps(1.0);
285 ftwos = _mm256_set1_ps(2.0);
286 ffours = _mm256_set1_ps(4.0);
288 for (; number < eighthPoints; number++) {
289 aVal = _mm256_loadu_ps(aPtr);
291 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
292 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
293 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
295 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
297 for (
i = 0;
i < 2;
i++) {
298 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
300 x = _mm256_div_ps(fones, x);
302 for (j =
TERMS - 1; j >= 0; j--) {
304 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
307 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
308 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
310 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
312 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
313 arctangent = _mm256_sub_ps(
314 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
316 _mm256_storeu_ps(bPtr, arctangent);
321 number = eighthPoints * 8;
322 for (; number < num_points; number++) {
323 *bPtr++ = atan(*aPtr++);
331 #include <immintrin.h>
336 float* bPtr = bVector;
337 const float* aPtr = aVector;
339 unsigned int number = 0;
340 unsigned int eighthPoints = num_points / 8;
343 __m256 aVal, pio2, x, y, z, arctangent;
344 __m256 fzeroes, fones, ftwos, ffours, condition;
346 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
347 fzeroes = _mm256_setzero_ps();
348 fones = _mm256_set1_ps(1.0);
349 ftwos = _mm256_set1_ps(2.0);
350 ffours = _mm256_set1_ps(4.0);
352 for (; number < eighthPoints; number++) {
353 aVal = _mm256_loadu_ps(aPtr);
355 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
356 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
357 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
359 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
361 for (
i = 0;
i < 2;
i++) {
363 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
365 x = _mm256_div_ps(fones, x);
367 for (j =
TERMS - 1; j >= 0; j--) {
368 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
369 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
372 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
373 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
376 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
378 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
379 arctangent = _mm256_sub_ps(
380 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
382 _mm256_storeu_ps(bPtr, arctangent);
387 number = eighthPoints * 8;
388 for (; number < num_points; number++) {
389 *bPtr++ = atan(*aPtr++);
395 #ifdef LV_HAVE_SSE4_1
396 #include <smmintrin.h>
399 volk_32f_atan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
401 float* bPtr = bVector;
402 const float* aPtr = aVector;
404 unsigned int number = 0;
405 unsigned int quarterPoints = num_points / 4;
408 __m128 aVal, pio2, x, y, z, arctangent;
409 __m128 fzeroes, fones, ftwos, ffours, condition;
417 for (; number < quarterPoints; number++) {
425 for (
i = 0;
i < 2;
i++)
429 for (j =
TERMS - 1; j >= 0; j--)
447 number = quarterPoints * 4;
448 for (; number < num_points; number++) {
449 *bPtr++ = atanf(*aPtr++);
455 #ifdef LV_HAVE_GENERIC
460 float* bPtr = bVector;
461 const float* aPtr = aVector;
462 unsigned int number = 0;
464 for (number = 0; number < num_points; number++) {
465 *bPtr++ = atanf(*aPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:334
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:458
#define TERMS
Definition: volk_32f_atan_32f.h:63
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:137
for i
Definition: volk_config_fixed.tmpl.h:13