65 #ifndef INCLUDED_volk_32f_asin_32f_a_H
66 #define INCLUDED_volk_32f_asin_32f_a_H
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
71 static inline void volk_32f_asin_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, pio2, x, y, z, arcsine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86 fzeroes = _mm256_setzero_ps();
87 fones = _mm256_set1_ps(1.0);
88 ftwos = _mm256_set1_ps(2.0);
89 ffours = _mm256_set1_ps(4.0);
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 aVal = _mm256_div_ps(aVal,
94 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
95 _mm256_sub_ps(fones, aVal))));
97 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
98 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
99 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
101 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
103 for (
i = 0;
i < 2;
i++) {
104 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
106 x = _mm256_div_ps(fones, x);
110 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arcsine = _mm256_sub_ps(arcsine,
120 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
122 _mm256_store_ps(bPtr, arcsine);
127 number = eighthPoints * 8;
128 for (; number < num_points; number++) {
129 *bPtr++ = asin(*aPtr++);
137 #include <immintrin.h>
142 float* bPtr = bVector;
143 const float* aPtr = aVector;
145 unsigned int number = 0;
146 unsigned int eighthPoints = num_points / 8;
149 __m256 aVal, pio2, x, y, z, arcsine;
150 __m256 fzeroes, fones, ftwos, ffours, condition;
152 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
153 fzeroes = _mm256_setzero_ps();
154 fones = _mm256_set1_ps(1.0);
155 ftwos = _mm256_set1_ps(2.0);
156 ffours = _mm256_set1_ps(4.0);
158 for (; number < eighthPoints; number++) {
159 aVal = _mm256_load_ps(aPtr);
160 aVal = _mm256_div_ps(aVal,
161 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
162 _mm256_sub_ps(fones, aVal))));
164 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
165 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
166 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
168 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
170 for (
i = 0;
i < 2;
i++) {
172 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
174 x = _mm256_div_ps(fones, x);
177 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
178 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
181 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
182 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
187 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188 arcsine = _mm256_sub_ps(arcsine,
189 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
191 _mm256_store_ps(bPtr, arcsine);
196 number = eighthPoints * 8;
197 for (; number < num_points; number++) {
198 *bPtr++ = asin(*aPtr++);
204 #ifdef LV_HAVE_SSE4_1
205 #include <smmintrin.h>
208 volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
213 unsigned int number = 0;
214 unsigned int quarterPoints = num_points / 4;
217 __m128 aVal, pio2, x, y, z, arcsine;
218 __m128 fzeroes, fones, ftwos, ffours, condition;
226 for (; number < quarterPoints; number++) {
237 for (
i = 0;
i < 2;
i++) {
260 number = quarterPoints * 4;
261 for (; number < num_points; number++) {
262 *bPtr++ = asinf(*aPtr++);
270 #ifndef INCLUDED_volk_32f_asin_32f_u_H
271 #define INCLUDED_volk_32f_asin_32f_u_H
273 #if LV_HAVE_AVX2 && LV_HAVE_FMA
274 #include <immintrin.h>
276 static inline void volk_32f_asin_32f_u_avx2_fma(
float* bVector,
277 const float* aVector,
278 unsigned int num_points)
280 float* bPtr = bVector;
281 const float* aPtr = aVector;
283 unsigned int number = 0;
284 unsigned int eighthPoints = num_points / 8;
287 __m256 aVal, pio2, x, y, z, arcsine;
288 __m256 fzeroes, fones, ftwos, ffours, condition;
290 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
291 fzeroes = _mm256_setzero_ps();
292 fones = _mm256_set1_ps(1.0);
293 ftwos = _mm256_set1_ps(2.0);
294 ffours = _mm256_set1_ps(4.0);
296 for (; number < eighthPoints; number++) {
297 aVal = _mm256_loadu_ps(aPtr);
298 aVal = _mm256_div_ps(aVal,
299 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
300 _mm256_sub_ps(fones, aVal))));
302 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
303 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
304 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
306 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
308 for (
i = 0;
i < 2;
i++) {
309 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
311 x = _mm256_div_ps(fones, x);
315 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
318 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
319 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
321 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
323 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
324 arcsine = _mm256_sub_ps(arcsine,
325 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
327 _mm256_storeu_ps(bPtr, arcsine);
332 number = eighthPoints * 8;
333 for (; number < num_points; number++) {
334 *bPtr++ = asin(*aPtr++);
342 #include <immintrin.h>
347 float* bPtr = bVector;
348 const float* aPtr = aVector;
350 unsigned int number = 0;
351 unsigned int eighthPoints = num_points / 8;
354 __m256 aVal, pio2, x, y, z, arcsine;
355 __m256 fzeroes, fones, ftwos, ffours, condition;
357 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
358 fzeroes = _mm256_setzero_ps();
359 fones = _mm256_set1_ps(1.0);
360 ftwos = _mm256_set1_ps(2.0);
361 ffours = _mm256_set1_ps(4.0);
363 for (; number < eighthPoints; number++) {
364 aVal = _mm256_loadu_ps(aPtr);
365 aVal = _mm256_div_ps(aVal,
366 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
367 _mm256_sub_ps(fones, aVal))));
369 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
370 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
371 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
373 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
375 for (
i = 0;
i < 2;
i++) {
377 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
379 x = _mm256_div_ps(fones, x);
382 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
383 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
386 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
387 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
390 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
392 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
393 arcsine = _mm256_sub_ps(arcsine,
394 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
396 _mm256_storeu_ps(bPtr, arcsine);
401 number = eighthPoints * 8;
402 for (; number < num_points; number++) {
403 *bPtr++ = asin(*aPtr++);
410 #ifdef LV_HAVE_SSE4_1
411 #include <smmintrin.h>
414 volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
416 float* bPtr = bVector;
417 const float* aPtr = aVector;
419 unsigned int number = 0;
420 unsigned int quarterPoints = num_points / 4;
423 __m128 aVal, pio2, x, y, z, arcsine;
424 __m128 fzeroes, fones, ftwos, ffours, condition;
432 for (; number < quarterPoints; number++) {
443 for (
i = 0;
i < 2;
i++) {
466 number = quarterPoints * 4;
467 for (; number < num_points; number++) {
468 *bPtr++ = asinf(*aPtr++);
474 #ifdef LV_HAVE_GENERIC
479 float* bPtr = bVector;
480 const float* aPtr = aVector;
481 unsigned int number = 0;
483 for (number = 0; number < num_points; number++) {
484 *bPtr++ = asinf(*aPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:63
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:345
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:140
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:477
for i
Definition: volk_config_fixed.tmpl.h:13