65 #ifndef INCLUDED_volk_32f_tan_32f_a_H
66 #define INCLUDED_volk_32f_tan_32f_a_H
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
72 volk_32f_tan_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
74 float* bPtr = bVector;
75 const float* aPtr = aVector;
77 unsigned int number = 0;
78 unsigned int eighthPoints = num_points / 8;
81 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
83 __m256 sine, cosine, tangent, condition1, condition2, condition3;
84 __m256i q, r, ones, twos, fours;
86 m4pi = _mm256_set1_ps(1.273239545);
87 pio4A = _mm256_set1_ps(0.78515625);
88 pio4B = _mm256_set1_ps(0.241876e-3);
89 ffours = _mm256_set1_ps(4.0);
90 ftwos = _mm256_set1_ps(2.0);
91 fones = _mm256_set1_ps(1.0);
92 fzeroes = _mm256_setzero_ps();
93 ones = _mm256_set1_epi32(1);
94 twos = _mm256_set1_epi32(2);
95 fours = _mm256_set1_epi32(4);
97 cp1 = _mm256_set1_ps(1.0);
98 cp2 = _mm256_set1_ps(0.83333333e-1);
99 cp3 = _mm256_set1_ps(0.2777778e-2);
100 cp4 = _mm256_set1_ps(0.49603e-4);
101 cp5 = _mm256_set1_ps(0.551e-6);
103 for (; number < eighthPoints; number++) {
104 aVal = _mm256_load_ps(aPtr);
105 s = _mm256_sub_ps(aVal,
106 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
107 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
108 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
109 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
111 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
112 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
116 _mm256_set1_ps(8.0));
117 s = _mm256_mul_ps(s, s);
122 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
127 for (
i = 0;
i < 3;
i++) {
128 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
130 s = _mm256_div_ps(s, ftwos);
132 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
133 cosine = _mm256_sub_ps(fones, s);
135 condition1 = _mm256_cmp_ps(
136 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
139 condition2 = _mm256_cmp_ps(
141 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
142 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
144 condition3 = _mm256_cmp_ps(
145 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
149 __m256 temp = cosine;
151 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
152 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
153 sine = _mm256_sub_ps(
154 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
155 cosine = _mm256_sub_ps(
157 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
158 tangent = _mm256_div_ps(sine, cosine);
159 _mm256_store_ps(bPtr, tangent);
164 number = eighthPoints * 8;
165 for (; number < num_points; number++) {
166 *bPtr++ = tan(*aPtr++);
173 #include <immintrin.h>
176 volk_32f_tan_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
178 float* bPtr = bVector;
179 const float* aPtr = aVector;
181 unsigned int number = 0;
182 unsigned int eighthPoints = num_points / 8;
185 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
187 __m256 sine, cosine, tangent, condition1, condition2, condition3;
188 __m256i q, r, ones, twos, fours;
190 m4pi = _mm256_set1_ps(1.273239545);
191 pio4A = _mm256_set1_ps(0.78515625);
192 pio4B = _mm256_set1_ps(0.241876e-3);
193 ffours = _mm256_set1_ps(4.0);
194 ftwos = _mm256_set1_ps(2.0);
195 fones = _mm256_set1_ps(1.0);
196 fzeroes = _mm256_setzero_ps();
197 ones = _mm256_set1_epi32(1);
198 twos = _mm256_set1_epi32(2);
199 fours = _mm256_set1_epi32(4);
201 cp1 = _mm256_set1_ps(1.0);
202 cp2 = _mm256_set1_ps(0.83333333e-1);
203 cp3 = _mm256_set1_ps(0.2777778e-2);
204 cp4 = _mm256_set1_ps(0.49603e-4);
205 cp5 = _mm256_set1_ps(0.551e-6);
207 for (; number < eighthPoints; number++) {
208 aVal = _mm256_load_ps(aPtr);
209 s = _mm256_sub_ps(aVal,
210 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
211 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
212 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
215 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
216 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
220 _mm256_set1_ps(8.0));
221 s = _mm256_mul_ps(s, s);
229 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
238 for (
i = 0;
i < 3;
i++) {
239 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
241 s = _mm256_div_ps(s, ftwos);
243 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
244 cosine = _mm256_sub_ps(fones, s);
246 condition1 = _mm256_cmp_ps(
247 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
250 condition2 = _mm256_cmp_ps(
252 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
253 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
255 condition3 = _mm256_cmp_ps(
256 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
260 __m256 temp = cosine;
262 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
263 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
264 sine = _mm256_sub_ps(
265 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
266 cosine = _mm256_sub_ps(
268 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
269 tangent = _mm256_div_ps(sine, cosine);
270 _mm256_store_ps(bPtr, tangent);
275 number = eighthPoints * 8;
276 for (; number < num_points; number++) {
277 *bPtr++ = tan(*aPtr++);
283 #ifdef LV_HAVE_SSE4_1
284 #include <smmintrin.h>
287 volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
289 float* bPtr = bVector;
290 const float* aPtr = aVector;
292 unsigned int number = 0;
293 unsigned int quarterPoints = num_points / 4;
296 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
298 __m128 sine, cosine, tangent, condition1, condition2, condition3;
299 __m128i q, r, ones, twos, fours;
318 for (; number < quarterPoints; number++) {
345 for (
i = 0;
i < 3;
i++) {
374 number = quarterPoints * 4;
375 for (; number < num_points; number++) {
376 *bPtr++ = tanf(*aPtr++);
385 #ifndef INCLUDED_volk_32f_tan_32f_u_H
386 #define INCLUDED_volk_32f_tan_32f_u_H
388 #if LV_HAVE_AVX2 && LV_HAVE_FMA
389 #include <immintrin.h>
392 volk_32f_tan_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
394 float* bPtr = bVector;
395 const float* aPtr = aVector;
397 unsigned int number = 0;
398 unsigned int eighthPoints = num_points / 8;
401 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
403 __m256 sine, cosine, tangent, condition1, condition2, condition3;
404 __m256i q, r, ones, twos, fours;
406 m4pi = _mm256_set1_ps(1.273239545);
407 pio4A = _mm256_set1_ps(0.78515625);
408 pio4B = _mm256_set1_ps(0.241876e-3);
409 ffours = _mm256_set1_ps(4.0);
410 ftwos = _mm256_set1_ps(2.0);
411 fones = _mm256_set1_ps(1.0);
412 fzeroes = _mm256_setzero_ps();
413 ones = _mm256_set1_epi32(1);
414 twos = _mm256_set1_epi32(2);
415 fours = _mm256_set1_epi32(4);
417 cp1 = _mm256_set1_ps(1.0);
418 cp2 = _mm256_set1_ps(0.83333333e-1);
419 cp3 = _mm256_set1_ps(0.2777778e-2);
420 cp4 = _mm256_set1_ps(0.49603e-4);
421 cp5 = _mm256_set1_ps(0.551e-6);
423 for (; number < eighthPoints; number++) {
424 aVal = _mm256_loadu_ps(aPtr);
425 s = _mm256_sub_ps(aVal,
426 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
427 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
428 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
429 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
431 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
432 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
436 _mm256_set1_ps(8.0));
437 s = _mm256_mul_ps(s, s);
442 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
447 for (
i = 0;
i < 3;
i++) {
448 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
450 s = _mm256_div_ps(s, ftwos);
452 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
453 cosine = _mm256_sub_ps(fones, s);
455 condition1 = _mm256_cmp_ps(
456 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
459 condition2 = _mm256_cmp_ps(
461 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
462 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
464 condition3 = _mm256_cmp_ps(
465 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
469 __m256 temp = cosine;
471 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
472 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
473 sine = _mm256_sub_ps(
474 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
475 cosine = _mm256_sub_ps(
477 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
478 tangent = _mm256_div_ps(sine, cosine);
479 _mm256_storeu_ps(bPtr, tangent);
484 number = eighthPoints * 8;
485 for (; number < num_points; number++) {
486 *bPtr++ = tan(*aPtr++);
493 #include <immintrin.h>
496 volk_32f_tan_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
498 float* bPtr = bVector;
499 const float* aPtr = aVector;
501 unsigned int number = 0;
502 unsigned int eighthPoints = num_points / 8;
505 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
507 __m256 sine, cosine, tangent, condition1, condition2, condition3;
508 __m256i q, r, ones, twos, fours;
510 m4pi = _mm256_set1_ps(1.273239545);
511 pio4A = _mm256_set1_ps(0.78515625);
512 pio4B = _mm256_set1_ps(0.241876e-3);
513 ffours = _mm256_set1_ps(4.0);
514 ftwos = _mm256_set1_ps(2.0);
515 fones = _mm256_set1_ps(1.0);
516 fzeroes = _mm256_setzero_ps();
517 ones = _mm256_set1_epi32(1);
518 twos = _mm256_set1_epi32(2);
519 fours = _mm256_set1_epi32(4);
521 cp1 = _mm256_set1_ps(1.0);
522 cp2 = _mm256_set1_ps(0.83333333e-1);
523 cp3 = _mm256_set1_ps(0.2777778e-2);
524 cp4 = _mm256_set1_ps(0.49603e-4);
525 cp5 = _mm256_set1_ps(0.551e-6);
527 for (; number < eighthPoints; number++) {
528 aVal = _mm256_loadu_ps(aPtr);
529 s = _mm256_sub_ps(aVal,
530 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
531 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
533 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
535 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
536 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
540 _mm256_set1_ps(8.0));
541 s = _mm256_mul_ps(s, s);
549 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
558 for (
i = 0;
i < 3;
i++) {
559 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
561 s = _mm256_div_ps(s, ftwos);
563 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
564 cosine = _mm256_sub_ps(fones, s);
566 condition1 = _mm256_cmp_ps(
567 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
570 condition2 = _mm256_cmp_ps(
572 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
573 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
575 condition3 = _mm256_cmp_ps(
576 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
580 __m256 temp = cosine;
582 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
583 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
584 sine = _mm256_sub_ps(
585 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
586 cosine = _mm256_sub_ps(
588 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
589 tangent = _mm256_div_ps(sine, cosine);
590 _mm256_storeu_ps(bPtr, tangent);
595 number = eighthPoints * 8;
596 for (; number < num_points; number++) {
597 *bPtr++ = tan(*aPtr++);
604 #ifdef LV_HAVE_SSE4_1
605 #include <smmintrin.h>
608 volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
610 float* bPtr = bVector;
611 const float* aPtr = aVector;
613 unsigned int number = 0;
614 unsigned int quarterPoints = num_points / 4;
617 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
619 __m128 sine, cosine, tangent, condition1, condition2, condition3;
620 __m128i q, r, ones, twos, fours;
639 for (; number < quarterPoints; number++) {
666 for (
i = 0;
i < 3;
i++) {
695 number = quarterPoints * 4;
696 for (; number < num_points; number++) {
697 *bPtr++ = tanf(*aPtr++);
704 #ifdef LV_HAVE_GENERIC
709 float* bPtr = bVector;
710 const float* aPtr = aVector;
711 unsigned int number = 0;
713 for (; number < num_points; number++) {
714 *bPtr++ = tanf(*aPtr++);
721 #include <arm_neon.h>
727 unsigned int number = 0;
728 unsigned int quarter_points = num_points / 4;
729 float* bVectorPtr = bVector;
730 const float* aVectorPtr = aVector;
735 for (number = 0; number < quarter_points; number++) {
736 a_vec = vld1q_f32(aVectorPtr);
740 vst1q_f32(bVectorPtr, b_vec);
747 for (number = quarter_points * 4; number < num_points; number++) {
748 *bVectorPtr++ = tanf(*aVectorPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_32f_tan_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:725
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:707
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:261