63 #ifndef INCLUDED_volk_32f_sin_32f_a_H
64 #define INCLUDED_volk_32f_sin_32f_a_H
65 #ifdef LV_HAVE_AVX512F
67 #include <immintrin.h>
68 static inline void volk_32f_sin_32f_a_avx512f(
float* sinVector,
69 const float* inVector,
70 unsigned int num_points)
72 float* sinPtr = sinVector;
73 const float* inPtr = inVector;
75 unsigned int number = 0;
76 unsigned int sixteenPoints = num_points / 16;
79 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
82 __m512i q, zeros, ones, twos, fours;
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2, ltZero;
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
119 _mm512_set1_ps(8.0f));
120 s = _mm512_mul_ps(s, s);
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
130 for (
i = 0;
i < 3;
i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
137 condition1 = _mm512_cmpneq_epi32_mask(
138 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
139 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
140 condition2 = _mm512_kxor(
141 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
143 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
144 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
145 _mm512_store_ps(sinPtr, sine);
150 number = sixteenPoints * 16;
151 for (; number < num_points; number++) {
152 *sinPtr++ = sinf(*inPtr++);
156 #if LV_HAVE_AVX2 && LV_HAVE_FMA
157 #include <immintrin.h>
160 volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
162 float* bPtr = bVector;
163 const float* aPtr = aVector;
165 unsigned int number = 0;
166 unsigned int eighthPoints = num_points / 8;
169 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
171 __m256 sine, cosine, condition1, condition2;
172 __m256i q, r, ones, twos, fours;
174 m4pi = _mm256_set1_ps(1.273239545);
175 pio4A = _mm256_set1_ps(0.78515625);
176 pio4B = _mm256_set1_ps(0.241876e-3);
177 ffours = _mm256_set1_ps(4.0);
178 ftwos = _mm256_set1_ps(2.0);
179 fones = _mm256_set1_ps(1.0);
180 fzeroes = _mm256_setzero_ps();
181 ones = _mm256_set1_epi32(1);
182 twos = _mm256_set1_epi32(2);
183 fours = _mm256_set1_epi32(4);
185 cp1 = _mm256_set1_ps(1.0);
186 cp2 = _mm256_set1_ps(0.83333333e-1);
187 cp3 = _mm256_set1_ps(0.2777778e-2);
188 cp4 = _mm256_set1_ps(0.49603e-4);
189 cp5 = _mm256_set1_ps(0.551e-6);
191 for (; number < eighthPoints; number++) {
192 aVal = _mm256_load_ps(aPtr);
193 s = _mm256_sub_ps(aVal,
194 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
195 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
196 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
197 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
199 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
200 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
204 _mm256_set1_ps(8.0));
205 s = _mm256_mul_ps(s, s);
210 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
215 for (
i = 0;
i < 3;
i++) {
216 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
218 s = _mm256_div_ps(s, ftwos);
220 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
221 cosine = _mm256_sub_ps(fones, s);
223 condition1 = _mm256_cmp_ps(
224 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
227 condition2 = _mm256_cmp_ps(
229 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
230 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
237 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
238 sine = _mm256_sub_ps(
239 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
240 _mm256_store_ps(bPtr, sine);
245 number = eighthPoints * 8;
246 for (; number < num_points; number++) {
247 *bPtr++ = sin(*aPtr++);
254 #include <immintrin.h>
257 volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
259 float* bPtr = bVector;
260 const float* aPtr = aVector;
262 unsigned int number = 0;
263 unsigned int eighthPoints = num_points / 8;
266 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
268 __m256 sine, cosine, condition1, condition2;
269 __m256i q, r, ones, twos, fours;
271 m4pi = _mm256_set1_ps(1.273239545);
272 pio4A = _mm256_set1_ps(0.78515625);
273 pio4B = _mm256_set1_ps(0.241876e-3);
274 ffours = _mm256_set1_ps(4.0);
275 ftwos = _mm256_set1_ps(2.0);
276 fones = _mm256_set1_ps(1.0);
277 fzeroes = _mm256_setzero_ps();
278 ones = _mm256_set1_epi32(1);
279 twos = _mm256_set1_epi32(2);
280 fours = _mm256_set1_epi32(4);
282 cp1 = _mm256_set1_ps(1.0);
283 cp2 = _mm256_set1_ps(0.83333333e-1);
284 cp3 = _mm256_set1_ps(0.2777778e-2);
285 cp4 = _mm256_set1_ps(0.49603e-4);
286 cp5 = _mm256_set1_ps(0.551e-6);
288 for (; number < eighthPoints; number++) {
289 aVal = _mm256_load_ps(aPtr);
290 s = _mm256_sub_ps(aVal,
291 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
292 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
293 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
294 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
296 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
297 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
301 _mm256_set1_ps(8.0));
302 s = _mm256_mul_ps(s, s);
310 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
319 for (
i = 0;
i < 3;
i++) {
320 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
322 s = _mm256_div_ps(s, ftwos);
324 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
325 cosine = _mm256_sub_ps(fones, s);
327 condition1 = _mm256_cmp_ps(
328 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
331 condition2 = _mm256_cmp_ps(
333 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
334 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
341 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
342 sine = _mm256_sub_ps(
343 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
344 _mm256_store_ps(bPtr, sine);
349 number = eighthPoints * 8;
350 for (; number < num_points; number++) {
351 *bPtr++ = sin(*aPtr++);
357 #ifdef LV_HAVE_SSE4_1
358 #include <smmintrin.h>
361 volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
363 float* bPtr = bVector;
364 const float* aPtr = aVector;
366 unsigned int number = 0;
367 unsigned int quarterPoints = num_points / 4;
370 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
372 __m128 sine, cosine, condition1, condition2;
373 __m128i q, r, ones, twos, fours;
392 for (; number < quarterPoints; number++) {
419 for (
i = 0;
i < 3;
i++) {
444 number = quarterPoints * 4;
445 for (; number < num_points; number++) {
446 *bPtr++ = sinf(*aPtr++);
455 #ifndef INCLUDED_volk_32f_sin_32f_u_H
456 #define INCLUDED_volk_32f_sin_32f_u_H
458 #ifdef LV_HAVE_AVX512F
460 #include <immintrin.h>
461 static inline void volk_32f_sin_32f_u_avx512f(
float* sinVector,
462 const float* inVector,
463 unsigned int num_points)
465 float* sinPtr = sinVector;
466 const float* inPtr = inVector;
468 unsigned int number = 0;
469 unsigned int sixteenPoints = num_points / 16;
472 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
475 __m512i q, zeros, ones, twos, fours;
477 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
478 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
479 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
480 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
481 ffours = _mm512_set1_ps(4.0);
482 ftwos = _mm512_set1_ps(2.0);
483 fones = _mm512_set1_ps(1.0);
484 zeros = _mm512_setzero_epi32();
485 ones = _mm512_set1_epi32(1);
486 twos = _mm512_set1_epi32(2);
487 fours = _mm512_set1_epi32(4);
489 cp1 = _mm512_set1_ps(1.0);
490 cp2 = _mm512_set1_ps(0.08333333333333333);
491 cp3 = _mm512_set1_ps(0.002777777777777778);
492 cp4 = _mm512_set1_ps(4.96031746031746e-05);
493 cp5 = _mm512_set1_ps(5.511463844797178e-07);
494 __mmask16 condition1, condition2, ltZero;
496 for (; number < sixteenPoints; number++) {
497 aVal = _mm512_loadu_ps(inPtr);
499 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
502 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
504 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
506 s = _mm512_fnmadd_ps(r, pio4A, s);
507 s = _mm512_fnmadd_ps(r, pio4B, s);
508 s = _mm512_fnmadd_ps(r, pio4C, s);
512 _mm512_set1_ps(8.0f));
513 s = _mm512_mul_ps(s, s);
518 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
523 for (
i = 0;
i < 3;
i++)
524 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
525 s = _mm512_div_ps(s, ftwos);
527 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
528 cosine = _mm512_sub_ps(fones, s);
530 condition1 = _mm512_cmpneq_epi32_mask(
531 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
532 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
533 condition2 = _mm512_kxor(
534 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
536 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
537 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
538 _mm512_storeu_ps(sinPtr, sine);
543 number = sixteenPoints * 16;
544 for (; number < num_points; number++) {
545 *sinPtr++ = sinf(*inPtr++);
550 #if LV_HAVE_AVX2 && LV_HAVE_FMA
551 #include <immintrin.h>
554 volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
556 float* bPtr = bVector;
557 const float* aPtr = aVector;
559 unsigned int number = 0;
560 unsigned int eighthPoints = num_points / 8;
563 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
565 __m256 sine, cosine, condition1, condition2;
566 __m256i q, r, ones, twos, fours;
568 m4pi = _mm256_set1_ps(1.273239545);
569 pio4A = _mm256_set1_ps(0.78515625);
570 pio4B = _mm256_set1_ps(0.241876e-3);
571 ffours = _mm256_set1_ps(4.0);
572 ftwos = _mm256_set1_ps(2.0);
573 fones = _mm256_set1_ps(1.0);
574 fzeroes = _mm256_setzero_ps();
575 ones = _mm256_set1_epi32(1);
576 twos = _mm256_set1_epi32(2);
577 fours = _mm256_set1_epi32(4);
579 cp1 = _mm256_set1_ps(1.0);
580 cp2 = _mm256_set1_ps(0.83333333e-1);
581 cp3 = _mm256_set1_ps(0.2777778e-2);
582 cp4 = _mm256_set1_ps(0.49603e-4);
583 cp5 = _mm256_set1_ps(0.551e-6);
585 for (; number < eighthPoints; number++) {
586 aVal = _mm256_loadu_ps(aPtr);
587 s = _mm256_sub_ps(aVal,
588 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
589 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
590 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
591 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
593 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
594 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
598 _mm256_set1_ps(8.0));
599 s = _mm256_mul_ps(s, s);
604 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
609 for (
i = 0;
i < 3;
i++) {
610 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
612 s = _mm256_div_ps(s, ftwos);
614 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
615 cosine = _mm256_sub_ps(fones, s);
617 condition1 = _mm256_cmp_ps(
618 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
621 condition2 = _mm256_cmp_ps(
623 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
624 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
631 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
632 sine = _mm256_sub_ps(
633 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
634 _mm256_storeu_ps(bPtr, sine);
639 number = eighthPoints * 8;
640 for (; number < num_points; number++) {
641 *bPtr++ = sin(*aPtr++);
648 #include <immintrin.h>
651 volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
653 float* bPtr = bVector;
654 const float* aPtr = aVector;
656 unsigned int number = 0;
657 unsigned int eighthPoints = num_points / 8;
660 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
662 __m256 sine, cosine, condition1, condition2;
663 __m256i q, r, ones, twos, fours;
665 m4pi = _mm256_set1_ps(1.273239545);
666 pio4A = _mm256_set1_ps(0.78515625);
667 pio4B = _mm256_set1_ps(0.241876e-3);
668 ffours = _mm256_set1_ps(4.0);
669 ftwos = _mm256_set1_ps(2.0);
670 fones = _mm256_set1_ps(1.0);
671 fzeroes = _mm256_setzero_ps();
672 ones = _mm256_set1_epi32(1);
673 twos = _mm256_set1_epi32(2);
674 fours = _mm256_set1_epi32(4);
676 cp1 = _mm256_set1_ps(1.0);
677 cp2 = _mm256_set1_ps(0.83333333e-1);
678 cp3 = _mm256_set1_ps(0.2777778e-2);
679 cp4 = _mm256_set1_ps(0.49603e-4);
680 cp5 = _mm256_set1_ps(0.551e-6);
682 for (; number < eighthPoints; number++) {
683 aVal = _mm256_loadu_ps(aPtr);
684 s = _mm256_sub_ps(aVal,
685 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
686 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
687 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
688 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
690 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
691 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
695 _mm256_set1_ps(8.0));
696 s = _mm256_mul_ps(s, s);
704 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
713 for (
i = 0;
i < 3;
i++) {
714 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
716 s = _mm256_div_ps(s, ftwos);
718 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
719 cosine = _mm256_sub_ps(fones, s);
721 condition1 = _mm256_cmp_ps(
722 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
725 condition2 = _mm256_cmp_ps(
727 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
728 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
735 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
736 sine = _mm256_sub_ps(
737 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
738 _mm256_storeu_ps(bPtr, sine);
743 number = eighthPoints * 8;
744 for (; number < num_points; number++) {
745 *bPtr++ = sin(*aPtr++);
752 #ifdef LV_HAVE_SSE4_1
753 #include <smmintrin.h>
756 volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
758 float* bPtr = bVector;
759 const float* aPtr = aVector;
761 unsigned int number = 0;
762 unsigned int quarterPoints = num_points / 4;
765 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
767 __m128 sine, cosine, condition1, condition2;
768 __m128i q, r, ones, twos, fours;
787 for (; number < quarterPoints; number++) {
814 for (
i = 0;
i < 3;
i++) {
836 number = quarterPoints * 4;
837 for (; number < num_points; number++) {
838 *bPtr++ = sinf(*aPtr++);
845 #ifdef LV_HAVE_GENERIC
850 float* bPtr = bVector;
851 const float* aPtr = aVector;
852 unsigned int number = 0;
854 for (number = 0; number < num_points; number++) {
855 *bPtr++ = sinf(*aPtr++);
863 #include <arm_neon.h>
869 unsigned int number = 0;
870 unsigned int quarter_points = num_points / 4;
871 float* bVectorPtr = bVector;
872 const float* aVectorPtr = aVector;
877 for (number = 0; number < quarter_points; number++) {
878 a_vec = vld1q_f32(aVectorPtr);
882 vst1q_f32(bVectorPtr, b_vec);
889 for (number = quarter_points * 4; number < num_points; number++) {
890 *bVectorPtr++ = sinf(*aVectorPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:848
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:867
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:249