63 #ifndef INCLUDED_volk_32f_cos_32f_a_H
64 #define INCLUDED_volk_32f_cos_32f_a_H
66 #ifdef LV_HAVE_AVX512F
68 #include <immintrin.h>
69 static inline void volk_32f_cos_32f_a_avx512f(
float* cosVector,
70 const float* inVector,
71 unsigned int num_points)
73 float* cosPtr = cosVector;
74 const float* inPtr = inVector;
76 unsigned int number = 0;
77 unsigned int sixteenPoints = num_points / 16;
80 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
82 __m512i q, zeros, ones, twos, fours;
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2;
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
119 _mm512_set1_ps(8.0f));
120 s = _mm512_mul_ps(s, s);
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
130 for (
i = 0;
i < 3;
i++)
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 s = _mm512_div_ps(s, ftwos);
134 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135 cosine = _mm512_sub_ps(fones, s);
138 condition1 = _mm512_cmpneq_epi32_mask(
139 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
142 condition2 = _mm512_cmpneq_epi32_mask(
143 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
144 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
145 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
146 _mm512_store_ps(cosPtr, cosine);
151 number = sixteenPoints * 16;
152 for (; number < num_points; number++) {
153 *cosPtr++ = cosf(*inPtr++);
158 #if LV_HAVE_AVX2 && LV_HAVE_FMA
159 #include <immintrin.h>
162 volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
164 float* bPtr = bVector;
165 const float* aPtr = aVector;
167 unsigned int number = 0;
168 unsigned int eighthPoints = num_points / 8;
171 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
174 __m256i q, ones, twos, fours;
176 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
177 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
178 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
179 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
180 ffours = _mm256_set1_ps(4.0);
181 ftwos = _mm256_set1_ps(2.0);
182 fones = _mm256_set1_ps(1.0);
183 fzeroes = _mm256_setzero_ps();
184 __m256i zeroes = _mm256_set1_epi32(0);
185 ones = _mm256_set1_epi32(1);
186 __m256i allones = _mm256_set1_epi32(0xffffffff);
187 twos = _mm256_set1_epi32(2);
188 fours = _mm256_set1_epi32(4);
190 cp1 = _mm256_set1_ps(1.0);
191 cp2 = _mm256_set1_ps(0.08333333333333333);
192 cp3 = _mm256_set1_ps(0.002777777777777778);
193 cp4 = _mm256_set1_ps(4.96031746031746e-05);
194 cp5 = _mm256_set1_ps(5.511463844797178e-07);
198 for (; number < eighthPoints; number++) {
200 aVal = _mm256_load_ps(aPtr);
202 s = _mm256_sub_ps(aVal,
203 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
204 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
206 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
208 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
210 s = _mm256_fnmadd_ps(r, pio4A, s);
211 s = _mm256_fnmadd_ps(r, pio4B, s);
212 s = _mm256_fnmadd_ps(r, pio4C, s);
216 _mm256_set1_ps(8.0));
217 s = _mm256_mul_ps(s, s);
222 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
227 for (
i = 0;
i < 3;
i++)
228 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
229 s = _mm256_div_ps(s, ftwos);
231 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
232 cosine = _mm256_sub_ps(fones, s);
236 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
237 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
240 condition3.int_vec = _mm256_cmpeq_epi32(
241 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
242 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
244 cosine = _mm256_add_ps(
245 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
246 cosine = _mm256_sub_ps(cosine,
247 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
248 condition3.float_vec));
249 _mm256_store_ps(bPtr, cosine);
254 number = eighthPoints * 8;
255 for (; number < num_points; number++) {
256 *bPtr++ = cos(*aPtr++);
263 #include <immintrin.h>
266 volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
268 float* bPtr = bVector;
269 const float* aPtr = aVector;
271 unsigned int number = 0;
272 unsigned int eighthPoints = num_points / 8;
275 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
278 __m256i q, ones, twos, fours;
280 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
281 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
282 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
283 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
284 ffours = _mm256_set1_ps(4.0);
285 ftwos = _mm256_set1_ps(2.0);
286 fones = _mm256_set1_ps(1.0);
287 fzeroes = _mm256_setzero_ps();
288 __m256i zeroes = _mm256_set1_epi32(0);
289 ones = _mm256_set1_epi32(1);
290 __m256i allones = _mm256_set1_epi32(0xffffffff);
291 twos = _mm256_set1_epi32(2);
292 fours = _mm256_set1_epi32(4);
294 cp1 = _mm256_set1_ps(1.0);
295 cp2 = _mm256_set1_ps(0.08333333333333333);
296 cp3 = _mm256_set1_ps(0.002777777777777778);
297 cp4 = _mm256_set1_ps(4.96031746031746e-05);
298 cp5 = _mm256_set1_ps(5.511463844797178e-07);
302 for (; number < eighthPoints; number++) {
304 aVal = _mm256_load_ps(aPtr);
306 s = _mm256_sub_ps(aVal,
307 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
308 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
310 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
312 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
314 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
315 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
316 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
320 _mm256_set1_ps(8.0));
321 s = _mm256_mul_ps(s, s);
329 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
338 for (
i = 0;
i < 3;
i++)
339 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
340 s = _mm256_div_ps(s, ftwos);
342 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
343 cosine = _mm256_sub_ps(fones, s);
347 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
348 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
351 condition3.int_vec = _mm256_cmpeq_epi32(
352 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
353 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
355 cosine = _mm256_add_ps(
356 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
357 cosine = _mm256_sub_ps(cosine,
358 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
359 condition3.float_vec));
360 _mm256_store_ps(bPtr, cosine);
365 number = eighthPoints * 8;
366 for (; number < num_points; number++) {
367 *bPtr++ = cos(*aPtr++);
373 #ifdef LV_HAVE_SSE4_1
374 #include <smmintrin.h>
377 volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
379 float* bPtr = bVector;
380 const float* aPtr = aVector;
382 unsigned int number = 0;
383 unsigned int quarterPoints = num_points / 4;
386 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
391 m4pi =
_mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
393 pio4B =
_mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
394 pio4C =
_mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
413 for (; number < quarterPoints; number++) {
445 for (
i = 0;
i < 3;
i++)
455 condition1.int_vec =
_mm_xor_si128(allones, condition1.int_vec);
460 condition3.int_vec =
_mm_xor_si128(allones, condition3.int_vec);
472 number = quarterPoints * 4;
473 for (; number < num_points; number++) {
474 *bPtr++ = cosf(*aPtr++);
483 #ifndef INCLUDED_volk_32f_cos_32f_u_H
484 #define INCLUDED_volk_32f_cos_32f_u_H
486 #ifdef LV_HAVE_AVX512F
488 #include <immintrin.h>
489 static inline void volk_32f_cos_32f_u_avx512f(
float* cosVector,
490 const float* inVector,
491 unsigned int num_points)
493 float* cosPtr = cosVector;
494 const float* inPtr = inVector;
496 unsigned int number = 0;
497 unsigned int sixteenPoints = num_points / 16;
500 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
502 __m512i q, zeros, ones, twos, fours;
504 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
505 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
506 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
507 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
508 ffours = _mm512_set1_ps(4.0);
509 ftwos = _mm512_set1_ps(2.0);
510 fones = _mm512_set1_ps(1.0);
511 zeros = _mm512_setzero_epi32();
512 ones = _mm512_set1_epi32(1);
513 twos = _mm512_set1_epi32(2);
514 fours = _mm512_set1_epi32(4);
516 cp1 = _mm512_set1_ps(1.0);
517 cp2 = _mm512_set1_ps(0.08333333333333333);
518 cp3 = _mm512_set1_ps(0.002777777777777778);
519 cp4 = _mm512_set1_ps(4.96031746031746e-05);
520 cp5 = _mm512_set1_ps(5.511463844797178e-07);
521 __mmask16 condition1, condition2;
522 for (; number < sixteenPoints; number++) {
523 aVal = _mm512_loadu_ps(inPtr);
525 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
528 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
530 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
532 s = _mm512_fnmadd_ps(r, pio4A, s);
533 s = _mm512_fnmadd_ps(r, pio4B, s);
534 s = _mm512_fnmadd_ps(r, pio4C, s);
538 _mm512_set1_ps(8.0f));
539 s = _mm512_mul_ps(s, s);
544 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
549 for (
i = 0;
i < 3;
i++)
550 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
551 s = _mm512_div_ps(s, ftwos);
553 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
554 cosine = _mm512_sub_ps(fones, s);
557 condition1 = _mm512_cmpneq_epi32_mask(
558 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
561 condition2 = _mm512_cmpneq_epi32_mask(
562 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
564 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
565 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
566 _mm512_storeu_ps(cosPtr, cosine);
571 number = sixteenPoints * 16;
572 for (; number < num_points; number++) {
573 *cosPtr++ = cosf(*inPtr++);
578 #if LV_HAVE_AVX2 && LV_HAVE_FMA
579 #include <immintrin.h>
582 volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
584 float* bPtr = bVector;
585 const float* aPtr = aVector;
587 unsigned int number = 0;
588 unsigned int eighthPoints = num_points / 8;
591 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
594 __m256i q, ones, twos, fours;
596 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
597 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
598 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
599 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
600 ffours = _mm256_set1_ps(4.0);
601 ftwos = _mm256_set1_ps(2.0);
602 fones = _mm256_set1_ps(1.0);
603 fzeroes = _mm256_setzero_ps();
604 __m256i zeroes = _mm256_set1_epi32(0);
605 ones = _mm256_set1_epi32(1);
606 __m256i allones = _mm256_set1_epi32(0xffffffff);
607 twos = _mm256_set1_epi32(2);
608 fours = _mm256_set1_epi32(4);
610 cp1 = _mm256_set1_ps(1.0);
611 cp2 = _mm256_set1_ps(0.08333333333333333);
612 cp3 = _mm256_set1_ps(0.002777777777777778);
613 cp4 = _mm256_set1_ps(4.96031746031746e-05);
614 cp5 = _mm256_set1_ps(5.511463844797178e-07);
618 for (; number < eighthPoints; number++) {
620 aVal = _mm256_loadu_ps(aPtr);
622 s = _mm256_sub_ps(aVal,
623 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
624 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
626 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
628 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
630 s = _mm256_fnmadd_ps(r, pio4A, s);
631 s = _mm256_fnmadd_ps(r, pio4B, s);
632 s = _mm256_fnmadd_ps(r, pio4C, s);
636 _mm256_set1_ps(8.0));
637 s = _mm256_mul_ps(s, s);
642 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
647 for (
i = 0;
i < 3;
i++)
648 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
649 s = _mm256_div_ps(s, ftwos);
651 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
652 cosine = _mm256_sub_ps(fones, s);
656 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
657 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
660 condition3.int_vec = _mm256_cmpeq_epi32(
661 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
662 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
664 cosine = _mm256_add_ps(
665 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
666 cosine = _mm256_sub_ps(cosine,
667 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
668 condition3.float_vec));
669 _mm256_storeu_ps(bPtr, cosine);
674 number = eighthPoints * 8;
675 for (; number < num_points; number++) {
676 *bPtr++ = cos(*aPtr++);
683 #include <immintrin.h>
686 volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
688 float* bPtr = bVector;
689 const float* aPtr = aVector;
691 unsigned int number = 0;
692 unsigned int eighthPoints = num_points / 8;
695 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
698 __m256i q, ones, twos, fours;
700 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
701 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
702 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
703 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
704 ffours = _mm256_set1_ps(4.0);
705 ftwos = _mm256_set1_ps(2.0);
706 fones = _mm256_set1_ps(1.0);
707 fzeroes = _mm256_setzero_ps();
708 __m256i zeroes = _mm256_set1_epi32(0);
709 ones = _mm256_set1_epi32(1);
710 __m256i allones = _mm256_set1_epi32(0xffffffff);
711 twos = _mm256_set1_epi32(2);
712 fours = _mm256_set1_epi32(4);
714 cp1 = _mm256_set1_ps(1.0);
715 cp2 = _mm256_set1_ps(0.08333333333333333);
716 cp3 = _mm256_set1_ps(0.002777777777777778);
717 cp4 = _mm256_set1_ps(4.96031746031746e-05);
718 cp5 = _mm256_set1_ps(5.511463844797178e-07);
722 for (; number < eighthPoints; number++) {
724 aVal = _mm256_loadu_ps(aPtr);
726 s = _mm256_sub_ps(aVal,
727 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
728 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
730 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
732 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
734 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
735 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
736 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
740 _mm256_set1_ps(8.0));
741 s = _mm256_mul_ps(s, s);
749 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
758 for (
i = 0;
i < 3;
i++)
759 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
760 s = _mm256_div_ps(s, ftwos);
762 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
763 cosine = _mm256_sub_ps(fones, s);
767 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
768 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
771 condition3.int_vec = _mm256_cmpeq_epi32(
772 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
773 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
775 cosine = _mm256_add_ps(
776 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
777 cosine = _mm256_sub_ps(cosine,
778 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
779 condition3.float_vec));
780 _mm256_storeu_ps(bPtr, cosine);
785 number = eighthPoints * 8;
786 for (; number < num_points; number++) {
787 *bPtr++ = cos(*aPtr++);
793 #ifdef LV_HAVE_SSE4_1
794 #include <smmintrin.h>
797 volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
799 float* bPtr = bVector;
800 const float* aPtr = aVector;
802 unsigned int number = 0;
803 unsigned int quarterPoints = num_points / 4;
806 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
808 __m128 sine, cosine, condition1, condition3;
809 __m128i q, r, ones, twos, fours;
828 for (; number < quarterPoints; number++) {
855 for (
i = 0;
i < 3;
i++) {
877 number = quarterPoints * 4;
878 for (; number < num_points; number++) {
879 *bPtr++ = cosf(*aPtr++);
886 #ifdef LV_HAVE_GENERIC
894 const float* aVector,
895 unsigned int num_points)
897 float* bPtr = bVector;
898 const float* aPtr = aVector;
900 float m4pi = 1.273239544735162542821171882678754627704620361328125;
901 float pio4A = 0.7853981554508209228515625;
902 float pio4B = 0.794662735614792836713604629039764404296875e-8;
903 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
907 for (number = 0; number < num_points; number++) {
908 float s = fabs(*aPtr);
909 int q = (int)(s * m4pi);
917 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
922 for (
i = 0;
i < N; ++
i) {
927 float sine = sqrt((2.0 - s) * s);
928 float cosine = 1 - s;
930 if (((q + 1) & 2) != 0) {
935 if (((q + 2) & 4) != 0) {
947 #ifdef LV_HAVE_GENERIC
952 float* bPtr = bVector;
953 const float* aPtr = aVector;
954 unsigned int number = 0;
956 for (; number < num_points; number++) {
957 *bPtr++ = cosf(*aPtr++);
965 #include <arm_neon.h>
971 unsigned int number = 0;
972 unsigned int quarter_points = num_points / 4;
973 float* bVectorPtr = bVector;
974 const float* aVectorPtr = aVector;
979 for (number = 0; number < quarter_points; number++) {
980 a_vec = vld1q_f32(aVectorPtr);
984 vst1q_f32(bVectorPtr, b_vec);
991 for (number = quarter_points * 4; number < num_points; number++) {
992 *bVectorPtr++ = cosf(*aVectorPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3275
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
Definition: volk_common.h:120
Definition: volk_common.h:137
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:950
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:969
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:893
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:255