45 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
54 #ifdef LV_HAVE_GENERIC
60 unsigned int num_points)
63 float* res = (
float*)result;
64 float* in = (
float*)input;
65 float* tp = (
float*)taps;
66 unsigned int n_2_ccomplex_blocks = num_points / 2;
68 float sum0[2] = { 0, 0 };
69 float sum1[2] = { 0, 0 };
72 for (
i = 0;
i < n_2_ccomplex_blocks; ++
i) {
73 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
74 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
75 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
76 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
82 res[0] = sum0[0] + sum1[0];
83 res[1] = sum0[1] + sum1[1];
87 *result += input[num_points - 1] * taps[num_points - 1];
94 #if LV_HAVE_SSE && LV_HAVE_64
96 static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(
lv_32fc_t* result,
99 unsigned int num_points)
102 const unsigned int num_bytes = num_points * 8;
103 unsigned int isodd = num_points & 1;
106 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
107 "# const float *taps, unsigned num_bytes)\n\t"
108 "# float sum0 = 0;\n\t"
109 "# float sum1 = 0;\n\t"
110 "# float sum2 = 0;\n\t"
111 "# float sum3 = 0;\n\t"
113 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
114 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
115 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
116 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
119 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
120 "# result[0] = sum0 + sum2;\n\t"
121 "# result[1] = sum1 + sum3;\n\t"
122 "# TODO: prefetch and better scheduling\n\t"
123 " xor %%r9, %%r9\n\t"
124 " xor %%r10, %%r10\n\t"
125 " movq %%rcx, %%rax\n\t"
126 " movq %%rcx, %%r8\n\t"
127 " movq %[rsi], %%r9\n\t"
128 " movq %[rdx], %%r10\n\t"
129 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
130 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
131 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
133 " jmp .%=L1_test\n\t"
134 " # 4 taps / loop\n\t"
135 " # something like ?? cycles / loop\n\t"
137 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
138 "# movups (%%r9), %%xmmA\n\t"
139 "# movups (%%r10), %%xmmB\n\t"
140 "# movups %%xmmA, %%xmmZ\n\t"
141 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
142 "# mulps %%xmmB, %%xmmA\n\t"
143 "# mulps %%xmmZ, %%xmmB\n\t"
144 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
145 "# xorps %%xmmPN, %%xmmA\n\t"
146 "# movups %%xmmA, %%xmmZ\n\t"
147 "# unpcklps %%xmmB, %%xmmA\n\t"
148 "# unpckhps %%xmmB, %%xmmZ\n\t"
149 "# movups %%xmmZ, %%xmmY\n\t"
150 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
151 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
152 "# addps %%xmmZ, %%xmmA\n\t"
153 "# addps %%xmmA, %%xmmC\n\t"
154 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
155 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
156 " movups 0(%%r9), %%xmm0\n\t"
157 " movups 16(%%r9), %%xmm1\n\t"
158 " movups %%xmm0, %%xmm4\n\t"
159 " movups 0(%%r10), %%xmm2\n\t"
160 " mulps %%xmm2, %%xmm0\n\t"
161 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
162 " movups 16(%%r10), %%xmm3\n\t"
163 " movups %%xmm1, %%xmm5\n\t"
164 " addps %%xmm0, %%xmm6\n\t"
165 " mulps %%xmm3, %%xmm1\n\t"
166 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
167 " addps %%xmm1, %%xmm6\n\t"
168 " mulps %%xmm4, %%xmm2\n\t"
169 " addps %%xmm2, %%xmm7\n\t"
170 " mulps %%xmm5, %%xmm3\n\t"
172 " addps %%xmm3, %%xmm7\n\t"
173 " add $32, %%r10\n\t"
177 " # We've handled the bulk of multiplies up to here.\n\t"
178 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
179 " # If so, we've got 2 more taps to do.\n\t"
182 " # The count was odd, do 2 more taps.\n\t"
183 " movups 0(%%r9), %%xmm0\n\t"
184 " movups %%xmm0, %%xmm4\n\t"
185 " movups 0(%%r10), %%xmm2\n\t"
186 " mulps %%xmm2, %%xmm0\n\t"
187 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
188 " addps %%xmm0, %%xmm6\n\t"
189 " mulps %%xmm4, %%xmm2\n\t"
190 " addps %%xmm2, %%xmm7\n\t"
192 " # neg inversor\n\t"
193 " xorps %%xmm1, %%xmm1\n\t"
194 " mov $0x80000000, %%r9\n\t"
195 " movd %%r9, %%xmm1\n\t"
196 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
198 " xorps %%xmm1, %%xmm6\n\t"
199 " movups %%xmm6, %%xmm2\n\t"
200 " unpcklps %%xmm7, %%xmm6\n\t"
201 " unpckhps %%xmm7, %%xmm2\n\t"
202 " movups %%xmm2, %%xmm3\n\t"
203 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
204 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
205 " addps %%xmm2, %%xmm6\n\t"
206 " # xmm6 = r1 i2 r3 i4\n\t"
207 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
208 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
209 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
212 : [rsi]
"r"(input), [rdx]
"r"(taps),
"c"(num_bytes), [rdi]
"r"(result)
213 :
"rax",
"r8",
"r9",
"r10");
217 *result += input[num_points - 1] * taps[num_points - 1];
228 #include <pmmintrin.h>
233 unsigned int num_points)
237 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
239 unsigned int number = 0;
240 const unsigned int halfPoints = num_points / 2;
241 unsigned int isodd = num_points & 1;
243 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
250 for (; number < halfPoints; number++) {
279 dotProduct += (dotProductVector[0] + dotProductVector[1]);
282 dotProduct += input[num_points - 1] * taps[num_points - 1];
285 *result = dotProduct;
376 #include <immintrin.h>
381 unsigned int num_points)
384 unsigned int isodd = num_points & 3;
387 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
389 unsigned int number = 0;
390 const unsigned int quarterPoints = num_points / 4;
392 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
397 dotProdVal = _mm256_setzero_ps();
399 for (; number < quarterPoints; number++) {
400 x = _mm256_loadu_ps((
float*)a);
401 y = _mm256_loadu_ps((
float*)b);
403 yl = _mm256_moveldup_ps(y);
404 yh = _mm256_movehdup_ps(y);
406 tmp1 = _mm256_mul_ps(x, yl);
408 x = _mm256_shuffle_ps(x, x, 0xB1);
410 tmp2 = _mm256_mul_ps(x, yh);
412 z = _mm256_addsub_ps(tmp1,
415 dotProdVal = _mm256_add_ps(dotProdVal,
424 _mm256_storeu_ps((
float*)dotProductVector,
427 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
428 dotProductVector[3]);
430 for (
i = num_points - isodd;
i < num_points;
i++) {
431 dotProduct += input[
i] * taps[
i];
434 *result = dotProduct;
439 #if LV_HAVE_AVX && LV_HAVE_FMA
440 #include <immintrin.h>
442 static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(
lv_32fc_t* result,
445 unsigned int num_points)
448 unsigned int isodd = num_points & 3;
451 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
453 unsigned int number = 0;
454 const unsigned int quarterPoints = num_points / 4;
456 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
461 dotProdVal = _mm256_setzero_ps();
463 for (; number < quarterPoints; number++) {
465 x = _mm256_loadu_ps((
float*)a);
466 y = _mm256_loadu_ps((
float*)b);
468 yl = _mm256_moveldup_ps(y);
469 yh = _mm256_movehdup_ps(y);
473 x = _mm256_shuffle_ps(x, x, 0xB1);
475 tmp2 = _mm256_mul_ps(x, yh);
477 z = _mm256_fmaddsub_ps(
480 dotProdVal = _mm256_add_ps(dotProdVal,
489 _mm256_storeu_ps((
float*)dotProductVector,
492 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
493 dotProductVector[3]);
495 for (
i = num_points - isodd;
i < num_points;
i++) {
496 dotProduct += input[
i] * taps[
i];
499 *result = dotProduct;
506 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
507 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
515 #ifdef LV_HAVE_GENERIC
521 unsigned int num_points)
524 const unsigned int num_bytes = num_points * 8;
526 float* res = (
float*)result;
527 float* in = (
float*)input;
528 float* tp = (
float*)taps;
529 unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
531 float sum0[2] = { 0, 0 };
532 float sum1[2] = { 0, 0 };
535 for (
i = 0;
i < n_2_ccomplex_blocks; ++
i) {
536 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
537 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
538 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
539 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
545 res[0] = sum0[0] + sum1[0];
546 res[1] = sum0[1] + sum1[1];
548 if (num_points & 1) {
549 *result += input[num_points - 1] * taps[num_points - 1];
556 #if LV_HAVE_SSE && LV_HAVE_64
559 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(
lv_32fc_t* result,
562 unsigned int num_points)
565 const unsigned int num_bytes = num_points * 8;
566 unsigned int isodd = num_points & 1;
569 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
570 "# const float *taps, unsigned num_bytes)\n\t"
571 "# float sum0 = 0;\n\t"
572 "# float sum1 = 0;\n\t"
573 "# float sum2 = 0;\n\t"
574 "# float sum3 = 0;\n\t"
576 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
577 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
578 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
579 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
582 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
583 "# result[0] = sum0 + sum2;\n\t"
584 "# result[1] = sum1 + sum3;\n\t"
585 "# TODO: prefetch and better scheduling\n\t"
586 " xor %%r9, %%r9\n\t"
587 " xor %%r10, %%r10\n\t"
588 " movq %%rcx, %%rax\n\t"
589 " movq %%rcx, %%r8\n\t"
590 " movq %[rsi], %%r9\n\t"
591 " movq %[rdx], %%r10\n\t"
592 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
593 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
594 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
596 " jmp .%=L1_test\n\t"
597 " # 4 taps / loop\n\t"
598 " # something like ?? cycles / loop\n\t"
600 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
601 "# movaps (%%r9), %%xmmA\n\t"
602 "# movaps (%%r10), %%xmmB\n\t"
603 "# movaps %%xmmA, %%xmmZ\n\t"
604 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
605 "# mulps %%xmmB, %%xmmA\n\t"
606 "# mulps %%xmmZ, %%xmmB\n\t"
607 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
608 "# xorps %%xmmPN, %%xmmA\n\t"
609 "# movaps %%xmmA, %%xmmZ\n\t"
610 "# unpcklps %%xmmB, %%xmmA\n\t"
611 "# unpckhps %%xmmB, %%xmmZ\n\t"
612 "# movaps %%xmmZ, %%xmmY\n\t"
613 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
614 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
615 "# addps %%xmmZ, %%xmmA\n\t"
616 "# addps %%xmmA, %%xmmC\n\t"
617 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
618 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
619 " movaps 0(%%r9), %%xmm0\n\t"
620 " movaps 16(%%r9), %%xmm1\n\t"
621 " movaps %%xmm0, %%xmm4\n\t"
622 " movaps 0(%%r10), %%xmm2\n\t"
623 " mulps %%xmm2, %%xmm0\n\t"
624 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
625 " movaps 16(%%r10), %%xmm3\n\t"
626 " movaps %%xmm1, %%xmm5\n\t"
627 " addps %%xmm0, %%xmm6\n\t"
628 " mulps %%xmm3, %%xmm1\n\t"
629 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
630 " addps %%xmm1, %%xmm6\n\t"
631 " mulps %%xmm4, %%xmm2\n\t"
632 " addps %%xmm2, %%xmm7\n\t"
633 " mulps %%xmm5, %%xmm3\n\t"
635 " addps %%xmm3, %%xmm7\n\t"
636 " add $32, %%r10\n\t"
640 " # We've handled the bulk of multiplies up to here.\n\t"
641 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
642 " # If so, we've got 2 more taps to do.\n\t"
645 " # The count was odd, do 2 more taps.\n\t"
646 " movaps 0(%%r9), %%xmm0\n\t"
647 " movaps %%xmm0, %%xmm4\n\t"
648 " movaps 0(%%r10), %%xmm2\n\t"
649 " mulps %%xmm2, %%xmm0\n\t"
650 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
651 " addps %%xmm0, %%xmm6\n\t"
652 " mulps %%xmm4, %%xmm2\n\t"
653 " addps %%xmm2, %%xmm7\n\t"
655 " # neg inversor\n\t"
656 " xorps %%xmm1, %%xmm1\n\t"
657 " mov $0x80000000, %%r9\n\t"
658 " movd %%r9, %%xmm1\n\t"
659 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
661 " xorps %%xmm1, %%xmm6\n\t"
662 " movaps %%xmm6, %%xmm2\n\t"
663 " unpcklps %%xmm7, %%xmm6\n\t"
664 " unpckhps %%xmm7, %%xmm2\n\t"
665 " movaps %%xmm2, %%xmm3\n\t"
666 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
667 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
668 " addps %%xmm2, %%xmm6\n\t"
669 " # xmm6 = r1 i2 r3 i4\n\t"
670 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
671 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
672 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
675 : [rsi]
"r"(input), [rdx]
"r"(taps),
"c"(num_bytes), [rdi]
"r"(result)
676 :
"rax",
"r8",
"r9",
"r10");
680 *result += input[num_points - 1] * taps[num_points - 1];
688 #if LV_HAVE_SSE && LV_HAVE_32
690 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(
lv_32fc_t* result,
693 unsigned int num_points)
699 const unsigned int num_bytes = num_points*8;
700 unsigned int isodd = num_points & 1;
705 " #movl %%esp, %%ebp\n\t"
706 " movl 12(%%ebp), %%eax # input\n\t"
707 " movl 16(%%ebp), %%edx # taps\n\t"
708 " movl 20(%%ebp), %%ecx # n_bytes\n\t"
709 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
710 " movaps 0(%%eax), %%xmm0\n\t"
711 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
712 " movaps 0(%%edx), %%xmm2\n\t"
713 " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
714 " jmp .%=L1_test\n\t"
715 " # 4 taps / loop\n\t"
716 " # something like ?? cycles / loop\n\t"
718 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
719 "# movaps (%%eax), %%xmmA\n\t"
720 "# movaps (%%edx), %%xmmB\n\t"
721 "# movaps %%xmmA, %%xmmZ\n\t"
722 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
723 "# mulps %%xmmB, %%xmmA\n\t"
724 "# mulps %%xmmZ, %%xmmB\n\t"
725 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
726 "# xorps %%xmmPN, %%xmmA\n\t"
727 "# movaps %%xmmA, %%xmmZ\n\t"
728 "# unpcklps %%xmmB, %%xmmA\n\t"
729 "# unpckhps %%xmmB, %%xmmZ\n\t"
730 "# movaps %%xmmZ, %%xmmY\n\t"
731 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
732 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
733 "# addps %%xmmZ, %%xmmA\n\t"
734 "# addps %%xmmA, %%xmmC\n\t"
735 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
736 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
737 " movaps 16(%%eax), %%xmm1\n\t"
738 " movaps %%xmm0, %%xmm4\n\t"
739 " mulps %%xmm2, %%xmm0\n\t"
740 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
741 " movaps 16(%%edx), %%xmm3\n\t"
742 " movaps %%xmm1, %%xmm5\n\t"
743 " addps %%xmm0, %%xmm6\n\t"
744 " mulps %%xmm3, %%xmm1\n\t"
745 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
746 " addps %%xmm1, %%xmm6\n\t"
747 " mulps %%xmm4, %%xmm2\n\t"
748 " movaps 32(%%eax), %%xmm0\n\t"
749 " addps %%xmm2, %%xmm7\n\t"
750 " mulps %%xmm5, %%xmm3\n\t"
751 " addl $32, %%eax\n\t"
752 " movaps 32(%%edx), %%xmm2\n\t"
753 " addps %%xmm3, %%xmm7\n\t"
754 " addl $32, %%edx\n\t"
758 " # We've handled the bulk of multiplies up to here.\n\t"
759 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
760 " # If so, we've got 2 more taps to do.\n\t"
761 " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
762 " shrl $4, %%ecx\n\t"
763 " andl $1, %%ecx\n\t"
765 " # The count was odd, do 2 more taps.\n\t"
766 " # Note that we've already got mm0/mm2 preloaded\n\t"
767 " # from the main loop.\n\t"
768 " movaps %%xmm0, %%xmm4\n\t"
769 " mulps %%xmm2, %%xmm0\n\t"
770 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
771 " addps %%xmm0, %%xmm6\n\t"
772 " mulps %%xmm4, %%xmm2\n\t"
773 " addps %%xmm2, %%xmm7\n\t"
775 " # neg inversor\n\t"
776 " movl 8(%%ebp), %%eax \n\t"
777 " xorps %%xmm1, %%xmm1\n\t"
778 " movl $0x80000000, (%%eax)\n\t"
779 " movss (%%eax), %%xmm1\n\t"
780 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
782 " xorps %%xmm1, %%xmm6\n\t"
783 " movaps %%xmm6, %%xmm2\n\t"
784 " unpcklps %%xmm7, %%xmm6\n\t"
785 " unpckhps %%xmm7, %%xmm2\n\t"
786 " movaps %%xmm2, %%xmm3\n\t"
787 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
788 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
789 " addps %%xmm2, %%xmm6\n\t"
790 " # xmm6 = r1 i2 r3 i4\n\t"
791 " #movl 8(%%ebp), %%eax # @result\n\t"
792 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
793 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
794 " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
798 :
"eax",
"ecx",
"edx"
802 int getem = num_bytes % 16;
805 *result += (input[num_points - 1] * taps[num_points - 1]);
816 #include <pmmintrin.h>
821 unsigned int num_points)
824 const unsigned int num_bytes = num_points * 8;
825 unsigned int isodd = num_points & 1;
828 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
830 unsigned int number = 0;
831 const unsigned int halfPoints = num_bytes >> 4;
833 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
840 for (; number < halfPoints; number++) {
869 dotProduct += (dotProductVector[0] + dotProductVector[1]);
872 dotProduct += input[num_points - 1] * taps[num_points - 1];
875 *result = dotProduct;
966 #include <arm_neon.h>
971 unsigned int num_points)
974 unsigned int quarter_points = num_points / 4;
981 float32x4x2_t a_val, b_val, c_val, accumulator;
982 float32x4x2_t tmp_real, tmp_imag;
983 accumulator.val[0] = vdupq_n_f32(0);
984 accumulator.val[1] = vdupq_n_f32(0);
986 for (number = 0; number < quarter_points; ++number) {
987 a_val = vld2q_f32((
float*)a_ptr);
988 b_val = vld2q_f32((
float*)b_ptr);
994 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
996 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
1000 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
1002 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1004 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
1005 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
1007 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
1008 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
1014 vst2q_f32((
float*)accum_result, accumulator);
1015 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1018 for (number = quarter_points * 4; number < num_points; ++number) {
1019 *result += (*a_ptr++) * (*b_ptr++);
1025 #include <arm_neon.h>
1029 unsigned int num_points)
1032 unsigned int quarter_points = num_points / 4;
1033 unsigned int number;
1039 float32x4x2_t a_val, b_val, accumulator;
1040 float32x4x2_t tmp_imag;
1041 accumulator.val[0] = vdupq_n_f32(0);
1042 accumulator.val[1] = vdupq_n_f32(0);
1044 for (number = 0; number < quarter_points; ++number) {
1045 a_val = vld2q_f32((
float*)a_ptr);
1046 b_val = vld2q_f32((
float*)b_ptr);
1051 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1052 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
1055 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
1056 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
1058 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
1059 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
1066 vst2q_f32((
float*)accum_result, accumulator);
1067 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1070 for (number = quarter_points * 4; number < num_points; ++number) {
1071 *result += (*a_ptr++) * (*b_ptr++);
1080 unsigned int num_points)
1083 unsigned int quarter_points = num_points / 4;
1084 unsigned int number;
1090 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
1091 accumulator1.val[0] = vdupq_n_f32(0);
1092 accumulator1.val[1] = vdupq_n_f32(0);
1093 accumulator2.val[0] = vdupq_n_f32(0);
1094 accumulator2.val[1] = vdupq_n_f32(0);
1096 for (number = 0; number < quarter_points; ++number) {
1097 a_val = vld2q_f32((
float*)a_ptr);
1098 b_val = vld2q_f32((
float*)b_ptr);
1103 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1104 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1105 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1106 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1111 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1112 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1114 vst2q_f32((
float*)accum_result, accumulator1);
1115 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1118 for (number = quarter_points * 4; number < num_points; ++number) {
1119 *result += (*a_ptr++) * (*b_ptr++);
1128 unsigned int num_points)
1133 unsigned int quarter_points = num_points / 8;
1134 unsigned int number;
1140 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
1141 float32x4x2_t reduced_accumulator;
1142 accumulator1.val[0] = vdupq_n_f32(0);
1143 accumulator1.val[1] = vdupq_n_f32(0);
1144 accumulator1.val[2] = vdupq_n_f32(0);
1145 accumulator1.val[3] = vdupq_n_f32(0);
1146 accumulator2.val[0] = vdupq_n_f32(0);
1147 accumulator2.val[1] = vdupq_n_f32(0);
1148 accumulator2.val[2] = vdupq_n_f32(0);
1149 accumulator2.val[3] = vdupq_n_f32(0);
1152 for (number = 0; number < quarter_points; ++number) {
1153 a_val = vld4q_f32((
float*)a_ptr);
1154 b_val = vld4q_f32((
float*)b_ptr);
1159 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1160 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1162 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
1163 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
1165 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1166 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1168 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
1169 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
1175 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
1176 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
1177 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
1178 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
1179 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1180 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1183 vst2q_f32((
float*)accum_result, reduced_accumulator);
1184 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1187 for (number = quarter_points * 8; number < num_points; ++number) {
1188 *result += (*a_ptr++) * (*b_ptr++);
1196 #include <immintrin.h>
1201 unsigned int num_points)
1204 unsigned int isodd = num_points & 3;
1207 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
1209 unsigned int number = 0;
1210 const unsigned int quarterPoints = num_points / 4;
1212 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1217 dotProdVal = _mm256_setzero_ps();
1219 for (; number < quarterPoints; number++) {
1221 x = _mm256_load_ps((
float*)a);
1222 y = _mm256_load_ps((
float*)b);
1224 yl = _mm256_moveldup_ps(y);
1225 yh = _mm256_movehdup_ps(y);
1227 tmp1 = _mm256_mul_ps(x, yl);
1229 x = _mm256_shuffle_ps(x, x, 0xB1);
1231 tmp2 = _mm256_mul_ps(x, yh);
1233 z = _mm256_addsub_ps(tmp1,
1236 dotProdVal = _mm256_add_ps(dotProdVal,
1245 _mm256_store_ps((
float*)dotProductVector,
1248 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1249 dotProductVector[3]);
1251 for (
i = num_points - isodd;
i < num_points;
i++) {
1252 dotProduct += input[
i] * taps[
i];
1255 *result = dotProduct;
1260 #if LV_HAVE_AVX && LV_HAVE_FMA
1261 #include <immintrin.h>
1263 static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(
lv_32fc_t* result,
1266 unsigned int num_points)
1269 unsigned int isodd = num_points & 3;
1272 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
1274 unsigned int number = 0;
1275 const unsigned int quarterPoints = num_points / 4;
1277 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1282 dotProdVal = _mm256_setzero_ps();
1284 for (; number < quarterPoints; number++) {
1286 x = _mm256_load_ps((
float*)a);
1287 y = _mm256_load_ps((
float*)b);
1289 yl = _mm256_moveldup_ps(y);
1290 yh = _mm256_movehdup_ps(y);
1294 x = _mm256_shuffle_ps(x, x, 0xB1);
1296 tmp2 = _mm256_mul_ps(x, yh);
1298 z = _mm256_fmaddsub_ps(
1301 dotProdVal = _mm256_add_ps(dotProdVal,
1310 _mm256_store_ps((
float*)dotProductVector,
1313 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1314 dotProductVector[3]);
1316 for (
i = num_points - isodd;
i < num_points;
i++) {
1317 dotProduct += input[
i] * taps[
i];
1320 *result = dotProduct;
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1125
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:818
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1198
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:378
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:968
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:57
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:230
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1026
static void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:518
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1077
#define __VOLK_VOLATILE
Definition: volk_common.h:73
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ASM
Definition: volk_common.h:72
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13