58 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
65 #ifdef LV_HAVE_GENERIC
71 unsigned int num_points)
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
95 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
101 float dotProduct = 0;
102 const float* aPtr = input;
103 const float* bPtr = taps;
105 __m128 a0Val, a1Val, a2Val, a3Val;
106 __m128 b0Val, b1Val, b2Val, b3Val;
107 __m128 c0Val, c1Val, c2Val, c3Val;
114 for (; number < sixteenthPoints; number++) {
139 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
148 dotProduct = dotProductVector[0];
149 dotProduct += dotProductVector[1];
150 dotProduct += dotProductVector[2];
151 dotProduct += dotProductVector[3];
153 number = sixteenthPoints * 16;
154 for (; number < num_points; number++) {
155 dotProduct += ((*aPtr++) * (*bPtr++));
158 *result = dotProduct;
165 #include <pmmintrin.h>
170 unsigned int num_points)
172 unsigned int number = 0;
173 const unsigned int sixteenthPoints = num_points / 16;
175 float dotProduct = 0;
176 const float* aPtr = input;
177 const float* bPtr = taps;
179 __m128 a0Val, a1Val, a2Val, a3Val;
180 __m128 b0Val, b1Val, b2Val, b3Val;
181 __m128 c0Val, c1Val, c2Val, c3Val;
188 for (; number < sixteenthPoints; number++) {
213 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
214 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
215 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
221 dotProduct = dotProductVector[0];
222 dotProduct += dotProductVector[1];
223 dotProduct += dotProductVector[2];
224 dotProduct += dotProductVector[3];
226 number = sixteenthPoints * 16;
227 for (; number < num_points; number++) {
228 dotProduct += ((*aPtr++) * (*bPtr++));
231 *result = dotProduct;
236 #ifdef LV_HAVE_SSE4_1
238 #include <smmintrin.h>
240 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float* result,
243 unsigned int num_points)
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
248 float dotProduct = 0;
249 const float* aPtr = input;
250 const float* bPtr = taps;
252 __m128 aVal1, bVal1, cVal1;
253 __m128 aVal2, bVal2, cVal2;
254 __m128 aVal3, bVal3, cVal3;
255 __m128 aVal4, bVal4, cVal4;
259 for (; number < sixteenthPoints; number++) {
295 dotProduct = dotProductVector[0];
296 dotProduct += dotProductVector[1];
297 dotProduct += dotProductVector[2];
298 dotProduct += dotProductVector[3];
300 number = sixteenthPoints * 16;
301 for (; number < num_points; number++) {
302 dotProduct += ((*aPtr++) * (*bPtr++));
305 *result = dotProduct;
312 #include <immintrin.h>
317 unsigned int num_points)
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
331 __m256 dotProdVal0 = _mm256_setzero_ps();
332 __m256 dotProdVal1 = _mm256_setzero_ps();
334 for (; number < sixteenthPoints; number++) {
336 a0Val = _mm256_loadu_ps(aPtr);
337 a1Val = _mm256_loadu_ps(aPtr + 8);
338 b0Val = _mm256_loadu_ps(bPtr);
339 b1Val = _mm256_loadu_ps(bPtr + 8);
341 c0Val = _mm256_mul_ps(a0Val, b0Val);
342 c1Val = _mm256_mul_ps(a1Val, b1Val);
344 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
351 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
355 _mm256_storeu_ps(dotProductVector,
358 dotProduct = dotProductVector[0];
359 dotProduct += dotProductVector[1];
360 dotProduct += dotProductVector[2];
361 dotProduct += dotProductVector[3];
362 dotProduct += dotProductVector[4];
363 dotProduct += dotProductVector[5];
364 dotProduct += dotProductVector[6];
365 dotProduct += dotProductVector[7];
367 number = sixteenthPoints * 16;
368 for (; number < num_points; number++) {
369 dotProduct += ((*aPtr++) * (*bPtr++));
372 *result = dotProduct;
377 #if LV_HAVE_AVX2 && LV_HAVE_FMA
378 #include <immintrin.h>
379 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(
float* result,
382 unsigned int num_points)
385 const unsigned int eighthPoints = num_points / 8;
387 const float* aPtr = input;
388 const float* bPtr = taps;
390 __m256 dotProdVal = _mm256_setzero_ps();
393 for (number = 0; number < eighthPoints; number++) {
395 aVal1 = _mm256_loadu_ps(aPtr);
396 bVal1 = _mm256_loadu_ps(bPtr);
400 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
404 _mm256_storeu_ps(dotProductVector,
407 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409 dotProductVector[6] + dotProductVector[7];
411 for (number = eighthPoints * 8; number < num_points; number++) {
412 dotProduct += ((*aPtr++) * (*bPtr++));
415 *result = dotProduct;
420 #include <immintrin.h>
421 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(
float* result,
424 unsigned int num_points)
427 const unsigned int sixteenthPoints = num_points / 16;
429 const float* aPtr = input;
430 const float* bPtr = taps;
432 __m512 dotProdVal = _mm512_setzero_ps();
435 for (number = 0; number < sixteenthPoints; number++) {
437 aVal1 = _mm512_loadu_ps(aPtr);
438 bVal1 = _mm512_loadu_ps(bPtr);
442 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
446 _mm512_storeu_ps(dotProductVector,
449 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453 dotProductVector[12] + dotProductVector[13] +
454 dotProductVector[14] + dotProductVector[15];
456 for (number = sixteenthPoints * 16; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
460 *result = dotProduct;
466 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
473 #ifdef LV_HAVE_GENERIC
479 unsigned int num_points)
482 float dotProduct = 0;
483 const float* aPtr = input;
484 const float* bPtr = taps;
485 unsigned int number = 0;
487 for (number = 0; number < num_points; number++) {
488 dotProduct += ((*aPtr++) * (*bPtr++));
491 *result = dotProduct;
503 unsigned int num_points)
506 unsigned int number = 0;
507 const unsigned int sixteenthPoints = num_points / 16;
509 float dotProduct = 0;
510 const float* aPtr = input;
511 const float* bPtr = taps;
513 __m128 a0Val, a1Val, a2Val, a3Val;
514 __m128 b0Val, b1Val, b2Val, b3Val;
515 __m128 c0Val, c1Val, c2Val, c3Val;
522 for (; number < sixteenthPoints; number++) {
547 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
548 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
549 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
556 dotProduct = dotProductVector[0];
557 dotProduct += dotProductVector[1];
558 dotProduct += dotProductVector[2];
559 dotProduct += dotProductVector[3];
561 number = sixteenthPoints * 16;
562 for (; number < num_points; number++) {
563 dotProduct += ((*aPtr++) * (*bPtr++));
566 *result = dotProduct;
573 #include <pmmintrin.h>
578 unsigned int num_points)
580 unsigned int number = 0;
581 const unsigned int sixteenthPoints = num_points / 16;
583 float dotProduct = 0;
584 const float* aPtr = input;
585 const float* bPtr = taps;
587 __m128 a0Val, a1Val, a2Val, a3Val;
588 __m128 b0Val, b1Val, b2Val, b3Val;
589 __m128 c0Val, c1Val, c2Val, c3Val;
596 for (; number < sixteenthPoints; number++) {
621 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
622 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
623 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
629 dotProduct = dotProductVector[0];
630 dotProduct += dotProductVector[1];
631 dotProduct += dotProductVector[2];
632 dotProduct += dotProductVector[3];
634 number = sixteenthPoints * 16;
635 for (; number < num_points; number++) {
636 dotProduct += ((*aPtr++) * (*bPtr++));
639 *result = dotProduct;
644 #ifdef LV_HAVE_SSE4_1
646 #include <smmintrin.h>
648 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float* result,
651 unsigned int num_points)
653 unsigned int number = 0;
654 const unsigned int sixteenthPoints = num_points / 16;
656 float dotProduct = 0;
657 const float* aPtr = input;
658 const float* bPtr = taps;
660 __m128 aVal1, bVal1, cVal1;
661 __m128 aVal2, bVal2, cVal2;
662 __m128 aVal3, bVal3, cVal3;
663 __m128 aVal4, bVal4, cVal4;
667 for (; number < sixteenthPoints; number++) {
703 dotProduct = dotProductVector[0];
704 dotProduct += dotProductVector[1];
705 dotProduct += dotProductVector[2];
706 dotProduct += dotProductVector[3];
708 number = sixteenthPoints * 16;
709 for (; number < num_points; number++) {
710 dotProduct += ((*aPtr++) * (*bPtr++));
713 *result = dotProduct;
720 #include <immintrin.h>
725 unsigned int num_points)
728 unsigned int number = 0;
729 const unsigned int sixteenthPoints = num_points / 16;
731 float dotProduct = 0;
732 const float* aPtr = input;
733 const float* bPtr = taps;
739 __m256 dotProdVal0 = _mm256_setzero_ps();
740 __m256 dotProdVal1 = _mm256_setzero_ps();
742 for (; number < sixteenthPoints; number++) {
744 a0Val = _mm256_load_ps(aPtr);
745 a1Val = _mm256_load_ps(aPtr + 8);
746 b0Val = _mm256_load_ps(bPtr);
747 b1Val = _mm256_load_ps(bPtr + 8);
749 c0Val = _mm256_mul_ps(a0Val, b0Val);
750 c1Val = _mm256_mul_ps(a1Val, b1Val);
752 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
753 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
759 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
763 _mm256_store_ps(dotProductVector,
766 dotProduct = dotProductVector[0];
767 dotProduct += dotProductVector[1];
768 dotProduct += dotProductVector[2];
769 dotProduct += dotProductVector[3];
770 dotProduct += dotProductVector[4];
771 dotProduct += dotProductVector[5];
772 dotProduct += dotProductVector[6];
773 dotProduct += dotProductVector[7];
775 number = sixteenthPoints * 16;
776 for (; number < num_points; number++) {
777 dotProduct += ((*aPtr++) * (*bPtr++));
780 *result = dotProduct;
785 #if LV_HAVE_AVX2 && LV_HAVE_FMA
786 #include <immintrin.h>
787 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(
float* result,
790 unsigned int num_points)
793 const unsigned int eighthPoints = num_points / 8;
795 const float* aPtr = input;
796 const float* bPtr = taps;
798 __m256 dotProdVal = _mm256_setzero_ps();
801 for (number = 0; number < eighthPoints; number++) {
803 aVal1 = _mm256_load_ps(aPtr);
804 bVal1 = _mm256_load_ps(bPtr);
808 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
812 _mm256_store_ps(dotProductVector,
815 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
816 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
817 dotProductVector[6] + dotProductVector[7];
819 for (number = eighthPoints * 8; number < num_points; number++) {
820 dotProduct += ((*aPtr++) * (*bPtr++));
823 *result = dotProduct;
828 #include <immintrin.h>
829 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(
float* result,
832 unsigned int num_points)
835 const unsigned int sixteenthPoints = num_points / 16;
837 const float* aPtr = input;
838 const float* bPtr = taps;
840 __m512 dotProdVal = _mm512_setzero_ps();
843 for (number = 0; number < sixteenthPoints; number++) {
845 aVal1 = _mm512_load_ps(aPtr);
846 bVal1 = _mm512_load_ps(bPtr);
850 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
854 _mm512_store_ps(dotProductVector,
857 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
858 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
859 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
860 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
861 dotProductVector[12] + dotProductVector[13] +
862 dotProductVector[14] + dotProductVector[15];
864 for (number = sixteenthPoints * 16; number < num_points; number++) {
865 dotProduct += ((*aPtr++) * (*bPtr++));
868 *result = dotProduct;
873 #include <arm_neon.h>
878 unsigned int num_points)
881 unsigned int quarter_points = num_points / 16;
882 float dotProduct = 0;
883 const float* aPtr = input;
884 const float* bPtr = taps;
885 unsigned int number = 0;
887 float32x4x4_t a_val, b_val, accumulator0;
888 accumulator0.val[0] = vdupq_n_f32(0);
889 accumulator0.val[1] = vdupq_n_f32(0);
890 accumulator0.val[2] = vdupq_n_f32(0);
891 accumulator0.val[3] = vdupq_n_f32(0);
894 for (number = 0; number < quarter_points; ++number) {
895 a_val = vld4q_f32(aPtr);
896 b_val = vld4q_f32(bPtr);
897 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
898 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
899 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
900 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
904 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
905 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
906 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
908 vst1q_f32(accumulator, accumulator0.val[0]);
909 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
911 for (number = quarter_points * 16; number < num_points; number++) {
912 dotProduct += ((*aPtr++) * (*bPtr++));
915 *result = dotProduct;
925 unsigned int num_points)
928 unsigned int quarter_points = num_points / 8;
929 float dotProduct = 0;
930 const float* aPtr = input;
931 const float* bPtr = taps;
932 unsigned int number = 0;
934 float32x4x2_t a_val, b_val, accumulator_val;
935 accumulator_val.val[0] = vdupq_n_f32(0);
936 accumulator_val.val[1] = vdupq_n_f32(0);
938 for (number = 0; number < quarter_points; ++number) {
939 a_val = vld2q_f32(aPtr);
940 b_val = vld2q_f32(bPtr);
941 accumulator_val.val[0] =
942 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
943 accumulator_val.val[1] =
944 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
948 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
950 vst1q_f32(accumulator, accumulator_val.val[0]);
951 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
953 for (number = quarter_points * 8; number < num_points; number++) {
954 dotProduct += ((*aPtr++) * (*bPtr++));
957 *result = dotProduct;
962 #ifdef LV_HAVE_NEONV7
963 extern void volk_32f_x2_dot_prod_32f_a_neonasm(
float* cVector,
964 const float* aVector,
965 const float* bVector,
966 unsigned int num_points);
969 #ifdef LV_HAVE_NEONV7
970 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(
float* cVector,
971 const float* aVector,
972 const float* bVector,
973 unsigned int num_points);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:722
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:500
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:92
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:314
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:68
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:167
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:476
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:575
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:875
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:922
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65