Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
60 
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 
68 static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
69  const float* input,
70  const float* taps,
71  unsigned int num_points)
72 {
73 
74  float dotProduct = 0;
75  const float* aPtr = input;
76  const float* bPtr = taps;
77  unsigned int number = 0;
78 
79  for (number = 0; number < num_points; number++) {
80  dotProduct += ((*aPtr++) * (*bPtr++));
81  }
82 
83  *result = dotProduct;
84 }
85 
86 #endif /*LV_HAVE_GENERIC*/
87 
88 
89 #ifdef LV_HAVE_SSE
90 
91 
92 static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
93  const float* input,
94  const float* taps,
95  unsigned int num_points)
96 {
97 
98  unsigned int number = 0;
99  const unsigned int sixteenthPoints = num_points / 16;
100 
101  float dotProduct = 0;
102  const float* aPtr = input;
103  const float* bPtr = taps;
104 
105  __m128 a0Val, a1Val, a2Val, a3Val;
106  __m128 b0Val, b1Val, b2Val, b3Val;
107  __m128 c0Val, c1Val, c2Val, c3Val;
108 
109  __m128 dotProdVal0 = _mm_setzero_ps();
110  __m128 dotProdVal1 = _mm_setzero_ps();
111  __m128 dotProdVal2 = _mm_setzero_ps();
112  __m128 dotProdVal3 = _mm_setzero_ps();
113 
114  for (; number < sixteenthPoints; number++) {
115 
116  a0Val = _mm_loadu_ps(aPtr);
117  a1Val = _mm_loadu_ps(aPtr + 4);
118  a2Val = _mm_loadu_ps(aPtr + 8);
119  a3Val = _mm_loadu_ps(aPtr + 12);
120  b0Val = _mm_loadu_ps(bPtr);
121  b1Val = _mm_loadu_ps(bPtr + 4);
122  b2Val = _mm_loadu_ps(bPtr + 8);
123  b3Val = _mm_loadu_ps(bPtr + 12);
124 
125  c0Val = _mm_mul_ps(a0Val, b0Val);
126  c1Val = _mm_mul_ps(a1Val, b1Val);
127  c2Val = _mm_mul_ps(a2Val, b2Val);
128  c3Val = _mm_mul_ps(a3Val, b3Val);
129 
130  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
134 
135  aPtr += 16;
136  bPtr += 16;
137  }
138 
139  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
142 
143  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
144 
145  _mm_store_ps(dotProductVector,
146  dotProdVal0); // Store the results back into the dot product vector
147 
148  dotProduct = dotProductVector[0];
149  dotProduct += dotProductVector[1];
150  dotProduct += dotProductVector[2];
151  dotProduct += dotProductVector[3];
152 
153  number = sixteenthPoints * 16;
154  for (; number < num_points; number++) {
155  dotProduct += ((*aPtr++) * (*bPtr++));
156  }
157 
158  *result = dotProduct;
159 }
160 
161 #endif /*LV_HAVE_SSE*/
162 
163 #ifdef LV_HAVE_SSE3
164 
165 #include <pmmintrin.h>
166 
167 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
168  const float* input,
169  const float* taps,
170  unsigned int num_points)
171 {
172  unsigned int number = 0;
173  const unsigned int sixteenthPoints = num_points / 16;
174 
175  float dotProduct = 0;
176  const float* aPtr = input;
177  const float* bPtr = taps;
178 
179  __m128 a0Val, a1Val, a2Val, a3Val;
180  __m128 b0Val, b1Val, b2Val, b3Val;
181  __m128 c0Val, c1Val, c2Val, c3Val;
182 
183  __m128 dotProdVal0 = _mm_setzero_ps();
184  __m128 dotProdVal1 = _mm_setzero_ps();
185  __m128 dotProdVal2 = _mm_setzero_ps();
186  __m128 dotProdVal3 = _mm_setzero_ps();
187 
188  for (; number < sixteenthPoints; number++) {
189 
190  a0Val = _mm_loadu_ps(aPtr);
191  a1Val = _mm_loadu_ps(aPtr + 4);
192  a2Val = _mm_loadu_ps(aPtr + 8);
193  a3Val = _mm_loadu_ps(aPtr + 12);
194  b0Val = _mm_loadu_ps(bPtr);
195  b1Val = _mm_loadu_ps(bPtr + 4);
196  b2Val = _mm_loadu_ps(bPtr + 8);
197  b3Val = _mm_loadu_ps(bPtr + 12);
198 
199  c0Val = _mm_mul_ps(a0Val, b0Val);
200  c1Val = _mm_mul_ps(a1Val, b1Val);
201  c2Val = _mm_mul_ps(a2Val, b2Val);
202  c3Val = _mm_mul_ps(a3Val, b3Val);
203 
204  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
208 
209  aPtr += 16;
210  bPtr += 16;
211  }
212 
213  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
216 
217  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
218  _mm_store_ps(dotProductVector,
219  dotProdVal0); // Store the results back into the dot product vector
220 
221  dotProduct = dotProductVector[0];
222  dotProduct += dotProductVector[1];
223  dotProduct += dotProductVector[2];
224  dotProduct += dotProductVector[3];
225 
226  number = sixteenthPoints * 16;
227  for (; number < num_points; number++) {
228  dotProduct += ((*aPtr++) * (*bPtr++));
229  }
230 
231  *result = dotProduct;
232 }
233 
234 #endif /*LV_HAVE_SSE3*/
235 
236 #ifdef LV_HAVE_SSE4_1
237 
238 #include <smmintrin.h>
239 
240 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
241  const float* input,
242  const float* taps,
243  unsigned int num_points)
244 {
245  unsigned int number = 0;
246  const unsigned int sixteenthPoints = num_points / 16;
247 
248  float dotProduct = 0;
249  const float* aPtr = input;
250  const float* bPtr = taps;
251 
252  __m128 aVal1, bVal1, cVal1;
253  __m128 aVal2, bVal2, cVal2;
254  __m128 aVal3, bVal3, cVal3;
255  __m128 aVal4, bVal4, cVal4;
256 
257  __m128 dotProdVal = _mm_setzero_ps();
258 
259  for (; number < sixteenthPoints; number++) {
260 
261  aVal1 = _mm_loadu_ps(aPtr);
262  aPtr += 4;
263  aVal2 = _mm_loadu_ps(aPtr);
264  aPtr += 4;
265  aVal3 = _mm_loadu_ps(aPtr);
266  aPtr += 4;
267  aVal4 = _mm_loadu_ps(aPtr);
268  aPtr += 4;
269 
270  bVal1 = _mm_loadu_ps(bPtr);
271  bPtr += 4;
272  bVal2 = _mm_loadu_ps(bPtr);
273  bPtr += 4;
274  bVal3 = _mm_loadu_ps(bPtr);
275  bPtr += 4;
276  bVal4 = _mm_loadu_ps(bPtr);
277  bPtr += 4;
278 
279  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
283 
284  cVal1 = _mm_or_ps(cVal1, cVal2);
285  cVal3 = _mm_or_ps(cVal3, cVal4);
286  cVal1 = _mm_or_ps(cVal1, cVal3);
287 
288  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
289  }
290 
291  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
292  _mm_store_ps(dotProductVector,
293  dotProdVal); // Store the results back into the dot product vector
294 
295  dotProduct = dotProductVector[0];
296  dotProduct += dotProductVector[1];
297  dotProduct += dotProductVector[2];
298  dotProduct += dotProductVector[3];
299 
300  number = sixteenthPoints * 16;
301  for (; number < num_points; number++) {
302  dotProduct += ((*aPtr++) * (*bPtr++));
303  }
304 
305  *result = dotProduct;
306 }
307 
308 #endif /*LV_HAVE_SSE4_1*/
309 
310 #ifdef LV_HAVE_AVX
311 
312 #include <immintrin.h>
313 
314 static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
315  const float* input,
316  const float* taps,
317  unsigned int num_points)
318 {
319 
320  unsigned int number = 0;
321  const unsigned int sixteenthPoints = num_points / 16;
322 
323  float dotProduct = 0;
324  const float* aPtr = input;
325  const float* bPtr = taps;
326 
327  __m256 a0Val, a1Val;
328  __m256 b0Val, b1Val;
329  __m256 c0Val, c1Val;
330 
331  __m256 dotProdVal0 = _mm256_setzero_ps();
332  __m256 dotProdVal1 = _mm256_setzero_ps();
333 
334  for (; number < sixteenthPoints; number++) {
335 
336  a0Val = _mm256_loadu_ps(aPtr);
337  a1Val = _mm256_loadu_ps(aPtr + 8);
338  b0Val = _mm256_loadu_ps(bPtr);
339  b1Val = _mm256_loadu_ps(bPtr + 8);
340 
341  c0Val = _mm256_mul_ps(a0Val, b0Val);
342  c1Val = _mm256_mul_ps(a1Val, b1Val);
343 
344  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
346 
347  aPtr += 16;
348  bPtr += 16;
349  }
350 
351  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
352 
353  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
354 
355  _mm256_storeu_ps(dotProductVector,
356  dotProdVal0); // Store the results back into the dot product vector
357 
358  dotProduct = dotProductVector[0];
359  dotProduct += dotProductVector[1];
360  dotProduct += dotProductVector[2];
361  dotProduct += dotProductVector[3];
362  dotProduct += dotProductVector[4];
363  dotProduct += dotProductVector[5];
364  dotProduct += dotProductVector[6];
365  dotProduct += dotProductVector[7];
366 
367  number = sixteenthPoints * 16;
368  for (; number < num_points; number++) {
369  dotProduct += ((*aPtr++) * (*bPtr++));
370  }
371 
372  *result = dotProduct;
373 }
374 
375 #endif /*LV_HAVE_AVX*/
376 
377 #if LV_HAVE_AVX2 && LV_HAVE_FMA
378 #include <immintrin.h>
379 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
380  const float* input,
381  const float* taps,
382  unsigned int num_points)
383 {
384  unsigned int number;
385  const unsigned int eighthPoints = num_points / 8;
386 
387  const float* aPtr = input;
388  const float* bPtr = taps;
389 
390  __m256 dotProdVal = _mm256_setzero_ps();
391  __m256 aVal1, bVal1;
392 
393  for (number = 0; number < eighthPoints; number++) {
394 
395  aVal1 = _mm256_loadu_ps(aPtr);
396  bVal1 = _mm256_loadu_ps(bPtr);
397  aPtr += 8;
398  bPtr += 8;
399 
400  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
401  }
402 
403  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
404  _mm256_storeu_ps(dotProductVector,
405  dotProdVal); // Store the results back into the dot product vector
406 
407  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409  dotProductVector[6] + dotProductVector[7];
410 
411  for (number = eighthPoints * 8; number < num_points; number++) {
412  dotProduct += ((*aPtr++) * (*bPtr++));
413  }
414 
415  *result = dotProduct;
416 }
417 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
418 
419 #if LV_HAVE_AVX512F
420 #include <immintrin.h>
421 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
422  const float* input,
423  const float* taps,
424  unsigned int num_points)
425 {
426  unsigned int number;
427  const unsigned int sixteenthPoints = num_points / 16;
428 
429  const float* aPtr = input;
430  const float* bPtr = taps;
431 
432  __m512 dotProdVal = _mm512_setzero_ps();
433  __m512 aVal1, bVal1;
434 
435  for (number = 0; number < sixteenthPoints; number++) {
436 
437  aVal1 = _mm512_loadu_ps(aPtr);
438  bVal1 = _mm512_loadu_ps(bPtr);
439  aPtr += 16;
440  bPtr += 16;
441 
442  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
443  }
444 
445  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
446  _mm512_storeu_ps(dotProductVector,
447  dotProdVal); // Store the results back into the dot product vector
448 
449  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453  dotProductVector[12] + dotProductVector[13] +
454  dotProductVector[14] + dotProductVector[15];
455 
456  for (number = sixteenthPoints * 16; number < num_points; number++) {
457  dotProduct += ((*aPtr++) * (*bPtr++));
458  }
459 
460  *result = dotProduct;
461 }
462 #endif /* LV_HAVE_AVX512F */
463 
464 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
465 
466 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
468 
469 #include <stdio.h>
470 #include <volk/volk_common.h>
471 
472 
473 #ifdef LV_HAVE_GENERIC
474 
475 
476 static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
477  const float* input,
478  const float* taps,
479  unsigned int num_points)
480 {
481 
482  float dotProduct = 0;
483  const float* aPtr = input;
484  const float* bPtr = taps;
485  unsigned int number = 0;
486 
487  for (number = 0; number < num_points; number++) {
488  dotProduct += ((*aPtr++) * (*bPtr++));
489  }
490 
491  *result = dotProduct;
492 }
493 
494 #endif /*LV_HAVE_GENERIC*/
495 
496 
497 #ifdef LV_HAVE_SSE
498 
499 
500 static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
501  const float* input,
502  const float* taps,
503  unsigned int num_points)
504 {
505 
506  unsigned int number = 0;
507  const unsigned int sixteenthPoints = num_points / 16;
508 
509  float dotProduct = 0;
510  const float* aPtr = input;
511  const float* bPtr = taps;
512 
513  __m128 a0Val, a1Val, a2Val, a3Val;
514  __m128 b0Val, b1Val, b2Val, b3Val;
515  __m128 c0Val, c1Val, c2Val, c3Val;
516 
517  __m128 dotProdVal0 = _mm_setzero_ps();
518  __m128 dotProdVal1 = _mm_setzero_ps();
519  __m128 dotProdVal2 = _mm_setzero_ps();
520  __m128 dotProdVal3 = _mm_setzero_ps();
521 
522  for (; number < sixteenthPoints; number++) {
523 
524  a0Val = _mm_load_ps(aPtr);
525  a1Val = _mm_load_ps(aPtr + 4);
526  a2Val = _mm_load_ps(aPtr + 8);
527  a3Val = _mm_load_ps(aPtr + 12);
528  b0Val = _mm_load_ps(bPtr);
529  b1Val = _mm_load_ps(bPtr + 4);
530  b2Val = _mm_load_ps(bPtr + 8);
531  b3Val = _mm_load_ps(bPtr + 12);
532 
533  c0Val = _mm_mul_ps(a0Val, b0Val);
534  c1Val = _mm_mul_ps(a1Val, b1Val);
535  c2Val = _mm_mul_ps(a2Val, b2Val);
536  c3Val = _mm_mul_ps(a3Val, b3Val);
537 
538  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
539  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
540  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
541  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
542 
543  aPtr += 16;
544  bPtr += 16;
545  }
546 
547  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
548  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
549  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
550 
551  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
552 
553  _mm_store_ps(dotProductVector,
554  dotProdVal0); // Store the results back into the dot product vector
555 
556  dotProduct = dotProductVector[0];
557  dotProduct += dotProductVector[1];
558  dotProduct += dotProductVector[2];
559  dotProduct += dotProductVector[3];
560 
561  number = sixteenthPoints * 16;
562  for (; number < num_points; number++) {
563  dotProduct += ((*aPtr++) * (*bPtr++));
564  }
565 
566  *result = dotProduct;
567 }
568 
569 #endif /*LV_HAVE_SSE*/
570 
571 #ifdef LV_HAVE_SSE3
572 
573 #include <pmmintrin.h>
574 
575 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
576  const float* input,
577  const float* taps,
578  unsigned int num_points)
579 {
580  unsigned int number = 0;
581  const unsigned int sixteenthPoints = num_points / 16;
582 
583  float dotProduct = 0;
584  const float* aPtr = input;
585  const float* bPtr = taps;
586 
587  __m128 a0Val, a1Val, a2Val, a3Val;
588  __m128 b0Val, b1Val, b2Val, b3Val;
589  __m128 c0Val, c1Val, c2Val, c3Val;
590 
591  __m128 dotProdVal0 = _mm_setzero_ps();
592  __m128 dotProdVal1 = _mm_setzero_ps();
593  __m128 dotProdVal2 = _mm_setzero_ps();
594  __m128 dotProdVal3 = _mm_setzero_ps();
595 
596  for (; number < sixteenthPoints; number++) {
597 
598  a0Val = _mm_load_ps(aPtr);
599  a1Val = _mm_load_ps(aPtr + 4);
600  a2Val = _mm_load_ps(aPtr + 8);
601  a3Val = _mm_load_ps(aPtr + 12);
602  b0Val = _mm_load_ps(bPtr);
603  b1Val = _mm_load_ps(bPtr + 4);
604  b2Val = _mm_load_ps(bPtr + 8);
605  b3Val = _mm_load_ps(bPtr + 12);
606 
607  c0Val = _mm_mul_ps(a0Val, b0Val);
608  c1Val = _mm_mul_ps(a1Val, b1Val);
609  c2Val = _mm_mul_ps(a2Val, b2Val);
610  c3Val = _mm_mul_ps(a3Val, b3Val);
611 
612  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
613  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
614  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
615  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
616 
617  aPtr += 16;
618  bPtr += 16;
619  }
620 
621  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
622  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
623  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
624 
625  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
626  _mm_store_ps(dotProductVector,
627  dotProdVal0); // Store the results back into the dot product vector
628 
629  dotProduct = dotProductVector[0];
630  dotProduct += dotProductVector[1];
631  dotProduct += dotProductVector[2];
632  dotProduct += dotProductVector[3];
633 
634  number = sixteenthPoints * 16;
635  for (; number < num_points; number++) {
636  dotProduct += ((*aPtr++) * (*bPtr++));
637  }
638 
639  *result = dotProduct;
640 }
641 
642 #endif /*LV_HAVE_SSE3*/
643 
644 #ifdef LV_HAVE_SSE4_1
645 
646 #include <smmintrin.h>
647 
648 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
649  const float* input,
650  const float* taps,
651  unsigned int num_points)
652 {
653  unsigned int number = 0;
654  const unsigned int sixteenthPoints = num_points / 16;
655 
656  float dotProduct = 0;
657  const float* aPtr = input;
658  const float* bPtr = taps;
659 
660  __m128 aVal1, bVal1, cVal1;
661  __m128 aVal2, bVal2, cVal2;
662  __m128 aVal3, bVal3, cVal3;
663  __m128 aVal4, bVal4, cVal4;
664 
665  __m128 dotProdVal = _mm_setzero_ps();
666 
667  for (; number < sixteenthPoints; number++) {
668 
669  aVal1 = _mm_load_ps(aPtr);
670  aPtr += 4;
671  aVal2 = _mm_load_ps(aPtr);
672  aPtr += 4;
673  aVal3 = _mm_load_ps(aPtr);
674  aPtr += 4;
675  aVal4 = _mm_load_ps(aPtr);
676  aPtr += 4;
677 
678  bVal1 = _mm_load_ps(bPtr);
679  bPtr += 4;
680  bVal2 = _mm_load_ps(bPtr);
681  bPtr += 4;
682  bVal3 = _mm_load_ps(bPtr);
683  bPtr += 4;
684  bVal4 = _mm_load_ps(bPtr);
685  bPtr += 4;
686 
687  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
688  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
689  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
690  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
691 
692  cVal1 = _mm_or_ps(cVal1, cVal2);
693  cVal3 = _mm_or_ps(cVal3, cVal4);
694  cVal1 = _mm_or_ps(cVal1, cVal3);
695 
696  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
697  }
698 
699  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
700  _mm_store_ps(dotProductVector,
701  dotProdVal); // Store the results back into the dot product vector
702 
703  dotProduct = dotProductVector[0];
704  dotProduct += dotProductVector[1];
705  dotProduct += dotProductVector[2];
706  dotProduct += dotProductVector[3];
707 
708  number = sixteenthPoints * 16;
709  for (; number < num_points; number++) {
710  dotProduct += ((*aPtr++) * (*bPtr++));
711  }
712 
713  *result = dotProduct;
714 }
715 
716 #endif /*LV_HAVE_SSE4_1*/
717 
718 #ifdef LV_HAVE_AVX
719 
720 #include <immintrin.h>
721 
722 static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
723  const float* input,
724  const float* taps,
725  unsigned int num_points)
726 {
727 
728  unsigned int number = 0;
729  const unsigned int sixteenthPoints = num_points / 16;
730 
731  float dotProduct = 0;
732  const float* aPtr = input;
733  const float* bPtr = taps;
734 
735  __m256 a0Val, a1Val;
736  __m256 b0Val, b1Val;
737  __m256 c0Val, c1Val;
738 
739  __m256 dotProdVal0 = _mm256_setzero_ps();
740  __m256 dotProdVal1 = _mm256_setzero_ps();
741 
742  for (; number < sixteenthPoints; number++) {
743 
744  a0Val = _mm256_load_ps(aPtr);
745  a1Val = _mm256_load_ps(aPtr + 8);
746  b0Val = _mm256_load_ps(bPtr);
747  b1Val = _mm256_load_ps(bPtr + 8);
748 
749  c0Val = _mm256_mul_ps(a0Val, b0Val);
750  c1Val = _mm256_mul_ps(a1Val, b1Val);
751 
752  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
753  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
754 
755  aPtr += 16;
756  bPtr += 16;
757  }
758 
759  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
760 
761  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
762 
763  _mm256_store_ps(dotProductVector,
764  dotProdVal0); // Store the results back into the dot product vector
765 
766  dotProduct = dotProductVector[0];
767  dotProduct += dotProductVector[1];
768  dotProduct += dotProductVector[2];
769  dotProduct += dotProductVector[3];
770  dotProduct += dotProductVector[4];
771  dotProduct += dotProductVector[5];
772  dotProduct += dotProductVector[6];
773  dotProduct += dotProductVector[7];
774 
775  number = sixteenthPoints * 16;
776  for (; number < num_points; number++) {
777  dotProduct += ((*aPtr++) * (*bPtr++));
778  }
779 
780  *result = dotProduct;
781 }
782 #endif /*LV_HAVE_AVX*/
783 
784 
785 #if LV_HAVE_AVX2 && LV_HAVE_FMA
786 #include <immintrin.h>
787 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
788  const float* input,
789  const float* taps,
790  unsigned int num_points)
791 {
792  unsigned int number;
793  const unsigned int eighthPoints = num_points / 8;
794 
795  const float* aPtr = input;
796  const float* bPtr = taps;
797 
798  __m256 dotProdVal = _mm256_setzero_ps();
799  __m256 aVal1, bVal1;
800 
801  for (number = 0; number < eighthPoints; number++) {
802 
803  aVal1 = _mm256_load_ps(aPtr);
804  bVal1 = _mm256_load_ps(bPtr);
805  aPtr += 8;
806  bPtr += 8;
807 
808  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
809  }
810 
811  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
812  _mm256_store_ps(dotProductVector,
813  dotProdVal); // Store the results back into the dot product vector
814 
815  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
816  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
817  dotProductVector[6] + dotProductVector[7];
818 
819  for (number = eighthPoints * 8; number < num_points; number++) {
820  dotProduct += ((*aPtr++) * (*bPtr++));
821  }
822 
823  *result = dotProduct;
824 }
825 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
826 
827 #if LV_HAVE_AVX512F
828 #include <immintrin.h>
829 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
830  const float* input,
831  const float* taps,
832  unsigned int num_points)
833 {
834  unsigned int number;
835  const unsigned int sixteenthPoints = num_points / 16;
836 
837  const float* aPtr = input;
838  const float* bPtr = taps;
839 
840  __m512 dotProdVal = _mm512_setzero_ps();
841  __m512 aVal1, bVal1;
842 
843  for (number = 0; number < sixteenthPoints; number++) {
844 
845  aVal1 = _mm512_load_ps(aPtr);
846  bVal1 = _mm512_load_ps(bPtr);
847  aPtr += 16;
848  bPtr += 16;
849 
850  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
851  }
852 
853  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
854  _mm512_store_ps(dotProductVector,
855  dotProdVal); // Store the results back into the dot product vector
856 
857  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
858  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
859  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
860  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
861  dotProductVector[12] + dotProductVector[13] +
862  dotProductVector[14] + dotProductVector[15];
863 
864  for (number = sixteenthPoints * 16; number < num_points; number++) {
865  dotProduct += ((*aPtr++) * (*bPtr++));
866  }
867 
868  *result = dotProduct;
869 }
870 #endif /* LV_HAVE_AVX512F */
871 
872 #ifdef LV_HAVE_NEON
873 #include <arm_neon.h>
874 
875 static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
876  const float* input,
877  const float* taps,
878  unsigned int num_points)
879 {
880 
881  unsigned int quarter_points = num_points / 16;
882  float dotProduct = 0;
883  const float* aPtr = input;
884  const float* bPtr = taps;
885  unsigned int number = 0;
886 
887  float32x4x4_t a_val, b_val, accumulator0;
888  accumulator0.val[0] = vdupq_n_f32(0);
889  accumulator0.val[1] = vdupq_n_f32(0);
890  accumulator0.val[2] = vdupq_n_f32(0);
891  accumulator0.val[3] = vdupq_n_f32(0);
892  // factor of 4 loop unroll with independent accumulators
893  // uses 12 out of 16 neon q registers
894  for (number = 0; number < quarter_points; ++number) {
895  a_val = vld4q_f32(aPtr);
896  b_val = vld4q_f32(bPtr);
897  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
898  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
899  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
900  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
901  aPtr += 16;
902  bPtr += 16;
903  }
904  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
905  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
906  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
907  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
908  vst1q_f32(accumulator, accumulator0.val[0]);
909  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
910 
911  for (number = quarter_points * 16; number < num_points; number++) {
912  dotProduct += ((*aPtr++) * (*bPtr++));
913  }
914 
915  *result = dotProduct;
916 }
917 
918 #endif
919 
920 
921 #ifdef LV_HAVE_NEON
922 static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
923  const float* input,
924  const float* taps,
925  unsigned int num_points)
926 {
927 
928  unsigned int quarter_points = num_points / 8;
929  float dotProduct = 0;
930  const float* aPtr = input;
931  const float* bPtr = taps;
932  unsigned int number = 0;
933 
934  float32x4x2_t a_val, b_val, accumulator_val;
935  accumulator_val.val[0] = vdupq_n_f32(0);
936  accumulator_val.val[1] = vdupq_n_f32(0);
937  // factor of 2 loop unroll with independent accumulators
938  for (number = 0; number < quarter_points; ++number) {
939  a_val = vld2q_f32(aPtr);
940  b_val = vld2q_f32(bPtr);
941  accumulator_val.val[0] =
942  vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
943  accumulator_val.val[1] =
944  vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
945  aPtr += 8;
946  bPtr += 8;
947  }
948  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
949  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
950  vst1q_f32(accumulator, accumulator_val.val[0]);
951  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
952 
953  for (number = quarter_points * 8; number < num_points; number++) {
954  dotProduct += ((*aPtr++) * (*bPtr++));
955  }
956 
957  *result = dotProduct;
958 }
959 
960 #endif /* LV_HAVE_NEON */
961 
962 #ifdef LV_HAVE_NEONV7
963 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
964  const float* aVector,
965  const float* bVector,
966  unsigned int num_points);
967 #endif /* LV_HAVE_NEONV7 */
968 
969 #ifdef LV_HAVE_NEONV7
970 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
971  const float* aVector,
972  const float* bVector,
973  unsigned int num_points);
974 #endif /* LV_HAVE_NEONV7 */
975 
976 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:722
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:500
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:92
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:314
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:68
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:167
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:476
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:575
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:875
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:922
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65