Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
46 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
47 
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50 
51 
52 #ifdef LV_HAVE_GENERIC
53 
54 
55 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
56  const float* input,
57  const float* taps,
58  unsigned int num_points)
59 {
60 
61  float dotProduct = 0;
62  const float* aPtr = input;
63  const float* bPtr = taps;
64  unsigned int number = 0;
65 
66  for (number = 0; number < num_points; number++) {
67  dotProduct += ((*aPtr++) * (*bPtr++));
68  }
69 
70  *result = (int16_t)dotProduct;
71 }
72 
73 #endif /*LV_HAVE_GENERIC*/
74 
75 
76 #ifdef LV_HAVE_SSE
77 
78 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
79  const float* input,
80  const float* taps,
81  unsigned int num_points)
82 {
83 
84  unsigned int number = 0;
85  const unsigned int sixteenthPoints = num_points / 16;
86 
87  float dotProduct = 0;
88  const float* aPtr = input;
89  const float* bPtr = taps;
90 
91  __m128 a0Val, a1Val, a2Val, a3Val;
92  __m128 b0Val, b1Val, b2Val, b3Val;
93  __m128 c0Val, c1Val, c2Val, c3Val;
94 
95  __m128 dotProdVal0 = _mm_setzero_ps();
96  __m128 dotProdVal1 = _mm_setzero_ps();
97  __m128 dotProdVal2 = _mm_setzero_ps();
98  __m128 dotProdVal3 = _mm_setzero_ps();
99 
100  for (; number < sixteenthPoints; number++) {
101 
102  a0Val = _mm_load_ps(aPtr);
103  a1Val = _mm_load_ps(aPtr + 4);
104  a2Val = _mm_load_ps(aPtr + 8);
105  a3Val = _mm_load_ps(aPtr + 12);
106  b0Val = _mm_load_ps(bPtr);
107  b1Val = _mm_load_ps(bPtr + 4);
108  b2Val = _mm_load_ps(bPtr + 8);
109  b3Val = _mm_load_ps(bPtr + 12);
110 
111  c0Val = _mm_mul_ps(a0Val, b0Val);
112  c1Val = _mm_mul_ps(a1Val, b1Val);
113  c2Val = _mm_mul_ps(a2Val, b2Val);
114  c3Val = _mm_mul_ps(a3Val, b3Val);
115 
116  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
117  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
118  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
119  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
120 
121  aPtr += 16;
122  bPtr += 16;
123  }
124 
125  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
126  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
127  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
128 
129  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
130 
131  _mm_store_ps(dotProductVector,
132  dotProdVal0); // Store the results back into the dot product vector
133 
134  dotProduct = dotProductVector[0];
135  dotProduct += dotProductVector[1];
136  dotProduct += dotProductVector[2];
137  dotProduct += dotProductVector[3];
138 
139  number = sixteenthPoints * 16;
140  for (; number < num_points; number++) {
141  dotProduct += ((*aPtr++) * (*bPtr++));
142  }
143 
144  *result = (short)dotProduct;
145 }
146 
147 #endif /*LV_HAVE_SSE*/
148 
149 
150 #if LV_HAVE_AVX2 && LV_HAVE_FMA
151 
152 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
153  const float* input,
154  const float* taps,
155  unsigned int num_points)
156 {
157 
158  unsigned int number = 0;
159  const unsigned int thirtysecondPoints = num_points / 32;
160 
161  float dotProduct = 0;
162  const float* aPtr = input;
163  const float* bPtr = taps;
164 
165  __m256 a0Val, a1Val, a2Val, a3Val;
166  __m256 b0Val, b1Val, b2Val, b3Val;
167 
168  __m256 dotProdVal0 = _mm256_setzero_ps();
169  __m256 dotProdVal1 = _mm256_setzero_ps();
170  __m256 dotProdVal2 = _mm256_setzero_ps();
171  __m256 dotProdVal3 = _mm256_setzero_ps();
172 
173  for (; number < thirtysecondPoints; number++) {
174 
175  a0Val = _mm256_load_ps(aPtr);
176  a1Val = _mm256_load_ps(aPtr + 8);
177  a2Val = _mm256_load_ps(aPtr + 16);
178  a3Val = _mm256_load_ps(aPtr + 24);
179  b0Val = _mm256_load_ps(bPtr);
180  b1Val = _mm256_load_ps(bPtr + 8);
181  b2Val = _mm256_load_ps(bPtr + 16);
182  b3Val = _mm256_load_ps(bPtr + 24);
183 
184  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
188 
189  aPtr += 32;
190  bPtr += 32;
191  }
192 
193  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
196 
197  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
198 
199  _mm256_store_ps(dotProductVector,
200  dotProdVal0); // Store the results back into the dot product vector
201 
202  dotProduct = dotProductVector[0];
203  dotProduct += dotProductVector[1];
204  dotProduct += dotProductVector[2];
205  dotProduct += dotProductVector[3];
206  dotProduct += dotProductVector[4];
207  dotProduct += dotProductVector[5];
208  dotProduct += dotProductVector[6];
209  dotProduct += dotProductVector[7];
210 
211  number = thirtysecondPoints * 32;
212  for (; number < num_points; number++) {
213  dotProduct += ((*aPtr++) * (*bPtr++));
214  }
215 
216  *result = (short)dotProduct;
217 }
218 
219 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
220 
221 
222 #ifdef LV_HAVE_AVX
223 
224 static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
225  const float* input,
226  const float* taps,
227  unsigned int num_points)
228 {
229 
230  unsigned int number = 0;
231  const unsigned int thirtysecondPoints = num_points / 32;
232 
233  float dotProduct = 0;
234  const float* aPtr = input;
235  const float* bPtr = taps;
236 
237  __m256 a0Val, a1Val, a2Val, a3Val;
238  __m256 b0Val, b1Val, b2Val, b3Val;
239  __m256 c0Val, c1Val, c2Val, c3Val;
240 
241  __m256 dotProdVal0 = _mm256_setzero_ps();
242  __m256 dotProdVal1 = _mm256_setzero_ps();
243  __m256 dotProdVal2 = _mm256_setzero_ps();
244  __m256 dotProdVal3 = _mm256_setzero_ps();
245 
246  for (; number < thirtysecondPoints; number++) {
247 
248  a0Val = _mm256_load_ps(aPtr);
249  a1Val = _mm256_load_ps(aPtr + 8);
250  a2Val = _mm256_load_ps(aPtr + 16);
251  a3Val = _mm256_load_ps(aPtr + 24);
252  b0Val = _mm256_load_ps(bPtr);
253  b1Val = _mm256_load_ps(bPtr + 8);
254  b2Val = _mm256_load_ps(bPtr + 16);
255  b3Val = _mm256_load_ps(bPtr + 24);
256 
257  c0Val = _mm256_mul_ps(a0Val, b0Val);
258  c1Val = _mm256_mul_ps(a1Val, b1Val);
259  c2Val = _mm256_mul_ps(a2Val, b2Val);
260  c3Val = _mm256_mul_ps(a3Val, b3Val);
261 
262  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
263  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
265  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
266 
267  aPtr += 32;
268  bPtr += 32;
269  }
270 
271  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
272  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
273  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
274 
275  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
276 
277  _mm256_store_ps(dotProductVector,
278  dotProdVal0); // Store the results back into the dot product vector
279 
280  dotProduct = dotProductVector[0];
281  dotProduct += dotProductVector[1];
282  dotProduct += dotProductVector[2];
283  dotProduct += dotProductVector[3];
284  dotProduct += dotProductVector[4];
285  dotProduct += dotProductVector[5];
286  dotProduct += dotProductVector[6];
287  dotProduct += dotProductVector[7];
288 
289  number = thirtysecondPoints * 32;
290  for (; number < num_points; number++) {
291  dotProduct += ((*aPtr++) * (*bPtr++));
292  }
293 
294  *result = (short)dotProduct;
295 }
296 
297 #endif /*LV_HAVE_AVX*/
298 
299 #ifdef LV_HAVE_AVX512F
300 
301 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
302  const float* input,
303  const float* taps,
304  unsigned int num_points)
305 {
306 
307  unsigned int number = 0;
308  const unsigned int sixtyfourthPoints = num_points / 64;
309 
310  float dotProduct = 0;
311  const float* aPtr = input;
312  const float* bPtr = taps;
313 
314  __m512 a0Val, a1Val, a2Val, a3Val;
315  __m512 b0Val, b1Val, b2Val, b3Val;
316 
317  __m512 dotProdVal0 = _mm512_setzero_ps();
318  __m512 dotProdVal1 = _mm512_setzero_ps();
319  __m512 dotProdVal2 = _mm512_setzero_ps();
320  __m512 dotProdVal3 = _mm512_setzero_ps();
321 
322  for (; number < sixtyfourthPoints; number++) {
323 
324  a0Val = _mm512_load_ps(aPtr);
325  a1Val = _mm512_load_ps(aPtr + 16);
326  a2Val = _mm512_load_ps(aPtr + 32);
327  a3Val = _mm512_load_ps(aPtr + 48);
328  b0Val = _mm512_load_ps(bPtr);
329  b1Val = _mm512_load_ps(bPtr + 16);
330  b2Val = _mm512_load_ps(bPtr + 32);
331  b3Val = _mm512_load_ps(bPtr + 48);
332 
333  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
334  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
335  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
336  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
337 
338  aPtr += 64;
339  bPtr += 64;
340  }
341 
342  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
343  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
344  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
345 
346  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
347 
348  _mm512_store_ps(dotProductVector,
349  dotProdVal0); // Store the results back into the dot product vector
350 
351  dotProduct = dotProductVector[0];
352  dotProduct += dotProductVector[1];
353  dotProduct += dotProductVector[2];
354  dotProduct += dotProductVector[3];
355  dotProduct += dotProductVector[4];
356  dotProduct += dotProductVector[5];
357  dotProduct += dotProductVector[6];
358  dotProduct += dotProductVector[7];
359  dotProduct += dotProductVector[8];
360  dotProduct += dotProductVector[9];
361  dotProduct += dotProductVector[10];
362  dotProduct += dotProductVector[11];
363  dotProduct += dotProductVector[12];
364  dotProduct += dotProductVector[13];
365  dotProduct += dotProductVector[14];
366  dotProduct += dotProductVector[15];
367 
368  number = sixtyfourthPoints * 64;
369  for (; number < num_points; number++) {
370  dotProduct += ((*aPtr++) * (*bPtr++));
371  }
372 
373  *result = (short)dotProduct;
374 }
375 
376 #endif /*LV_HAVE_AVX512F*/
377 
378 
379 #ifdef LV_HAVE_SSE
380 
381 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
382  const float* input,
383  const float* taps,
384  unsigned int num_points)
385 {
386 
387  unsigned int number = 0;
388  const unsigned int sixteenthPoints = num_points / 16;
389 
390  float dotProduct = 0;
391  const float* aPtr = input;
392  const float* bPtr = taps;
393 
394  __m128 a0Val, a1Val, a2Val, a3Val;
395  __m128 b0Val, b1Val, b2Val, b3Val;
396  __m128 c0Val, c1Val, c2Val, c3Val;
397 
398  __m128 dotProdVal0 = _mm_setzero_ps();
399  __m128 dotProdVal1 = _mm_setzero_ps();
400  __m128 dotProdVal2 = _mm_setzero_ps();
401  __m128 dotProdVal3 = _mm_setzero_ps();
402 
403  for (; number < sixteenthPoints; number++) {
404 
405  a0Val = _mm_loadu_ps(aPtr);
406  a1Val = _mm_loadu_ps(aPtr + 4);
407  a2Val = _mm_loadu_ps(aPtr + 8);
408  a3Val = _mm_loadu_ps(aPtr + 12);
409  b0Val = _mm_loadu_ps(bPtr);
410  b1Val = _mm_loadu_ps(bPtr + 4);
411  b2Val = _mm_loadu_ps(bPtr + 8);
412  b3Val = _mm_loadu_ps(bPtr + 12);
413 
414  c0Val = _mm_mul_ps(a0Val, b0Val);
415  c1Val = _mm_mul_ps(a1Val, b1Val);
416  c2Val = _mm_mul_ps(a2Val, b2Val);
417  c3Val = _mm_mul_ps(a3Val, b3Val);
418 
419  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
420  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
421  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
422  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
423 
424  aPtr += 16;
425  bPtr += 16;
426  }
427 
428  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
429  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
430  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
431 
432  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
433 
434  _mm_store_ps(dotProductVector,
435  dotProdVal0); // Store the results back into the dot product vector
436 
437  dotProduct = dotProductVector[0];
438  dotProduct += dotProductVector[1];
439  dotProduct += dotProductVector[2];
440  dotProduct += dotProductVector[3];
441 
442  number = sixteenthPoints * 16;
443  for (; number < num_points; number++) {
444  dotProduct += ((*aPtr++) * (*bPtr++));
445  }
446 
447  *result = (short)dotProduct;
448 }
449 
450 #endif /*LV_HAVE_SSE*/
451 
452 
453 #if LV_HAVE_AVX2 && LV_HAVE_FMA
454 
455 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
456  const float* input,
457  const float* taps,
458  unsigned int num_points)
459 {
460 
461  unsigned int number = 0;
462  const unsigned int thirtysecondPoints = num_points / 32;
463 
464  float dotProduct = 0;
465  const float* aPtr = input;
466  const float* bPtr = taps;
467 
468  __m256 a0Val, a1Val, a2Val, a3Val;
469  __m256 b0Val, b1Val, b2Val, b3Val;
470 
471  __m256 dotProdVal0 = _mm256_setzero_ps();
472  __m256 dotProdVal1 = _mm256_setzero_ps();
473  __m256 dotProdVal2 = _mm256_setzero_ps();
474  __m256 dotProdVal3 = _mm256_setzero_ps();
475 
476  for (; number < thirtysecondPoints; number++) {
477 
478  a0Val = _mm256_loadu_ps(aPtr);
479  a1Val = _mm256_loadu_ps(aPtr + 8);
480  a2Val = _mm256_loadu_ps(aPtr + 16);
481  a3Val = _mm256_loadu_ps(aPtr + 24);
482  b0Val = _mm256_loadu_ps(bPtr);
483  b1Val = _mm256_loadu_ps(bPtr + 8);
484  b2Val = _mm256_loadu_ps(bPtr + 16);
485  b3Val = _mm256_loadu_ps(bPtr + 24);
486 
487  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
488  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
489  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
490  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
491 
492  aPtr += 32;
493  bPtr += 32;
494  }
495 
496  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
497  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
498  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
499 
500  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
501 
502  _mm256_store_ps(dotProductVector,
503  dotProdVal0); // Store the results back into the dot product vector
504 
505  dotProduct = dotProductVector[0];
506  dotProduct += dotProductVector[1];
507  dotProduct += dotProductVector[2];
508  dotProduct += dotProductVector[3];
509  dotProduct += dotProductVector[4];
510  dotProduct += dotProductVector[5];
511  dotProduct += dotProductVector[6];
512  dotProduct += dotProductVector[7];
513 
514  number = thirtysecondPoints * 32;
515  for (; number < num_points; number++) {
516  dotProduct += ((*aPtr++) * (*bPtr++));
517  }
518 
519  *result = (short)dotProduct;
520 }
521 
522 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
523 
524 
525 #ifdef LV_HAVE_AVX
526 
527 static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
528  const float* input,
529  const float* taps,
530  unsigned int num_points)
531 {
532 
533  unsigned int number = 0;
534  const unsigned int thirtysecondPoints = num_points / 32;
535 
536  float dotProduct = 0;
537  const float* aPtr = input;
538  const float* bPtr = taps;
539 
540  __m256 a0Val, a1Val, a2Val, a3Val;
541  __m256 b0Val, b1Val, b2Val, b3Val;
542  __m256 c0Val, c1Val, c2Val, c3Val;
543 
544  __m256 dotProdVal0 = _mm256_setzero_ps();
545  __m256 dotProdVal1 = _mm256_setzero_ps();
546  __m256 dotProdVal2 = _mm256_setzero_ps();
547  __m256 dotProdVal3 = _mm256_setzero_ps();
548 
549  for (; number < thirtysecondPoints; number++) {
550 
551  a0Val = _mm256_loadu_ps(aPtr);
552  a1Val = _mm256_loadu_ps(aPtr + 8);
553  a2Val = _mm256_loadu_ps(aPtr + 16);
554  a3Val = _mm256_loadu_ps(aPtr + 24);
555  b0Val = _mm256_loadu_ps(bPtr);
556  b1Val = _mm256_loadu_ps(bPtr + 8);
557  b2Val = _mm256_loadu_ps(bPtr + 16);
558  b3Val = _mm256_loadu_ps(bPtr + 24);
559 
560  c0Val = _mm256_mul_ps(a0Val, b0Val);
561  c1Val = _mm256_mul_ps(a1Val, b1Val);
562  c2Val = _mm256_mul_ps(a2Val, b2Val);
563  c3Val = _mm256_mul_ps(a3Val, b3Val);
564 
565  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
566  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
567  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
568  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
569 
570  aPtr += 32;
571  bPtr += 32;
572  }
573 
574  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
575  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
576  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
577 
578  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
579 
580  _mm256_store_ps(dotProductVector,
581  dotProdVal0); // Store the results back into the dot product vector
582 
583  dotProduct = dotProductVector[0];
584  dotProduct += dotProductVector[1];
585  dotProduct += dotProductVector[2];
586  dotProduct += dotProductVector[3];
587  dotProduct += dotProductVector[4];
588  dotProduct += dotProductVector[5];
589  dotProduct += dotProductVector[6];
590  dotProduct += dotProductVector[7];
591 
592  number = thirtysecondPoints * 32;
593  for (; number < num_points; number++) {
594  dotProduct += ((*aPtr++) * (*bPtr++));
595  }
596 
597  *result = (short)dotProduct;
598 }
599 
600 #endif /*LV_HAVE_AVX*/
601 
602 #ifdef LV_HAVE_AVX512F
603 
604 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
605  const float* input,
606  const float* taps,
607  unsigned int num_points)
608 {
609 
610  unsigned int number = 0;
611  const unsigned int sixtyfourthPoints = num_points / 64;
612 
613  float dotProduct = 0;
614  const float* aPtr = input;
615  const float* bPtr = taps;
616 
617  __m512 a0Val, a1Val, a2Val, a3Val;
618  __m512 b0Val, b1Val, b2Val, b3Val;
619 
620  __m512 dotProdVal0 = _mm512_setzero_ps();
621  __m512 dotProdVal1 = _mm512_setzero_ps();
622  __m512 dotProdVal2 = _mm512_setzero_ps();
623  __m512 dotProdVal3 = _mm512_setzero_ps();
624 
625  for (; number < sixtyfourthPoints; number++) {
626 
627  a0Val = _mm512_loadu_ps(aPtr);
628  a1Val = _mm512_loadu_ps(aPtr + 16);
629  a2Val = _mm512_loadu_ps(aPtr + 32);
630  a3Val = _mm512_loadu_ps(aPtr + 48);
631  b0Val = _mm512_loadu_ps(bPtr);
632  b1Val = _mm512_loadu_ps(bPtr + 16);
633  b2Val = _mm512_loadu_ps(bPtr + 32);
634  b3Val = _mm512_loadu_ps(bPtr + 48);
635 
636  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
637  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
638  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
639  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
640 
641  aPtr += 64;
642  bPtr += 64;
643  }
644 
645  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
646  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
647  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
648 
649  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
650 
651  _mm512_storeu_ps(dotProductVector,
652  dotProdVal0); // Store the results back into the dot product vector
653 
654  dotProduct = dotProductVector[0];
655  dotProduct += dotProductVector[1];
656  dotProduct += dotProductVector[2];
657  dotProduct += dotProductVector[3];
658  dotProduct += dotProductVector[4];
659  dotProduct += dotProductVector[5];
660  dotProduct += dotProductVector[6];
661  dotProduct += dotProductVector[7];
662  dotProduct += dotProductVector[8];
663  dotProduct += dotProductVector[9];
664  dotProduct += dotProductVector[10];
665  dotProduct += dotProductVector[11];
666  dotProduct += dotProductVector[12];
667  dotProduct += dotProductVector[13];
668  dotProduct += dotProductVector[14];
669  dotProduct += dotProductVector[15];
670 
671  number = sixtyfourthPoints * 64;
672  for (; number < num_points; number++) {
673  dotProduct += ((*aPtr++) * (*bPtr++));
674  }
675 
676  *result = (short)dotProduct;
677 }
678 
679 #endif /*LV_HAVE_AVX512F*/
680 
681 
682 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:381
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:55
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:224
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:527
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:78
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65