45 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
46 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
52 #ifdef LV_HAVE_GENERIC
58 unsigned int num_points)
62 const float* aPtr = input;
63 const float* bPtr = taps;
64 unsigned int number = 0;
66 for (number = 0; number < num_points; number++) {
67 dotProduct += ((*aPtr++) * (*bPtr++));
70 *result = (int16_t)dotProduct;
81 unsigned int num_points)
84 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
88 const float* aPtr = input;
89 const float* bPtr = taps;
91 __m128 a0Val, a1Val, a2Val, a3Val;
92 __m128 b0Val, b1Val, b2Val, b3Val;
93 __m128 c0Val, c1Val, c2Val, c3Val;
100 for (; number < sixteenthPoints; number++) {
125 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
126 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
127 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
134 dotProduct = dotProductVector[0];
135 dotProduct += dotProductVector[1];
136 dotProduct += dotProductVector[2];
137 dotProduct += dotProductVector[3];
139 number = sixteenthPoints * 16;
140 for (; number < num_points; number++) {
141 dotProduct += ((*aPtr++) * (*bPtr++));
144 *result = (short)dotProduct;
150 #if LV_HAVE_AVX2 && LV_HAVE_FMA
152 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
155 unsigned int num_points)
158 unsigned int number = 0;
159 const unsigned int thirtysecondPoints = num_points / 32;
161 float dotProduct = 0;
162 const float* aPtr = input;
163 const float* bPtr = taps;
165 __m256 a0Val, a1Val, a2Val, a3Val;
166 __m256 b0Val, b1Val, b2Val, b3Val;
168 __m256 dotProdVal0 = _mm256_setzero_ps();
169 __m256 dotProdVal1 = _mm256_setzero_ps();
170 __m256 dotProdVal2 = _mm256_setzero_ps();
171 __m256 dotProdVal3 = _mm256_setzero_ps();
173 for (; number < thirtysecondPoints; number++) {
175 a0Val = _mm256_load_ps(aPtr);
176 a1Val = _mm256_load_ps(aPtr + 8);
177 a2Val = _mm256_load_ps(aPtr + 16);
178 a3Val = _mm256_load_ps(aPtr + 24);
179 b0Val = _mm256_load_ps(bPtr);
180 b1Val = _mm256_load_ps(bPtr + 8);
181 b2Val = _mm256_load_ps(bPtr + 16);
182 b3Val = _mm256_load_ps(bPtr + 24);
184 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
193 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
199 _mm256_store_ps(dotProductVector,
202 dotProduct = dotProductVector[0];
203 dotProduct += dotProductVector[1];
204 dotProduct += dotProductVector[2];
205 dotProduct += dotProductVector[3];
206 dotProduct += dotProductVector[4];
207 dotProduct += dotProductVector[5];
208 dotProduct += dotProductVector[6];
209 dotProduct += dotProductVector[7];
211 number = thirtysecondPoints * 32;
212 for (; number < num_points; number++) {
213 dotProduct += ((*aPtr++) * (*bPtr++));
216 *result = (short)dotProduct;
227 unsigned int num_points)
230 unsigned int number = 0;
231 const unsigned int thirtysecondPoints = num_points / 32;
233 float dotProduct = 0;
234 const float* aPtr = input;
235 const float* bPtr = taps;
237 __m256 a0Val, a1Val, a2Val, a3Val;
238 __m256 b0Val, b1Val, b2Val, b3Val;
239 __m256 c0Val, c1Val, c2Val, c3Val;
241 __m256 dotProdVal0 = _mm256_setzero_ps();
242 __m256 dotProdVal1 = _mm256_setzero_ps();
243 __m256 dotProdVal2 = _mm256_setzero_ps();
244 __m256 dotProdVal3 = _mm256_setzero_ps();
246 for (; number < thirtysecondPoints; number++) {
248 a0Val = _mm256_load_ps(aPtr);
249 a1Val = _mm256_load_ps(aPtr + 8);
250 a2Val = _mm256_load_ps(aPtr + 16);
251 a3Val = _mm256_load_ps(aPtr + 24);
252 b0Val = _mm256_load_ps(bPtr);
253 b1Val = _mm256_load_ps(bPtr + 8);
254 b2Val = _mm256_load_ps(bPtr + 16);
255 b3Val = _mm256_load_ps(bPtr + 24);
257 c0Val = _mm256_mul_ps(a0Val, b0Val);
258 c1Val = _mm256_mul_ps(a1Val, b1Val);
259 c2Val = _mm256_mul_ps(a2Val, b2Val);
260 c3Val = _mm256_mul_ps(a3Val, b3Val);
262 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
263 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
265 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
271 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
272 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
273 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
277 _mm256_store_ps(dotProductVector,
280 dotProduct = dotProductVector[0];
281 dotProduct += dotProductVector[1];
282 dotProduct += dotProductVector[2];
283 dotProduct += dotProductVector[3];
284 dotProduct += dotProductVector[4];
285 dotProduct += dotProductVector[5];
286 dotProduct += dotProductVector[6];
287 dotProduct += dotProductVector[7];
289 number = thirtysecondPoints * 32;
290 for (; number < num_points; number++) {
291 dotProduct += ((*aPtr++) * (*bPtr++));
294 *result = (short)dotProduct;
299 #ifdef LV_HAVE_AVX512F
301 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
304 unsigned int num_points)
307 unsigned int number = 0;
308 const unsigned int sixtyfourthPoints = num_points / 64;
310 float dotProduct = 0;
311 const float* aPtr = input;
312 const float* bPtr = taps;
314 __m512 a0Val, a1Val, a2Val, a3Val;
315 __m512 b0Val, b1Val, b2Val, b3Val;
317 __m512 dotProdVal0 = _mm512_setzero_ps();
318 __m512 dotProdVal1 = _mm512_setzero_ps();
319 __m512 dotProdVal2 = _mm512_setzero_ps();
320 __m512 dotProdVal3 = _mm512_setzero_ps();
322 for (; number < sixtyfourthPoints; number++) {
324 a0Val = _mm512_load_ps(aPtr);
325 a1Val = _mm512_load_ps(aPtr + 16);
326 a2Val = _mm512_load_ps(aPtr + 32);
327 a3Val = _mm512_load_ps(aPtr + 48);
328 b0Val = _mm512_load_ps(bPtr);
329 b1Val = _mm512_load_ps(bPtr + 16);
330 b2Val = _mm512_load_ps(bPtr + 32);
331 b3Val = _mm512_load_ps(bPtr + 48);
333 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
334 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
335 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
336 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
342 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
343 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
344 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
348 _mm512_store_ps(dotProductVector,
351 dotProduct = dotProductVector[0];
352 dotProduct += dotProductVector[1];
353 dotProduct += dotProductVector[2];
354 dotProduct += dotProductVector[3];
355 dotProduct += dotProductVector[4];
356 dotProduct += dotProductVector[5];
357 dotProduct += dotProductVector[6];
358 dotProduct += dotProductVector[7];
359 dotProduct += dotProductVector[8];
360 dotProduct += dotProductVector[9];
361 dotProduct += dotProductVector[10];
362 dotProduct += dotProductVector[11];
363 dotProduct += dotProductVector[12];
364 dotProduct += dotProductVector[13];
365 dotProduct += dotProductVector[14];
366 dotProduct += dotProductVector[15];
368 number = sixtyfourthPoints * 64;
369 for (; number < num_points; number++) {
370 dotProduct += ((*aPtr++) * (*bPtr++));
373 *result = (short)dotProduct;
384 unsigned int num_points)
387 unsigned int number = 0;
388 const unsigned int sixteenthPoints = num_points / 16;
390 float dotProduct = 0;
391 const float* aPtr = input;
392 const float* bPtr = taps;
394 __m128 a0Val, a1Val, a2Val, a3Val;
395 __m128 b0Val, b1Val, b2Val, b3Val;
396 __m128 c0Val, c1Val, c2Val, c3Val;
403 for (; number < sixteenthPoints; number++) {
428 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
429 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
430 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
437 dotProduct = dotProductVector[0];
438 dotProduct += dotProductVector[1];
439 dotProduct += dotProductVector[2];
440 dotProduct += dotProductVector[3];
442 number = sixteenthPoints * 16;
443 for (; number < num_points; number++) {
444 dotProduct += ((*aPtr++) * (*bPtr++));
447 *result = (short)dotProduct;
453 #if LV_HAVE_AVX2 && LV_HAVE_FMA
455 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
458 unsigned int num_points)
461 unsigned int number = 0;
462 const unsigned int thirtysecondPoints = num_points / 32;
464 float dotProduct = 0;
465 const float* aPtr = input;
466 const float* bPtr = taps;
468 __m256 a0Val, a1Val, a2Val, a3Val;
469 __m256 b0Val, b1Val, b2Val, b3Val;
471 __m256 dotProdVal0 = _mm256_setzero_ps();
472 __m256 dotProdVal1 = _mm256_setzero_ps();
473 __m256 dotProdVal2 = _mm256_setzero_ps();
474 __m256 dotProdVal3 = _mm256_setzero_ps();
476 for (; number < thirtysecondPoints; number++) {
478 a0Val = _mm256_loadu_ps(aPtr);
479 a1Val = _mm256_loadu_ps(aPtr + 8);
480 a2Val = _mm256_loadu_ps(aPtr + 16);
481 a3Val = _mm256_loadu_ps(aPtr + 24);
482 b0Val = _mm256_loadu_ps(bPtr);
483 b1Val = _mm256_loadu_ps(bPtr + 8);
484 b2Val = _mm256_loadu_ps(bPtr + 16);
485 b3Val = _mm256_loadu_ps(bPtr + 24);
487 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
488 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
489 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
490 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
496 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
497 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
498 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
502 _mm256_store_ps(dotProductVector,
505 dotProduct = dotProductVector[0];
506 dotProduct += dotProductVector[1];
507 dotProduct += dotProductVector[2];
508 dotProduct += dotProductVector[3];
509 dotProduct += dotProductVector[4];
510 dotProduct += dotProductVector[5];
511 dotProduct += dotProductVector[6];
512 dotProduct += dotProductVector[7];
514 number = thirtysecondPoints * 32;
515 for (; number < num_points; number++) {
516 dotProduct += ((*aPtr++) * (*bPtr++));
519 *result = (short)dotProduct;
530 unsigned int num_points)
533 unsigned int number = 0;
534 const unsigned int thirtysecondPoints = num_points / 32;
536 float dotProduct = 0;
537 const float* aPtr = input;
538 const float* bPtr = taps;
540 __m256 a0Val, a1Val, a2Val, a3Val;
541 __m256 b0Val, b1Val, b2Val, b3Val;
542 __m256 c0Val, c1Val, c2Val, c3Val;
544 __m256 dotProdVal0 = _mm256_setzero_ps();
545 __m256 dotProdVal1 = _mm256_setzero_ps();
546 __m256 dotProdVal2 = _mm256_setzero_ps();
547 __m256 dotProdVal3 = _mm256_setzero_ps();
549 for (; number < thirtysecondPoints; number++) {
551 a0Val = _mm256_loadu_ps(aPtr);
552 a1Val = _mm256_loadu_ps(aPtr + 8);
553 a2Val = _mm256_loadu_ps(aPtr + 16);
554 a3Val = _mm256_loadu_ps(aPtr + 24);
555 b0Val = _mm256_loadu_ps(bPtr);
556 b1Val = _mm256_loadu_ps(bPtr + 8);
557 b2Val = _mm256_loadu_ps(bPtr + 16);
558 b3Val = _mm256_loadu_ps(bPtr + 24);
560 c0Val = _mm256_mul_ps(a0Val, b0Val);
561 c1Val = _mm256_mul_ps(a1Val, b1Val);
562 c2Val = _mm256_mul_ps(a2Val, b2Val);
563 c3Val = _mm256_mul_ps(a3Val, b3Val);
565 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
566 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
567 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
568 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
574 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
575 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
576 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
580 _mm256_store_ps(dotProductVector,
583 dotProduct = dotProductVector[0];
584 dotProduct += dotProductVector[1];
585 dotProduct += dotProductVector[2];
586 dotProduct += dotProductVector[3];
587 dotProduct += dotProductVector[4];
588 dotProduct += dotProductVector[5];
589 dotProduct += dotProductVector[6];
590 dotProduct += dotProductVector[7];
592 number = thirtysecondPoints * 32;
593 for (; number < num_points; number++) {
594 dotProduct += ((*aPtr++) * (*bPtr++));
597 *result = (short)dotProduct;
602 #ifdef LV_HAVE_AVX512F
604 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
607 unsigned int num_points)
610 unsigned int number = 0;
611 const unsigned int sixtyfourthPoints = num_points / 64;
613 float dotProduct = 0;
614 const float* aPtr = input;
615 const float* bPtr = taps;
617 __m512 a0Val, a1Val, a2Val, a3Val;
618 __m512 b0Val, b1Val, b2Val, b3Val;
620 __m512 dotProdVal0 = _mm512_setzero_ps();
621 __m512 dotProdVal1 = _mm512_setzero_ps();
622 __m512 dotProdVal2 = _mm512_setzero_ps();
623 __m512 dotProdVal3 = _mm512_setzero_ps();
625 for (; number < sixtyfourthPoints; number++) {
627 a0Val = _mm512_loadu_ps(aPtr);
628 a1Val = _mm512_loadu_ps(aPtr + 16);
629 a2Val = _mm512_loadu_ps(aPtr + 32);
630 a3Val = _mm512_loadu_ps(aPtr + 48);
631 b0Val = _mm512_loadu_ps(bPtr);
632 b1Val = _mm512_loadu_ps(bPtr + 16);
633 b2Val = _mm512_loadu_ps(bPtr + 32);
634 b3Val = _mm512_loadu_ps(bPtr + 48);
636 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
637 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
638 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
639 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
645 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
646 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
647 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
651 _mm512_storeu_ps(dotProductVector,
654 dotProduct = dotProductVector[0];
655 dotProduct += dotProductVector[1];
656 dotProduct += dotProductVector[2];
657 dotProduct += dotProductVector[3];
658 dotProduct += dotProductVector[4];
659 dotProduct += dotProductVector[5];
660 dotProduct += dotProductVector[6];
661 dotProduct += dotProductVector[7];
662 dotProduct += dotProductVector[8];
663 dotProduct += dotProductVector[9];
664 dotProduct += dotProductVector[10];
665 dotProduct += dotProductVector[11];
666 dotProduct += dotProductVector[12];
667 dotProduct += dotProductVector[13];
668 dotProduct += dotProductVector[14];
669 dotProduct += dotProductVector[15];
671 number = sixtyfourthPoints * 64;
672 for (; number < num_points; number++) {
673 dotProduct += ((*aPtr++) * (*bPtr++));
676 *result = (short)dotProduct;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:381
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:55
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:224
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:527
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:78
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65