45 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
52 #ifdef LV_HAVE_GENERIC
57 unsigned int num_points)
60 static const int N_UNROLL = 4;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
70 for (
i = 0;
i < n;
i += N_UNROLL) {
71 acc0 += taps[
i + 0] * (float)input[
i + 0];
72 acc1 += taps[
i + 1] * (float)input[
i + 1];
73 acc2 += taps[
i + 2] * (float)input[
i + 2];
74 acc3 += taps[
i + 3] * (float)input[
i + 3];
77 for (;
i < num_points;
i++) {
78 acc0 += taps[
i] * (float)input[
i];
81 *result = acc0 + acc1 + acc2 + acc3;
91 unsigned int num_points)
95 unsigned quarter_points = num_points / 4;
97 short* inputPtr = (
short*)input;
100 float32x4x2_t tapsVal, accumulator_val;
103 float32x4_t input_float, prod_re, prod_im;
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((
float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
112 input32 = vmovl_s16(input16);
114 input_float = vcvtq_f32_s32(input32);
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
125 vst2q_f32((
float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
134 *result = accumulator_vec[0];
139 #if LV_HAVE_SSE && LV_HAVE_MMX
141 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
144 unsigned int num_points)
147 unsigned int number = 0;
148 const unsigned int sixteenthPoints = num_points / 8;
151 float *realpt = &res[0], *imagpt = &res[1];
152 const short* aPtr = input;
153 const float* bPtr = (
float*)taps;
157 __m128 a0Val, a1Val, a2Val, a3Val;
158 __m128 b0Val, b1Val, b2Val, b3Val;
159 __m128 c0Val, c1Val, c2Val, c3Val;
166 for (; number < sixteenthPoints; number++) {
168 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
169 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
201 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
202 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
203 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
210 *realpt = dotProductVector[0];
211 *imagpt = dotProductVector[1];
212 *realpt += dotProductVector[2];
213 *imagpt += dotProductVector[3];
215 number = sixteenthPoints * 8;
216 for (; number < num_points; number++) {
217 *realpt += ((*aPtr) * (*bPtr++));
218 *imagpt += ((*aPtr++) * (*bPtr++));
227 #if LV_HAVE_AVX2 && LV_HAVE_FMA
229 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
232 unsigned int num_points)
235 unsigned int number = 0;
236 const unsigned int sixteenthPoints = num_points / 16;
239 float *realpt = &res[0], *imagpt = &res[1];
240 const short* aPtr = input;
241 const float* bPtr = (
float*)taps;
245 __m256 g0, g1, h0, h1, h2, h3;
246 __m256 a0Val, a1Val, a2Val, a3Val;
247 __m256 b0Val, b1Val, b2Val, b3Val;
249 __m256 dotProdVal0 = _mm256_setzero_ps();
250 __m256 dotProdVal1 = _mm256_setzero_ps();
251 __m256 dotProdVal2 = _mm256_setzero_ps();
252 __m256 dotProdVal3 = _mm256_setzero_ps();
254 for (; number < sixteenthPoints; number++) {
259 f0 = _mm256_cvtepi16_epi32(m0);
260 g0 = _mm256_cvtepi32_ps(f0);
261 f1 = _mm256_cvtepi16_epi32(m1);
262 g1 = _mm256_cvtepi32_ps(f1);
264 h0 = _mm256_unpacklo_ps(g0, g0);
265 h1 = _mm256_unpackhi_ps(g0, g0);
266 h2 = _mm256_unpacklo_ps(g1, g1);
267 h3 = _mm256_unpackhi_ps(g1, g1);
269 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
270 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
271 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
272 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
274 b0Val = _mm256_loadu_ps(bPtr);
275 b1Val = _mm256_loadu_ps(bPtr + 8);
276 b2Val = _mm256_loadu_ps(bPtr + 16);
277 b3Val = _mm256_loadu_ps(bPtr + 24);
279 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
280 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
281 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
282 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
288 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
294 _mm256_store_ps(dotProductVector,
297 *realpt = dotProductVector[0];
298 *imagpt = dotProductVector[1];
299 *realpt += dotProductVector[2];
300 *imagpt += dotProductVector[3];
301 *realpt += dotProductVector[4];
302 *imagpt += dotProductVector[5];
303 *realpt += dotProductVector[6];
304 *imagpt += dotProductVector[7];
306 number = sixteenthPoints * 16;
307 for (; number < num_points; number++) {
308 *realpt += ((*aPtr) * (*bPtr++));
309 *imagpt += ((*aPtr++) * (*bPtr++));
320 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
323 unsigned int num_points)
326 unsigned int number = 0;
327 const unsigned int sixteenthPoints = num_points / 16;
330 float *realpt = &res[0], *imagpt = &res[1];
331 const short* aPtr = input;
332 const float* bPtr = (
float*)taps;
336 __m256 g0, g1, h0, h1, h2, h3;
337 __m256 a0Val, a1Val, a2Val, a3Val;
338 __m256 b0Val, b1Val, b2Val, b3Val;
339 __m256 c0Val, c1Val, c2Val, c3Val;
341 __m256 dotProdVal0 = _mm256_setzero_ps();
342 __m256 dotProdVal1 = _mm256_setzero_ps();
343 __m256 dotProdVal2 = _mm256_setzero_ps();
344 __m256 dotProdVal3 = _mm256_setzero_ps();
346 for (; number < sixteenthPoints; number++) {
351 f0 = _mm256_cvtepi16_epi32(m0);
352 g0 = _mm256_cvtepi32_ps(f0);
353 f1 = _mm256_cvtepi16_epi32(m1);
354 g1 = _mm256_cvtepi32_ps(f1);
356 h0 = _mm256_unpacklo_ps(g0, g0);
357 h1 = _mm256_unpackhi_ps(g0, g0);
358 h2 = _mm256_unpacklo_ps(g1, g1);
359 h3 = _mm256_unpackhi_ps(g1, g1);
361 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
362 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
363 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
364 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
366 b0Val = _mm256_loadu_ps(bPtr);
367 b1Val = _mm256_loadu_ps(bPtr + 8);
368 b2Val = _mm256_loadu_ps(bPtr + 16);
369 b3Val = _mm256_loadu_ps(bPtr + 24);
371 c0Val = _mm256_mul_ps(a0Val, b0Val);
372 c1Val = _mm256_mul_ps(a1Val, b1Val);
373 c2Val = _mm256_mul_ps(a2Val, b2Val);
374 c3Val = _mm256_mul_ps(a3Val, b3Val);
376 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
377 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
378 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
379 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
385 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
386 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
387 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
391 _mm256_store_ps(dotProductVector,
394 *realpt = dotProductVector[0];
395 *imagpt = dotProductVector[1];
396 *realpt += dotProductVector[2];
397 *imagpt += dotProductVector[3];
398 *realpt += dotProductVector[4];
399 *imagpt += dotProductVector[5];
400 *realpt += dotProductVector[6];
401 *imagpt += dotProductVector[7];
403 number = sixteenthPoints * 16;
404 for (; number < num_points; number++) {
405 *realpt += ((*aPtr) * (*bPtr++));
406 *imagpt += ((*aPtr++) * (*bPtr++));
415 #if LV_HAVE_SSE && LV_HAVE_MMX
418 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
421 unsigned int num_points)
424 unsigned int number = 0;
425 const unsigned int sixteenthPoints = num_points / 8;
428 float *realpt = &res[0], *imagpt = &res[1];
429 const short* aPtr = input;
430 const float* bPtr = (
float*)taps;
434 __m128 a0Val, a1Val, a2Val, a3Val;
435 __m128 b0Val, b1Val, b2Val, b3Val;
436 __m128 c0Val, c1Val, c2Val, c3Val;
443 for (; number < sixteenthPoints; number++) {
445 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
446 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
478 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
479 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
480 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
487 *realpt = dotProductVector[0];
488 *imagpt = dotProductVector[1];
489 *realpt += dotProductVector[2];
490 *imagpt += dotProductVector[3];
492 number = sixteenthPoints * 8;
493 for (; number < num_points; number++) {
494 *realpt += ((*aPtr) * (*bPtr++));
495 *imagpt += ((*aPtr++) * (*bPtr++));
505 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
508 unsigned int num_points)
511 unsigned int number = 0;
512 const unsigned int sixteenthPoints = num_points / 16;
515 float *realpt = &res[0], *imagpt = &res[1];
516 const short* aPtr = input;
517 const float* bPtr = (
float*)taps;
521 __m256 g0, g1, h0, h1, h2, h3;
522 __m256 a0Val, a1Val, a2Val, a3Val;
523 __m256 b0Val, b1Val, b2Val, b3Val;
524 __m256 c0Val, c1Val, c2Val, c3Val;
526 __m256 dotProdVal0 = _mm256_setzero_ps();
527 __m256 dotProdVal1 = _mm256_setzero_ps();
528 __m256 dotProdVal2 = _mm256_setzero_ps();
529 __m256 dotProdVal3 = _mm256_setzero_ps();
531 for (; number < sixteenthPoints; number++) {
536 f0 = _mm256_cvtepi16_epi32(m0);
537 g0 = _mm256_cvtepi32_ps(f0);
538 f1 = _mm256_cvtepi16_epi32(m1);
539 g1 = _mm256_cvtepi32_ps(f1);
541 h0 = _mm256_unpacklo_ps(g0, g0);
542 h1 = _mm256_unpackhi_ps(g0, g0);
543 h2 = _mm256_unpacklo_ps(g1, g1);
544 h3 = _mm256_unpackhi_ps(g1, g1);
546 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
547 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
548 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
549 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
551 b0Val = _mm256_load_ps(bPtr);
552 b1Val = _mm256_load_ps(bPtr + 8);
553 b2Val = _mm256_load_ps(bPtr + 16);
554 b3Val = _mm256_load_ps(bPtr + 24);
556 c0Val = _mm256_mul_ps(a0Val, b0Val);
557 c1Val = _mm256_mul_ps(a1Val, b1Val);
558 c2Val = _mm256_mul_ps(a2Val, b2Val);
559 c3Val = _mm256_mul_ps(a3Val, b3Val);
561 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
562 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
563 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
564 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
570 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
571 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
572 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
576 _mm256_store_ps(dotProductVector,
579 *realpt = dotProductVector[0];
580 *imagpt = dotProductVector[1];
581 *realpt += dotProductVector[2];
582 *imagpt += dotProductVector[3];
583 *realpt += dotProductVector[4];
584 *imagpt += dotProductVector[5];
585 *realpt += dotProductVector[6];
586 *imagpt += dotProductVector[7];
588 number = sixteenthPoints * 16;
589 for (; number < num_points; number++) {
590 *realpt += ((*aPtr) * (*bPtr++));
591 *imagpt += ((*aPtr++) * (*bPtr++));
600 #if LV_HAVE_AVX2 && LV_HAVE_FMA
602 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
605 unsigned int num_points)
608 unsigned int number = 0;
609 const unsigned int sixteenthPoints = num_points / 16;
612 float *realpt = &res[0], *imagpt = &res[1];
613 const short* aPtr = input;
614 const float* bPtr = (
float*)taps;
618 __m256 g0, g1, h0, h1, h2, h3;
619 __m256 a0Val, a1Val, a2Val, a3Val;
620 __m256 b0Val, b1Val, b2Val, b3Val;
622 __m256 dotProdVal0 = _mm256_setzero_ps();
623 __m256 dotProdVal1 = _mm256_setzero_ps();
624 __m256 dotProdVal2 = _mm256_setzero_ps();
625 __m256 dotProdVal3 = _mm256_setzero_ps();
627 for (; number < sixteenthPoints; number++) {
632 f0 = _mm256_cvtepi16_epi32(m0);
633 g0 = _mm256_cvtepi32_ps(f0);
634 f1 = _mm256_cvtepi16_epi32(m1);
635 g1 = _mm256_cvtepi32_ps(f1);
637 h0 = _mm256_unpacklo_ps(g0, g0);
638 h1 = _mm256_unpackhi_ps(g0, g0);
639 h2 = _mm256_unpacklo_ps(g1, g1);
640 h3 = _mm256_unpackhi_ps(g1, g1);
642 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
643 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
644 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
645 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
647 b0Val = _mm256_load_ps(bPtr);
648 b1Val = _mm256_load_ps(bPtr + 8);
649 b2Val = _mm256_load_ps(bPtr + 16);
650 b3Val = _mm256_load_ps(bPtr + 24);
652 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
653 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
654 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
655 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
661 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
662 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
663 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
667 _mm256_store_ps(dotProductVector,
670 *realpt = dotProductVector[0];
671 *imagpt = dotProductVector[1];
672 *realpt += dotProductVector[2];
673 *imagpt += dotProductVector[3];
674 *realpt += dotProductVector[4];
675 *imagpt += dotProductVector[5];
676 *realpt += dotProductVector[6];
677 *imagpt += dotProductVector[7];
679 number = sixteenthPoints * 16;
680 for (; number < num_points; number++) {
681 *realpt += ((*aPtr) * (*bPtr++));
682 *imagpt += ((*aPtr++) * (*bPtr++));
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition: sse2neon.h:1459
FORCE_INLINE void _mm_empty(void)
Definition: sse2neon.h:1027
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:88
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:54
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13