50 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
56 #ifdef LV_HAVE_GENERIC
61 unsigned int num_points)
65 float *realpt = &res[0], *imagpt = &res[1];
66 const float* aPtr = (
float*)input;
67 const float* bPtr = taps;
68 unsigned int number = 0;
73 for (number = 0; number < num_points; number++) {
74 *realpt += ((*aPtr++) * (*bPtr));
75 *imagpt += ((*aPtr++) * (*bPtr++));
83 #if LV_HAVE_AVX2 && LV_HAVE_FMA
85 #include <immintrin.h>
87 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
90 unsigned int num_points)
93 unsigned int number = 0;
94 const unsigned int sixteenthPoints = num_points / 16;
97 float *realpt = &res[0], *imagpt = &res[1];
98 const float* aPtr = (
float*)input;
99 const float* bPtr = taps;
101 __m256 a0Val, a1Val, a2Val, a3Val;
102 __m256 b0Val, b1Val, b2Val, b3Val;
103 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
105 __m256 dotProdVal0 = _mm256_setzero_ps();
106 __m256 dotProdVal1 = _mm256_setzero_ps();
107 __m256 dotProdVal2 = _mm256_setzero_ps();
108 __m256 dotProdVal3 = _mm256_setzero_ps();
110 for (; number < sixteenthPoints; number++) {
112 a0Val = _mm256_load_ps(aPtr);
113 a1Val = _mm256_load_ps(aPtr + 8);
114 a2Val = _mm256_load_ps(aPtr + 16);
115 a3Val = _mm256_load_ps(aPtr + 24);
117 x0Val = _mm256_load_ps(bPtr);
118 x1Val = _mm256_load_ps(bPtr + 8);
119 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
120 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
121 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
122 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
125 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
126 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
127 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
128 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
130 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
131 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
132 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
133 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
139 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
145 _mm256_store_ps(dotProductVector,
148 *realpt = dotProductVector[0];
149 *imagpt = dotProductVector[1];
150 *realpt += dotProductVector[2];
151 *imagpt += dotProductVector[3];
152 *realpt += dotProductVector[4];
153 *imagpt += dotProductVector[5];
154 *realpt += dotProductVector[6];
155 *imagpt += dotProductVector[7];
157 number = sixteenthPoints * 16;
158 for (; number < num_points; number++) {
159 *realpt += ((*aPtr++) * (*bPtr));
160 *imagpt += ((*aPtr++) * (*bPtr++));
170 #include <immintrin.h>
175 unsigned int num_points)
178 unsigned int number = 0;
179 const unsigned int sixteenthPoints = num_points / 16;
182 float *realpt = &res[0], *imagpt = &res[1];
183 const float* aPtr = (
float*)input;
184 const float* bPtr = taps;
186 __m256 a0Val, a1Val, a2Val, a3Val;
187 __m256 b0Val, b1Val, b2Val, b3Val;
188 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
189 __m256 c0Val, c1Val, c2Val, c3Val;
191 __m256 dotProdVal0 = _mm256_setzero_ps();
192 __m256 dotProdVal1 = _mm256_setzero_ps();
193 __m256 dotProdVal2 = _mm256_setzero_ps();
194 __m256 dotProdVal3 = _mm256_setzero_ps();
196 for (; number < sixteenthPoints; number++) {
198 a0Val = _mm256_load_ps(aPtr);
199 a1Val = _mm256_load_ps(aPtr + 8);
200 a2Val = _mm256_load_ps(aPtr + 16);
201 a3Val = _mm256_load_ps(aPtr + 24);
203 x0Val = _mm256_load_ps(bPtr);
204 x1Val = _mm256_load_ps(bPtr + 8);
205 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
206 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
207 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
208 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
211 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
212 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
213 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
214 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
216 c0Val = _mm256_mul_ps(a0Val, b0Val);
217 c1Val = _mm256_mul_ps(a1Val, b1Val);
218 c2Val = _mm256_mul_ps(a2Val, b2Val);
219 c3Val = _mm256_mul_ps(a3Val, b3Val);
221 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
222 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
223 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
224 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
230 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
231 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
232 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
236 _mm256_store_ps(dotProductVector,
239 *realpt = dotProductVector[0];
240 *imagpt = dotProductVector[1];
241 *realpt += dotProductVector[2];
242 *imagpt += dotProductVector[3];
243 *realpt += dotProductVector[4];
244 *imagpt += dotProductVector[5];
245 *realpt += dotProductVector[6];
246 *imagpt += dotProductVector[7];
248 number = sixteenthPoints * 16;
249 for (; number < num_points; number++) {
250 *realpt += ((*aPtr++) * (*bPtr));
251 *imagpt += ((*aPtr++) * (*bPtr++));
266 unsigned int num_points)
269 unsigned int number = 0;
270 const unsigned int sixteenthPoints = num_points / 8;
273 float *realpt = &res[0], *imagpt = &res[1];
274 const float* aPtr = (
float*)input;
275 const float* bPtr = taps;
277 __m128 a0Val, a1Val, a2Val, a3Val;
278 __m128 b0Val, b1Val, b2Val, b3Val;
279 __m128 x0Val, x1Val, x2Val, x3Val;
280 __m128 c0Val, c1Val, c2Val, c3Val;
287 for (; number < sixteenthPoints; number++) {
317 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
318 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
319 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
326 *realpt = dotProductVector[0];
327 *imagpt = dotProductVector[1];
328 *realpt += dotProductVector[2];
329 *imagpt += dotProductVector[3];
331 number = sixteenthPoints * 8;
332 for (; number < num_points; number++) {
333 *realpt += ((*aPtr++) * (*bPtr));
334 *imagpt += ((*aPtr++) * (*bPtr++));
342 #if LV_HAVE_AVX2 && LV_HAVE_FMA
344 #include <immintrin.h>
346 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
349 unsigned int num_points)
352 unsigned int number = 0;
353 const unsigned int sixteenthPoints = num_points / 16;
356 float *realpt = &res[0], *imagpt = &res[1];
357 const float* aPtr = (
float*)input;
358 const float* bPtr = taps;
360 __m256 a0Val, a1Val, a2Val, a3Val;
361 __m256 b0Val, b1Val, b2Val, b3Val;
362 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
364 __m256 dotProdVal0 = _mm256_setzero_ps();
365 __m256 dotProdVal1 = _mm256_setzero_ps();
366 __m256 dotProdVal2 = _mm256_setzero_ps();
367 __m256 dotProdVal3 = _mm256_setzero_ps();
369 for (; number < sixteenthPoints; number++) {
371 a0Val = _mm256_loadu_ps(aPtr);
372 a1Val = _mm256_loadu_ps(aPtr + 8);
373 a2Val = _mm256_loadu_ps(aPtr + 16);
374 a3Val = _mm256_loadu_ps(aPtr + 24);
376 x0Val = _mm256_load_ps(bPtr);
377 x1Val = _mm256_load_ps(bPtr + 8);
378 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
379 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
380 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
381 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
384 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
385 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
386 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
387 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
389 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
390 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
391 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
392 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
398 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
404 _mm256_store_ps(dotProductVector,
407 *realpt = dotProductVector[0];
408 *imagpt = dotProductVector[1];
409 *realpt += dotProductVector[2];
410 *imagpt += dotProductVector[3];
411 *realpt += dotProductVector[4];
412 *imagpt += dotProductVector[5];
413 *realpt += dotProductVector[6];
414 *imagpt += dotProductVector[7];
416 number = sixteenthPoints * 16;
417 for (; number < num_points; number++) {
418 *realpt += ((*aPtr++) * (*bPtr));
419 *imagpt += ((*aPtr++) * (*bPtr++));
429 #include <immintrin.h>
434 unsigned int num_points)
437 unsigned int number = 0;
438 const unsigned int sixteenthPoints = num_points / 16;
441 float *realpt = &res[0], *imagpt = &res[1];
442 const float* aPtr = (
float*)input;
443 const float* bPtr = taps;
445 __m256 a0Val, a1Val, a2Val, a3Val;
446 __m256 b0Val, b1Val, b2Val, b3Val;
447 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
448 __m256 c0Val, c1Val, c2Val, c3Val;
450 __m256 dotProdVal0 = _mm256_setzero_ps();
451 __m256 dotProdVal1 = _mm256_setzero_ps();
452 __m256 dotProdVal2 = _mm256_setzero_ps();
453 __m256 dotProdVal3 = _mm256_setzero_ps();
455 for (; number < sixteenthPoints; number++) {
457 a0Val = _mm256_loadu_ps(aPtr);
458 a1Val = _mm256_loadu_ps(aPtr + 8);
459 a2Val = _mm256_loadu_ps(aPtr + 16);
460 a3Val = _mm256_loadu_ps(aPtr + 24);
462 x0Val = _mm256_loadu_ps(bPtr);
463 x1Val = _mm256_loadu_ps(bPtr + 8);
464 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
465 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
466 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
467 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
470 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
471 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
472 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
473 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
475 c0Val = _mm256_mul_ps(a0Val, b0Val);
476 c1Val = _mm256_mul_ps(a1Val, b1Val);
477 c2Val = _mm256_mul_ps(a2Val, b2Val);
478 c3Val = _mm256_mul_ps(a3Val, b3Val);
480 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
481 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
482 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
483 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
489 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
490 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
491 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
495 _mm256_store_ps(dotProductVector,
498 *realpt = dotProductVector[0];
499 *imagpt = dotProductVector[1];
500 *realpt += dotProductVector[2];
501 *imagpt += dotProductVector[3];
502 *realpt += dotProductVector[4];
503 *imagpt += dotProductVector[5];
504 *realpt += dotProductVector[6];
505 *imagpt += dotProductVector[7];
507 number = sixteenthPoints * 16;
508 for (; number < num_points; number++) {
509 *realpt += ((*aPtr++) * (*bPtr));
510 *imagpt += ((*aPtr++) * (*bPtr++));
518 #include <arm_neon.h>
523 const float* __restrict taps,
524 unsigned int num_points)
528 const unsigned int quarterPoints = num_points / 8;
531 float *realpt = &res[0], *imagpt = &res[1];
532 const float* inputPtr = (
float*)input;
533 const float* tapsPtr = taps;
534 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
535 float accVector_real[4];
536 float accVector_imag[4];
538 float32x4x2_t inputVector0, inputVector1;
539 float32x4_t tapsVector0, tapsVector1;
540 float32x4_t tmp_real0, tmp_imag0;
541 float32x4_t tmp_real1, tmp_imag1;
542 float32x4_t real_accumulator0, imag_accumulator0;
543 float32x4_t real_accumulator1, imag_accumulator1;
547 real_accumulator0 = vld1q_f32(zero);
548 imag_accumulator0 = vld1q_f32(zero);
549 real_accumulator1 = vld1q_f32(zero);
550 imag_accumulator1 = vld1q_f32(zero);
552 for (number = 0; number < quarterPoints; number++) {
554 tapsVector0 = vld1q_f32(tapsPtr);
555 tapsVector1 = vld1q_f32(tapsPtr + 4);
558 inputVector0 = vld2q_f32(inputPtr);
559 inputVector1 = vld2q_f32(inputPtr + 8);
562 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
563 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
565 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
566 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
568 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
569 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
571 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
572 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
578 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
579 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
582 vst1q_f32(accVector_real, real_accumulator0);
583 vst1q_f32(accVector_imag, imag_accumulator0);
585 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
588 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
591 for (number = quarterPoints * 8; number < num_points; number++) {
592 *realpt += ((*inputPtr++) * (*tapsPtr));
593 *imagpt += ((*inputPtr++) * (*tapsPtr++));
602 #include <arm_neon.h>
606 const float* __restrict taps,
607 unsigned int num_points)
611 const unsigned int quarterPoints = num_points / 4;
614 float *realpt = &res[0], *imagpt = &res[1];
615 const float* inputPtr = (
float*)input;
616 const float* tapsPtr = taps;
617 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
618 float accVector_real[4];
619 float accVector_imag[4];
621 float32x4x2_t inputVector;
622 float32x4_t tapsVector;
623 float32x4_t tmp_real, tmp_imag;
624 float32x4_t real_accumulator, imag_accumulator;
629 real_accumulator = vld1q_f32(zero);
630 imag_accumulator = vld1q_f32(zero);
632 for (number = 0; number < quarterPoints; number++) {
635 tapsVector = vld1q_f32(tapsPtr);
638 inputVector = vld2q_f32(inputPtr);
640 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
641 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
643 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
644 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
652 vst1q_f32(accVector_real, real_accumulator);
653 vst1q_f32(accVector_imag, imag_accumulator);
655 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
658 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
661 for (number = quarterPoints * 4; number < num_points; number++) {
662 *realpt += ((*inputPtr++) * (*tapsPtr));
663 *imagpt += ((*inputPtr++) * (*tapsPtr++));
671 #ifdef LV_HAVE_NEONV7
672 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(
lv_32fc_t* result,
675 unsigned int num_points);
678 #ifdef LV_HAVE_NEONV7
679 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(
lv_32fc_t* result,
682 unsigned int num_points);
685 #ifdef LV_HAVE_NEONV7
686 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(
lv_32fc_t* result,
689 unsigned int num_points);
697 unsigned int num_points)
700 unsigned int number = 0;
701 const unsigned int sixteenthPoints = num_points / 8;
704 float *realpt = &res[0], *imagpt = &res[1];
705 const float* aPtr = (
float*)input;
706 const float* bPtr = taps;
708 __m128 a0Val, a1Val, a2Val, a3Val;
709 __m128 b0Val, b1Val, b2Val, b3Val;
710 __m128 x0Val, x1Val, x2Val, x3Val;
711 __m128 c0Val, c1Val, c2Val, c3Val;
718 for (; number < sixteenthPoints; number++) {
748 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal1);
749 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal2);
750 dotProdVal0 =
_mm_add_ps(dotProdVal0, dotProdVal3);
757 *realpt = dotProductVector[0];
758 *imagpt = dotProductVector[1];
759 *realpt += dotProductVector[2];
760 *imagpt += dotProductVector[3];
762 number = sixteenthPoints * 8;
763 for (; number < num_points; number++) {
764 *realpt += ((*aPtr++) * (*bPtr));
765 *imagpt += ((*aPtr++) * (*bPtr++));
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:431
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:263
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:521
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:604
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:58
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:172
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:694
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74