Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
50 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52 
53 #include <stdio.h>
54 #include <volk/volk_common.h>
55 
56 #ifdef LV_HAVE_GENERIC
57 
59  const lv_32fc_t* input,
60  const float* taps,
61  unsigned int num_points)
62 {
63 
64  float res[2];
65  float *realpt = &res[0], *imagpt = &res[1];
66  const float* aPtr = (float*)input;
67  const float* bPtr = taps;
68  unsigned int number = 0;
69 
70  *realpt = 0;
71  *imagpt = 0;
72 
73  for (number = 0; number < num_points; number++) {
74  *realpt += ((*aPtr++) * (*bPtr));
75  *imagpt += ((*aPtr++) * (*bPtr++));
76  }
77 
78  *result = *(lv_32fc_t*)(&res[0]);
79 }
80 
81 #endif /*LV_HAVE_GENERIC*/
82 
83 #if LV_HAVE_AVX2 && LV_HAVE_FMA
84 
85 #include <immintrin.h>
86 
87 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
88  const lv_32fc_t* input,
89  const float* taps,
90  unsigned int num_points)
91 {
92 
93  unsigned int number = 0;
94  const unsigned int sixteenthPoints = num_points / 16;
95 
96  float res[2];
97  float *realpt = &res[0], *imagpt = &res[1];
98  const float* aPtr = (float*)input;
99  const float* bPtr = taps;
100 
101  __m256 a0Val, a1Val, a2Val, a3Val;
102  __m256 b0Val, b1Val, b2Val, b3Val;
103  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
104 
105  __m256 dotProdVal0 = _mm256_setzero_ps();
106  __m256 dotProdVal1 = _mm256_setzero_ps();
107  __m256 dotProdVal2 = _mm256_setzero_ps();
108  __m256 dotProdVal3 = _mm256_setzero_ps();
109 
110  for (; number < sixteenthPoints; number++) {
111 
112  a0Val = _mm256_load_ps(aPtr);
113  a1Val = _mm256_load_ps(aPtr + 8);
114  a2Val = _mm256_load_ps(aPtr + 16);
115  a3Val = _mm256_load_ps(aPtr + 24);
116 
117  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
118  x1Val = _mm256_load_ps(bPtr + 8);
119  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
120  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
121  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
122  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
123 
124  // TODO: it may be possible to rearrange swizzling to better pipeline data
125  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
126  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
127  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
128  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
129 
130  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
131  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
132  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
133  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
134 
135  aPtr += 32;
136  bPtr += 16;
137  }
138 
139  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
140  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
141  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
142 
143  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
144 
145  _mm256_store_ps(dotProductVector,
146  dotProdVal0); // Store the results back into the dot product vector
147 
148  *realpt = dotProductVector[0];
149  *imagpt = dotProductVector[1];
150  *realpt += dotProductVector[2];
151  *imagpt += dotProductVector[3];
152  *realpt += dotProductVector[4];
153  *imagpt += dotProductVector[5];
154  *realpt += dotProductVector[6];
155  *imagpt += dotProductVector[7];
156 
157  number = sixteenthPoints * 16;
158  for (; number < num_points; number++) {
159  *realpt += ((*aPtr++) * (*bPtr));
160  *imagpt += ((*aPtr++) * (*bPtr++));
161  }
162 
163  *result = *(lv_32fc_t*)(&res[0]);
164 }
165 
166 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
167 
168 #ifdef LV_HAVE_AVX
169 
170 #include <immintrin.h>
171 
172 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
173  const lv_32fc_t* input,
174  const float* taps,
175  unsigned int num_points)
176 {
177 
178  unsigned int number = 0;
179  const unsigned int sixteenthPoints = num_points / 16;
180 
181  float res[2];
182  float *realpt = &res[0], *imagpt = &res[1];
183  const float* aPtr = (float*)input;
184  const float* bPtr = taps;
185 
186  __m256 a0Val, a1Val, a2Val, a3Val;
187  __m256 b0Val, b1Val, b2Val, b3Val;
188  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
189  __m256 c0Val, c1Val, c2Val, c3Val;
190 
191  __m256 dotProdVal0 = _mm256_setzero_ps();
192  __m256 dotProdVal1 = _mm256_setzero_ps();
193  __m256 dotProdVal2 = _mm256_setzero_ps();
194  __m256 dotProdVal3 = _mm256_setzero_ps();
195 
196  for (; number < sixteenthPoints; number++) {
197 
198  a0Val = _mm256_load_ps(aPtr);
199  a1Val = _mm256_load_ps(aPtr + 8);
200  a2Val = _mm256_load_ps(aPtr + 16);
201  a3Val = _mm256_load_ps(aPtr + 24);
202 
203  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
204  x1Val = _mm256_load_ps(bPtr + 8);
205  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
206  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
207  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
208  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
209 
210  // TODO: it may be possible to rearrange swizzling to better pipeline data
211  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
212  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
213  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
214  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
215 
216  c0Val = _mm256_mul_ps(a0Val, b0Val);
217  c1Val = _mm256_mul_ps(a1Val, b1Val);
218  c2Val = _mm256_mul_ps(a2Val, b2Val);
219  c3Val = _mm256_mul_ps(a3Val, b3Val);
220 
221  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
222  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
223  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
224  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
225 
226  aPtr += 32;
227  bPtr += 16;
228  }
229 
230  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
231  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
232  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
233 
234  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
235 
236  _mm256_store_ps(dotProductVector,
237  dotProdVal0); // Store the results back into the dot product vector
238 
239  *realpt = dotProductVector[0];
240  *imagpt = dotProductVector[1];
241  *realpt += dotProductVector[2];
242  *imagpt += dotProductVector[3];
243  *realpt += dotProductVector[4];
244  *imagpt += dotProductVector[5];
245  *realpt += dotProductVector[6];
246  *imagpt += dotProductVector[7];
247 
248  number = sixteenthPoints * 16;
249  for (; number < num_points; number++) {
250  *realpt += ((*aPtr++) * (*bPtr));
251  *imagpt += ((*aPtr++) * (*bPtr++));
252  }
253 
254  *result = *(lv_32fc_t*)(&res[0]);
255 }
256 
257 #endif /*LV_HAVE_AVX*/
258 
259 
260 #ifdef LV_HAVE_SSE
261 
262 
263 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
264  const lv_32fc_t* input,
265  const float* taps,
266  unsigned int num_points)
267 {
268 
269  unsigned int number = 0;
270  const unsigned int sixteenthPoints = num_points / 8;
271 
272  float res[2];
273  float *realpt = &res[0], *imagpt = &res[1];
274  const float* aPtr = (float*)input;
275  const float* bPtr = taps;
276 
277  __m128 a0Val, a1Val, a2Val, a3Val;
278  __m128 b0Val, b1Val, b2Val, b3Val;
279  __m128 x0Val, x1Val, x2Val, x3Val;
280  __m128 c0Val, c1Val, c2Val, c3Val;
281 
282  __m128 dotProdVal0 = _mm_setzero_ps();
283  __m128 dotProdVal1 = _mm_setzero_ps();
284  __m128 dotProdVal2 = _mm_setzero_ps();
285  __m128 dotProdVal3 = _mm_setzero_ps();
286 
287  for (; number < sixteenthPoints; number++) {
288 
289  a0Val = _mm_load_ps(aPtr);
290  a1Val = _mm_load_ps(aPtr + 4);
291  a2Val = _mm_load_ps(aPtr + 8);
292  a3Val = _mm_load_ps(aPtr + 12);
293 
294  x0Val = _mm_load_ps(bPtr);
295  x1Val = _mm_load_ps(bPtr);
296  x2Val = _mm_load_ps(bPtr + 4);
297  x3Val = _mm_load_ps(bPtr + 4);
298  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
299  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
300  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
301  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
302 
303  c0Val = _mm_mul_ps(a0Val, b0Val);
304  c1Val = _mm_mul_ps(a1Val, b1Val);
305  c2Val = _mm_mul_ps(a2Val, b2Val);
306  c3Val = _mm_mul_ps(a3Val, b3Val);
307 
308  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
309  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
310  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
311  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
312 
313  aPtr += 16;
314  bPtr += 8;
315  }
316 
317  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
318  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
319  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
320 
321  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
322 
323  _mm_store_ps(dotProductVector,
324  dotProdVal0); // Store the results back into the dot product vector
325 
326  *realpt = dotProductVector[0];
327  *imagpt = dotProductVector[1];
328  *realpt += dotProductVector[2];
329  *imagpt += dotProductVector[3];
330 
331  number = sixteenthPoints * 8;
332  for (; number < num_points; number++) {
333  *realpt += ((*aPtr++) * (*bPtr));
334  *imagpt += ((*aPtr++) * (*bPtr++));
335  }
336 
337  *result = *(lv_32fc_t*)(&res[0]);
338 }
339 
340 #endif /*LV_HAVE_SSE*/
341 
342 #if LV_HAVE_AVX2 && LV_HAVE_FMA
343 
344 #include <immintrin.h>
345 
346 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
347  const lv_32fc_t* input,
348  const float* taps,
349  unsigned int num_points)
350 {
351 
352  unsigned int number = 0;
353  const unsigned int sixteenthPoints = num_points / 16;
354 
355  float res[2];
356  float *realpt = &res[0], *imagpt = &res[1];
357  const float* aPtr = (float*)input;
358  const float* bPtr = taps;
359 
360  __m256 a0Val, a1Val, a2Val, a3Val;
361  __m256 b0Val, b1Val, b2Val, b3Val;
362  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
363 
364  __m256 dotProdVal0 = _mm256_setzero_ps();
365  __m256 dotProdVal1 = _mm256_setzero_ps();
366  __m256 dotProdVal2 = _mm256_setzero_ps();
367  __m256 dotProdVal3 = _mm256_setzero_ps();
368 
369  for (; number < sixteenthPoints; number++) {
370 
371  a0Val = _mm256_loadu_ps(aPtr);
372  a1Val = _mm256_loadu_ps(aPtr + 8);
373  a2Val = _mm256_loadu_ps(aPtr + 16);
374  a3Val = _mm256_loadu_ps(aPtr + 24);
375 
376  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
377  x1Val = _mm256_load_ps(bPtr + 8);
378  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
379  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
380  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
381  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
382 
383  // TODO: it may be possible to rearrange swizzling to better pipeline data
384  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
385  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
386  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
387  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
388 
389  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
390  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
391  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
392  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
393 
394  aPtr += 32;
395  bPtr += 16;
396  }
397 
398  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
401 
402  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
403 
404  _mm256_store_ps(dotProductVector,
405  dotProdVal0); // Store the results back into the dot product vector
406 
407  *realpt = dotProductVector[0];
408  *imagpt = dotProductVector[1];
409  *realpt += dotProductVector[2];
410  *imagpt += dotProductVector[3];
411  *realpt += dotProductVector[4];
412  *imagpt += dotProductVector[5];
413  *realpt += dotProductVector[6];
414  *imagpt += dotProductVector[7];
415 
416  number = sixteenthPoints * 16;
417  for (; number < num_points; number++) {
418  *realpt += ((*aPtr++) * (*bPtr));
419  *imagpt += ((*aPtr++) * (*bPtr++));
420  }
421 
422  *result = *(lv_32fc_t*)(&res[0]);
423 }
424 
425 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
426 
427 #ifdef LV_HAVE_AVX
428 
429 #include <immintrin.h>
430 
431 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
432  const lv_32fc_t* input,
433  const float* taps,
434  unsigned int num_points)
435 {
436 
437  unsigned int number = 0;
438  const unsigned int sixteenthPoints = num_points / 16;
439 
440  float res[2];
441  float *realpt = &res[0], *imagpt = &res[1];
442  const float* aPtr = (float*)input;
443  const float* bPtr = taps;
444 
445  __m256 a0Val, a1Val, a2Val, a3Val;
446  __m256 b0Val, b1Val, b2Val, b3Val;
447  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
448  __m256 c0Val, c1Val, c2Val, c3Val;
449 
450  __m256 dotProdVal0 = _mm256_setzero_ps();
451  __m256 dotProdVal1 = _mm256_setzero_ps();
452  __m256 dotProdVal2 = _mm256_setzero_ps();
453  __m256 dotProdVal3 = _mm256_setzero_ps();
454 
455  for (; number < sixteenthPoints; number++) {
456 
457  a0Val = _mm256_loadu_ps(aPtr);
458  a1Val = _mm256_loadu_ps(aPtr + 8);
459  a2Val = _mm256_loadu_ps(aPtr + 16);
460  a3Val = _mm256_loadu_ps(aPtr + 24);
461 
462  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
463  x1Val = _mm256_loadu_ps(bPtr + 8);
464  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
465  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
466  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
467  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
468 
469  // TODO: it may be possible to rearrange swizzling to better pipeline data
470  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
471  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
472  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
473  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
474 
475  c0Val = _mm256_mul_ps(a0Val, b0Val);
476  c1Val = _mm256_mul_ps(a1Val, b1Val);
477  c2Val = _mm256_mul_ps(a2Val, b2Val);
478  c3Val = _mm256_mul_ps(a3Val, b3Val);
479 
480  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
481  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
482  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
483  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
484 
485  aPtr += 32;
486  bPtr += 16;
487  }
488 
489  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
490  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
491  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
492 
493  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
494 
495  _mm256_store_ps(dotProductVector,
496  dotProdVal0); // Store the results back into the dot product vector
497 
498  *realpt = dotProductVector[0];
499  *imagpt = dotProductVector[1];
500  *realpt += dotProductVector[2];
501  *imagpt += dotProductVector[3];
502  *realpt += dotProductVector[4];
503  *imagpt += dotProductVector[5];
504  *realpt += dotProductVector[6];
505  *imagpt += dotProductVector[7];
506 
507  number = sixteenthPoints * 16;
508  for (; number < num_points; number++) {
509  *realpt += ((*aPtr++) * (*bPtr));
510  *imagpt += ((*aPtr++) * (*bPtr++));
511  }
512 
513  *result = *(lv_32fc_t*)(&res[0]);
514 }
515 #endif /*LV_HAVE_AVX*/
516 
517 #ifdef LV_HAVE_NEON
518 #include <arm_neon.h>
519 
520 static inline void
522  const lv_32fc_t* __restrict input,
523  const float* __restrict taps,
524  unsigned int num_points)
525 {
526 
527  unsigned int number;
528  const unsigned int quarterPoints = num_points / 8;
529 
530  float res[2];
531  float *realpt = &res[0], *imagpt = &res[1];
532  const float* inputPtr = (float*)input;
533  const float* tapsPtr = taps;
534  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
535  float accVector_real[4];
536  float accVector_imag[4];
537 
538  float32x4x2_t inputVector0, inputVector1;
539  float32x4_t tapsVector0, tapsVector1;
540  float32x4_t tmp_real0, tmp_imag0;
541  float32x4_t tmp_real1, tmp_imag1;
542  float32x4_t real_accumulator0, imag_accumulator0;
543  float32x4_t real_accumulator1, imag_accumulator1;
544 
545  // zero out accumulators
546  // take a *float, return float32x4_t
547  real_accumulator0 = vld1q_f32(zero);
548  imag_accumulator0 = vld1q_f32(zero);
549  real_accumulator1 = vld1q_f32(zero);
550  imag_accumulator1 = vld1q_f32(zero);
551 
552  for (number = 0; number < quarterPoints; number++) {
553  // load doublewords and duplicate in to second lane
554  tapsVector0 = vld1q_f32(tapsPtr);
555  tapsVector1 = vld1q_f32(tapsPtr + 4);
556 
557  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
558  inputVector0 = vld2q_f32(inputPtr);
559  inputVector1 = vld2q_f32(inputPtr + 8);
560  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
561 
562  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
563  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
564 
565  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
566  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
567 
568  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
569  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
570 
571  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
572  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
573 
574  tapsPtr += 8;
575  inputPtr += 16;
576  }
577 
578  real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
579  imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
580  // void vst1q_f32( float32_t * ptr, float32x4_t val);
581  // store results back to a complex (array of 2 floats)
582  vst1q_f32(accVector_real, real_accumulator0);
583  vst1q_f32(accVector_imag, imag_accumulator0);
584  *realpt =
585  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
586 
587  *imagpt =
588  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
589 
590  // clean up the remainder
591  for (number = quarterPoints * 8; number < num_points; number++) {
592  *realpt += ((*inputPtr++) * (*tapsPtr));
593  *imagpt += ((*inputPtr++) * (*tapsPtr++));
594  }
595 
596  *result = *(lv_32fc_t*)(&res[0]);
597 }
598 
599 #endif /*LV_HAVE_NEON*/
600 
601 #ifdef LV_HAVE_NEON
602 #include <arm_neon.h>
603 
604 static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
605  const lv_32fc_t* __restrict input,
606  const float* __restrict taps,
607  unsigned int num_points)
608 {
609 
610  unsigned int number;
611  const unsigned int quarterPoints = num_points / 4;
612 
613  float res[2];
614  float *realpt = &res[0], *imagpt = &res[1];
615  const float* inputPtr = (float*)input;
616  const float* tapsPtr = taps;
617  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
618  float accVector_real[4];
619  float accVector_imag[4];
620 
621  float32x4x2_t inputVector;
622  float32x4_t tapsVector;
623  float32x4_t tmp_real, tmp_imag;
624  float32x4_t real_accumulator, imag_accumulator;
625 
626 
627  // zero out accumulators
628  // take a *float, return float32x4_t
629  real_accumulator = vld1q_f32(zero);
630  imag_accumulator = vld1q_f32(zero);
631 
632  for (number = 0; number < quarterPoints; number++) {
633  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
634  // load doublewords and duplicate in to second lane
635  tapsVector = vld1q_f32(tapsPtr);
636 
637  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
638  inputVector = vld2q_f32(inputPtr);
639 
640  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
641  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
642 
643  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
644  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
645 
646 
647  tapsPtr += 4;
648  inputPtr += 8;
649  }
650 
651  // store results back to a complex (array of 2 floats)
652  vst1q_f32(accVector_real, real_accumulator);
653  vst1q_f32(accVector_imag, imag_accumulator);
654  *realpt =
655  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
656 
657  *imagpt =
658  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
659 
660  // clean up the remainder
661  for (number = quarterPoints * 4; number < num_points; number++) {
662  *realpt += ((*inputPtr++) * (*tapsPtr));
663  *imagpt += ((*inputPtr++) * (*tapsPtr++));
664  }
665 
666  *result = *(lv_32fc_t*)(&res[0]);
667 }
668 
669 #endif /*LV_HAVE_NEON*/
670 
671 #ifdef LV_HAVE_NEONV7
672 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
673  const lv_32fc_t* input,
674  const float* taps,
675  unsigned int num_points);
676 #endif /*LV_HAVE_NEONV7*/
677 
678 #ifdef LV_HAVE_NEONV7
679 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
680  const lv_32fc_t* input,
681  const float* taps,
682  unsigned int num_points);
683 #endif /*LV_HAVE_NEONV7*/
684 
685 #ifdef LV_HAVE_NEONV7
686 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
687  const lv_32fc_t* input,
688  const float* taps,
689  unsigned int num_points);
690 #endif /*LV_HAVE_NEONV7*/
691 
692 #ifdef LV_HAVE_SSE
693 
694 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
695  const lv_32fc_t* input,
696  const float* taps,
697  unsigned int num_points)
698 {
699 
700  unsigned int number = 0;
701  const unsigned int sixteenthPoints = num_points / 8;
702 
703  float res[2];
704  float *realpt = &res[0], *imagpt = &res[1];
705  const float* aPtr = (float*)input;
706  const float* bPtr = taps;
707 
708  __m128 a0Val, a1Val, a2Val, a3Val;
709  __m128 b0Val, b1Val, b2Val, b3Val;
710  __m128 x0Val, x1Val, x2Val, x3Val;
711  __m128 c0Val, c1Val, c2Val, c3Val;
712 
713  __m128 dotProdVal0 = _mm_setzero_ps();
714  __m128 dotProdVal1 = _mm_setzero_ps();
715  __m128 dotProdVal2 = _mm_setzero_ps();
716  __m128 dotProdVal3 = _mm_setzero_ps();
717 
718  for (; number < sixteenthPoints; number++) {
719 
720  a0Val = _mm_loadu_ps(aPtr);
721  a1Val = _mm_loadu_ps(aPtr + 4);
722  a2Val = _mm_loadu_ps(aPtr + 8);
723  a3Val = _mm_loadu_ps(aPtr + 12);
724 
725  x0Val = _mm_loadu_ps(bPtr);
726  x1Val = _mm_loadu_ps(bPtr);
727  x2Val = _mm_loadu_ps(bPtr + 4);
728  x3Val = _mm_loadu_ps(bPtr + 4);
729  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
730  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
731  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
732  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
733 
734  c0Val = _mm_mul_ps(a0Val, b0Val);
735  c1Val = _mm_mul_ps(a1Val, b1Val);
736  c2Val = _mm_mul_ps(a2Val, b2Val);
737  c3Val = _mm_mul_ps(a3Val, b3Val);
738 
739  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
740  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
741  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
742  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
743 
744  aPtr += 16;
745  bPtr += 8;
746  }
747 
748  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
749  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
750  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
751 
752  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
753 
754  _mm_store_ps(dotProductVector,
755  dotProdVal0); // Store the results back into the dot product vector
756 
757  *realpt = dotProductVector[0];
758  *imagpt = dotProductVector[1];
759  *realpt += dotProductVector[2];
760  *imagpt += dotProductVector[3];
761 
762  number = sixteenthPoints * 8;
763  for (; number < num_points; number++) {
764  *realpt += ((*aPtr++) * (*bPtr));
765  *imagpt += ((*aPtr++) * (*bPtr++));
766  }
767 
768  *result = *(lv_32fc_t*)(&res[0]);
769 }
770 
771 #endif /*LV_HAVE_SSE*/
772 
773 
774 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:431
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:263
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:521
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:604
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:58
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:172
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:694
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74