Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47 
48 #include <stdio.h>
49 #include <string.h>
50 #include <volk/volk_common.h>
51 #include <volk/volk_complex.h>
52 
53 
54 #ifdef LV_HAVE_GENERIC
55 
56 
57 static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
58  const lv_32fc_t* input,
59  const lv_32fc_t* taps,
60  unsigned int num_points)
61 {
62 
63  float* res = (float*)result;
64  float* in = (float*)input;
65  float* tp = (float*)taps;
66  unsigned int n_2_ccomplex_blocks = num_points / 2;
67 
68  float sum0[2] = { 0, 0 };
69  float sum1[2] = { 0, 0 };
70  unsigned int i = 0;
71 
72  for (i = 0; i < n_2_ccomplex_blocks; ++i) {
73  sum0[0] += in[0] * tp[0] - in[1] * tp[1];
74  sum0[1] += in[0] * tp[1] + in[1] * tp[0];
75  sum1[0] += in[2] * tp[2] - in[3] * tp[3];
76  sum1[1] += in[2] * tp[3] + in[3] * tp[2];
77 
78  in += 4;
79  tp += 4;
80  }
81 
82  res[0] = sum0[0] + sum1[0];
83  res[1] = sum0[1] + sum1[1];
84 
85  // Cleanup if we had an odd number of points
86  if (num_points & 1) {
87  *result += input[num_points - 1] * taps[num_points - 1];
88  }
89 }
90 
91 #endif /*LV_HAVE_GENERIC*/
92 
93 
94 #if LV_HAVE_SSE && LV_HAVE_64
95 
96 static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
97  const lv_32fc_t* input,
98  const lv_32fc_t* taps,
99  unsigned int num_points)
100 {
101 
102  const unsigned int num_bytes = num_points * 8;
103  unsigned int isodd = num_points & 1;
104 
105  __VOLK_ASM(
106  "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
107  "# const float *taps, unsigned num_bytes)\n\t"
108  "# float sum0 = 0;\n\t"
109  "# float sum1 = 0;\n\t"
110  "# float sum2 = 0;\n\t"
111  "# float sum3 = 0;\n\t"
112  "# do {\n\t"
113  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
114  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
115  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
116  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
117  "# input += 4;\n\t"
118  "# taps += 4; \n\t"
119  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
120  "# result[0] = sum0 + sum2;\n\t"
121  "# result[1] = sum1 + sum3;\n\t"
122  "# TODO: prefetch and better scheduling\n\t"
123  " xor %%r9, %%r9\n\t"
124  " xor %%r10, %%r10\n\t"
125  " movq %%rcx, %%rax\n\t"
126  " movq %%rcx, %%r8\n\t"
127  " movq %[rsi], %%r9\n\t"
128  " movq %[rdx], %%r10\n\t"
129  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
130  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
131  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
132  " shr $4, %%r8\n\t"
133  " jmp .%=L1_test\n\t"
134  " # 4 taps / loop\n\t"
135  " # something like ?? cycles / loop\n\t"
136  ".%=Loop1: \n\t"
137  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
138  "# movups (%%r9), %%xmmA\n\t"
139  "# movups (%%r10), %%xmmB\n\t"
140  "# movups %%xmmA, %%xmmZ\n\t"
141  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
142  "# mulps %%xmmB, %%xmmA\n\t"
143  "# mulps %%xmmZ, %%xmmB\n\t"
144  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
145  "# xorps %%xmmPN, %%xmmA\n\t"
146  "# movups %%xmmA, %%xmmZ\n\t"
147  "# unpcklps %%xmmB, %%xmmA\n\t"
148  "# unpckhps %%xmmB, %%xmmZ\n\t"
149  "# movups %%xmmZ, %%xmmY\n\t"
150  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
151  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
152  "# addps %%xmmZ, %%xmmA\n\t"
153  "# addps %%xmmA, %%xmmC\n\t"
154  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
155  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
156  " movups 0(%%r9), %%xmm0\n\t"
157  " movups 16(%%r9), %%xmm1\n\t"
158  " movups %%xmm0, %%xmm4\n\t"
159  " movups 0(%%r10), %%xmm2\n\t"
160  " mulps %%xmm2, %%xmm0\n\t"
161  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
162  " movups 16(%%r10), %%xmm3\n\t"
163  " movups %%xmm1, %%xmm5\n\t"
164  " addps %%xmm0, %%xmm6\n\t"
165  " mulps %%xmm3, %%xmm1\n\t"
166  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
167  " addps %%xmm1, %%xmm6\n\t"
168  " mulps %%xmm4, %%xmm2\n\t"
169  " addps %%xmm2, %%xmm7\n\t"
170  " mulps %%xmm5, %%xmm3\n\t"
171  " add $32, %%r9\n\t"
172  " addps %%xmm3, %%xmm7\n\t"
173  " add $32, %%r10\n\t"
174  ".%=L1_test:\n\t"
175  " dec %%rax\n\t"
176  " jge .%=Loop1\n\t"
177  " # We've handled the bulk of multiplies up to here.\n\t"
178  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
179  " # If so, we've got 2 more taps to do.\n\t"
180  " and $1, %%r8\n\t"
181  " je .%=Leven\n\t"
182  " # The count was odd, do 2 more taps.\n\t"
183  " movups 0(%%r9), %%xmm0\n\t"
184  " movups %%xmm0, %%xmm4\n\t"
185  " movups 0(%%r10), %%xmm2\n\t"
186  " mulps %%xmm2, %%xmm0\n\t"
187  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
188  " addps %%xmm0, %%xmm6\n\t"
189  " mulps %%xmm4, %%xmm2\n\t"
190  " addps %%xmm2, %%xmm7\n\t"
191  ".%=Leven:\n\t"
192  " # neg inversor\n\t"
193  " xorps %%xmm1, %%xmm1\n\t"
194  " mov $0x80000000, %%r9\n\t"
195  " movd %%r9, %%xmm1\n\t"
196  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
197  " # pfpnacc\n\t"
198  " xorps %%xmm1, %%xmm6\n\t"
199  " movups %%xmm6, %%xmm2\n\t"
200  " unpcklps %%xmm7, %%xmm6\n\t"
201  " unpckhps %%xmm7, %%xmm2\n\t"
202  " movups %%xmm2, %%xmm3\n\t"
203  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
204  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
205  " addps %%xmm2, %%xmm6\n\t"
206  " # xmm6 = r1 i2 r3 i4\n\t"
207  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
208  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
209  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
210  "to memory\n\t"
211  :
212  : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
213  : "rax", "r8", "r9", "r10");
214 
215 
216  if (isodd) {
217  *result += input[num_points - 1] * taps[num_points - 1];
218  }
219 
220  return;
221 }
222 
223 #endif /* LV_HAVE_SSE && LV_HAVE_64 */
224 
225 
226 #ifdef LV_HAVE_SSE3
227 
228 #include <pmmintrin.h>
229 
230 static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
231  const lv_32fc_t* input,
232  const lv_32fc_t* taps,
233  unsigned int num_points)
234 {
235 
236  lv_32fc_t dotProduct;
237  memset(&dotProduct, 0x0, 2 * sizeof(float));
238 
239  unsigned int number = 0;
240  const unsigned int halfPoints = num_points / 2;
241  unsigned int isodd = num_points & 1;
242 
243  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
244 
245  const lv_32fc_t* a = input;
246  const lv_32fc_t* b = taps;
247 
248  dotProdVal = _mm_setzero_ps();
249 
250  for (; number < halfPoints; number++) {
251 
252  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
253  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
254 
255  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
256  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
257 
258  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
259 
260  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
261 
262  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
263 
264  z = _mm_addsub_ps(tmp1,
265  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
266 
267  dotProdVal =
268  _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
269 
270  a += 2;
271  b += 2;
272  }
273 
274  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
275 
276  _mm_storeu_ps((float*)dotProductVector,
277  dotProdVal); // Store the results back into the dot product vector
278 
279  dotProduct += (dotProductVector[0] + dotProductVector[1]);
280 
281  if (isodd) {
282  dotProduct += input[num_points - 1] * taps[num_points - 1];
283  }
284 
285  *result = dotProduct;
286 }
287 
288 #endif /*LV_HAVE_SSE3*/
289 
290 // #ifdef LV_HAVE_SSE4_1
291 
292 // #include <smmintrin.h>
293 
294 // static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
295 // const lv_32fc_t* input,
296 // const lv_32fc_t* taps,
297 // unsigned int num_points)
298 // {
299 
300 // unsigned int i = 0;
301 // const unsigned int qtr_points = num_points / 4;
302 // const unsigned int isodd = num_points & 3;
303 
304 // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
305 // float *p_input, *p_taps;
306 // __m64* p_result;
307 
308 // p_result = (__m64*)result;
309 // p_input = (float*)input;
310 // p_taps = (float*)taps;
311 
312 // static const __m128i neg = { 0x000000000000000080000000 };
313 
314 // real0 = _mm_setzero_ps();
315 // real1 = _mm_setzero_ps();
316 // im0 = _mm_setzero_ps();
317 // im1 = _mm_setzero_ps();
318 
319 // for (; i < qtr_points; ++i) {
320 // xmm0 = _mm_loadu_ps(p_input);
321 // xmm1 = _mm_loadu_ps(p_taps);
322 
323 // p_input += 4;
324 // p_taps += 4;
325 
326 // xmm2 = _mm_loadu_ps(p_input);
327 // xmm3 = _mm_loadu_ps(p_taps);
328 
329 // p_input += 4;
330 // p_taps += 4;
331 
332 // xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
333 // xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
334 // xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
335 // xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
336 
337 // // imaginary vector from input
338 // xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
339 // // real vector from input
340 // xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
341 // // imaginary vector from taps
342 // xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
343 // // real vector from taps
344 // xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
345 
346 // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
347 // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
348 
349 // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
350 // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
351 
352 // real0 = _mm_add_ps(xmm4, real0);
353 // real1 = _mm_add_ps(xmm5, real1);
354 // im0 = _mm_add_ps(xmm6, im0);
355 // im1 = _mm_add_ps(xmm7, im1);
356 // }
357 
358 // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
359 
360 // im0 = _mm_add_ps(im0, im1);
361 // real0 = _mm_add_ps(real0, real1);
362 
363 // im0 = _mm_add_ps(im0, real0);
364 
365 // _mm_storel_pi(p_result, im0);
366 
367 // for (i = num_points - isodd; i < num_points; i++) {
368 // *result += input[i] * taps[i];
369 // }
370 // }
371 
372 // #endif /*LV_HAVE_SSE4_1*/
373 
374 #ifdef LV_HAVE_AVX
375 
376 #include <immintrin.h>
377 
378 static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
379  const lv_32fc_t* input,
380  const lv_32fc_t* taps,
381  unsigned int num_points)
382 {
383 
384  unsigned int isodd = num_points & 3;
385  unsigned int i = 0;
386  lv_32fc_t dotProduct;
387  memset(&dotProduct, 0x0, 2 * sizeof(float));
388 
389  unsigned int number = 0;
390  const unsigned int quarterPoints = num_points / 4;
391 
392  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
393 
394  const lv_32fc_t* a = input;
395  const lv_32fc_t* b = taps;
396 
397  dotProdVal = _mm256_setzero_ps();
398 
399  for (; number < quarterPoints; number++) {
400  x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
401  y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
402 
403  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
404  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
405 
406  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
407 
408  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
409 
410  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
411 
412  z = _mm256_addsub_ps(tmp1,
413  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
414 
415  dotProdVal = _mm256_add_ps(dotProdVal,
416  z); // Add the complex multiplication results together
417 
418  a += 4;
419  b += 4;
420  }
421 
422  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
423 
424  _mm256_storeu_ps((float*)dotProductVector,
425  dotProdVal); // Store the results back into the dot product vector
426 
427  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
428  dotProductVector[3]);
429 
430  for (i = num_points - isodd; i < num_points; i++) {
431  dotProduct += input[i] * taps[i];
432  }
433 
434  *result = dotProduct;
435 }
436 
437 #endif /*LV_HAVE_AVX*/
438 
439 #if LV_HAVE_AVX && LV_HAVE_FMA
440 #include <immintrin.h>
441 
442 static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
443  const lv_32fc_t* input,
444  const lv_32fc_t* taps,
445  unsigned int num_points)
446 {
447 
448  unsigned int isodd = num_points & 3;
449  unsigned int i = 0;
450  lv_32fc_t dotProduct;
451  memset(&dotProduct, 0x0, 2 * sizeof(float));
452 
453  unsigned int number = 0;
454  const unsigned int quarterPoints = num_points / 4;
455 
456  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
457 
458  const lv_32fc_t* a = input;
459  const lv_32fc_t* b = taps;
460 
461  dotProdVal = _mm256_setzero_ps();
462 
463  for (; number < quarterPoints; number++) {
464 
465  x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
466  y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
467 
468  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
469  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
470 
471  tmp1 = x;
472 
473  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
474 
475  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
476 
477  z = _mm256_fmaddsub_ps(
478  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
479 
480  dotProdVal = _mm256_add_ps(dotProdVal,
481  z); // Add the complex multiplication results together
482 
483  a += 4;
484  b += 4;
485  }
486 
487  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
488 
489  _mm256_storeu_ps((float*)dotProductVector,
490  dotProdVal); // Store the results back into the dot product vector
491 
492  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
493  dotProductVector[3]);
494 
495  for (i = num_points - isodd; i < num_points; i++) {
496  dotProduct += input[i] * taps[i];
497  }
498 
499  *result = dotProduct;
500 }
501 
502 #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
503 
504 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
505 
506 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
507 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
508 
509 #include <stdio.h>
510 #include <string.h>
511 #include <volk/volk_common.h>
512 #include <volk/volk_complex.h>
513 
514 
515 #ifdef LV_HAVE_GENERIC
516 
517 
519  const lv_32fc_t* input,
520  const lv_32fc_t* taps,
521  unsigned int num_points)
522 {
523 
524  const unsigned int num_bytes = num_points * 8;
525 
526  float* res = (float*)result;
527  float* in = (float*)input;
528  float* tp = (float*)taps;
529  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
530 
531  float sum0[2] = { 0, 0 };
532  float sum1[2] = { 0, 0 };
533  unsigned int i = 0;
534 
535  for (i = 0; i < n_2_ccomplex_blocks; ++i) {
536  sum0[0] += in[0] * tp[0] - in[1] * tp[1];
537  sum0[1] += in[0] * tp[1] + in[1] * tp[0];
538  sum1[0] += in[2] * tp[2] - in[3] * tp[3];
539  sum1[1] += in[2] * tp[3] + in[3] * tp[2];
540 
541  in += 4;
542  tp += 4;
543  }
544 
545  res[0] = sum0[0] + sum1[0];
546  res[1] = sum0[1] + sum1[1];
547 
548  if (num_points & 1) {
549  *result += input[num_points - 1] * taps[num_points - 1];
550  }
551 }
552 
553 #endif /*LV_HAVE_GENERIC*/
554 
555 
556 #if LV_HAVE_SSE && LV_HAVE_64
557 
558 
559 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
560  const lv_32fc_t* input,
561  const lv_32fc_t* taps,
562  unsigned int num_points)
563 {
564 
565  const unsigned int num_bytes = num_points * 8;
566  unsigned int isodd = num_points & 1;
567 
568  __VOLK_ASM(
569  "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
570  "# const float *taps, unsigned num_bytes)\n\t"
571  "# float sum0 = 0;\n\t"
572  "# float sum1 = 0;\n\t"
573  "# float sum2 = 0;\n\t"
574  "# float sum3 = 0;\n\t"
575  "# do {\n\t"
576  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
577  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
578  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
579  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
580  "# input += 4;\n\t"
581  "# taps += 4; \n\t"
582  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
583  "# result[0] = sum0 + sum2;\n\t"
584  "# result[1] = sum1 + sum3;\n\t"
585  "# TODO: prefetch and better scheduling\n\t"
586  " xor %%r9, %%r9\n\t"
587  " xor %%r10, %%r10\n\t"
588  " movq %%rcx, %%rax\n\t"
589  " movq %%rcx, %%r8\n\t"
590  " movq %[rsi], %%r9\n\t"
591  " movq %[rdx], %%r10\n\t"
592  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
593  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
594  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
595  " shr $4, %%r8\n\t"
596  " jmp .%=L1_test\n\t"
597  " # 4 taps / loop\n\t"
598  " # something like ?? cycles / loop\n\t"
599  ".%=Loop1: \n\t"
600  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
601  "# movaps (%%r9), %%xmmA\n\t"
602  "# movaps (%%r10), %%xmmB\n\t"
603  "# movaps %%xmmA, %%xmmZ\n\t"
604  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
605  "# mulps %%xmmB, %%xmmA\n\t"
606  "# mulps %%xmmZ, %%xmmB\n\t"
607  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
608  "# xorps %%xmmPN, %%xmmA\n\t"
609  "# movaps %%xmmA, %%xmmZ\n\t"
610  "# unpcklps %%xmmB, %%xmmA\n\t"
611  "# unpckhps %%xmmB, %%xmmZ\n\t"
612  "# movaps %%xmmZ, %%xmmY\n\t"
613  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
614  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
615  "# addps %%xmmZ, %%xmmA\n\t"
616  "# addps %%xmmA, %%xmmC\n\t"
617  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
618  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
619  " movaps 0(%%r9), %%xmm0\n\t"
620  " movaps 16(%%r9), %%xmm1\n\t"
621  " movaps %%xmm0, %%xmm4\n\t"
622  " movaps 0(%%r10), %%xmm2\n\t"
623  " mulps %%xmm2, %%xmm0\n\t"
624  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
625  " movaps 16(%%r10), %%xmm3\n\t"
626  " movaps %%xmm1, %%xmm5\n\t"
627  " addps %%xmm0, %%xmm6\n\t"
628  " mulps %%xmm3, %%xmm1\n\t"
629  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
630  " addps %%xmm1, %%xmm6\n\t"
631  " mulps %%xmm4, %%xmm2\n\t"
632  " addps %%xmm2, %%xmm7\n\t"
633  " mulps %%xmm5, %%xmm3\n\t"
634  " add $32, %%r9\n\t"
635  " addps %%xmm3, %%xmm7\n\t"
636  " add $32, %%r10\n\t"
637  ".%=L1_test:\n\t"
638  " dec %%rax\n\t"
639  " jge .%=Loop1\n\t"
640  " # We've handled the bulk of multiplies up to here.\n\t"
641  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
642  " # If so, we've got 2 more taps to do.\n\t"
643  " and $1, %%r8\n\t"
644  " je .%=Leven\n\t"
645  " # The count was odd, do 2 more taps.\n\t"
646  " movaps 0(%%r9), %%xmm0\n\t"
647  " movaps %%xmm0, %%xmm4\n\t"
648  " movaps 0(%%r10), %%xmm2\n\t"
649  " mulps %%xmm2, %%xmm0\n\t"
650  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
651  " addps %%xmm0, %%xmm6\n\t"
652  " mulps %%xmm4, %%xmm2\n\t"
653  " addps %%xmm2, %%xmm7\n\t"
654  ".%=Leven:\n\t"
655  " # neg inversor\n\t"
656  " xorps %%xmm1, %%xmm1\n\t"
657  " mov $0x80000000, %%r9\n\t"
658  " movd %%r9, %%xmm1\n\t"
659  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
660  " # pfpnacc\n\t"
661  " xorps %%xmm1, %%xmm6\n\t"
662  " movaps %%xmm6, %%xmm2\n\t"
663  " unpcklps %%xmm7, %%xmm6\n\t"
664  " unpckhps %%xmm7, %%xmm2\n\t"
665  " movaps %%xmm2, %%xmm3\n\t"
666  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
667  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
668  " addps %%xmm2, %%xmm6\n\t"
669  " # xmm6 = r1 i2 r3 i4\n\t"
670  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
671  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
672  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
673  "to memory\n\t"
674  :
675  : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
676  : "rax", "r8", "r9", "r10");
677 
678 
679  if (isodd) {
680  *result += input[num_points - 1] * taps[num_points - 1];
681  }
682 
683  return;
684 }
685 
686 #endif
687 
688 #if LV_HAVE_SSE && LV_HAVE_32
689 
690 static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
691  const lv_32fc_t* input,
692  const lv_32fc_t* taps,
693  unsigned int num_points)
694 {
695 
696  volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
697 
698 #if 0
699  const unsigned int num_bytes = num_points*8;
700  unsigned int isodd = num_points & 1;
701 
703  (
704  " #pushl %%ebp\n\t"
705  " #movl %%esp, %%ebp\n\t"
706  " movl 12(%%ebp), %%eax # input\n\t"
707  " movl 16(%%ebp), %%edx # taps\n\t"
708  " movl 20(%%ebp), %%ecx # n_bytes\n\t"
709  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
710  " movaps 0(%%eax), %%xmm0\n\t"
711  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
712  " movaps 0(%%edx), %%xmm2\n\t"
713  " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
714  " jmp .%=L1_test\n\t"
715  " # 4 taps / loop\n\t"
716  " # something like ?? cycles / loop\n\t"
717  ".%=Loop1: \n\t"
718  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
719  "# movaps (%%eax), %%xmmA\n\t"
720  "# movaps (%%edx), %%xmmB\n\t"
721  "# movaps %%xmmA, %%xmmZ\n\t"
722  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
723  "# mulps %%xmmB, %%xmmA\n\t"
724  "# mulps %%xmmZ, %%xmmB\n\t"
725  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
726  "# xorps %%xmmPN, %%xmmA\n\t"
727  "# movaps %%xmmA, %%xmmZ\n\t"
728  "# unpcklps %%xmmB, %%xmmA\n\t"
729  "# unpckhps %%xmmB, %%xmmZ\n\t"
730  "# movaps %%xmmZ, %%xmmY\n\t"
731  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
732  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
733  "# addps %%xmmZ, %%xmmA\n\t"
734  "# addps %%xmmA, %%xmmC\n\t"
735  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
736  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
737  " movaps 16(%%eax), %%xmm1\n\t"
738  " movaps %%xmm0, %%xmm4\n\t"
739  " mulps %%xmm2, %%xmm0\n\t"
740  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
741  " movaps 16(%%edx), %%xmm3\n\t"
742  " movaps %%xmm1, %%xmm5\n\t"
743  " addps %%xmm0, %%xmm6\n\t"
744  " mulps %%xmm3, %%xmm1\n\t"
745  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
746  " addps %%xmm1, %%xmm6\n\t"
747  " mulps %%xmm4, %%xmm2\n\t"
748  " movaps 32(%%eax), %%xmm0\n\t"
749  " addps %%xmm2, %%xmm7\n\t"
750  " mulps %%xmm5, %%xmm3\n\t"
751  " addl $32, %%eax\n\t"
752  " movaps 32(%%edx), %%xmm2\n\t"
753  " addps %%xmm3, %%xmm7\n\t"
754  " addl $32, %%edx\n\t"
755  ".%=L1_test:\n\t"
756  " decl %%ecx\n\t"
757  " jge .%=Loop1\n\t"
758  " # We've handled the bulk of multiplies up to here.\n\t"
759  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
760  " # If so, we've got 2 more taps to do.\n\t"
761  " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
762  " shrl $4, %%ecx\n\t"
763  " andl $1, %%ecx\n\t"
764  " je .%=Leven\n\t"
765  " # The count was odd, do 2 more taps.\n\t"
766  " # Note that we've already got mm0/mm2 preloaded\n\t"
767  " # from the main loop.\n\t"
768  " movaps %%xmm0, %%xmm4\n\t"
769  " mulps %%xmm2, %%xmm0\n\t"
770  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
771  " addps %%xmm0, %%xmm6\n\t"
772  " mulps %%xmm4, %%xmm2\n\t"
773  " addps %%xmm2, %%xmm7\n\t"
774  ".%=Leven:\n\t"
775  " # neg inversor\n\t"
776  " movl 8(%%ebp), %%eax \n\t"
777  " xorps %%xmm1, %%xmm1\n\t"
778  " movl $0x80000000, (%%eax)\n\t"
779  " movss (%%eax), %%xmm1\n\t"
780  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
781  " # pfpnacc\n\t"
782  " xorps %%xmm1, %%xmm6\n\t"
783  " movaps %%xmm6, %%xmm2\n\t"
784  " unpcklps %%xmm7, %%xmm6\n\t"
785  " unpckhps %%xmm7, %%xmm2\n\t"
786  " movaps %%xmm2, %%xmm3\n\t"
787  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
788  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
789  " addps %%xmm2, %%xmm6\n\t"
790  " # xmm6 = r1 i2 r3 i4\n\t"
791  " #movl 8(%%ebp), %%eax # @result\n\t"
792  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
793  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
794  " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
795  " #popl %%ebp\n\t"
796  :
797  :
798  : "eax", "ecx", "edx"
799  );
800 
801 
802  int getem = num_bytes % 16;
803 
804  if(isodd) {
805  *result += (input[num_points - 1] * taps[num_points - 1]);
806  }
807 
808  return;
809 #endif
810 }
811 
812 #endif /*LV_HAVE_SSE*/
813 
814 #ifdef LV_HAVE_SSE3
815 
816 #include <pmmintrin.h>
817 
818 static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
819  const lv_32fc_t* input,
820  const lv_32fc_t* taps,
821  unsigned int num_points)
822 {
823 
824  const unsigned int num_bytes = num_points * 8;
825  unsigned int isodd = num_points & 1;
826 
827  lv_32fc_t dotProduct;
828  memset(&dotProduct, 0x0, 2 * sizeof(float));
829 
830  unsigned int number = 0;
831  const unsigned int halfPoints = num_bytes >> 4;
832 
833  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
834 
835  const lv_32fc_t* a = input;
836  const lv_32fc_t* b = taps;
837 
838  dotProdVal = _mm_setzero_ps();
839 
840  for (; number < halfPoints; number++) {
841 
842  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
843  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
844 
845  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
846  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
847 
848  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
849 
850  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
851 
852  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
853 
854  z = _mm_addsub_ps(tmp1,
855  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
856 
857  dotProdVal =
858  _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
859 
860  a += 2;
861  b += 2;
862  }
863 
864  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
865 
866  _mm_store_ps((float*)dotProductVector,
867  dotProdVal); // Store the results back into the dot product vector
868 
869  dotProduct += (dotProductVector[0] + dotProductVector[1]);
870 
871  if (isodd) {
872  dotProduct += input[num_points - 1] * taps[num_points - 1];
873  }
874 
875  *result = dotProduct;
876 }
877 
878 #endif /*LV_HAVE_SSE3*/
879 
880 
881 // #ifdef LV_HAVE_SSE4_1
882 
883 // #include <smmintrin.h>
884 
885 // static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
886 // const lv_32fc_t* input,
887 // const lv_32fc_t* taps,
888 // unsigned int num_points)
889 // {
890 
891 // unsigned int i = 0;
892 // const unsigned int qtr_points = num_points / 4;
893 // const unsigned int isodd = num_points & 3;
894 
895 // __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
896 // float *p_input, *p_taps;
897 // __m64* p_result;
898 
899 // static const __m128i neg = { 0x000000000000000080000000 };
900 
901 // p_result = (__m64*)result;
902 // p_input = (float*)input;
903 // p_taps = (float*)taps;
904 
905 // real0 = _mm_setzero_ps();
906 // real1 = _mm_setzero_ps();
907 // im0 = _mm_setzero_ps();
908 // im1 = _mm_setzero_ps();
909 
910 // for (; i < qtr_points; ++i) {
911 // xmm0 = _mm_load_ps(p_input);
912 // xmm1 = _mm_load_ps(p_taps);
913 
914 // p_input += 4;
915 // p_taps += 4;
916 
917 // xmm2 = _mm_load_ps(p_input);
918 // xmm3 = _mm_load_ps(p_taps);
919 
920 // p_input += 4;
921 // p_taps += 4;
922 
923 // xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
924 // xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
925 // xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
926 // xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
927 
928 // // imaginary vector from input
929 // xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
930 // // real vector from input
931 // xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
932 // // imaginary vector from taps
933 // xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
934 // // real vector from taps
935 // xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
936 
937 // xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
938 // xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
939 
940 // xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
941 // xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
942 
943 // real0 = _mm_add_ps(xmm4, real0);
944 // real1 = _mm_add_ps(xmm5, real1);
945 // im0 = _mm_add_ps(xmm6, im0);
946 // im1 = _mm_add_ps(xmm7, im1);
947 // }
948 
949 // real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
950 
951 // im0 = _mm_add_ps(im0, im1);
952 // real0 = _mm_add_ps(real0, real1);
953 
954 // im0 = _mm_add_ps(im0, real0);
955 
956 // _mm_storel_pi(p_result, im0);
957 
958 // for (i = num_points - isodd; i < num_points; i++) {
959 // *result += input[i] * taps[i];
960 // }
961 // }
962 
963 // #endif /*LV_HAVE_SSE4_1*/
964 
965 #ifdef LV_HAVE_NEON
966 #include <arm_neon.h>
967 
968 static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
969  const lv_32fc_t* input,
970  const lv_32fc_t* taps,
971  unsigned int num_points)
972 {
973 
974  unsigned int quarter_points = num_points / 4;
975  unsigned int number;
976 
977  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
978  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
979  // for 2-lane vectors, 1st lane holds the real part,
980  // 2nd lane holds the imaginary part
981  float32x4x2_t a_val, b_val, c_val, accumulator;
982  float32x4x2_t tmp_real, tmp_imag;
983  accumulator.val[0] = vdupq_n_f32(0);
984  accumulator.val[1] = vdupq_n_f32(0);
985 
986  for (number = 0; number < quarter_points; ++number) {
987  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
988  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
989  __VOLK_PREFETCH(a_ptr + 8);
990  __VOLK_PREFETCH(b_ptr + 8);
991 
992  // multiply the real*real and imag*imag to get real result
993  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
994  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
995  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
996  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
997 
998  // Multiply cross terms to get the imaginary result
999  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
1000  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
1001  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
1002  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1003 
1004  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
1005  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
1006 
1007  accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
1008  accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
1009 
1010  a_ptr += 4;
1011  b_ptr += 4;
1012  }
1013  lv_32fc_t accum_result[4];
1014  vst2q_f32((float*)accum_result, accumulator);
1015  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1016 
1017  // tail case
1018  for (number = quarter_points * 4; number < num_points; ++number) {
1019  *result += (*a_ptr++) * (*b_ptr++);
1020  }
1021 }
1022 #endif /*LV_HAVE_NEON*/
1023 
1024 #ifdef LV_HAVE_NEON
1025 #include <arm_neon.h>
1027  const lv_32fc_t* input,
1028  const lv_32fc_t* taps,
1029  unsigned int num_points)
1030 {
1031 
1032  unsigned int quarter_points = num_points / 4;
1033  unsigned int number;
1034 
1035  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1036  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1037  // for 2-lane vectors, 1st lane holds the real part,
1038  // 2nd lane holds the imaginary part
1039  float32x4x2_t a_val, b_val, accumulator;
1040  float32x4x2_t tmp_imag;
1041  accumulator.val[0] = vdupq_n_f32(0);
1042  accumulator.val[1] = vdupq_n_f32(0);
1043 
1044  for (number = 0; number < quarter_points; ++number) {
1045  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1046  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1047  __VOLK_PREFETCH(a_ptr + 8);
1048  __VOLK_PREFETCH(b_ptr + 8);
1049 
1050  // do the first multiply
1051  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1052  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
1053 
1054  // use multiply accumulate/subtract to get result
1055  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
1056  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
1057 
1058  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
1059  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
1060 
1061  // increment pointers
1062  a_ptr += 4;
1063  b_ptr += 4;
1064  }
1065  lv_32fc_t accum_result[4];
1066  vst2q_f32((float*)accum_result, accumulator);
1067  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1068 
1069  // tail case
1070  for (number = quarter_points * 4; number < num_points; ++number) {
1071  *result += (*a_ptr++) * (*b_ptr++);
1072  }
1073 }
1074 #endif /*LV_HAVE_NEON*/
1075 
1076 #ifdef LV_HAVE_NEON
1078  const lv_32fc_t* input,
1079  const lv_32fc_t* taps,
1080  unsigned int num_points)
1081 {
1082 
1083  unsigned int quarter_points = num_points / 4;
1084  unsigned int number;
1085 
1086  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1087  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1088  // for 2-lane vectors, 1st lane holds the real part,
1089  // 2nd lane holds the imaginary part
1090  float32x4x2_t a_val, b_val, accumulator1, accumulator2;
1091  accumulator1.val[0] = vdupq_n_f32(0);
1092  accumulator1.val[1] = vdupq_n_f32(0);
1093  accumulator2.val[0] = vdupq_n_f32(0);
1094  accumulator2.val[1] = vdupq_n_f32(0);
1095 
1096  for (number = 0; number < quarter_points; ++number) {
1097  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1098  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1099  __VOLK_PREFETCH(a_ptr + 8);
1100  __VOLK_PREFETCH(b_ptr + 8);
1101 
1102  // use 2 accumulators to remove inter-instruction data dependencies
1103  accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1104  accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1105  accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1106  accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1107  // increment pointers
1108  a_ptr += 4;
1109  b_ptr += 4;
1110  }
1111  accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1112  accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1113  lv_32fc_t accum_result[4];
1114  vst2q_f32((float*)accum_result, accumulator1);
1115  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1116 
1117  // tail case
1118  for (number = quarter_points * 4; number < num_points; ++number) {
1119  *result += (*a_ptr++) * (*b_ptr++);
1120  }
1121 }
1122 #endif /*LV_HAVE_NEON*/
1123 
1124 #ifdef LV_HAVE_NEON
1126  const lv_32fc_t* input,
1127  const lv_32fc_t* taps,
1128  unsigned int num_points)
1129 {
1130  // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
1131  // fast
1132 
1133  unsigned int quarter_points = num_points / 8;
1134  unsigned int number;
1135 
1136  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1137  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1138  // for 2-lane vectors, 1st lane holds the real part,
1139  // 2nd lane holds the imaginary part
1140  float32x4x4_t a_val, b_val, accumulator1, accumulator2;
1141  float32x4x2_t reduced_accumulator;
1142  accumulator1.val[0] = vdupq_n_f32(0);
1143  accumulator1.val[1] = vdupq_n_f32(0);
1144  accumulator1.val[2] = vdupq_n_f32(0);
1145  accumulator1.val[3] = vdupq_n_f32(0);
1146  accumulator2.val[0] = vdupq_n_f32(0);
1147  accumulator2.val[1] = vdupq_n_f32(0);
1148  accumulator2.val[2] = vdupq_n_f32(0);
1149  accumulator2.val[3] = vdupq_n_f32(0);
1150 
1151  // 8 input regs, 8 accumulators -> 16/16 neon regs are used
1152  for (number = 0; number < quarter_points; ++number) {
1153  a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1154  b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1155  __VOLK_PREFETCH(a_ptr + 8);
1156  __VOLK_PREFETCH(b_ptr + 8);
1157 
1158  // use 2 accumulators to remove inter-instruction data dependencies
1159  accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1160  accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1161 
1162  accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
1163  accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
1164 
1165  accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1166  accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1167 
1168  accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
1169  accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
1170  // increment pointers
1171  a_ptr += 8;
1172  b_ptr += 8;
1173  }
1174  // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
1175  accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
1176  accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
1177  accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
1178  accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
1179  reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1180  reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1181  // now reduce accumulators to scalars
1182  lv_32fc_t accum_result[4];
1183  vst2q_f32((float*)accum_result, reduced_accumulator);
1184  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1185 
1186  // tail case
1187  for (number = quarter_points * 8; number < num_points; ++number) {
1188  *result += (*a_ptr++) * (*b_ptr++);
1189  }
1190 }
1191 #endif /*LV_HAVE_NEON*/
1192 
1193 
1194 #ifdef LV_HAVE_AVX
1195 
1196 #include <immintrin.h>
1197 
1198 static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
1199  const lv_32fc_t* input,
1200  const lv_32fc_t* taps,
1201  unsigned int num_points)
1202 {
1203 
1204  unsigned int isodd = num_points & 3;
1205  unsigned int i = 0;
1206  lv_32fc_t dotProduct;
1207  memset(&dotProduct, 0x0, 2 * sizeof(float));
1208 
1209  unsigned int number = 0;
1210  const unsigned int quarterPoints = num_points / 4;
1211 
1212  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1213 
1214  const lv_32fc_t* a = input;
1215  const lv_32fc_t* b = taps;
1216 
1217  dotProdVal = _mm256_setzero_ps();
1218 
1219  for (; number < quarterPoints; number++) {
1220 
1221  x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1222  y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1223 
1224  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1225  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1226 
1227  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
1228 
1229  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1230 
1231  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1232 
1233  z = _mm256_addsub_ps(tmp1,
1234  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1235 
1236  dotProdVal = _mm256_add_ps(dotProdVal,
1237  z); // Add the complex multiplication results together
1238 
1239  a += 4;
1240  b += 4;
1241  }
1242 
1243  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1244 
1245  _mm256_store_ps((float*)dotProductVector,
1246  dotProdVal); // Store the results back into the dot product vector
1247 
1248  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1249  dotProductVector[3]);
1250 
1251  for (i = num_points - isodd; i < num_points; i++) {
1252  dotProduct += input[i] * taps[i];
1253  }
1254 
1255  *result = dotProduct;
1256 }
1257 
1258 #endif /*LV_HAVE_AVX*/
1259 
1260 #if LV_HAVE_AVX && LV_HAVE_FMA
1261 #include <immintrin.h>
1262 
1263 static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
1264  const lv_32fc_t* input,
1265  const lv_32fc_t* taps,
1266  unsigned int num_points)
1267 {
1268 
1269  unsigned int isodd = num_points & 3;
1270  unsigned int i = 0;
1271  lv_32fc_t dotProduct;
1272  memset(&dotProduct, 0x0, 2 * sizeof(float));
1273 
1274  unsigned int number = 0;
1275  const unsigned int quarterPoints = num_points / 4;
1276 
1277  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1278 
1279  const lv_32fc_t* a = input;
1280  const lv_32fc_t* b = taps;
1281 
1282  dotProdVal = _mm256_setzero_ps();
1283 
1284  for (; number < quarterPoints; number++) {
1285 
1286  x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1287  y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1288 
1289  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1290  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1291 
1292  tmp1 = x;
1293 
1294  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1295 
1296  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1297 
1298  z = _mm256_fmaddsub_ps(
1299  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1300 
1301  dotProdVal = _mm256_add_ps(dotProdVal,
1302  z); // Add the complex multiplication results together
1303 
1304  a += 4;
1305  b += 4;
1306  }
1307 
1308  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1309 
1310  _mm256_store_ps((float*)dotProductVector,
1311  dotProdVal); // Store the results back into the dot product vector
1312 
1313  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1314  dotProductVector[3]);
1315 
1316  for (i = num_points - isodd; i < num_points; i++) {
1317  dotProduct += input[i] * taps[i];
1318  }
1319 
1320  *result = dotProduct;
1321 }
1322 
1323 #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
1324 
1325 
1326 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1125
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:818
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1198
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:378
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:968
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:57
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:230
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1026
static void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:518
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1077
#define __VOLK_VOLATILE
Definition: volk_common.h:73
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ASM
Definition: volk_common.h:72
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13