Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47 
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50 
51 
52 #ifdef LV_HAVE_GENERIC
53 
55  const short* input,
56  const lv_32fc_t* taps,
57  unsigned int num_points)
58 {
59 
60  static const int N_UNROLL = 4;
61 
62  lv_32fc_t acc0 = 0;
63  lv_32fc_t acc1 = 0;
64  lv_32fc_t acc2 = 0;
65  lv_32fc_t acc3 = 0;
66 
67  unsigned i = 0;
68  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69 
70  for (i = 0; i < n; i += N_UNROLL) {
71  acc0 += taps[i + 0] * (float)input[i + 0];
72  acc1 += taps[i + 1] * (float)input[i + 1];
73  acc2 += taps[i + 2] * (float)input[i + 2];
74  acc3 += taps[i + 3] * (float)input[i + 3];
75  }
76 
77  for (; i < num_points; i++) {
78  acc0 += taps[i] * (float)input[i];
79  }
80 
81  *result = acc0 + acc1 + acc2 + acc3;
82 }
83 
84 #endif /*LV_HAVE_GENERIC*/
85 
86 #ifdef LV_HAVE_NEON
87 #include <arm_neon.h>
88 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
89  const short* input,
90  const lv_32fc_t* taps,
91  unsigned int num_points)
92 {
93 
94  unsigned ii;
95  unsigned quarter_points = num_points / 4;
96  lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97  short* inputPtr = (short*)input;
98  lv_32fc_t accumulator_vec[4];
99 
100  float32x4x2_t tapsVal, accumulator_val;
101  int16x4_t input16;
102  int32x4_t input32;
103  float32x4_t input_float, prod_re, prod_im;
104 
105  accumulator_val.val[0] = vdupq_n_f32(0.0);
106  accumulator_val.val[1] = vdupq_n_f32(0.0);
107 
108  for (ii = 0; ii < quarter_points; ++ii) {
109  tapsVal = vld2q_f32((float*)tapsPtr);
110  input16 = vld1_s16(inputPtr);
111  // widen 16-bit int to 32-bit int
112  input32 = vmovl_s16(input16);
113  // convert 32-bit int to float with scale
114  input_float = vcvtq_f32_s32(input32);
115 
116  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118 
119  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121 
122  tapsPtr += 4;
123  inputPtr += 4;
124  }
125  vst2q_f32((float*)accumulator_vec, accumulator_val);
126  accumulator_vec[0] += accumulator_vec[1];
127  accumulator_vec[2] += accumulator_vec[3];
128  accumulator_vec[0] += accumulator_vec[2];
129 
130  for (ii = quarter_points * 4; ii < num_points; ++ii) {
131  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132  }
133 
134  *result = accumulator_vec[0];
135 }
136 
137 #endif /*LV_HAVE_NEON*/
138 
139 #if LV_HAVE_SSE && LV_HAVE_MMX
140 
141 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
142  const short* input,
143  const lv_32fc_t* taps,
144  unsigned int num_points)
145 {
146 
147  unsigned int number = 0;
148  const unsigned int sixteenthPoints = num_points / 8;
149 
150  float res[2];
151  float *realpt = &res[0], *imagpt = &res[1];
152  const short* aPtr = input;
153  const float* bPtr = (float*)taps;
154 
155  __m64 m0, m1;
156  __m128 f0, f1, f2, f3;
157  __m128 a0Val, a1Val, a2Val, a3Val;
158  __m128 b0Val, b1Val, b2Val, b3Val;
159  __m128 c0Val, c1Val, c2Val, c3Val;
160 
161  __m128 dotProdVal0 = _mm_setzero_ps();
162  __m128 dotProdVal1 = _mm_setzero_ps();
163  __m128 dotProdVal2 = _mm_setzero_ps();
164  __m128 dotProdVal3 = _mm_setzero_ps();
165 
166  for (; number < sixteenthPoints; number++) {
167 
168  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
169  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
170  f0 = _mm_cvtpi16_ps(m0);
171  f1 = _mm_cvtpi16_ps(m0);
172  f2 = _mm_cvtpi16_ps(m1);
173  f3 = _mm_cvtpi16_ps(m1);
174 
175  a0Val = _mm_unpacklo_ps(f0, f1);
176  a1Val = _mm_unpackhi_ps(f0, f1);
177  a2Val = _mm_unpacklo_ps(f2, f3);
178  a3Val = _mm_unpackhi_ps(f2, f3);
179 
180  b0Val = _mm_loadu_ps(bPtr);
181  b1Val = _mm_loadu_ps(bPtr + 4);
182  b2Val = _mm_loadu_ps(bPtr + 8);
183  b3Val = _mm_loadu_ps(bPtr + 12);
184 
185  c0Val = _mm_mul_ps(a0Val, b0Val);
186  c1Val = _mm_mul_ps(a1Val, b1Val);
187  c2Val = _mm_mul_ps(a2Val, b2Val);
188  c3Val = _mm_mul_ps(a3Val, b3Val);
189 
190  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
191  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
192  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
193  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
194 
195  aPtr += 8;
196  bPtr += 16;
197  }
198 
199  _mm_empty(); // clear the mmx technology state
200 
201  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
202  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
203  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
204 
205  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
206 
207  _mm_store_ps(dotProductVector,
208  dotProdVal0); // Store the results back into the dot product vector
209 
210  *realpt = dotProductVector[0];
211  *imagpt = dotProductVector[1];
212  *realpt += dotProductVector[2];
213  *imagpt += dotProductVector[3];
214 
215  number = sixteenthPoints * 8;
216  for (; number < num_points; number++) {
217  *realpt += ((*aPtr) * (*bPtr++));
218  *imagpt += ((*aPtr++) * (*bPtr++));
219  }
220 
221  *result = *(lv_32fc_t*)(&res[0]);
222 }
223 
224 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
225 
226 
227 #if LV_HAVE_AVX2 && LV_HAVE_FMA
228 
229 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
230  const short* input,
231  const lv_32fc_t* taps,
232  unsigned int num_points)
233 {
234 
235  unsigned int number = 0;
236  const unsigned int sixteenthPoints = num_points / 16;
237 
238  float res[2];
239  float *realpt = &res[0], *imagpt = &res[1];
240  const short* aPtr = input;
241  const float* bPtr = (float*)taps;
242 
243  __m128i m0, m1;
244  __m256i f0, f1;
245  __m256 g0, g1, h0, h1, h2, h3;
246  __m256 a0Val, a1Val, a2Val, a3Val;
247  __m256 b0Val, b1Val, b2Val, b3Val;
248 
249  __m256 dotProdVal0 = _mm256_setzero_ps();
250  __m256 dotProdVal1 = _mm256_setzero_ps();
251  __m256 dotProdVal2 = _mm256_setzero_ps();
252  __m256 dotProdVal3 = _mm256_setzero_ps();
253 
254  for (; number < sixteenthPoints; number++) {
255 
256  m0 = _mm_loadu_si128((__m128i const*)aPtr);
257  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
258 
259  f0 = _mm256_cvtepi16_epi32(m0);
260  g0 = _mm256_cvtepi32_ps(f0);
261  f1 = _mm256_cvtepi16_epi32(m1);
262  g1 = _mm256_cvtepi32_ps(f1);
263 
264  h0 = _mm256_unpacklo_ps(g0, g0);
265  h1 = _mm256_unpackhi_ps(g0, g0);
266  h2 = _mm256_unpacklo_ps(g1, g1);
267  h3 = _mm256_unpackhi_ps(g1, g1);
268 
269  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
270  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
271  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
272  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
273 
274  b0Val = _mm256_loadu_ps(bPtr);
275  b1Val = _mm256_loadu_ps(bPtr + 8);
276  b2Val = _mm256_loadu_ps(bPtr + 16);
277  b3Val = _mm256_loadu_ps(bPtr + 24);
278 
279  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
280  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
281  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
282  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
283 
284  aPtr += 16;
285  bPtr += 32;
286  }
287 
288  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291 
292  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
293 
294  _mm256_store_ps(dotProductVector,
295  dotProdVal0); // Store the results back into the dot product vector
296 
297  *realpt = dotProductVector[0];
298  *imagpt = dotProductVector[1];
299  *realpt += dotProductVector[2];
300  *imagpt += dotProductVector[3];
301  *realpt += dotProductVector[4];
302  *imagpt += dotProductVector[5];
303  *realpt += dotProductVector[6];
304  *imagpt += dotProductVector[7];
305 
306  number = sixteenthPoints * 16;
307  for (; number < num_points; number++) {
308  *realpt += ((*aPtr) * (*bPtr++));
309  *imagpt += ((*aPtr++) * (*bPtr++));
310  }
311 
312  *result = *(lv_32fc_t*)(&res[0]);
313 }
314 
315 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
316 
317 
318 #ifdef LV_HAVE_AVX2
319 
320 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
321  const short* input,
322  const lv_32fc_t* taps,
323  unsigned int num_points)
324 {
325 
326  unsigned int number = 0;
327  const unsigned int sixteenthPoints = num_points / 16;
328 
329  float res[2];
330  float *realpt = &res[0], *imagpt = &res[1];
331  const short* aPtr = input;
332  const float* bPtr = (float*)taps;
333 
334  __m128i m0, m1;
335  __m256i f0, f1;
336  __m256 g0, g1, h0, h1, h2, h3;
337  __m256 a0Val, a1Val, a2Val, a3Val;
338  __m256 b0Val, b1Val, b2Val, b3Val;
339  __m256 c0Val, c1Val, c2Val, c3Val;
340 
341  __m256 dotProdVal0 = _mm256_setzero_ps();
342  __m256 dotProdVal1 = _mm256_setzero_ps();
343  __m256 dotProdVal2 = _mm256_setzero_ps();
344  __m256 dotProdVal3 = _mm256_setzero_ps();
345 
346  for (; number < sixteenthPoints; number++) {
347 
348  m0 = _mm_loadu_si128((__m128i const*)aPtr);
349  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
350 
351  f0 = _mm256_cvtepi16_epi32(m0);
352  g0 = _mm256_cvtepi32_ps(f0);
353  f1 = _mm256_cvtepi16_epi32(m1);
354  g1 = _mm256_cvtepi32_ps(f1);
355 
356  h0 = _mm256_unpacklo_ps(g0, g0);
357  h1 = _mm256_unpackhi_ps(g0, g0);
358  h2 = _mm256_unpacklo_ps(g1, g1);
359  h3 = _mm256_unpackhi_ps(g1, g1);
360 
361  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
362  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
363  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
364  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
365 
366  b0Val = _mm256_loadu_ps(bPtr);
367  b1Val = _mm256_loadu_ps(bPtr + 8);
368  b2Val = _mm256_loadu_ps(bPtr + 16);
369  b3Val = _mm256_loadu_ps(bPtr + 24);
370 
371  c0Val = _mm256_mul_ps(a0Val, b0Val);
372  c1Val = _mm256_mul_ps(a1Val, b1Val);
373  c2Val = _mm256_mul_ps(a2Val, b2Val);
374  c3Val = _mm256_mul_ps(a3Val, b3Val);
375 
376  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
377  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
378  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
379  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
380 
381  aPtr += 16;
382  bPtr += 32;
383  }
384 
385  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
386  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
387  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
388 
389  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
390 
391  _mm256_store_ps(dotProductVector,
392  dotProdVal0); // Store the results back into the dot product vector
393 
394  *realpt = dotProductVector[0];
395  *imagpt = dotProductVector[1];
396  *realpt += dotProductVector[2];
397  *imagpt += dotProductVector[3];
398  *realpt += dotProductVector[4];
399  *imagpt += dotProductVector[5];
400  *realpt += dotProductVector[6];
401  *imagpt += dotProductVector[7];
402 
403  number = sixteenthPoints * 16;
404  for (; number < num_points; number++) {
405  *realpt += ((*aPtr) * (*bPtr++));
406  *imagpt += ((*aPtr++) * (*bPtr++));
407  }
408 
409  *result = *(lv_32fc_t*)(&res[0]);
410 }
411 
412 #endif /*LV_HAVE_AVX2*/
413 
414 
415 #if LV_HAVE_SSE && LV_HAVE_MMX
416 
417 
418 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
419  const short* input,
420  const lv_32fc_t* taps,
421  unsigned int num_points)
422 {
423 
424  unsigned int number = 0;
425  const unsigned int sixteenthPoints = num_points / 8;
426 
427  float res[2];
428  float *realpt = &res[0], *imagpt = &res[1];
429  const short* aPtr = input;
430  const float* bPtr = (float*)taps;
431 
432  __m64 m0, m1;
433  __m128 f0, f1, f2, f3;
434  __m128 a0Val, a1Val, a2Val, a3Val;
435  __m128 b0Val, b1Val, b2Val, b3Val;
436  __m128 c0Val, c1Val, c2Val, c3Val;
437 
438  __m128 dotProdVal0 = _mm_setzero_ps();
439  __m128 dotProdVal1 = _mm_setzero_ps();
440  __m128 dotProdVal2 = _mm_setzero_ps();
441  __m128 dotProdVal3 = _mm_setzero_ps();
442 
443  for (; number < sixteenthPoints; number++) {
444 
445  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
446  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
447  f0 = _mm_cvtpi16_ps(m0);
448  f1 = _mm_cvtpi16_ps(m0);
449  f2 = _mm_cvtpi16_ps(m1);
450  f3 = _mm_cvtpi16_ps(m1);
451 
452  a0Val = _mm_unpacklo_ps(f0, f1);
453  a1Val = _mm_unpackhi_ps(f0, f1);
454  a2Val = _mm_unpacklo_ps(f2, f3);
455  a3Val = _mm_unpackhi_ps(f2, f3);
456 
457  b0Val = _mm_load_ps(bPtr);
458  b1Val = _mm_load_ps(bPtr + 4);
459  b2Val = _mm_load_ps(bPtr + 8);
460  b3Val = _mm_load_ps(bPtr + 12);
461 
462  c0Val = _mm_mul_ps(a0Val, b0Val);
463  c1Val = _mm_mul_ps(a1Val, b1Val);
464  c2Val = _mm_mul_ps(a2Val, b2Val);
465  c3Val = _mm_mul_ps(a3Val, b3Val);
466 
467  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
468  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
469  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
470  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
471 
472  aPtr += 8;
473  bPtr += 16;
474  }
475 
476  _mm_empty(); // clear the mmx technology state
477 
478  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
479  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
480  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
481 
482  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
483 
484  _mm_store_ps(dotProductVector,
485  dotProdVal0); // Store the results back into the dot product vector
486 
487  *realpt = dotProductVector[0];
488  *imagpt = dotProductVector[1];
489  *realpt += dotProductVector[2];
490  *imagpt += dotProductVector[3];
491 
492  number = sixteenthPoints * 8;
493  for (; number < num_points; number++) {
494  *realpt += ((*aPtr) * (*bPtr++));
495  *imagpt += ((*aPtr++) * (*bPtr++));
496  }
497 
498  *result = *(lv_32fc_t*)(&res[0]);
499 }
500 
501 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
502 
503 #ifdef LV_HAVE_AVX2
504 
505 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
506  const short* input,
507  const lv_32fc_t* taps,
508  unsigned int num_points)
509 {
510 
511  unsigned int number = 0;
512  const unsigned int sixteenthPoints = num_points / 16;
513 
514  float res[2];
515  float *realpt = &res[0], *imagpt = &res[1];
516  const short* aPtr = input;
517  const float* bPtr = (float*)taps;
518 
519  __m128i m0, m1;
520  __m256i f0, f1;
521  __m256 g0, g1, h0, h1, h2, h3;
522  __m256 a0Val, a1Val, a2Val, a3Val;
523  __m256 b0Val, b1Val, b2Val, b3Val;
524  __m256 c0Val, c1Val, c2Val, c3Val;
525 
526  __m256 dotProdVal0 = _mm256_setzero_ps();
527  __m256 dotProdVal1 = _mm256_setzero_ps();
528  __m256 dotProdVal2 = _mm256_setzero_ps();
529  __m256 dotProdVal3 = _mm256_setzero_ps();
530 
531  for (; number < sixteenthPoints; number++) {
532 
533  m0 = _mm_load_si128((__m128i const*)aPtr);
534  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
535 
536  f0 = _mm256_cvtepi16_epi32(m0);
537  g0 = _mm256_cvtepi32_ps(f0);
538  f1 = _mm256_cvtepi16_epi32(m1);
539  g1 = _mm256_cvtepi32_ps(f1);
540 
541  h0 = _mm256_unpacklo_ps(g0, g0);
542  h1 = _mm256_unpackhi_ps(g0, g0);
543  h2 = _mm256_unpacklo_ps(g1, g1);
544  h3 = _mm256_unpackhi_ps(g1, g1);
545 
546  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
547  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
548  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
549  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
550 
551  b0Val = _mm256_load_ps(bPtr);
552  b1Val = _mm256_load_ps(bPtr + 8);
553  b2Val = _mm256_load_ps(bPtr + 16);
554  b3Val = _mm256_load_ps(bPtr + 24);
555 
556  c0Val = _mm256_mul_ps(a0Val, b0Val);
557  c1Val = _mm256_mul_ps(a1Val, b1Val);
558  c2Val = _mm256_mul_ps(a2Val, b2Val);
559  c3Val = _mm256_mul_ps(a3Val, b3Val);
560 
561  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
562  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
563  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
564  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
565 
566  aPtr += 16;
567  bPtr += 32;
568  }
569 
570  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
571  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
572  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
573 
574  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
575 
576  _mm256_store_ps(dotProductVector,
577  dotProdVal0); // Store the results back into the dot product vector
578 
579  *realpt = dotProductVector[0];
580  *imagpt = dotProductVector[1];
581  *realpt += dotProductVector[2];
582  *imagpt += dotProductVector[3];
583  *realpt += dotProductVector[4];
584  *imagpt += dotProductVector[5];
585  *realpt += dotProductVector[6];
586  *imagpt += dotProductVector[7];
587 
588  number = sixteenthPoints * 16;
589  for (; number < num_points; number++) {
590  *realpt += ((*aPtr) * (*bPtr++));
591  *imagpt += ((*aPtr++) * (*bPtr++));
592  }
593 
594  *result = *(lv_32fc_t*)(&res[0]);
595 }
596 
597 
598 #endif /*LV_HAVE_AVX2*/
599 
600 #if LV_HAVE_AVX2 && LV_HAVE_FMA
601 
602 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
603  const short* input,
604  const lv_32fc_t* taps,
605  unsigned int num_points)
606 {
607 
608  unsigned int number = 0;
609  const unsigned int sixteenthPoints = num_points / 16;
610 
611  float res[2];
612  float *realpt = &res[0], *imagpt = &res[1];
613  const short* aPtr = input;
614  const float* bPtr = (float*)taps;
615 
616  __m128i m0, m1;
617  __m256i f0, f1;
618  __m256 g0, g1, h0, h1, h2, h3;
619  __m256 a0Val, a1Val, a2Val, a3Val;
620  __m256 b0Val, b1Val, b2Val, b3Val;
621 
622  __m256 dotProdVal0 = _mm256_setzero_ps();
623  __m256 dotProdVal1 = _mm256_setzero_ps();
624  __m256 dotProdVal2 = _mm256_setzero_ps();
625  __m256 dotProdVal3 = _mm256_setzero_ps();
626 
627  for (; number < sixteenthPoints; number++) {
628 
629  m0 = _mm_load_si128((__m128i const*)aPtr);
630  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
631 
632  f0 = _mm256_cvtepi16_epi32(m0);
633  g0 = _mm256_cvtepi32_ps(f0);
634  f1 = _mm256_cvtepi16_epi32(m1);
635  g1 = _mm256_cvtepi32_ps(f1);
636 
637  h0 = _mm256_unpacklo_ps(g0, g0);
638  h1 = _mm256_unpackhi_ps(g0, g0);
639  h2 = _mm256_unpacklo_ps(g1, g1);
640  h3 = _mm256_unpackhi_ps(g1, g1);
641 
642  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
643  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
644  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
645  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
646 
647  b0Val = _mm256_load_ps(bPtr);
648  b1Val = _mm256_load_ps(bPtr + 8);
649  b2Val = _mm256_load_ps(bPtr + 16);
650  b3Val = _mm256_load_ps(bPtr + 24);
651 
652  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
653  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
654  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
655  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
656 
657  aPtr += 16;
658  bPtr += 32;
659  }
660 
661  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
662  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
663  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
664 
665  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
666 
667  _mm256_store_ps(dotProductVector,
668  dotProdVal0); // Store the results back into the dot product vector
669 
670  *realpt = dotProductVector[0];
671  *imagpt = dotProductVector[1];
672  *realpt += dotProductVector[2];
673  *imagpt += dotProductVector[3];
674  *realpt += dotProductVector[4];
675  *imagpt += dotProductVector[5];
676  *realpt += dotProductVector[6];
677  *imagpt += dotProductVector[7];
678 
679  number = sixteenthPoints * 16;
680  for (; number < num_points; number++) {
681  *realpt += ((*aPtr) * (*bPtr++));
682  *imagpt += ((*aPtr++) * (*bPtr++));
683  }
684 
685  *result = *(lv_32fc_t*)(&res[0]);
686 }
687 
688 
689 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
690 
691 
692 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition: sse2neon.h:1459
FORCE_INLINE void _mm_empty(void)
Definition: sse2neon.h:1027
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:88
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:54
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13