Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_dot_prod_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
35 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
36 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
37 
39 #include <volk/volk_common.h>
40 #include <volk/volk_complex.h>
41 
42 
43 #ifdef LV_HAVE_GENERIC
44 
45 static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
46  const lv_16sc_t* in_a,
47  const lv_16sc_t* in_b,
48  unsigned int num_points)
49 {
50  result[0] = lv_cmake((int16_t)0, (int16_t)0);
51  unsigned int n;
52  for (n = 0; n < num_points; n++) {
53  lv_16sc_t tmp = in_a[n] * in_b[n];
54  result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
55  sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
56  }
57 }
58 
59 #endif /*LV_HAVE_GENERIC*/
60 
61 
62 #ifdef LV_HAVE_SSE2
63 #include <emmintrin.h>
64 
66  const lv_16sc_t* in_a,
67  const lv_16sc_t* in_b,
68  unsigned int num_points)
69 {
70  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
71 
72  const unsigned int sse_iters = num_points / 4;
73  unsigned int number;
74 
75  const lv_16sc_t* _in_a = in_a;
76  const lv_16sc_t* _in_b = in_b;
77  lv_16sc_t* _out = out;
78 
79  if (sse_iters > 0) {
80  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
81  realcacc, imagcacc;
82  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
83 
84  realcacc = _mm_setzero_si128();
85  imagcacc = _mm_setzero_si128();
86 
87  mask_imag = _mm_set_epi8(
88  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
89  mask_real = _mm_set_epi8(
90  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
91 
92  for (number = 0; number < sse_iters; number++) {
93  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
94  a = _mm_load_si128(
95  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
96  __VOLK_PREFETCH(_in_a + 8);
97  b = _mm_load_si128((__m128i*)_in_b);
98  __VOLK_PREFETCH(_in_b + 8);
99  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
100 
101  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
102  // zeros, and store the results in dst.
103  real = _mm_subs_epi16(c, c_sr);
104 
105  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
106  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
107 
108  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
109  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
110 
111  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
112 
113  realcacc = _mm_adds_epi16(realcacc, real);
114  imagcacc = _mm_adds_epi16(imagcacc, imag);
115 
116  _in_a += 4;
117  _in_b += 4;
118  }
119 
120  realcacc = _mm_and_si128(realcacc, mask_real);
121  imagcacc = _mm_and_si128(imagcacc, mask_imag);
122 
123  a = _mm_or_si128(realcacc, imagcacc);
124 
125  _mm_store_si128((__m128i*)dotProductVector,
126  a); // Store the results back into the dot product vector
127 
128  for (number = 0; number < 4; ++number) {
129  dotProduct = lv_cmake(
130  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
131  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
132  }
133  }
134 
135  for (number = 0; number < (num_points % 4); ++number) {
136  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
137  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
138  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
139  }
140 
141  *_out = dotProduct;
142 }
143 
144 #endif /* LV_HAVE_SSE2 */
145 
146 
147 #ifdef LV_HAVE_SSE2
148 #include <emmintrin.h>
149 
151  const lv_16sc_t* in_a,
152  const lv_16sc_t* in_b,
153  unsigned int num_points)
154 {
155  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
156 
157  const unsigned int sse_iters = num_points / 4;
158 
159  const lv_16sc_t* _in_a = in_a;
160  const lv_16sc_t* _in_b = in_b;
161  lv_16sc_t* _out = out;
162  unsigned int number;
163 
164  if (sse_iters > 0) {
165  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
166  realcacc, imagcacc, result;
167  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
168 
169  realcacc = _mm_setzero_si128();
170  imagcacc = _mm_setzero_si128();
171 
172  mask_imag = _mm_set_epi8(
173  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
174  mask_real = _mm_set_epi8(
175  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
176 
177  for (number = 0; number < sse_iters; number++) {
178  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
179  a = _mm_loadu_si128(
180  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
181  __VOLK_PREFETCH(_in_a + 8);
182  b = _mm_loadu_si128((__m128i*)_in_b);
183  __VOLK_PREFETCH(_in_b + 8);
184  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
185 
186  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
187  // zeros, and store the results in dst.
188  real = _mm_subs_epi16(c, c_sr);
189 
190  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
191  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
192 
193  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
194  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
195 
196  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
197 
198  realcacc = _mm_adds_epi16(realcacc, real);
199  imagcacc = _mm_adds_epi16(imagcacc, imag);
200 
201  _in_a += 4;
202  _in_b += 4;
203  }
204 
205  realcacc = _mm_and_si128(realcacc, mask_real);
206  imagcacc = _mm_and_si128(imagcacc, mask_imag);
207 
208  result = _mm_or_si128(realcacc, imagcacc);
209 
210  _mm_storeu_si128((__m128i*)dotProductVector,
211  result); // Store the results back into the dot product vector
212 
213  for (number = 0; number < 4; ++number) {
214  dotProduct = lv_cmake(
215  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
216  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
217  }
218  }
219 
220  for (number = 0; number < (num_points % 4); ++number) {
221  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
222  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
223  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
224  }
225 
226  *_out = dotProduct;
227 }
228 #endif /* LV_HAVE_SSE2 */
229 
230 
231 #ifdef LV_HAVE_AVX2
232 #include <immintrin.h>
233 
234 static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
235  const lv_16sc_t* in_a,
236  const lv_16sc_t* in_b,
237  unsigned int num_points)
238 {
239  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
240 
241  const unsigned int avx_iters = num_points / 8;
242 
243  const lv_16sc_t* _in_a = in_a;
244  const lv_16sc_t* _in_b = in_b;
245  lv_16sc_t* _out = out;
246  unsigned int number;
247 
248  if (avx_iters > 0) {
249  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
250  realcacc, imagcacc, result;
251  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
252 
253  realcacc = _mm256_setzero_si256();
254  imagcacc = _mm256_setzero_si256();
255 
256  mask_imag = _mm256_set_epi8(0xFF,
257  0xFF,
258  0,
259  0,
260  0xFF,
261  0xFF,
262  0,
263  0,
264  0xFF,
265  0xFF,
266  0,
267  0,
268  0xFF,
269  0xFF,
270  0,
271  0,
272  0xFF,
273  0xFF,
274  0,
275  0,
276  0xFF,
277  0xFF,
278  0,
279  0,
280  0xFF,
281  0xFF,
282  0,
283  0,
284  0xFF,
285  0xFF,
286  0,
287  0);
288  mask_real = _mm256_set_epi8(0,
289  0,
290  0xFF,
291  0xFF,
292  0,
293  0,
294  0xFF,
295  0xFF,
296  0,
297  0,
298  0xFF,
299  0xFF,
300  0,
301  0,
302  0xFF,
303  0xFF,
304  0,
305  0,
306  0xFF,
307  0xFF,
308  0,
309  0,
310  0xFF,
311  0xFF,
312  0,
313  0,
314  0xFF,
315  0xFF,
316  0,
317  0,
318  0xFF,
319  0xFF);
320 
321  for (number = 0; number < avx_iters; number++) {
322  a = _mm256_loadu_si256((__m256i*)_in_a);
323  __VOLK_PREFETCH(_in_a + 16);
324  b = _mm256_loadu_si256((__m256i*)_in_b);
325  __VOLK_PREFETCH(_in_b + 16);
326  c = _mm256_mullo_epi16(a, b);
327 
328  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
329  // in zeros, and store the results in dst.
330  real = _mm256_subs_epi16(c, c_sr);
331 
332  b_sl = _mm256_slli_si256(b, 2);
333  a_sl = _mm256_slli_si256(a, 2);
334 
335  imag1 = _mm256_mullo_epi16(a, b_sl);
336  imag2 = _mm256_mullo_epi16(b, a_sl);
337 
338  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
339 
340  realcacc = _mm256_adds_epi16(realcacc, real);
341  imagcacc = _mm256_adds_epi16(imagcacc, imag);
342 
343  _in_a += 8;
344  _in_b += 8;
345  }
346 
347  realcacc = _mm256_and_si256(realcacc, mask_real);
348  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
349 
350  result = _mm256_or_si256(realcacc, imagcacc);
351 
352  _mm256_storeu_si256((__m256i*)dotProductVector,
353  result); // Store the results back into the dot product vector
354 
355  for (number = 0; number < 8; ++number) {
356  dotProduct = lv_cmake(
357  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
358  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
359  }
360  }
361 
362  for (number = 0; number < (num_points % 8); ++number) {
363  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
364  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
365  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
366  }
367 
368  *_out = dotProduct;
369 }
370 #endif /* LV_HAVE_AVX2 */
371 
372 
373 #ifdef LV_HAVE_AVX2
374 #include <immintrin.h>
375 
376 static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
377  const lv_16sc_t* in_a,
378  const lv_16sc_t* in_b,
379  unsigned int num_points)
380 {
381  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
382 
383  const unsigned int avx_iters = num_points / 8;
384 
385  const lv_16sc_t* _in_a = in_a;
386  const lv_16sc_t* _in_b = in_b;
387  lv_16sc_t* _out = out;
388  unsigned int number;
389 
390  if (avx_iters > 0) {
391  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
392  realcacc, imagcacc, result;
393  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
394 
395  realcacc = _mm256_setzero_si256();
396  imagcacc = _mm256_setzero_si256();
397 
398  mask_imag = _mm256_set_epi8(0xFF,
399  0xFF,
400  0,
401  0,
402  0xFF,
403  0xFF,
404  0,
405  0,
406  0xFF,
407  0xFF,
408  0,
409  0,
410  0xFF,
411  0xFF,
412  0,
413  0,
414  0xFF,
415  0xFF,
416  0,
417  0,
418  0xFF,
419  0xFF,
420  0,
421  0,
422  0xFF,
423  0xFF,
424  0,
425  0,
426  0xFF,
427  0xFF,
428  0,
429  0);
430  mask_real = _mm256_set_epi8(0,
431  0,
432  0xFF,
433  0xFF,
434  0,
435  0,
436  0xFF,
437  0xFF,
438  0,
439  0,
440  0xFF,
441  0xFF,
442  0,
443  0,
444  0xFF,
445  0xFF,
446  0,
447  0,
448  0xFF,
449  0xFF,
450  0,
451  0,
452  0xFF,
453  0xFF,
454  0,
455  0,
456  0xFF,
457  0xFF,
458  0,
459  0,
460  0xFF,
461  0xFF);
462 
463  for (number = 0; number < avx_iters; number++) {
464  a = _mm256_load_si256((__m256i*)_in_a);
465  __VOLK_PREFETCH(_in_a + 16);
466  b = _mm256_load_si256((__m256i*)_in_b);
467  __VOLK_PREFETCH(_in_b + 16);
468  c = _mm256_mullo_epi16(a, b);
469 
470  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
471  // in zeros, and store the results in dst.
472  real = _mm256_subs_epi16(c, c_sr);
473 
474  b_sl = _mm256_slli_si256(b, 2);
475  a_sl = _mm256_slli_si256(a, 2);
476 
477  imag1 = _mm256_mullo_epi16(a, b_sl);
478  imag2 = _mm256_mullo_epi16(b, a_sl);
479 
480  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
481 
482  realcacc = _mm256_adds_epi16(realcacc, real);
483  imagcacc = _mm256_adds_epi16(imagcacc, imag);
484 
485  _in_a += 8;
486  _in_b += 8;
487  }
488 
489  realcacc = _mm256_and_si256(realcacc, mask_real);
490  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
491 
492  result = _mm256_or_si256(realcacc, imagcacc);
493 
494  _mm256_store_si256((__m256i*)dotProductVector,
495  result); // Store the results back into the dot product vector
496 
497  for (number = 0; number < 8; ++number) {
498  dotProduct = lv_cmake(
499  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
500  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
501  }
502  }
503 
504  for (number = 0; number < (num_points % 8); ++number) {
505  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
506  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
507  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
508  }
509 
510  *_out = dotProduct;
511 }
512 #endif /* LV_HAVE_AVX2 */
513 
514 
515 #ifdef LV_HAVE_NEON
516 #include <arm_neon.h>
517 
519  const lv_16sc_t* in_a,
520  const lv_16sc_t* in_b,
521  unsigned int num_points)
522 {
523  unsigned int quarter_points = num_points / 4;
524  unsigned int number;
525 
526  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
527  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
528  *out = lv_cmake((int16_t)0, (int16_t)0);
529 
530  if (quarter_points > 0) {
531  // for 2-lane vectors, 1st lane holds the real part,
532  // 2nd lane holds the imaginary part
533  int16x4x2_t a_val, b_val, c_val, accumulator;
534  int16x4x2_t tmp_real, tmp_imag;
535  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
536  accumulator.val[0] = vdup_n_s16(0);
537  accumulator.val[1] = vdup_n_s16(0);
538  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
539 
540  for (number = 0; number < quarter_points; ++number) {
541  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
542  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
543  __VOLK_PREFETCH(a_ptr + 8);
544  __VOLK_PREFETCH(b_ptr + 8);
545 
546  // multiply the real*real and imag*imag to get real result
547  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
548  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
549  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
550  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
551 
552  // Multiply cross terms to get the imaginary result
553  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
554  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
555  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
556  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
557 
558  c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
559  c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
560 
561  accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
562  accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
563 
564  a_ptr += 4;
565  b_ptr += 4;
566  }
567 
568  vst2_s16((int16_t*)accum_result, accumulator);
569  for (number = 0; number < 4; ++number) {
570  dotProduct = lv_cmake(
571  sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
572  sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
573  }
574 
575  *out = dotProduct;
576  }
577 
578  // tail case
579  for (number = quarter_points * 4; number < num_points; ++number) {
580  *out += (*a_ptr++) * (*b_ptr++);
581  }
582 }
583 
584 #endif /* LV_HAVE_NEON */
585 
586 
587 #ifdef LV_HAVE_NEON
588 #include <arm_neon.h>
589 
591  const lv_16sc_t* in_a,
592  const lv_16sc_t* in_b,
593  unsigned int num_points)
594 {
595  unsigned int quarter_points = num_points / 4;
596  unsigned int number;
597 
598  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
599  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
600  // for 2-lane vectors, 1st lane holds the real part,
601  // 2nd lane holds the imaginary part
602  int16x4x2_t a_val, b_val, accumulator;
603  int16x4x2_t tmp;
604  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
605  accumulator.val[0] = vdup_n_s16(0);
606  accumulator.val[1] = vdup_n_s16(0);
607 
608  for (number = 0; number < quarter_points; ++number) {
609  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
610  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
611  __VOLK_PREFETCH(a_ptr + 8);
612  __VOLK_PREFETCH(b_ptr + 8);
613 
614  tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
615  tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
616 
617  // use multiply accumulate/subtract to get result
618  tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
619  tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
620 
621  accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
622  accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
623 
624  a_ptr += 4;
625  b_ptr += 4;
626  }
627 
628  vst2_s16((int16_t*)accum_result, accumulator);
629  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
630 
631  // tail case
632  for (number = quarter_points * 4; number < num_points; ++number) {
633  *out += (*a_ptr++) * (*b_ptr++);
634  }
635 }
636 
637 #endif /* LV_HAVE_NEON */
638 
639 
640 #ifdef LV_HAVE_NEON
641 #include <arm_neon.h>
642 
644  const lv_16sc_t* in_a,
645  const lv_16sc_t* in_b,
646  unsigned int num_points)
647 {
648  unsigned int quarter_points = num_points / 4;
649  unsigned int number;
650 
651  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
652  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
653  // for 2-lane vectors, 1st lane holds the real part,
654  // 2nd lane holds the imaginary part
655  int16x4x2_t a_val, b_val, accumulator1, accumulator2;
656 
657  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
658  accumulator1.val[0] = vdup_n_s16(0);
659  accumulator1.val[1] = vdup_n_s16(0);
660  accumulator2.val[0] = vdup_n_s16(0);
661  accumulator2.val[1] = vdup_n_s16(0);
662 
663  for (number = 0; number < quarter_points; ++number) {
664  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
665  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
666  __VOLK_PREFETCH(a_ptr + 8);
667  __VOLK_PREFETCH(b_ptr + 8);
668 
669  // use 2 accumulators to remove inter-instruction data dependencies
670  accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
671  accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
672  accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
673  accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
674 
675  a_ptr += 4;
676  b_ptr += 4;
677  }
678 
679  accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
680  accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
681 
682  vst2_s16((int16_t*)accum_result, accumulator1);
683  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
684 
685  // tail case
686  for (number = quarter_points * 4; number < num_points; ++number) {
687  *out += (*a_ptr++) * (*b_ptr++);
688  }
689 }
690 
691 #endif /* LV_HAVE_NEON */
692 
693 #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:16
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:643
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:45
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:65
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:590
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:150
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:518
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
#define lv_creal(x)
Definition: volk_complex.h:96
short complex lv_16sc_t
Definition: volk_complex.h:71