Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_multiply_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
34 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
36 
37 #include <volk/volk_common.h>
38 #include <volk/volk_complex.h>
39 
40 #ifdef LV_HAVE_GENERIC
41 
42 static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
43  const lv_16sc_t* in_a,
44  const lv_16sc_t* in_b,
45  unsigned int num_points)
46 {
47  unsigned int n;
48  for (n = 0; n < num_points; n++) {
49  result[n] = in_a[n] * in_b[n];
50  }
51 }
52 
53 #endif /*LV_HAVE_GENERIC*/
54 
55 
56 #ifdef LV_HAVE_SSE2
57 #include <emmintrin.h>
58 
60  const lv_16sc_t* in_a,
61  const lv_16sc_t* in_b,
62  unsigned int num_points)
63 {
64  const unsigned int sse_iters = num_points / 4;
65  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
66  result;
67 
68  mask_imag = _mm_set_epi8(
69  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70  mask_real = _mm_set_epi8(
71  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
72 
73  const lv_16sc_t* _in_a = in_a;
74  const lv_16sc_t* _in_b = in_b;
75  lv_16sc_t* _out = out;
76  unsigned int number;
77 
78  for (number = 0; number < sse_iters; number++) {
79  a = _mm_load_si128(
80  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
81  b = _mm_load_si128((__m128i*)_in_b);
82  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
83 
84  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
85  // zeros, and store the results in dst.
86  real = _mm_subs_epi16(c, c_sr);
87  real = _mm_and_si128(real,
88  mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
89 
90  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
91  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
92 
93  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
94  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
95 
96  imag = _mm_adds_epi16(imag1, imag2);
97  imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
98 
99  result = _mm_or_si128(real, imag);
100 
101  _mm_store_si128((__m128i*)_out, result);
102 
103  _in_a += 4;
104  _in_b += 4;
105  _out += 4;
106  }
107 
108  for (number = sse_iters * 4; number < num_points; ++number) {
109  *_out++ = (*_in_a++) * (*_in_b++);
110  }
111 }
112 #endif /* LV_HAVE_SSE2 */
113 
114 
115 #ifdef LV_HAVE_SSE2
116 #include <emmintrin.h>
117 
119  const lv_16sc_t* in_a,
120  const lv_16sc_t* in_b,
121  unsigned int num_points)
122 {
123  const unsigned int sse_iters = num_points / 4;
124  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
125  result;
126 
127  mask_imag = _mm_set_epi8(
128  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129  mask_real = _mm_set_epi8(
130  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
131 
132  const lv_16sc_t* _in_a = in_a;
133  const lv_16sc_t* _in_b = in_b;
134  lv_16sc_t* _out = out;
135  unsigned int number;
136 
137  for (number = 0; number < sse_iters; number++) {
138  a = _mm_loadu_si128(
139  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
140  b = _mm_loadu_si128((__m128i*)_in_b);
141  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
142 
143  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
144  // zeros, and store the results in dst.
145  real = _mm_subs_epi16(c, c_sr);
146  real = _mm_and_si128(real,
147  mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
148 
149  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
150  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
151 
152  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
153  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
154 
155  imag = _mm_adds_epi16(imag1, imag2);
156  imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
157 
158  result = _mm_or_si128(real, imag);
159 
160  _mm_storeu_si128((__m128i*)_out, result);
161 
162  _in_a += 4;
163  _in_b += 4;
164  _out += 4;
165  }
166 
167  for (number = sse_iters * 4; number < num_points; ++number) {
168  *_out++ = (*_in_a++) * (*_in_b++);
169  }
170 }
171 #endif /* LV_HAVE_SSE2 */
172 
173 
174 #ifdef LV_HAVE_AVX2
175 #include <immintrin.h>
176 
177 static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
178  const lv_16sc_t* in_a,
179  const lv_16sc_t* in_b,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int avx2_points = num_points / 8;
184 
185  const lv_16sc_t* _in_a = in_a;
186  const lv_16sc_t* _in_b = in_b;
187  lv_16sc_t* _out = out;
188 
189  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
190 
191  const __m256i mask_imag = _mm256_set_epi8(0xFF,
192  0xFF,
193  0,
194  0,
195  0xFF,
196  0xFF,
197  0,
198  0,
199  0xFF,
200  0xFF,
201  0,
202  0,
203  0xFF,
204  0xFF,
205  0,
206  0,
207  0xFF,
208  0xFF,
209  0,
210  0,
211  0xFF,
212  0xFF,
213  0,
214  0,
215  0xFF,
216  0xFF,
217  0,
218  0,
219  0xFF,
220  0xFF,
221  0,
222  0);
223  const __m256i mask_real = _mm256_set_epi8(0,
224  0,
225  0xFF,
226  0xFF,
227  0,
228  0,
229  0xFF,
230  0xFF,
231  0,
232  0,
233  0xFF,
234  0xFF,
235  0,
236  0,
237  0xFF,
238  0xFF,
239  0,
240  0,
241  0xFF,
242  0xFF,
243  0,
244  0,
245  0xFF,
246  0xFF,
247  0,
248  0,
249  0xFF,
250  0xFF,
251  0,
252  0,
253  0xFF,
254  0xFF);
255 
256  for (; number < avx2_points; number++) {
257  a = _mm256_loadu_si256(
258  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
259  b = _mm256_loadu_si256(
260  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
261  c = _mm256_mullo_epi16(a, b);
262 
263  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
264  // zeros, and store the results in dst.
265  real = _mm256_subs_epi16(c, c_sr);
266  real = _mm256_and_si256(
267  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
268 
269  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
270  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
271 
272  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
273  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
274 
275  imag = _mm256_adds_epi16(imag1, imag2);
276  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
277 
278  result = _mm256_or_si256(real, imag);
279 
280  _mm256_storeu_si256((__m256i*)_out, result);
281 
282  _in_a += 8;
283  _in_b += 8;
284  _out += 8;
285  }
286 
287  number = avx2_points * 8;
288  for (; number < num_points; number++) {
289  *_out++ = (*_in_a++) * (*_in_b++);
290  }
291 }
292 #endif /* LV_HAVE_AVX2 */
293 
294 
295 #ifdef LV_HAVE_AVX2
296 #include <immintrin.h>
297 
298 static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
299  const lv_16sc_t* in_a,
300  const lv_16sc_t* in_b,
301  unsigned int num_points)
302 {
303  unsigned int number = 0;
304  const unsigned int avx2_points = num_points / 8;
305 
306  const lv_16sc_t* _in_a = in_a;
307  const lv_16sc_t* _in_b = in_b;
308  lv_16sc_t* _out = out;
309 
310  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
311 
312  const __m256i mask_imag = _mm256_set_epi8(0xFF,
313  0xFF,
314  0,
315  0,
316  0xFF,
317  0xFF,
318  0,
319  0,
320  0xFF,
321  0xFF,
322  0,
323  0,
324  0xFF,
325  0xFF,
326  0,
327  0,
328  0xFF,
329  0xFF,
330  0,
331  0,
332  0xFF,
333  0xFF,
334  0,
335  0,
336  0xFF,
337  0xFF,
338  0,
339  0,
340  0xFF,
341  0xFF,
342  0,
343  0);
344  const __m256i mask_real = _mm256_set_epi8(0,
345  0,
346  0xFF,
347  0xFF,
348  0,
349  0,
350  0xFF,
351  0xFF,
352  0,
353  0,
354  0xFF,
355  0xFF,
356  0,
357  0,
358  0xFF,
359  0xFF,
360  0,
361  0,
362  0xFF,
363  0xFF,
364  0,
365  0,
366  0xFF,
367  0xFF,
368  0,
369  0,
370  0xFF,
371  0xFF,
372  0,
373  0,
374  0xFF,
375  0xFF);
376 
377  for (; number < avx2_points; number++) {
378  a = _mm256_load_si256(
379  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
380  b = _mm256_load_si256(
381  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
382  c = _mm256_mullo_epi16(a, b);
383 
384  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
385  // zeros, and store the results in dst.
386  real = _mm256_subs_epi16(c, c_sr);
387  real = _mm256_and_si256(
388  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
389 
390  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
391  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
392 
393  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
394  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
395 
396  imag = _mm256_adds_epi16(imag1, imag2);
397  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
398 
399  result = _mm256_or_si256(real, imag);
400 
401  _mm256_store_si256((__m256i*)_out, result);
402 
403  _in_a += 8;
404  _in_b += 8;
405  _out += 8;
406  }
407 
408  number = avx2_points * 8;
409  for (; number < num_points; number++) {
410  *_out++ = (*_in_a++) * (*_in_b++);
411  }
412 }
413 #endif /* LV_HAVE_AVX2 */
414 
415 
416 #ifdef LV_HAVE_NEON
417 #include <arm_neon.h>
418 
420  const lv_16sc_t* in_a,
421  const lv_16sc_t* in_b,
422  unsigned int num_points)
423 {
424  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
425  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
426  unsigned int quarter_points = num_points / 4;
427  int16x4x2_t a_val, b_val, c_val;
428  int16x4x2_t tmp_real, tmp_imag;
429  unsigned int number = 0;
430 
431  for (number = 0; number < quarter_points; ++number) {
432  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
433  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
434  __VOLK_PREFETCH(a_ptr + 4);
435  __VOLK_PREFETCH(b_ptr + 4);
436 
437  // multiply the real*real and imag*imag to get real result
438  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
439  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
440  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
441  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
442 
443  // Multiply cross terms to get the imaginary result
444  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
445  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
446  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
447  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
448 
449  // store the results
450  c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451  c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452  vst2_s16((int16_t*)out, c_val);
453 
454  a_ptr += 4;
455  b_ptr += 4;
456  out += 4;
457  }
458 
459  for (number = quarter_points * 4; number < num_points; number++) {
460  *out++ = (*a_ptr++) * (*b_ptr++);
461  }
462 }
463 #endif /* LV_HAVE_NEON */
464 
465 #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:59
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:118
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:42
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:419
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
short complex lv_16sc_t
Definition: volk_complex.h:71