Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
64 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
65 
66 #include <float.h>
67 #include <inttypes.h>
68 #include <stdio.h>
69 #include <volk/volk_complex.h>
70 
71 #if LV_HAVE_AVX && LV_HAVE_FMA
72 #include <immintrin.h>
73 
74 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
75  const lv_32fc_t* aVector,
76  const lv_32fc_t scalar,
77  unsigned int num_points)
78 {
79  unsigned int number = 0;
80  unsigned int i = 0;
81  const unsigned int quarterPoints = num_points / 4;
82  unsigned int isodd = num_points & 3;
83  __m256 x, yl, yh, z, tmp1, tmp2;
84  lv_32fc_t* c = cVector;
85  const lv_32fc_t* a = aVector;
86 
87  // Set up constant scalar vector
88  yl = _mm256_set1_ps(lv_creal(scalar));
89  yh = _mm256_set1_ps(lv_cimag(scalar));
90 
91  for (; number < quarterPoints; number++) {
92  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
93 
94  tmp1 = x;
95 
96  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
97 
98  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
99 
100  z = _mm256_fmaddsub_ps(
101  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
102 
103  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
104 
105  a += 4;
106  c += 4;
107  }
108 
109  for (i = num_points - isodd; i < num_points; i++) {
110  *c++ = (*a++) * scalar;
111  }
112 }
113 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
114 
115 #ifdef LV_HAVE_AVX
116 #include <immintrin.h>
117 
118 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
119  const lv_32fc_t* aVector,
120  const lv_32fc_t scalar,
121  unsigned int num_points)
122 {
123  unsigned int number = 0;
124  unsigned int i = 0;
125  const unsigned int quarterPoints = num_points / 4;
126  unsigned int isodd = num_points & 3;
127  __m256 x, yl, yh, z, tmp1, tmp2;
128  lv_32fc_t* c = cVector;
129  const lv_32fc_t* a = aVector;
130 
131  // Set up constant scalar vector
132  yl = _mm256_set1_ps(lv_creal(scalar));
133  yh = _mm256_set1_ps(lv_cimag(scalar));
134 
135  for (; number < quarterPoints; number++) {
136  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
137 
138  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
139 
140  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
141 
142  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
143 
144  z = _mm256_addsub_ps(tmp1,
145  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
146 
147  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
148 
149  a += 4;
150  c += 4;
151  }
152 
153  for (i = num_points - isodd; i < num_points; i++) {
154  *c++ = (*a++) * scalar;
155  }
156 }
157 #endif /* LV_HAVE_AVX */
158 
159 #ifdef LV_HAVE_SSE3
160 #include <pmmintrin.h>
161 
163  const lv_32fc_t* aVector,
164  const lv_32fc_t scalar,
165  unsigned int num_points)
166 {
167  unsigned int number = 0;
168  const unsigned int halfPoints = num_points / 2;
169 
170  __m128 x, yl, yh, z, tmp1, tmp2;
171  lv_32fc_t* c = cVector;
172  const lv_32fc_t* a = aVector;
173 
174  // Set up constant scalar vector
175  yl = _mm_set_ps1(lv_creal(scalar));
176  yh = _mm_set_ps1(lv_cimag(scalar));
177 
178  for (; number < halfPoints; number++) {
179 
180  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
181 
182  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
183 
184  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
185 
186  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
187 
188  z = _mm_addsub_ps(tmp1,
189  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
190 
191  _mm_storeu_ps((float*)c, z); // Store the results back into the C container
192 
193  a += 2;
194  c += 2;
195  }
196 
197  if ((num_points % 2) != 0) {
198  *c = (*a) * scalar;
199  }
200 }
201 #endif /* LV_HAVE_SSE */
202 
203 #ifdef LV_HAVE_GENERIC
204 
206  const lv_32fc_t* aVector,
207  const lv_32fc_t scalar,
208  unsigned int num_points)
209 {
210  lv_32fc_t* cPtr = cVector;
211  const lv_32fc_t* aPtr = aVector;
212  unsigned int number = num_points;
213 
214  // unwrap loop
215  while (number >= 8) {
216  *cPtr++ = (*aPtr++) * scalar;
217  *cPtr++ = (*aPtr++) * scalar;
218  *cPtr++ = (*aPtr++) * scalar;
219  *cPtr++ = (*aPtr++) * scalar;
220  *cPtr++ = (*aPtr++) * scalar;
221  *cPtr++ = (*aPtr++) * scalar;
222  *cPtr++ = (*aPtr++) * scalar;
223  *cPtr++ = (*aPtr++) * scalar;
224  number -= 8;
225  }
226 
227  // clean up any remaining
228  while (number-- > 0)
229  *cPtr++ = *aPtr++ * scalar;
230 }
231 #endif /* LV_HAVE_GENERIC */
232 
233 
234 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
235 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
236 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
237 
238 #include <float.h>
239 #include <inttypes.h>
240 #include <stdio.h>
241 #include <volk/volk_complex.h>
242 
243 #if LV_HAVE_AVX && LV_HAVE_FMA
244 #include <immintrin.h>
245 
246 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
247  const lv_32fc_t* aVector,
248  const lv_32fc_t scalar,
249  unsigned int num_points)
250 {
251  unsigned int number = 0;
252  unsigned int i = 0;
253  const unsigned int quarterPoints = num_points / 4;
254  unsigned int isodd = num_points & 3;
255  __m256 x, yl, yh, z, tmp1, tmp2;
256  lv_32fc_t* c = cVector;
257  const lv_32fc_t* a = aVector;
258 
259  // Set up constant scalar vector
260  yl = _mm256_set1_ps(lv_creal(scalar));
261  yh = _mm256_set1_ps(lv_cimag(scalar));
262 
263  for (; number < quarterPoints; number++) {
264  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
265 
266  tmp1 = x;
267 
268  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
269 
270  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
271 
272  z = _mm256_fmaddsub_ps(
273  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
274 
275  _mm256_store_ps((float*)c, z); // Store the results back into the C container
276 
277  a += 4;
278  c += 4;
279  }
280 
281  for (i = num_points - isodd; i < num_points; i++) {
282  *c++ = (*a++) * scalar;
283  }
284 }
285 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
286 
287 
288 #ifdef LV_HAVE_AVX
289 #include <immintrin.h>
290 
291 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
292  const lv_32fc_t* aVector,
293  const lv_32fc_t scalar,
294  unsigned int num_points)
295 {
296  unsigned int number = 0;
297  unsigned int i = 0;
298  const unsigned int quarterPoints = num_points / 4;
299  unsigned int isodd = num_points & 3;
300  __m256 x, yl, yh, z, tmp1, tmp2;
301  lv_32fc_t* c = cVector;
302  const lv_32fc_t* a = aVector;
303 
304  // Set up constant scalar vector
305  yl = _mm256_set1_ps(lv_creal(scalar));
306  yh = _mm256_set1_ps(lv_cimag(scalar));
307 
308  for (; number < quarterPoints; number++) {
309  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
310 
311  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
312 
313  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
314 
315  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
316 
317  z = _mm256_addsub_ps(tmp1,
318  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
319 
320  _mm256_store_ps((float*)c, z); // Store the results back into the C container
321 
322  a += 4;
323  c += 4;
324  }
325 
326  for (i = num_points - isodd; i < num_points; i++) {
327  *c++ = (*a++) * scalar;
328  }
329 }
330 #endif /* LV_HAVE_AVX */
331 
332 #ifdef LV_HAVE_SSE3
333 #include <pmmintrin.h>
334 
336  const lv_32fc_t* aVector,
337  const lv_32fc_t scalar,
338  unsigned int num_points)
339 {
340  unsigned int number = 0;
341  const unsigned int halfPoints = num_points / 2;
342 
343  __m128 x, yl, yh, z, tmp1, tmp2;
344  lv_32fc_t* c = cVector;
345  const lv_32fc_t* a = aVector;
346 
347  // Set up constant scalar vector
348  yl = _mm_set_ps1(lv_creal(scalar));
349  yh = _mm_set_ps1(lv_cimag(scalar));
350 
351  for (; number < halfPoints; number++) {
352 
353  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
354 
355  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
356 
357  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
358 
359  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
360 
361  z = _mm_addsub_ps(tmp1,
362  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
363 
364  _mm_store_ps((float*)c, z); // Store the results back into the C container
365 
366  a += 2;
367  c += 2;
368  }
369 
370  if ((num_points % 2) != 0) {
371  *c = (*a) * scalar;
372  }
373 }
374 #endif /* LV_HAVE_SSE */
375 
376 #ifdef LV_HAVE_NEON
377 #include <arm_neon.h>
378 
379 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
380  const lv_32fc_t* aVector,
381  const lv_32fc_t scalar,
382  unsigned int num_points)
383 {
384  lv_32fc_t* cPtr = cVector;
385  const lv_32fc_t* aPtr = aVector;
386  unsigned int number = num_points;
387  unsigned int quarter_points = num_points / 4;
388 
389  float32x4x2_t a_val, scalar_val;
390  float32x4x2_t tmp_imag;
391 
392  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
393  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
394  for (number = 0; number < quarter_points; ++number) {
395  a_val = vld2q_f32((float*)aPtr);
396  tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
397  tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
398 
399  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
400  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
401 
402  vst2q_f32((float*)cPtr, tmp_imag);
403  aPtr += 4;
404  cPtr += 4;
405  }
406 
407  for (number = quarter_points * 4; number < num_points; number++) {
408  *cPtr++ = *aPtr++ * scalar;
409  }
410 }
411 #endif /* LV_HAVE_NEON */
412 
413 #ifdef LV_HAVE_GENERIC
414 
416  const lv_32fc_t* aVector,
417  const lv_32fc_t scalar,
418  unsigned int num_points)
419 {
420  lv_32fc_t* cPtr = cVector;
421  const lv_32fc_t* aPtr = aVector;
422  unsigned int number = num_points;
423 
424  // unwrap loop
425  while (number >= 8) {
426  *cPtr++ = (*aPtr++) * scalar;
427  *cPtr++ = (*aPtr++) * scalar;
428  *cPtr++ = (*aPtr++) * scalar;
429  *cPtr++ = (*aPtr++) * scalar;
430  *cPtr++ = (*aPtr++) * scalar;
431  *cPtr++ = (*aPtr++) * scalar;
432  *cPtr++ = (*aPtr++) * scalar;
433  *cPtr++ = (*aPtr++) * scalar;
434  number -= 8;
435  }
436 
437  // clean up any remaining
438  while (number-- > 0)
439  *cPtr++ = *aPtr++ * scalar;
440 }
441 #endif /* LV_HAVE_GENERIC */
442 
443 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:415
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:205
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:118
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:162
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:291
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:335
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:379
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13