Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
73 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
74 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
75 
76 #include <float.h>
77 #include <inttypes.h>
78 #include <stdio.h>
79 #include <volk/volk_complex.h>
80 
81 
82 #ifdef LV_HAVE_GENERIC
83 
84 static inline void
86  const lv_32fc_t* aVector,
87  const lv_32fc_t* bVector,
88  const lv_32fc_t scalar,
89  unsigned int num_points)
90 {
91  const lv_32fc_t* aPtr = aVector;
92  const lv_32fc_t* bPtr = bVector;
93  lv_32fc_t* cPtr = cVector;
94  unsigned int number = num_points;
95 
96  // unwrap loop
97  while (number >= 8) {
98  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
99  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
100  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
101  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
102  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
103  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
104  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
105  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
106  number -= 8;
107  }
108 
109  // clean up any remaining
110  while (number-- > 0) {
111  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
112  }
113 }
114 #endif /* LV_HAVE_GENERIC */
115 
116 
117 #ifdef LV_HAVE_AVX
118 #include <immintrin.h>
120 
121 static inline void
123  const lv_32fc_t* aVector,
124  const lv_32fc_t* bVector,
125  const lv_32fc_t scalar,
126  unsigned int num_points)
127 {
128  unsigned int number = 0;
129  unsigned int i = 0;
130  const unsigned int quarterPoints = num_points / 4;
131  unsigned int isodd = num_points & 3;
132 
133  __m256 x, y, s, z;
134  lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
135 
136  const lv_32fc_t* a = aVector;
137  const lv_32fc_t* b = bVector;
138  lv_32fc_t* c = cVector;
139 
140  // Set up constant scalar vector
141  s = _mm256_loadu_ps((float*)v_scalar);
142 
143  for (; number < quarterPoints; number++) {
144  x = _mm256_loadu_ps((float*)b);
145  y = _mm256_loadu_ps((float*)a);
147  z = _mm256_add_ps(y, z);
148  _mm256_storeu_ps((float*)c, z);
149 
150  a += 4;
151  b += 4;
152  c += 4;
153  }
154 
155  for (i = num_points - isodd; i < num_points; i++) {
156  *c++ = (*a++) + lv_conj(*b++) * scalar;
157  }
158 }
159 #endif /* LV_HAVE_AVX */
160 
161 
162 #ifdef LV_HAVE_SSE3
163 #include <pmmintrin.h>
165 
166 static inline void
168  const lv_32fc_t* aVector,
169  const lv_32fc_t* bVector,
170  const lv_32fc_t scalar,
171  unsigned int num_points)
172 {
173  unsigned int number = 0;
174  const unsigned int halfPoints = num_points / 2;
175 
176  __m128 x, y, s, z;
177  lv_32fc_t v_scalar[2] = { scalar, scalar };
178 
179  const lv_32fc_t* a = aVector;
180  const lv_32fc_t* b = bVector;
181  lv_32fc_t* c = cVector;
182 
183  // Set up constant scalar vector
184  s = _mm_loadu_ps((float*)v_scalar);
185 
186  for (; number < halfPoints; number++) {
187  x = _mm_loadu_ps((float*)b);
188  y = _mm_loadu_ps((float*)a);
189  z = _mm_complexconjugatemul_ps(s, x);
190  z = _mm_add_ps(y, z);
191  _mm_storeu_ps((float*)c, z);
192 
193  a += 2;
194  b += 2;
195  c += 2;
196  }
197 
198  if ((num_points % 2) != 0) {
199  *c = *a + lv_conj(*b) * scalar;
200  }
201 }
202 #endif /* LV_HAVE_SSE */
203 
204 
205 #ifdef LV_HAVE_AVX
206 #include <immintrin.h>
208 
209 static inline void
211  const lv_32fc_t* aVector,
212  const lv_32fc_t* bVector,
213  const lv_32fc_t scalar,
214  unsigned int num_points)
215 {
216  unsigned int number = 0;
217  unsigned int i = 0;
218  const unsigned int quarterPoints = num_points / 4;
219  unsigned int isodd = num_points & 3;
220 
221  __m256 x, y, s, z;
222  lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
223 
224  const lv_32fc_t* a = aVector;
225  const lv_32fc_t* b = bVector;
226  lv_32fc_t* c = cVector;
227 
228  // Set up constant scalar vector
229  s = _mm256_loadu_ps((float*)v_scalar);
230 
231  for (; number < quarterPoints; number++) {
232  x = _mm256_load_ps((float*)b);
233  y = _mm256_load_ps((float*)a);
235  z = _mm256_add_ps(y, z);
236  _mm256_store_ps((float*)c, z);
237 
238  a += 4;
239  b += 4;
240  c += 4;
241  }
242 
243  for (i = num_points - isodd; i < num_points; i++) {
244  *c++ = (*a++) + lv_conj(*b++) * scalar;
245  }
246 }
247 #endif /* LV_HAVE_AVX */
248 
249 
250 #ifdef LV_HAVE_SSE3
251 #include <pmmintrin.h>
253 
254 static inline void
256  const lv_32fc_t* aVector,
257  const lv_32fc_t* bVector,
258  const lv_32fc_t scalar,
259  unsigned int num_points)
260 {
261  unsigned int number = 0;
262  const unsigned int halfPoints = num_points / 2;
263 
264  __m128 x, y, s, z;
265  lv_32fc_t v_scalar[2] = { scalar, scalar };
266 
267  const lv_32fc_t* a = aVector;
268  const lv_32fc_t* b = bVector;
269  lv_32fc_t* c = cVector;
270 
271  // Set up constant scalar vector
272  s = _mm_loadu_ps((float*)v_scalar);
273 
274  for (; number < halfPoints; number++) {
275  x = _mm_load_ps((float*)b);
276  y = _mm_load_ps((float*)a);
277  z = _mm_complexconjugatemul_ps(s, x);
278  z = _mm_add_ps(y, z);
279  _mm_store_ps((float*)c, z);
280 
281  a += 2;
282  b += 2;
283  c += 2;
284  }
285 
286  if ((num_points % 2) != 0) {
287  *c = *a + lv_conj(*b) * scalar;
288  }
289 }
290 #endif /* LV_HAVE_SSE */
291 
292 
293 #ifdef LV_HAVE_NEON
294 #include <arm_neon.h>
295 
296 static inline void
298  const lv_32fc_t* aVector,
299  const lv_32fc_t* bVector,
300  const lv_32fc_t scalar,
301  unsigned int num_points)
302 {
303  const lv_32fc_t* bPtr = bVector;
304  const lv_32fc_t* aPtr = aVector;
305  lv_32fc_t* cPtr = cVector;
306  unsigned int number = num_points;
307  unsigned int quarter_points = num_points / 4;
308 
309  float32x4x2_t a_val, b_val, c_val, scalar_val;
310  float32x4x2_t tmp_val;
311 
312  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
313  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
314 
315  for (number = 0; number < quarter_points; ++number) {
316  a_val = vld2q_f32((float*)aPtr);
317  b_val = vld2q_f32((float*)bPtr);
318  b_val.val[1] = vnegq_f32(b_val.val[1]);
319  __VOLK_PREFETCH(aPtr + 8);
320  __VOLK_PREFETCH(bPtr + 8);
321 
322  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
323  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
324 
325  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
326  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
327 
328  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
329  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
330 
331  vst2q_f32((float*)cPtr, c_val);
332 
333  aPtr += 4;
334  bPtr += 4;
335  cPtr += 4;
336  }
337 
338  for (number = quarter_points * 4; number < num_points; number++) {
339  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
340  }
341 }
342 #endif /* LV_HAVE_NEON */
343 
344 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:210
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:85
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:167
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:255
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:122
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:297
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31