Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
59 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
60 
61 #include <float.h>
62 #include <inttypes.h>
63 #include <stdio.h>
64 #include <volk/volk_complex.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
69 
71  const lv_32fc_t* aVector,
72  const lv_32fc_t* bVector,
73  unsigned int num_points)
74 {
75  unsigned int number = 0;
76  const unsigned int quarterPoints = num_points / 4;
77 
78  __m256 x, y, z;
79  lv_32fc_t* c = cVector;
80  const lv_32fc_t* a = aVector;
81  const lv_32fc_t* b = bVector;
82 
83  for (; number < quarterPoints; number++) {
84  x = _mm256_loadu_ps(
85  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
86  y = _mm256_loadu_ps(
87  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
89  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
90 
91  a += 4;
92  b += 4;
93  c += 4;
94  }
95 
96  number = quarterPoints * 4;
97 
98  for (; number < num_points; number++) {
99  *c++ = (*a++) * lv_conj(*b++);
100  }
101 }
102 #endif /* LV_HAVE_AVX */
103 
104 
105 #ifdef LV_HAVE_SSE3
106 #include <pmmintrin.h>
108 
110  const lv_32fc_t* aVector,
111  const lv_32fc_t* bVector,
112  unsigned int num_points)
113 {
114  unsigned int number = 0;
115  const unsigned int halfPoints = num_points / 2;
116 
117  __m128 x, y, z;
118  lv_32fc_t* c = cVector;
119  const lv_32fc_t* a = aVector;
120  const lv_32fc_t* b = bVector;
121 
122  for (; number < halfPoints; number++) {
123  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
124  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
125  z = _mm_complexconjugatemul_ps(x, y);
126  _mm_storeu_ps((float*)c, z); // Store the results back into the C container
127 
128  a += 2;
129  b += 2;
130  c += 2;
131  }
132 
133  if ((num_points % 2) != 0) {
134  *c = (*a) * lv_conj(*b);
135  }
136 }
137 #endif /* LV_HAVE_SSE */
138 
139 
140 #ifdef LV_HAVE_GENERIC
141 
143  const lv_32fc_t* aVector,
144  const lv_32fc_t* bVector,
145  unsigned int num_points)
146 {
147  lv_32fc_t* cPtr = cVector;
148  const lv_32fc_t* aPtr = aVector;
149  const lv_32fc_t* bPtr = bVector;
150  unsigned int number = 0;
151 
152  for (number = 0; number < num_points; number++) {
153  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
154  }
155 }
156 #endif /* LV_HAVE_GENERIC */
157 
158 
159 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
160 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
161 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
162 
163 #include <float.h>
164 #include <inttypes.h>
165 #include <stdio.h>
166 #include <volk/volk_complex.h>
167 
168 #ifdef LV_HAVE_AVX
169 #include <immintrin.h>
171 
173  const lv_32fc_t* aVector,
174  const lv_32fc_t* bVector,
175  unsigned int num_points)
176 {
177  unsigned int number = 0;
178  const unsigned int quarterPoints = num_points / 4;
179 
180  __m256 x, y, z;
181  lv_32fc_t* c = cVector;
182  const lv_32fc_t* a = aVector;
183  const lv_32fc_t* b = bVector;
184 
185  for (; number < quarterPoints; number++) {
186  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
187  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
189  _mm256_store_ps((float*)c, z); // Store the results back into the C container
190 
191  a += 4;
192  b += 4;
193  c += 4;
194  }
195 
196  number = quarterPoints * 4;
197 
198  for (; number < num_points; number++) {
199  *c++ = (*a++) * lv_conj(*b++);
200  }
201 }
202 #endif /* LV_HAVE_AVX */
203 
204 
205 #ifdef LV_HAVE_SSE3
206 #include <pmmintrin.h>
208 
210  const lv_32fc_t* aVector,
211  const lv_32fc_t* bVector,
212  unsigned int num_points)
213 {
214  unsigned int number = 0;
215  const unsigned int halfPoints = num_points / 2;
216 
217  __m128 x, y, z;
218  lv_32fc_t* c = cVector;
219  const lv_32fc_t* a = aVector;
220  const lv_32fc_t* b = bVector;
221 
222  for (; number < halfPoints; number++) {
223  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
224  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
225  z = _mm_complexconjugatemul_ps(x, y);
226  _mm_store_ps((float*)c, z); // Store the results back into the C container
227 
228  a += 2;
229  b += 2;
230  c += 2;
231  }
232 
233  if ((num_points % 2) != 0) {
234  *c = (*a) * lv_conj(*b);
235  }
236 }
237 #endif /* LV_HAVE_SSE */
238 
239 
240 #ifdef LV_HAVE_NEON
241 #include <arm_neon.h>
242 
244  const lv_32fc_t* aVector,
245  const lv_32fc_t* bVector,
246  unsigned int num_points)
247 {
248  lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
249  lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
250  unsigned int quarter_points = num_points / 4;
251  float32x4x2_t a_val, b_val, c_val;
252  float32x4x2_t tmp_real, tmp_imag;
253  unsigned int number = 0;
254 
255  for (number = 0; number < quarter_points; ++number) {
256  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
257  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
258  b_val.val[1] = vnegq_f32(b_val.val[1]);
259  __VOLK_PREFETCH(a_ptr + 4);
260  __VOLK_PREFETCH(b_ptr + 4);
261 
262  // multiply the real*real and imag*imag to get real result
263  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
264  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
265  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
266  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
267 
268  // Multiply cross terms to get the imaginary result
269  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
270  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
271  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
272  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
273 
274  // store the results
275  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
276  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
277  vst2q_f32((float*)cVector, c_val);
278 
279  a_ptr += 4;
280  b_ptr += 4;
281  cVector += 4;
282  }
283 
284  for (number = quarter_points * 4; number < num_points; number++) {
285  *cVector++ = (*a_ptr++) * conj(*b_ptr++);
286  }
287 }
288 #endif /* LV_HAVE_NEON */
289 
290 
291 #ifdef LV_HAVE_GENERIC
292 
293 static inline void
295  const lv_32fc_t* aVector,
296  const lv_32fc_t* bVector,
297  unsigned int num_points)
298 {
299  lv_32fc_t* cPtr = cVector;
300  const lv_32fc_t* aPtr = aVector;
301  const lv_32fc_t* bPtr = bVector;
302  unsigned int number = 0;
303 
304  for (number = 0; number < num_points; number++) {
305  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
306  }
307 }
308 #endif /* LV_HAVE_GENERIC */
309 
310 
311 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:109
static void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:209
static void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:142
static void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:243
static void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:70
static void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:294
static void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:172
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31