Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
60 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
61 
62 #include <float.h>
63 #include <inttypes.h>
64 #include <volk/volk_complex.h>
65 
66 
67 #ifdef LV_HAVE_GENERIC
68 
69 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
70  const lv_32fc_t* aVector,
71  const lv_32fc_t* bVector,
72  unsigned int num_points)
73 {
74  lv_32fc_t* cPtr = cVector;
75  const lv_32fc_t* aPtr = aVector;
76  const lv_32fc_t* bPtr = bVector;
77 
78  for (unsigned int number = 0; number < num_points; number++) {
79  *cPtr++ = (*aPtr++) / (*bPtr++);
80  }
81 }
82 #endif /* LV_HAVE_GENERIC */
83 
84 
85 #ifdef LV_HAVE_SSE3
86 #include <pmmintrin.h>
88 
89 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
90  const lv_32fc_t* numeratorVector,
91  const lv_32fc_t* denumeratorVector,
92  unsigned int num_points)
93 {
94  /*
95  * we'll do the "classical"
96  * a a b*
97  * --- = -------
98  * b |b|^2
99  * */
100  unsigned int number = 0;
101  const unsigned int quarterPoints = num_points / 4;
102 
103  __m128 num01, num23, den01, den23, norm, result;
104  lv_32fc_t* c = cVector;
105  const lv_32fc_t* a = numeratorVector;
106  const lv_32fc_t* b = denumeratorVector;
107 
108  for (; number < quarterPoints; number++) {
109  num01 = _mm_loadu_ps((float*)a); // first pair
110  den01 = _mm_loadu_ps((float*)b); // first pair
111  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
112  a += 2;
113  b += 2;
114 
115  num23 = _mm_loadu_ps((float*)a); // second pair
116  den23 = _mm_loadu_ps((float*)b); // second pair
117  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
118  a += 2;
119  b += 2;
120 
121  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
122  den01 = _mm_unpacklo_ps(norm, norm);
123  den23 = _mm_unpackhi_ps(norm, norm);
124 
125  result = _mm_div_ps(num01, den01);
126  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
127  c += 2;
128  result = _mm_div_ps(num23, den23);
129  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
130  c += 2;
131  }
132 
133  number *= 4;
134  for (; number < num_points; number++) {
135  *c = (*a) / (*b);
136  a++;
137  b++;
138  c++;
139  }
140 }
141 #endif /* LV_HAVE_SSE3 */
142 
143 
144 #ifdef LV_HAVE_AVX
145 #include <immintrin.h>
147 
148 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
149  const lv_32fc_t* numeratorVector,
150  const lv_32fc_t* denumeratorVector,
151  unsigned int num_points)
152 {
153  /*
154  * we'll do the "classical"
155  * a a b*
156  * --- = -------
157  * b |b|^2
158  * */
159  unsigned int number = 0;
160  const unsigned int quarterPoints = num_points / 4;
161 
162  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
163  lv_32fc_t* c = cVector;
164  const lv_32fc_t* a = numeratorVector;
165  const lv_32fc_t* b = denumeratorVector;
166 
167  for (; number < quarterPoints; number++) {
168  num = _mm256_loadu_ps(
169  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
170  denum = _mm256_loadu_ps(
171  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
172  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
173  sq = _mm256_mul_ps(denum, denum); // Square the values
174  mag_sq_un = _mm256_hadd_ps(
175  sq, sq); // obtain the actual squared magnitude, although out of order
176  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
177  // best guide I found on using these functions:
178  // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
179  div = _mm256_div_ps(mul_conj, mag_sq);
180 
181  _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
182 
183  a += 4;
184  b += 4;
185  c += 4;
186  }
187 
188  number = quarterPoints * 4;
189 
190  for (; number < num_points; number++) {
191  *c++ = (*a++) / (*b++);
192  }
193 }
194 #endif /* LV_HAVE_AVX */
195 
196 
197 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
198 
199 
200 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
201 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
202 
203 #include <float.h>
204 #include <inttypes.h>
205 #include <stdio.h>
206 #include <volk/volk_complex.h>
207 
208 #ifdef LV_HAVE_SSE3
209 #include <pmmintrin.h>
211 
212 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
213  const lv_32fc_t* numeratorVector,
214  const lv_32fc_t* denumeratorVector,
215  unsigned int num_points)
216 {
217  /*
218  * we'll do the "classical"
219  * a a b*
220  * --- = -------
221  * b |b|^2
222  * */
223  unsigned int number = 0;
224  const unsigned int quarterPoints = num_points / 4;
225 
226  __m128 num01, num23, den01, den23, norm, result;
227  lv_32fc_t* c = cVector;
228  const lv_32fc_t* a = numeratorVector;
229  const lv_32fc_t* b = denumeratorVector;
230 
231  for (; number < quarterPoints; number++) {
232  num01 = _mm_load_ps((float*)a); // first pair
233  den01 = _mm_load_ps((float*)b); // first pair
234  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
235  a += 2;
236  b += 2;
237 
238  num23 = _mm_load_ps((float*)a); // second pair
239  den23 = _mm_load_ps((float*)b); // second pair
240  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
241  a += 2;
242  b += 2;
243 
244  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
245 
246  den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
247  den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
248 
249  result = _mm_div_ps(num01, den01);
250  _mm_store_ps((float*)c, result); // Store the results back into the C container
251  c += 2;
252  result = _mm_div_ps(num23, den23);
253  _mm_store_ps((float*)c, result); // Store the results back into the C container
254  c += 2;
255  }
256 
257  number *= 4;
258  for (; number < num_points; number++) {
259  *c = (*a) / (*b);
260  a++;
261  b++;
262  c++;
263  }
264 }
265 #endif /* LV_HAVE_SSE */
266 
267 #ifdef LV_HAVE_AVX
268 #include <immintrin.h>
270 
271 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
272  const lv_32fc_t* numeratorVector,
273  const lv_32fc_t* denumeratorVector,
274  unsigned int num_points)
275 {
276  /*
277  * Guide to AVX intrisics:
278  * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
279  *
280  * we'll do the "classical"
281  * a a b*
282  * --- = -------
283  * b |b|^2
284  *
285  */
286  lv_32fc_t* c = cVector;
287  const lv_32fc_t* a = numeratorVector;
288  const lv_32fc_t* b = denumeratorVector;
289 
290  const unsigned int eigthPoints = num_points / 8;
291 
292  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
293 
294  for (unsigned int number = 0; number < eigthPoints; number++) {
295  // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
296  num01 = _mm256_load_ps((float*)a);
297  denum01 = _mm256_load_ps((float*)b);
298 
299  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
300  a += 4;
301  b += 4;
302 
303  num23 = _mm256_load_ps((float*)a);
304  denum23 = _mm256_load_ps((float*)b);
305  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
306  a += 4;
307  b += 4;
308 
309  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
310  _mm256_mul_ps(denum23, denum23));
311 
312  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
313  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
314 
315  result0 = _mm256_div_ps(num01, denum01);
316  result1 = _mm256_div_ps(num23, denum23);
317 
318  _mm256_store_ps((float*)c, result0);
319  c += 4;
320  _mm256_store_ps((float*)c, result1);
321  c += 4;
322  }
323 
324  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
325 }
326 #endif /* LV_HAVE_AVX */
327 
328 
329 #ifdef LV_HAVE_NEON
330 #include <arm_neon.h>
331 
332 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
333  const lv_32fc_t* aVector,
334  const lv_32fc_t* bVector,
335  unsigned int num_points)
336 {
337  lv_32fc_t* cPtr = cVector;
338  const lv_32fc_t* aPtr = aVector;
339  const lv_32fc_t* bPtr = bVector;
340 
341  float32x4x2_t aVal, bVal, cVal;
342  float32x4_t bAbs, bAbsInv;
343 
344  const unsigned int quarterPoints = num_points / 4;
345  unsigned int number = 0;
346  for (; number < quarterPoints; number++) {
347  aVal = vld2q_f32((const float*)(aPtr));
348  bVal = vld2q_f32((const float*)(bPtr));
349  aPtr += 4;
350  bPtr += 4;
351  __VOLK_PREFETCH(aPtr + 4);
352  __VOLK_PREFETCH(bPtr + 4);
353 
354  bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
355  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
356 
357  bAbsInv = vrecpeq_f32(bAbs);
358  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
359  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
360 
361  cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
362  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
363  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
364 
365  cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
366  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
367  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
368 
369  vst2q_f32((float*)(cPtr), cVal);
370  cPtr += 4;
371  }
372 
373  for (number = quarterPoints * 4; number < num_points; number++) {
374  *cPtr++ = (*aPtr++) / (*bPtr++);
375  }
376 }
377 #endif /* LV_HAVE_NEON */
378 
379 
380 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:212
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:69
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:332
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:148
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:271
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:89
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31