doxygen/volk__32fc__x2__divide__32fc_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2016 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H

 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H


 #include <float.h>

 #include <inttypes.h>

 #include <volk/volk_complex.h>


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,

                                                     const lv_32fc_t* aVector,

                                                     const lv_32fc_t* bVector,

                                                     unsigned int num_points)

 {

     lv_32fc_t* cPtr = cVector;

     const lv_32fc_t* aPtr = aVector;

     const lv_32fc_t* bPtr = bVector;


     for (unsigned int number = 0; number < num_points; number++) {

         *cPtr++ = (*aPtr++) / (*bPtr++);

     }

 }

 #endif /* LV_HAVE_GENERIC */


 #ifdef LV_HAVE_SSE3

 #include <pmmintrin.h>

 #include <volk/volk_sse3_intrinsics.h>


 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,

                                                    const lv_32fc_t* numeratorVector,

                                                    const lv_32fc_t* denumeratorVector,

                                                    unsigned int num_points)

 {

     /*

      * we'll do the "classical"

      *  a      a b*

      * --- = -------

      *  b     |b|^2

      * */

     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 num01, num23, den01, den23, norm, result;

     lv_32fc_t* c = cVector;

     const lv_32fc_t* a = numeratorVector;

     const lv_32fc_t* b = denumeratorVector;


     for (; number < quarterPoints; number++) {

         num01 = _mm_loadu_ps((float*)a);                  // first pair

         den01 = _mm_loadu_ps((float*)b);                  // first pair

         num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)

         a += 2;

         b += 2;


         num23 = _mm_loadu_ps((float*)a);                  // second pair

         den23 = _mm_loadu_ps((float*)b);                  // second pair

         num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)

         a += 2;

         b += 2;


         norm = _mm_magnitudesquared_ps_sse3(den01, den23);

         den01 = _mm_unpacklo_ps(norm, norm);

         den23 = _mm_unpackhi_ps(norm, norm);


         result = _mm_div_ps(num01, den01);

         _mm_storeu_ps((float*)c, result); // Store the results back into the C container

         c += 2;

         result = _mm_div_ps(num23, den23);

         _mm_storeu_ps((float*)c, result); // Store the results back into the C container

         c += 2;

     }


     number *= 4;

     for (; number < num_points; number++) {

         *c = (*a) / (*b);

         a++;

         b++;

         c++;

     }

 }

 #endif /* LV_HAVE_SSE3 */


 #ifdef LV_HAVE_AVX

 #include <immintrin.h>

 #include <volk/volk_avx_intrinsics.h>


 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,

                                                   const lv_32fc_t* numeratorVector,

                                                   const lv_32fc_t* denumeratorVector,

                                                   unsigned int num_points)

 {

     /*

      * we'll do the "classical"

      *  a      a b*

      * --- = -------

      *  b     |b|^2

      * */

     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;

     lv_32fc_t* c = cVector;

     const lv_32fc_t* a = numeratorVector;

     const lv_32fc_t* b = denumeratorVector;


     for (; number < quarterPoints; number++) {

         num = _mm256_loadu_ps(

             (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...

         denum = _mm256_loadu_ps(

             (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...

         mul_conj = _mm256_complexconjugatemul_ps(num, denum);

         sq = _mm256_mul_ps(denum, denum); // Square the values

         mag_sq_un = _mm256_hadd_ps(

             sq, sq); // obtain the actual squared magnitude, although out of order

         mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them

         // best guide I found on using these functions:

         // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870

         div = _mm256_div_ps(mul_conj, mag_sq);


         _mm256_storeu_ps((float*)c, div); // Store the results back into the C container


         a += 4;

         b += 4;

         c += 4;

     }


     number = quarterPoints * 4;


     for (; number < num_points; number++) {

         *c++ = (*a++) / (*b++);

     }

 }

 #endif /* LV_HAVE_AVX */


 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */


 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H

 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H


 #include <float.h>

 #include <inttypes.h>

 #include <stdio.h>

 #include <volk/volk_complex.h>


 #ifdef LV_HAVE_SSE3

 #include <pmmintrin.h>

 #include <volk/volk_sse3_intrinsics.h>


 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,

                                                    const lv_32fc_t* numeratorVector,

                                                    const lv_32fc_t* denumeratorVector,

                                                    unsigned int num_points)

 {

     /*

      * we'll do the "classical"

      *  a      a b*

      * --- = -------

      *  b     |b|^2

      * */

     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;


     __m128 num01, num23, den01, den23, norm, result;

     lv_32fc_t* c = cVector;

     const lv_32fc_t* a = numeratorVector;

     const lv_32fc_t* b = denumeratorVector;


     for (; number < quarterPoints; number++) {

         num01 = _mm_load_ps((float*)a);                   // first pair

         den01 = _mm_load_ps((float*)b);                   // first pair

         num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)

         a += 2;

         b += 2;


         num23 = _mm_load_ps((float*)a);                   // second pair

         den23 = _mm_load_ps((float*)b);                   // second pair

         num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)

         a += 2;

         b += 2;


         norm = _mm_magnitudesquared_ps_sse3(den01, den23);


         den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice

         den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice


         result = _mm_div_ps(num01, den01);

         _mm_store_ps((float*)c, result); // Store the results back into the C container

         c += 2;

         result = _mm_div_ps(num23, den23);

         _mm_store_ps((float*)c, result); // Store the results back into the C container

         c += 2;

     }


     number *= 4;

     for (; number < num_points; number++) {

         *c = (*a) / (*b);

         a++;

         b++;

         c++;

     }

 }

 #endif /* LV_HAVE_SSE */


 #ifdef LV_HAVE_AVX

 #include <immintrin.h>

 #include <volk/volk_avx_intrinsics.h>


 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,

                                                   const lv_32fc_t* numeratorVector,

                                                   const lv_32fc_t* denumeratorVector,

                                                   unsigned int num_points)

 {

     /*

      * Guide to AVX intrisics:

      * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#

      *

      * we'll do the "classical"

      *  a      a b*

      * --- = -------

      *  b     |b|^2

      *

      */

     lv_32fc_t* c = cVector;

     const lv_32fc_t* a = numeratorVector;

     const lv_32fc_t* b = denumeratorVector;


     const unsigned int eigthPoints = num_points / 8;


     __m256 num01, num23, denum01, denum23, complex_result, result0, result1;


     for (unsigned int number = 0; number < eigthPoints; number++) {

         // Load the ar + ai, br + bi ... as ar,ai,br,bi ...

         num01 = _mm256_load_ps((float*)a);

         denum01 = _mm256_load_ps((float*)b);


         num01 = _mm256_complexconjugatemul_ps(num01, denum01);

         a += 4;

         b += 4;


         num23 = _mm256_load_ps((float*)a);

         denum23 = _mm256_load_ps((float*)b);

         num23 = _mm256_complexconjugatemul_ps(num23, denum23);

         a += 4;

         b += 4;


         complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),

                                         _mm256_mul_ps(denum23, denum23));


         denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);

         denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);


         result0 = _mm256_div_ps(num01, denum01);

         result1 = _mm256_div_ps(num23, denum23);


         _mm256_store_ps((float*)c, result0);

         c += 4;

         _mm256_store_ps((float*)c, result1);

         c += 4;

     }


     volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);

 }

 #endif /* LV_HAVE_AVX */


 #ifdef LV_HAVE_NEON

 #include <arm_neon.h>


 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,

                                                  const lv_32fc_t* aVector,

                                                  const lv_32fc_t* bVector,

                                                  unsigned int num_points)

 {

     lv_32fc_t* cPtr = cVector;

     const lv_32fc_t* aPtr = aVector;

     const lv_32fc_t* bPtr = bVector;


     float32x4x2_t aVal, bVal, cVal;

     float32x4_t bAbs, bAbsInv;


     const unsigned int quarterPoints = num_points / 4;

     unsigned int number = 0;

     for (; number < quarterPoints; number++) {

         aVal = vld2q_f32((const float*)(aPtr));

         bVal = vld2q_f32((const float*)(bPtr));

         aPtr += 4;

         bPtr += 4;

         __VOLK_PREFETCH(aPtr + 4);

         __VOLK_PREFETCH(bPtr + 4);


         bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);

         bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);


         bAbsInv = vrecpeq_f32(bAbs);

         bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));

         bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));


         cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);

         cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);

         cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);


         cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);

         cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);

         cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);


         vst2q_f32((float*)(cPtr), cVal);

         cPtr += 4;

     }


     for (number = quarterPoints * 4; number < num_points; number++) {

         *cPtr++ = (*aPtr++) / (*bPtr++);

     }

 }

 #endif /* LV_HAVE_NEON */


 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_div_ps
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756

_mm_unpackhi_ps
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920

_mm_storeu_ps
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787

_mm_loadu_ps
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941

_mm_unpacklo_ps
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

_mm_store_ps
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704

volk_32fc_x2_divide_32fc_a_sse3
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:212

volk_32fc_x2_divide_32fc_generic
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:69

volk_32fc_x2_divide_32fc_neon
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:332

volk_32fc_x2_divide_32fc_u_avx
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:148

volk_32fc_x2_divide_32fc_a_avx
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:271

volk_32fc_x2_divide_32fc_u_sse3
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:89

volk_avx_intrinsics.h

_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38

__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71

volk_complex.h

lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:74

volk_sse3_intrinsics.h

_mm_magnitudesquared_ps_sse3
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38

_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31