doxygen/volk__32f__64f__multiply__64f_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2018 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_volk_32f_64f_multiply_64f_H

 #define INCLUDED_volk_32f_64f_multiply_64f_H


 #include <inttypes.h>


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32f_64f_multiply_64f_generic(double* cVector,

                                                      const float* aVector,

                                                      const double* bVector,

                                                      unsigned int num_points)

 {

     double* cPtr = cVector;

     const float* aPtr = aVector;

     const double* bPtr = bVector;

     unsigned int number = 0;


     for (number = 0; number < num_points; number++) {

         *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);

     }

 }


 #endif /* LV_HAVE_GENERIC */


 /*

  * Unaligned versions

  */


 #ifdef LV_HAVE_AVX


 #include <immintrin.h>

 #include <xmmintrin.h>


 static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,

                                                    const float* aVector,

                                                    const double* bVector,

                                                    unsigned int num_points)

 {

     unsigned int number = 0;

     const unsigned int eighth_points = num_points / 8;


     double* cPtr = cVector;

     const float* aPtr = aVector;

     const double* bPtr = bVector;


     __m256 aVal;

     __m128 aVal1, aVal2;

     __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;

     for (; number < eighth_points; number++) {


         aVal = _mm256_loadu_ps(aPtr);

         bVal1 = _mm256_loadu_pd(bPtr);

         bVal2 = _mm256_loadu_pd(bPtr + 4);


         aVal1 = _mm256_extractf128_ps(aVal, 0);

         aVal2 = _mm256_extractf128_ps(aVal, 1);


         aDbl1 = _mm256_cvtps_pd(aVal1);

         aDbl2 = _mm256_cvtps_pd(aVal2);


         cVal1 = _mm256_mul_pd(aDbl1, bVal1);

         cVal2 = _mm256_mul_pd(aDbl2, bVal2);


         _mm256_storeu_pd(cPtr, cVal1);     // Store the results back into the C container

         _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighth_points * 8;

     for (; number < num_points; number++) {

         *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX */


 #ifdef LV_HAVE_AVX


 #include <immintrin.h>

 #include <xmmintrin.h>


 static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,

                                                    const float* aVector,

                                                    const double* bVector,

                                                    unsigned int num_points)

 {

     unsigned int number = 0;

     const unsigned int eighth_points = num_points / 8;


     double* cPtr = cVector;

     const float* aPtr = aVector;

     const double* bPtr = bVector;


     __m256 aVal;

     __m128 aVal1, aVal2;

     __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;

     for (; number < eighth_points; number++) {


         aVal = _mm256_load_ps(aPtr);

         bVal1 = _mm256_load_pd(bPtr);

         bVal2 = _mm256_load_pd(bPtr + 4);


         aVal1 = _mm256_extractf128_ps(aVal, 0);

         aVal2 = _mm256_extractf128_ps(aVal, 1);


         aDbl1 = _mm256_cvtps_pd(aVal1);

         aDbl2 = _mm256_cvtps_pd(aVal2);


         cVal1 = _mm256_mul_pd(aDbl1, bVal1);

         cVal2 = _mm256_mul_pd(aDbl2, bVal2);


         _mm256_store_pd(cPtr, cVal1);     // Store the results back into the C container

         _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container


         aPtr += 8;

         bPtr += 8;

         cPtr += 8;

     }


     number = eighth_points * 8;

     for (; number < num_points; number++) {

         *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);

     }

 }


 #endif /* LV_HAVE_AVX */


 #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */

__m128
float32x4_t __m128
Definition: sse2neon.h:235

volk_32f_64f_multiply_64f_generic
static void volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:66

volk_32f_64f_multiply_64f_u_avx
static void volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:93

volk_32f_64f_multiply_64f_a_avx
static void volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:145