58 #ifndef INCLUDED_volk_32f_64f_multiply_64f_H
59 #define INCLUDED_volk_32f_64f_multiply_64f_H
64 #ifdef LV_HAVE_GENERIC
68 const double* bVector,
69 unsigned int num_points)
71 double* cPtr = cVector;
72 const float* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
90 #include <immintrin.h>
91 #include <xmmintrin.h>
95 const double* bVector,
96 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int eighth_points = num_points / 8;
101 double* cPtr = cVector;
102 const float* aPtr = aVector;
103 const double* bPtr = bVector;
107 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
108 for (; number < eighth_points; number++) {
110 aVal = _mm256_loadu_ps(aPtr);
111 bVal1 = _mm256_loadu_pd(bPtr);
112 bVal2 = _mm256_loadu_pd(bPtr + 4);
114 aVal1 = _mm256_extractf128_ps(aVal, 0);
115 aVal2 = _mm256_extractf128_ps(aVal, 1);
117 aDbl1 = _mm256_cvtps_pd(aVal1);
118 aDbl2 = _mm256_cvtps_pd(aVal2);
120 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
121 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
123 _mm256_storeu_pd(cPtr, cVal1);
124 _mm256_storeu_pd(cPtr + 4, cVal2);
131 number = eighth_points * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
142 #include <immintrin.h>
143 #include <xmmintrin.h>
146 const float* aVector,
147 const double* bVector,
148 unsigned int num_points)
150 unsigned int number = 0;
151 const unsigned int eighth_points = num_points / 8;
153 double* cPtr = cVector;
154 const float* aPtr = aVector;
155 const double* bPtr = bVector;
159 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
160 for (; number < eighth_points; number++) {
162 aVal = _mm256_load_ps(aPtr);
163 bVal1 = _mm256_load_pd(bPtr);
164 bVal2 = _mm256_load_pd(bPtr + 4);
166 aVal1 = _mm256_extractf128_ps(aVal, 0);
167 aVal2 = _mm256_extractf128_ps(aVal, 1);
169 aDbl1 = _mm256_cvtps_pd(aVal1);
170 aDbl2 = _mm256_cvtps_pd(aVal2);
172 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
173 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
175 _mm256_store_pd(cPtr, cVal1);
176 _mm256_store_pd(cPtr + 4, cVal2);
183 number = eighth_points * 8;
184 for (; number < num_points; number++) {
185 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
static void volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:66
static void volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:93
static void volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:145