60 #ifndef INCLUDED_volk_32f_64f_add_64f_H
61 #define INCLUDED_volk_32f_64f_add_64f_H
65 #ifdef LV_HAVE_GENERIC
69 const double* bVector,
70 unsigned int num_points)
72 double* cPtr = cVector;
73 const float* aPtr = aVector;
74 const double* bPtr = bVector;
75 unsigned int number = 0;
77 for (number = 0; number < num_points; number++) {
78 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
87 static inline void volk_32f_64f_add_64f_neon(
double* cVector,
89 const double* bVector,
90 unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int half_points = num_points / 2;
95 double* cPtr = cVector;
96 const float* aPtr = aVector;
97 const double* bPtr = bVector;
99 float64x2_t aVal, bVal, cVal;
101 for (number = 0; number < half_points; number++) {
103 aVal1 = vld1_f32(aPtr);
104 bVal = vld1q_f64(bPtr);
111 aVal = vcvt_f64_f32(aVal1);
113 cVal = vaddq_f64(aVal, bVal);
115 vst1q_f64(cPtr, cVal);
120 number = half_points * 2;
121 for (; number < num_points; number++) {
122 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
130 #include <immintrin.h>
131 #include <xmmintrin.h>
134 const float* aVector,
135 const double* bVector,
136 unsigned int num_points)
138 unsigned int number = 0;
139 const unsigned int eighth_points = num_points / 8;
141 double* cPtr = cVector;
142 const float* aPtr = aVector;
143 const double* bPtr = bVector;
147 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
148 for (; number < eighth_points; number++) {
150 aVal = _mm256_loadu_ps(aPtr);
151 bVal1 = _mm256_loadu_pd(bPtr);
152 bVal2 = _mm256_loadu_pd(bPtr + 4);
154 aVal1 = _mm256_extractf128_ps(aVal, 0);
155 aVal2 = _mm256_extractf128_ps(aVal, 1);
157 aDbl1 = _mm256_cvtps_pd(aVal1);
158 aDbl2 = _mm256_cvtps_pd(aVal2);
160 cVal1 = _mm256_add_pd(aDbl1, bVal1);
161 cVal2 = _mm256_add_pd(aDbl2, bVal2);
163 _mm256_storeu_pd(cPtr,
165 _mm256_storeu_pd(cPtr + 4,
173 number = eighth_points * 8;
174 for (; number < num_points; number++) {
175 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
183 #include <immintrin.h>
184 #include <xmmintrin.h>
187 const float* aVector,
188 const double* bVector,
189 unsigned int num_points)
191 unsigned int number = 0;
192 const unsigned int eighth_points = num_points / 8;
194 double* cPtr = cVector;
195 const float* aPtr = aVector;
196 const double* bPtr = bVector;
200 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
201 for (; number < eighth_points; number++) {
203 aVal = _mm256_load_ps(aPtr);
204 bVal1 = _mm256_load_pd(bPtr);
205 bVal2 = _mm256_load_pd(bPtr + 4);
207 aVal1 = _mm256_extractf128_ps(aVal, 0);
208 aVal2 = _mm256_extractf128_ps(aVal, 1);
210 aDbl1 = _mm256_cvtps_pd(aVal1);
211 aDbl2 = _mm256_cvtps_pd(aVal2);
213 cVal1 = _mm256_add_pd(aDbl1, bVal1);
214 cVal2 = _mm256_add_pd(aDbl2, bVal2);
216 _mm256_store_pd(cPtr, cVal1);
217 _mm256_store_pd(cPtr + 4,
225 number = eighth_points * 8;
226 for (; number < num_points; number++) {
227 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:186
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:133
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:67
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71