58 #ifndef INCLUDED_volk_64f_x2_add_64f_H
59 #define INCLUDED_volk_64f_x2_add_64f_H
64 #ifdef LV_HAVE_GENERIC
67 const double* aVector,
68 const double* bVector,
69 unsigned int num_points)
71 double* cPtr = cVector;
72 const double* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = (*aPtr++) + (*bPtr++);
89 #include <emmintrin.h>
92 const double* aVector,
93 const double* bVector,
94 unsigned int num_points)
96 unsigned int number = 0;
97 const unsigned int half_points = num_points / 2;
99 double* cPtr = cVector;
100 const double* aPtr = aVector;
101 const double* bPtr = bVector;
104 for (; number < half_points; number++) {
117 number = half_points * 2;
118 for (; number < num_points; number++) {
119 *cPtr++ = (*aPtr++) + (*bPtr++);
128 #include <immintrin.h>
131 const double* aVector,
132 const double* bVector,
133 unsigned int num_points)
135 unsigned int number = 0;
136 const unsigned int quarter_points = num_points / 4;
138 double* cPtr = cVector;
139 const double* aPtr = aVector;
140 const double* bPtr = bVector;
142 __m256d aVal, bVal, cVal;
143 for (; number < quarter_points; number++) {
145 aVal = _mm256_loadu_pd(aPtr);
146 bVal = _mm256_loadu_pd(bPtr);
148 cVal = _mm256_add_pd(aVal, bVal);
150 _mm256_storeu_pd(cPtr, cVal);
157 number = quarter_points * 4;
158 for (; number < num_points; number++) {
159 *cPtr++ = (*aPtr++) + (*bPtr++);
171 #include <emmintrin.h>
174 const double* aVector,
175 const double* bVector,
176 unsigned int num_points)
178 unsigned int number = 0;
179 const unsigned int half_points = num_points / 2;
181 double* cPtr = cVector;
182 const double* aPtr = aVector;
183 const double* bPtr = bVector;
186 for (; number < half_points; number++) {
199 number = half_points * 2;
200 for (; number < num_points; number++) {
201 *cPtr++ = (*aPtr++) + (*bPtr++);
210 #include <immintrin.h>
213 const double* aVector,
214 const double* bVector,
215 unsigned int num_points)
217 unsigned int number = 0;
218 const unsigned int quarter_points = num_points / 4;
220 double* cPtr = cVector;
221 const double* aPtr = aVector;
222 const double* bPtr = bVector;
224 __m256d aVal, bVal, cVal;
225 for (; number < quarter_points; number++) {
227 aVal = _mm256_load_pd(aPtr);
228 bVal = _mm256_load_pd(bPtr);
230 cVal = _mm256_add_pd(aVal, bVal);
232 _mm256_store_pd(cPtr, cVal);
239 number = quarter_points * 4;
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4563
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:6003
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3011
static void volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:91
static void volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:66
static void volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:173
static void volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:212
static void volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:130