Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_64f_x2_multiply_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_64f_x2_multiply_64f_H
59 #define INCLUDED_volk_64f_x2_multiply_64f_H
60 
61 #include <inttypes.h>
62 
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
67  const double* aVector,
68  const double* bVector,
69  unsigned int num_points)
70 {
71  double* cPtr = cVector;
72  const double* aPtr = aVector;
73  const double* bPtr = bVector;
74  unsigned int number = 0;
75 
76  for (number = 0; number < num_points; number++) {
77  *cPtr++ = (*aPtr++) * (*bPtr++);
78  }
79 }
80 
81 #endif /* LV_HAVE_GENERIC */
82 
83 /*
84  * Unaligned versions
85  */
86 
87 #ifdef LV_HAVE_SSE2
88 
89 #include <emmintrin.h>
90 
91 static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
92  const double* aVector,
93  const double* bVector,
94  unsigned int num_points)
95 {
96  unsigned int number = 0;
97  const unsigned int half_points = num_points / 2;
98 
99  double* cPtr = cVector;
100  const double* aPtr = aVector;
101  const double* bPtr = bVector;
102 
103  __m128d aVal, bVal, cVal;
104  for (; number < half_points; number++) {
105  aVal = _mm_loadu_pd(aPtr);
106  bVal = _mm_loadu_pd(bPtr);
107 
108  cVal = _mm_mul_pd(aVal, bVal);
109 
110  _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
111 
112  aPtr += 2;
113  bPtr += 2;
114  cPtr += 2;
115  }
116 
117  number = half_points * 2;
118  for (; number < num_points; number++) {
119  *cPtr++ = (*aPtr++) * (*bPtr++);
120  }
121 }
122 
123 #endif /* LV_HAVE_SSE2 */
124 
125 
126 #ifdef LV_HAVE_AVX
127 
128 #include <immintrin.h>
129 
130 static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
131  const double* aVector,
132  const double* bVector,
133  unsigned int num_points)
134 {
135  unsigned int number = 0;
136  const unsigned int quarter_points = num_points / 4;
137 
138  double* cPtr = cVector;
139  const double* aPtr = aVector;
140  const double* bPtr = bVector;
141 
142  __m256d aVal, bVal, cVal;
143  for (; number < quarter_points; number++) {
144 
145  aVal = _mm256_loadu_pd(aPtr);
146  bVal = _mm256_loadu_pd(bPtr);
147 
148  cVal = _mm256_mul_pd(aVal, bVal);
149 
150  _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
151 
152  aPtr += 4;
153  bPtr += 4;
154  cPtr += 4;
155  }
156 
157  number = quarter_points * 4;
158  for (; number < num_points; number++) {
159  *cPtr++ = (*aPtr++) * (*bPtr++);
160  }
161 }
162 
163 #endif /* LV_HAVE_AVX */
164 
165 /*
166  * Aligned versions
167  */
168 
169 #ifdef LV_HAVE_SSE2
170 
171 #include <emmintrin.h>
172 
173 static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
174  const double* aVector,
175  const double* bVector,
176  unsigned int num_points)
177 {
178  unsigned int number = 0;
179  const unsigned int half_points = num_points / 2;
180 
181  double* cPtr = cVector;
182  const double* aPtr = aVector;
183  const double* bPtr = bVector;
184 
185  __m128d aVal, bVal, cVal;
186  for (; number < half_points; number++) {
187  aVal = _mm_load_pd(aPtr);
188  bVal = _mm_load_pd(bPtr);
189 
190  cVal = _mm_mul_pd(aVal, bVal);
191 
192  _mm_store_pd(cPtr, cVal); // Store the results back into the C container
193 
194  aPtr += 2;
195  bPtr += 2;
196  cPtr += 2;
197  }
198 
199  number = half_points * 2;
200  for (; number < num_points; number++) {
201  *cPtr++ = (*aPtr++) * (*bPtr++);
202  }
203 }
204 
205 #endif /* LV_HAVE_SSE2 */
206 
207 
208 #ifdef LV_HAVE_AVX
209 
210 #include <immintrin.h>
211 
212 static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
213  const double* aVector,
214  const double* bVector,
215  unsigned int num_points)
216 {
217  unsigned int number = 0;
218  const unsigned int quarter_points = num_points / 4;
219 
220  double* cPtr = cVector;
221  const double* aPtr = aVector;
222  const double* bPtr = bVector;
223 
224  __m256d aVal, bVal, cVal;
225  for (; number < quarter_points; number++) {
226 
227  aVal = _mm256_load_pd(aPtr);
228  bVal = _mm256_load_pd(bPtr);
229 
230  cVal = _mm256_mul_pd(aVal, bVal);
231 
232  _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
233 
234  aPtr += 4;
235  bPtr += 4;
236  cPtr += 4;
237  }
238 
239  number = quarter_points * 4;
240  for (; number < num_points; number++) {
241  *cPtr++ = (*aPtr++) * (*bPtr++);
242  }
243 }
244 
245 #endif /* LV_HAVE_AVX */
246 
247 #endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4563
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4905
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:6003
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
static void volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:173
static void volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:91
static void volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:66
static void volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:130
static void volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_multiply_64f.h:212