Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_64f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_64f_convert_32f_u_H
52 #define INCLUDED_volk_64f_convert_32f_u_H
53 
54 #include <inttypes.h>
55 #include <stdio.h>
56 
57 #ifdef LV_HAVE_AVX512F
58 #include <immintrin.h>
59 
60 static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
61  const double* inputVector,
62  unsigned int num_points)
63 {
64  unsigned int number = 0;
65 
66  const unsigned int oneSixteenthPoints = num_points / 16;
67 
68  const double* inputVectorPtr = (const double*)inputVector;
69  float* outputVectorPtr = outputVector;
70  __m256 ret1, ret2;
71  __m512d inputVal1, inputVal2;
72 
73  for (; number < oneSixteenthPoints; number++) {
74  inputVal1 = _mm512_loadu_pd(inputVectorPtr);
75  inputVectorPtr += 8;
76  inputVal2 = _mm512_loadu_pd(inputVectorPtr);
77  inputVectorPtr += 8;
78 
79  ret1 = _mm512_cvtpd_ps(inputVal1);
80  ret2 = _mm512_cvtpd_ps(inputVal2);
81 
82  _mm256_storeu_ps(outputVectorPtr, ret1);
83  outputVectorPtr += 8;
84 
85  _mm256_storeu_ps(outputVectorPtr, ret2);
86  outputVectorPtr += 8;
87  }
88 
89  number = oneSixteenthPoints * 16;
90  for (; number < num_points; number++) {
91  outputVector[number] = (float)(inputVector[number]);
92  }
93 }
94 #endif /* LV_HAVE_AVX512F */
95 
96 
97 #ifdef LV_HAVE_AVX
98 #include <immintrin.h>
99 
100 static inline void volk_64f_convert_32f_u_avx(float* outputVector,
101  const double* inputVector,
102  unsigned int num_points)
103 {
104  unsigned int number = 0;
105 
106  const unsigned int oneEightPoints = num_points / 8;
107 
108  const double* inputVectorPtr = (const double*)inputVector;
109  float* outputVectorPtr = outputVector;
110  __m128 ret1, ret2;
111  __m256d inputVal1, inputVal2;
112 
113  for (; number < oneEightPoints; number++) {
114  inputVal1 = _mm256_loadu_pd(inputVectorPtr);
115  inputVectorPtr += 4;
116  inputVal2 = _mm256_loadu_pd(inputVectorPtr);
117  inputVectorPtr += 4;
118 
119  ret1 = _mm256_cvtpd_ps(inputVal1);
120  ret2 = _mm256_cvtpd_ps(inputVal2);
121 
122  _mm_storeu_ps(outputVectorPtr, ret1);
123  outputVectorPtr += 4;
124 
125  _mm_storeu_ps(outputVectorPtr, ret2);
126  outputVectorPtr += 4;
127  }
128 
129  number = oneEightPoints * 8;
130  for (; number < num_points; number++) {
131  outputVector[number] = (float)(inputVector[number]);
132  }
133 }
134 #endif /* LV_HAVE_AVX */
135 
136 
137 #ifdef LV_HAVE_SSE2
138 #include <emmintrin.h>
139 
140 static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
141  const double* inputVector,
142  unsigned int num_points)
143 {
144  unsigned int number = 0;
145 
146  const unsigned int quarterPoints = num_points / 4;
147 
148  const double* inputVectorPtr = (const double*)inputVector;
149  float* outputVectorPtr = outputVector;
150  __m128 ret, ret2;
151  __m128d inputVal1, inputVal2;
152 
153  for (; number < quarterPoints; number++) {
154  inputVal1 = _mm_loadu_pd(inputVectorPtr);
155  inputVectorPtr += 2;
156  inputVal2 = _mm_loadu_pd(inputVectorPtr);
157  inputVectorPtr += 2;
158 
159  ret = _mm_cvtpd_ps(inputVal1);
160  ret2 = _mm_cvtpd_ps(inputVal2);
161 
162  ret = _mm_movelh_ps(ret, ret2);
163 
164  _mm_storeu_ps(outputVectorPtr, ret);
165  outputVectorPtr += 4;
166  }
167 
168  number = quarterPoints * 4;
169  for (; number < num_points; number++) {
170  outputVector[number] = (float)(inputVector[number]);
171  }
172 }
173 #endif /* LV_HAVE_SSE2 */
174 
175 
176 #ifdef LV_HAVE_GENERIC
177 
178 static inline void volk_64f_convert_32f_generic(float* outputVector,
179  const double* inputVector,
180  unsigned int num_points)
181 {
182  float* outputVectorPtr = outputVector;
183  const double* inputVectorPtr = inputVector;
184  unsigned int number = 0;
185 
186  for (number = 0; number < num_points; number++) {
187  *outputVectorPtr++ = ((float)(*inputVectorPtr++));
188  }
189 }
190 #endif /* LV_HAVE_GENERIC */
191 
192 
193 #endif /* INCLUDED_volk_64f_convert_32f_u_H */
194 #ifndef INCLUDED_volk_64f_convert_32f_a_H
195 #define INCLUDED_volk_64f_convert_32f_a_H
196 
197 #include <inttypes.h>
198 #include <stdio.h>
199 
200 #ifdef LV_HAVE_AVX512F
201 #include <immintrin.h>
202 
203 static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
204  const double* inputVector,
205  unsigned int num_points)
206 {
207  unsigned int number = 0;
208 
209  const unsigned int oneSixteenthPoints = num_points / 16;
210 
211  const double* inputVectorPtr = (const double*)inputVector;
212  float* outputVectorPtr = outputVector;
213  __m256 ret1, ret2;
214  __m512d inputVal1, inputVal2;
215 
216  for (; number < oneSixteenthPoints; number++) {
217  inputVal1 = _mm512_load_pd(inputVectorPtr);
218  inputVectorPtr += 8;
219  inputVal2 = _mm512_load_pd(inputVectorPtr);
220  inputVectorPtr += 8;
221 
222  ret1 = _mm512_cvtpd_ps(inputVal1);
223  ret2 = _mm512_cvtpd_ps(inputVal2);
224 
225  _mm256_store_ps(outputVectorPtr, ret1);
226  outputVectorPtr += 8;
227 
228  _mm256_store_ps(outputVectorPtr, ret2);
229  outputVectorPtr += 8;
230  }
231 
232  number = oneSixteenthPoints * 16;
233  for (; number < num_points; number++) {
234  outputVector[number] = (float)(inputVector[number]);
235  }
236 }
237 #endif /* LV_HAVE_AVX512F */
238 
239 
240 #ifdef LV_HAVE_AVX
241 #include <immintrin.h>
242 
243 static inline void volk_64f_convert_32f_a_avx(float* outputVector,
244  const double* inputVector,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248 
249  const unsigned int oneEightPoints = num_points / 8;
250 
251  const double* inputVectorPtr = (const double*)inputVector;
252  float* outputVectorPtr = outputVector;
253  __m128 ret1, ret2;
254  __m256d inputVal1, inputVal2;
255 
256  for (; number < oneEightPoints; number++) {
257  inputVal1 = _mm256_load_pd(inputVectorPtr);
258  inputVectorPtr += 4;
259  inputVal2 = _mm256_load_pd(inputVectorPtr);
260  inputVectorPtr += 4;
261 
262  ret1 = _mm256_cvtpd_ps(inputVal1);
263  ret2 = _mm256_cvtpd_ps(inputVal2);
264 
265  _mm_store_ps(outputVectorPtr, ret1);
266  outputVectorPtr += 4;
267 
268  _mm_store_ps(outputVectorPtr, ret2);
269  outputVectorPtr += 4;
270  }
271 
272  number = oneEightPoints * 8;
273  for (; number < num_points; number++) {
274  outputVector[number] = (float)(inputVector[number]);
275  }
276 }
277 #endif /* LV_HAVE_AVX */
278 
279 
280 #ifdef LV_HAVE_SSE2
281 #include <emmintrin.h>
282 
283 static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
284  const double* inputVector,
285  unsigned int num_points)
286 {
287  unsigned int number = 0;
288 
289  const unsigned int quarterPoints = num_points / 4;
290 
291  const double* inputVectorPtr = (const double*)inputVector;
292  float* outputVectorPtr = outputVector;
293  __m128 ret, ret2;
294  __m128d inputVal1, inputVal2;
295 
296  for (; number < quarterPoints; number++) {
297  inputVal1 = _mm_load_pd(inputVectorPtr);
298  inputVectorPtr += 2;
299  inputVal2 = _mm_load_pd(inputVectorPtr);
300  inputVectorPtr += 2;
301 
302  ret = _mm_cvtpd_ps(inputVal1);
303  ret2 = _mm_cvtpd_ps(inputVal2);
304 
305  ret = _mm_movelh_ps(ret, ret2);
306 
307  _mm_store_ps(outputVectorPtr, ret);
308  outputVectorPtr += 4;
309  }
310 
311  number = quarterPoints * 4;
312  for (; number < num_points; number++) {
313  outputVector[number] = (float)(inputVector[number]);
314  }
315 }
316 #endif /* LV_HAVE_SSE2 */
317 
318 
319 #ifdef LV_HAVE_GENERIC
320 
321 static inline void volk_64f_convert_32f_a_generic(float* outputVector,
322  const double* inputVector,
323  unsigned int num_points)
324 {
325  float* outputVectorPtr = outputVector;
326  const double* inputVectorPtr = inputVector;
327  unsigned int number = 0;
328 
329  for (number = 0; number < num_points; number++) {
330  *outputVectorPtr++ = ((float)(*inputVectorPtr++));
331  }
332 }
333 #endif /* LV_HAVE_GENERIC */
334 
335 
336 #endif /* INCLUDED_volk_64f_convert_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4563
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition: sse2neon.h:3991
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_64f_convert_32f_u_avx(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:100
static void volk_64f_convert_32f_generic(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:178
static void volk_64f_convert_32f_a_avx(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:243
static void volk_64f_convert_32f_a_generic(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:321
static void volk_64f_convert_32f_u_sse2(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:140
static void volk_64f_convert_32f_a_sse2(float *outputVector, const double *inputVector, unsigned int num_points)
Definition: volk_64f_convert_32f.h:283