Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
52 #define INCLUDED_volk_32i_s32f_convert_32f_u_H
53 
54 #include <inttypes.h>
55 #include <stdio.h>
56 
57 #ifdef LV_HAVE_AVX512F
58 #include <immintrin.h>
59 
60 static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector,
61  const int32_t* inputVector,
62  const float scalar,
63  unsigned int num_points)
64 {
65  unsigned int number = 0;
66  const unsigned int onesixteenthPoints = num_points / 16;
67 
68  float* outputVectorPtr = outputVector;
69  const float iScalar = 1.0 / scalar;
70  __m512 invScalar = _mm512_set1_ps(iScalar);
71  int32_t* inputPtr = (int32_t*)inputVector;
72  __m512i inputVal;
73  __m512 ret;
74 
75  for (; number < onesixteenthPoints; number++) {
76  // Load the values
77  inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
78 
79  ret = _mm512_cvtepi32_ps(inputVal);
80  ret = _mm512_mul_ps(ret, invScalar);
81 
82  _mm512_storeu_ps(outputVectorPtr, ret);
83 
84  outputVectorPtr += 16;
85  inputPtr += 16;
86  }
87 
88  number = onesixteenthPoints * 16;
89  for (; number < num_points; number++) {
90  outputVector[number] = ((float)(inputVector[number])) * iScalar;
91  }
92 }
93 #endif /* LV_HAVE_AVX512F */
94 
95 
96 #ifdef LV_HAVE_AVX2
97 #include <immintrin.h>
98 
99 static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector,
100  const int32_t* inputVector,
101  const float scalar,
102  unsigned int num_points)
103 {
104  unsigned int number = 0;
105  const unsigned int oneEightPoints = num_points / 8;
106 
107  float* outputVectorPtr = outputVector;
108  const float iScalar = 1.0 / scalar;
109  __m256 invScalar = _mm256_set1_ps(iScalar);
110  int32_t* inputPtr = (int32_t*)inputVector;
111  __m256i inputVal;
112  __m256 ret;
113 
114  for (; number < oneEightPoints; number++) {
115  // Load the 4 values
116  inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
117 
118  ret = _mm256_cvtepi32_ps(inputVal);
119  ret = _mm256_mul_ps(ret, invScalar);
120 
121  _mm256_storeu_ps(outputVectorPtr, ret);
122 
123  outputVectorPtr += 8;
124  inputPtr += 8;
125  }
126 
127  number = oneEightPoints * 8;
128  for (; number < num_points; number++) {
129  outputVector[number] = ((float)(inputVector[number])) * iScalar;
130  }
131 }
132 #endif /* LV_HAVE_AVX2 */
133 
134 
135 #ifdef LV_HAVE_SSE2
136 #include <emmintrin.h>
137 
138 static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector,
139  const int32_t* inputVector,
140  const float scalar,
141  unsigned int num_points)
142 {
143  unsigned int number = 0;
144  const unsigned int quarterPoints = num_points / 4;
145 
146  float* outputVectorPtr = outputVector;
147  const float iScalar = 1.0 / scalar;
148  __m128 invScalar = _mm_set_ps1(iScalar);
149  int32_t* inputPtr = (int32_t*)inputVector;
150  __m128i inputVal;
151  __m128 ret;
152 
153  for (; number < quarterPoints; number++) {
154  // Load the 4 values
155  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
156 
157  ret = _mm_cvtepi32_ps(inputVal);
158  ret = _mm_mul_ps(ret, invScalar);
159 
160  _mm_storeu_ps(outputVectorPtr, ret);
161 
162  outputVectorPtr += 4;
163  inputPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  outputVector[number] = ((float)(inputVector[number])) * iScalar;
169  }
170 }
171 #endif /* LV_HAVE_SSE2 */
172 
173 
174 #ifdef LV_HAVE_GENERIC
175 
176 static inline void volk_32i_s32f_convert_32f_generic(float* outputVector,
177  const int32_t* inputVector,
178  const float scalar,
179  unsigned int num_points)
180 {
181  float* outputVectorPtr = outputVector;
182  const int32_t* inputVectorPtr = inputVector;
183  unsigned int number = 0;
184  const float iScalar = 1.0 / scalar;
185 
186  for (number = 0; number < num_points; number++) {
187  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
188  }
189 }
190 #endif /* LV_HAVE_GENERIC */
191 
192 #endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
193 
194 
195 #ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
196 #define INCLUDED_volk_32i_s32f_convert_32f_a_H
197 
198 #include <inttypes.h>
199 #include <stdio.h>
200 
201 #ifdef LV_HAVE_AVX512F
202 #include <immintrin.h>
203 
204 static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector,
205  const int32_t* inputVector,
206  const float scalar,
207  unsigned int num_points)
208 {
209  unsigned int number = 0;
210  const unsigned int onesixteenthPoints = num_points / 16;
211 
212  float* outputVectorPtr = outputVector;
213  const float iScalar = 1.0 / scalar;
214  __m512 invScalar = _mm512_set1_ps(iScalar);
215  int32_t* inputPtr = (int32_t*)inputVector;
216  __m512i inputVal;
217  __m512 ret;
218 
219  for (; number < onesixteenthPoints; number++) {
220  // Load the values
221  inputVal = _mm512_load_si512((__m512i*)inputPtr);
222 
223  ret = _mm512_cvtepi32_ps(inputVal);
224  ret = _mm512_mul_ps(ret, invScalar);
225 
226  _mm512_store_ps(outputVectorPtr, ret);
227 
228  outputVectorPtr += 16;
229  inputPtr += 16;
230  }
231 
232  number = onesixteenthPoints * 16;
233  for (; number < num_points; number++) {
234  outputVector[number] = ((float)(inputVector[number])) * iScalar;
235  }
236 }
237 #endif /* LV_HAVE_AVX512F */
238 
239 #ifdef LV_HAVE_AVX2
240 #include <immintrin.h>
241 
242 static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector,
243  const int32_t* inputVector,
244  const float scalar,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248  const unsigned int oneEightPoints = num_points / 8;
249 
250  float* outputVectorPtr = outputVector;
251  const float iScalar = 1.0 / scalar;
252  __m256 invScalar = _mm256_set1_ps(iScalar);
253  int32_t* inputPtr = (int32_t*)inputVector;
254  __m256i inputVal;
255  __m256 ret;
256 
257  for (; number < oneEightPoints; number++) {
258  // Load the 4 values
259  inputVal = _mm256_load_si256((__m256i*)inputPtr);
260 
261  ret = _mm256_cvtepi32_ps(inputVal);
262  ret = _mm256_mul_ps(ret, invScalar);
263 
264  _mm256_store_ps(outputVectorPtr, ret);
265 
266  outputVectorPtr += 8;
267  inputPtr += 8;
268  }
269 
270  number = oneEightPoints * 8;
271  for (; number < num_points; number++) {
272  outputVector[number] = ((float)(inputVector[number])) * iScalar;
273  }
274 }
275 #endif /* LV_HAVE_AVX2 */
276 
277 
278 #ifdef LV_HAVE_SSE2
279 #include <emmintrin.h>
280 
281 static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
282  const int32_t* inputVector,
283  const float scalar,
284  unsigned int num_points)
285 {
286  unsigned int number = 0;
287  const unsigned int quarterPoints = num_points / 4;
288 
289  float* outputVectorPtr = outputVector;
290  const float iScalar = 1.0 / scalar;
291  __m128 invScalar = _mm_set_ps1(iScalar);
292  int32_t* inputPtr = (int32_t*)inputVector;
293  __m128i inputVal;
294  __m128 ret;
295 
296  for (; number < quarterPoints; number++) {
297  // Load the 4 values
298  inputVal = _mm_load_si128((__m128i*)inputPtr);
299 
300  ret = _mm_cvtepi32_ps(inputVal);
301  ret = _mm_mul_ps(ret, invScalar);
302 
303  _mm_store_ps(outputVectorPtr, ret);
304 
305  outputVectorPtr += 4;
306  inputPtr += 4;
307  }
308 
309  number = quarterPoints * 4;
310  for (; number < num_points; number++) {
311  outputVector[number] = ((float)(inputVector[number])) * iScalar;
312  }
313 }
314 #endif /* LV_HAVE_SSE2 */
315 
316 
317 #ifdef LV_HAVE_GENERIC
318 
319 static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector,
320  const int32_t* inputVector,
321  const float scalar,
322  unsigned int num_points)
323 {
324  float* outputVectorPtr = outputVector;
325  const int32_t* inputVectorPtr = inputVector;
326  unsigned int number = 0;
327  const float iScalar = 1.0 / scalar;
328 
329  for (number = 0; number < num_points; number++) {
330  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
331  }
332 }
333 #endif /* LV_HAVE_GENERIC */
334 
335 
336 #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_32i_s32f_convert_32f_a_generic(float *outputVector, const int32_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32i_s32f_convert_32f.h:319
static void volk_32i_s32f_convert_32f_u_sse2(float *outputVector, const int32_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32i_s32f_convert_32f.h:138
static void volk_32i_s32f_convert_32f_a_sse2(float *outputVector, const int32_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32i_s32f_convert_32f.h:281
static void volk_32i_s32f_convert_32f_generic(float *outputVector, const int32_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32i_s32f_convert_32f.h:176