Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
58 #define INCLUDED_volk_32f_s32f_convert_32i_u_H
59 
60 #include <inttypes.h>
61 #include <limits.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX
65 #include <immintrin.h>
66 
67 static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
68  const float* inputVector,
69  const float scalar,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73 
74  const unsigned int eighthPoints = num_points / 8;
75 
76  const float* inputVectorPtr = (const float*)inputVector;
77  int32_t* outputVectorPtr = outputVector;
78 
79  float min_val = INT_MIN;
80  float max_val = INT_MAX;
81  float r;
82 
83  __m256 vScalar = _mm256_set1_ps(scalar);
84  __m256 inputVal1;
85  __m256i intInputVal1;
86  __m256 vmin_val = _mm256_set1_ps(min_val);
87  __m256 vmax_val = _mm256_set1_ps(max_val);
88 
89  for (; number < eighthPoints; number++) {
90  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
91  inputVectorPtr += 8;
92 
93  inputVal1 = _mm256_max_ps(
94  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
95  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
96 
97  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
98  outputVectorPtr += 8;
99  }
100 
101  number = eighthPoints * 8;
102  for (; number < num_points; number++) {
103  r = inputVector[number] * scalar;
104  if (r > max_val)
105  r = max_val;
106  else if (r < min_val)
107  r = min_val;
108  outputVector[number] = (int32_t)rintf(r);
109  }
110 }
111 
112 #endif /* LV_HAVE_AVX */
113 
114 #ifdef LV_HAVE_SSE2
115 #include <emmintrin.h>
116 
117 static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
118  const float* inputVector,
119  const float scalar,
120  unsigned int num_points)
121 {
122  unsigned int number = 0;
123 
124  const unsigned int quarterPoints = num_points / 4;
125 
126  const float* inputVectorPtr = (const float*)inputVector;
127  int32_t* outputVectorPtr = outputVector;
128 
129  float min_val = INT_MIN;
130  float max_val = INT_MAX;
131  float r;
132 
133  __m128 vScalar = _mm_set_ps1(scalar);
134  __m128 inputVal1;
135  __m128i intInputVal1;
136  __m128 vmin_val = _mm_set_ps1(min_val);
137  __m128 vmax_val = _mm_set_ps1(max_val);
138 
139  for (; number < quarterPoints; number++) {
140  inputVal1 = _mm_loadu_ps(inputVectorPtr);
141  inputVectorPtr += 4;
142 
143  inputVal1 =
144  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
145  intInputVal1 = _mm_cvtps_epi32(inputVal1);
146 
147  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
148  outputVectorPtr += 4;
149  }
150 
151  number = quarterPoints * 4;
152  for (; number < num_points; number++) {
153  r = inputVector[number] * scalar;
154  if (r > max_val)
155  r = max_val;
156  else if (r < min_val)
157  r = min_val;
158  outputVector[number] = (int32_t)rintf(r);
159  }
160 }
161 
162 #endif /* LV_HAVE_SSE2 */
163 
164 
165 #ifdef LV_HAVE_SSE
166 #include <xmmintrin.h>
167 
168 static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
169  const float* inputVector,
170  const float scalar,
171  unsigned int num_points)
172 {
173  unsigned int number = 0;
174 
175  const unsigned int quarterPoints = num_points / 4;
176 
177  const float* inputVectorPtr = (const float*)inputVector;
178  int32_t* outputVectorPtr = outputVector;
179 
180  float min_val = INT_MIN;
181  float max_val = INT_MAX;
182  float r;
183 
184  __m128 vScalar = _mm_set_ps1(scalar);
185  __m128 ret;
186  __m128 vmin_val = _mm_set_ps1(min_val);
187  __m128 vmax_val = _mm_set_ps1(max_val);
188 
189  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
190 
191  for (; number < quarterPoints; number++) {
192  ret = _mm_loadu_ps(inputVectorPtr);
193  inputVectorPtr += 4;
194 
195  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
196 
197  _mm_store_ps(outputFloatBuffer, ret);
198  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
199  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
200  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
201  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
202  }
203 
204  number = quarterPoints * 4;
205  for (; number < num_points; number++) {
206  r = inputVector[number] * scalar;
207  if (r > max_val)
208  r = max_val;
209  else if (r < min_val)
210  r = min_val;
211  outputVector[number] = (int32_t)rintf(r);
212  }
213 }
214 
215 #endif /* LV_HAVE_SSE */
216 
217 
218 #ifdef LV_HAVE_GENERIC
219 
220 static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
221  const float* inputVector,
222  const float scalar,
223  unsigned int num_points)
224 {
225  int32_t* outputVectorPtr = outputVector;
226  const float* inputVectorPtr = inputVector;
227  const float min_val = (float)INT_MIN;
228  const float max_val = (float)INT_MAX;
229 
230  for (unsigned int number = 0; number < num_points; number++) {
231  const float r = *inputVectorPtr++ * scalar;
232  int s;
233  if (r >= max_val)
234  s = INT_MAX;
235  else if (r < min_val)
236  s = INT_MIN;
237  else
238  s = (int32_t)rintf(r);
239  *outputVectorPtr++ = s;
240  }
241 }
242 
243 #endif /* LV_HAVE_GENERIC */
244 
245 
246 #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
247 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
248 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
249 
250 #include <inttypes.h>
251 #include <stdio.h>
252 #include <volk/volk_common.h>
253 
254 #ifdef LV_HAVE_AVX
255 #include <immintrin.h>
256 
257 static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
258  const float* inputVector,
259  const float scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263 
264  const unsigned int eighthPoints = num_points / 8;
265 
266  const float* inputVectorPtr = (const float*)inputVector;
267  int32_t* outputVectorPtr = outputVector;
268 
269  float min_val = INT_MIN;
270  float max_val = INT_MAX;
271  float r;
272 
273  __m256 vScalar = _mm256_set1_ps(scalar);
274  __m256 inputVal1;
275  __m256i intInputVal1;
276  __m256 vmin_val = _mm256_set1_ps(min_val);
277  __m256 vmax_val = _mm256_set1_ps(max_val);
278 
279  for (; number < eighthPoints; number++) {
280  inputVal1 = _mm256_load_ps(inputVectorPtr);
281  inputVectorPtr += 8;
282 
283  inputVal1 = _mm256_max_ps(
284  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
285  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
286 
287  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
288  outputVectorPtr += 8;
289  }
290 
291  number = eighthPoints * 8;
292  for (; number < num_points; number++) {
293  r = inputVector[number] * scalar;
294  if (r > max_val)
295  r = max_val;
296  else if (r < min_val)
297  r = min_val;
298  outputVector[number] = (int32_t)rintf(r);
299  }
300 }
301 
302 #endif /* LV_HAVE_AVX */
303 
304 
305 #ifdef LV_HAVE_SSE2
306 #include <emmintrin.h>
307 
308 static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
309  const float* inputVector,
310  const float scalar,
311  unsigned int num_points)
312 {
313  unsigned int number = 0;
314 
315  const unsigned int quarterPoints = num_points / 4;
316 
317  const float* inputVectorPtr = (const float*)inputVector;
318  int32_t* outputVectorPtr = outputVector;
319 
320  float min_val = INT_MIN;
321  float max_val = INT_MAX;
322  float r;
323 
324  __m128 vScalar = _mm_set_ps1(scalar);
325  __m128 inputVal1;
326  __m128i intInputVal1;
327  __m128 vmin_val = _mm_set_ps1(min_val);
328  __m128 vmax_val = _mm_set_ps1(max_val);
329 
330  for (; number < quarterPoints; number++) {
331  inputVal1 = _mm_load_ps(inputVectorPtr);
332  inputVectorPtr += 4;
333 
334  inputVal1 =
335  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
336  intInputVal1 = _mm_cvtps_epi32(inputVal1);
337 
338  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
339  outputVectorPtr += 4;
340  }
341 
342  number = quarterPoints * 4;
343  for (; number < num_points; number++) {
344  r = inputVector[number] * scalar;
345  if (r > max_val)
346  r = max_val;
347  else if (r < min_val)
348  r = min_val;
349  outputVector[number] = (int32_t)rintf(r);
350  }
351 }
352 
353 #endif /* LV_HAVE_SSE2 */
354 
355 
356 #ifdef LV_HAVE_SSE
357 #include <xmmintrin.h>
358 
359 static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
360  const float* inputVector,
361  const float scalar,
362  unsigned int num_points)
363 {
364  unsigned int number = 0;
365 
366  const unsigned int quarterPoints = num_points / 4;
367 
368  const float* inputVectorPtr = (const float*)inputVector;
369  int32_t* outputVectorPtr = outputVector;
370 
371  float min_val = INT_MIN;
372  float max_val = INT_MAX;
373  float r;
374 
375  __m128 vScalar = _mm_set_ps1(scalar);
376  __m128 ret;
377  __m128 vmin_val = _mm_set_ps1(min_val);
378  __m128 vmax_val = _mm_set_ps1(max_val);
379 
380  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
381 
382  for (; number < quarterPoints; number++) {
383  ret = _mm_load_ps(inputVectorPtr);
384  inputVectorPtr += 4;
385 
386  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
387 
388  _mm_store_ps(outputFloatBuffer, ret);
389  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
390  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
391  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
392  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
393  }
394 
395  number = quarterPoints * 4;
396  for (; number < num_points; number++) {
397  r = inputVector[number] * scalar;
398  if (r > max_val)
399  r = max_val;
400  else if (r < min_val)
401  r = min_val;
402  outputVector[number] = (int32_t)rintf(r);
403  }
404 }
405 
406 #endif /* LV_HAVE_SSE */
407 
408 
409 #ifdef LV_HAVE_GENERIC
410 
411 static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
412  const float* inputVector,
413  const float scalar,
414  unsigned int num_points)
415 {
416  volk_32f_s32f_convert_32i_generic(outputVector, inputVector, scalar, num_points);
417 }
418 
419 #endif /* LV_HAVE_GENERIC */
420 
421 #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:359
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:257
static void volk_32f_s32f_convert_32i_a_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:411
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:308
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:168
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:220
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:67
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:117
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65