Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
57 
58 #include <inttypes.h>
59 #include <limits.h>
60 #include <stdio.h>
61 
62 #ifdef LV_HAVE_AVX2
63 #include <immintrin.h>
64 
65 static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66  const float* inputVector,
67  const float scalar,
68  unsigned int num_points)
69 {
70  unsigned int number = 0;
71 
72  const unsigned int sixteenthPoints = num_points / 16;
73 
74  const float* inputVectorPtr = (const float*)inputVector;
75  int16_t* outputVectorPtr = outputVector;
76 
77  float min_val = SHRT_MIN;
78  float max_val = SHRT_MAX;
79  float r;
80 
81  __m256 vScalar = _mm256_set1_ps(scalar);
82  __m256 inputVal1, inputVal2;
83  __m256i intInputVal1, intInputVal2;
84  __m256 ret1, ret2;
85  __m256 vmin_val = _mm256_set1_ps(min_val);
86  __m256 vmax_val = _mm256_set1_ps(max_val);
87 
88  for (; number < sixteenthPoints; number++) {
89  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
90  inputVectorPtr += 8;
91  inputVal2 = _mm256_loadu_ps(inputVectorPtr);
92  inputVectorPtr += 8;
93 
94  // Scale and clip
95  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
96  vmin_val);
97  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
98  vmin_val);
99 
100  intInputVal1 = _mm256_cvtps_epi32(ret1);
101  intInputVal2 = _mm256_cvtps_epi32(ret2);
102 
103  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
105 
106  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107  outputVectorPtr += 16;
108  }
109 
110  number = sixteenthPoints * 16;
111  for (; number < num_points; number++) {
112  r = inputVector[number] * scalar;
113  if (r > max_val)
114  r = max_val;
115  else if (r < min_val)
116  r = min_val;
117  outputVector[number] = (int16_t)rintf(r);
118  }
119 }
120 #endif /* LV_HAVE_AVX2 */
121 
122 
123 #ifdef LV_HAVE_AVX
124 #include <immintrin.h>
125 
126 static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
127  const float* inputVector,
128  const float scalar,
129  unsigned int num_points)
130 {
131  unsigned int number = 0;
132 
133  const unsigned int eighthPoints = num_points / 8;
134 
135  const float* inputVectorPtr = (const float*)inputVector;
136  int16_t* outputVectorPtr = outputVector;
137 
138  float min_val = SHRT_MIN;
139  float max_val = SHRT_MAX;
140  float r;
141 
142  __m256 vScalar = _mm256_set1_ps(scalar);
143  __m256 inputVal, ret;
144  __m256i intInputVal;
145  __m128i intInputVal1, intInputVal2;
146  __m256 vmin_val = _mm256_set1_ps(min_val);
147  __m256 vmax_val = _mm256_set1_ps(max_val);
148 
149  for (; number < eighthPoints; number++) {
150  inputVal = _mm256_loadu_ps(inputVectorPtr);
151  inputVectorPtr += 8;
152 
153  // Scale and clip
154  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
155  vmin_val);
156 
157  intInputVal = _mm256_cvtps_epi32(ret);
158 
159  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
160  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
161 
162  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
163 
164  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
165  outputVectorPtr += 8;
166  }
167 
168  number = eighthPoints * 8;
169  for (; number < num_points; number++) {
170  r = inputVector[number] * scalar;
171  if (r > max_val)
172  r = max_val;
173  else if (r < min_val)
174  r = min_val;
175  outputVector[number] = (int16_t)rintf(r);
176  }
177 }
178 #endif /* LV_HAVE_AVX */
179 
180 
181 #ifdef LV_HAVE_SSE2
182 #include <emmintrin.h>
183 
184 static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
185  const float* inputVector,
186  const float scalar,
187  unsigned int num_points)
188 {
189  unsigned int number = 0;
190 
191  const unsigned int eighthPoints = num_points / 8;
192 
193  const float* inputVectorPtr = (const float*)inputVector;
194  int16_t* outputVectorPtr = outputVector;
195 
196  float min_val = SHRT_MIN;
197  float max_val = SHRT_MAX;
198  float r;
199 
200  __m128 vScalar = _mm_set_ps1(scalar);
201  __m128 inputVal1, inputVal2;
202  __m128i intInputVal1, intInputVal2;
203  __m128 ret1, ret2;
204  __m128 vmin_val = _mm_set_ps1(min_val);
205  __m128 vmax_val = _mm_set_ps1(max_val);
206 
207  for (; number < eighthPoints; number++) {
208  inputVal1 = _mm_loadu_ps(inputVectorPtr);
209  inputVectorPtr += 4;
210  inputVal2 = _mm_loadu_ps(inputVectorPtr);
211  inputVectorPtr += 4;
212 
213  // Scale and clip
214  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
215  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
216 
217  intInputVal1 = _mm_cvtps_epi32(ret1);
218  intInputVal2 = _mm_cvtps_epi32(ret2);
219 
220  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
221 
222  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
223  outputVectorPtr += 8;
224  }
225 
226  number = eighthPoints * 8;
227  for (; number < num_points; number++) {
228  r = inputVector[number] * scalar;
229  if (r > max_val)
230  r = max_val;
231  else if (r < min_val)
232  r = min_val;
233  outputVector[number] = (int16_t)rintf(r);
234  }
235 }
236 #endif /* LV_HAVE_SSE2 */
237 
238 
239 #ifdef LV_HAVE_SSE
240 #include <xmmintrin.h>
241 
242 static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
243  const float* inputVector,
244  const float scalar,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248 
249  const unsigned int quarterPoints = num_points / 4;
250 
251  const float* inputVectorPtr = (const float*)inputVector;
252  int16_t* outputVectorPtr = outputVector;
253 
254  float min_val = SHRT_MIN;
255  float max_val = SHRT_MAX;
256  float r;
257 
258  __m128 vScalar = _mm_set_ps1(scalar);
259  __m128 ret;
260  __m128 vmin_val = _mm_set_ps1(min_val);
261  __m128 vmax_val = _mm_set_ps1(max_val);
262 
263  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
264 
265  for (; number < quarterPoints; number++) {
266  ret = _mm_loadu_ps(inputVectorPtr);
267  inputVectorPtr += 4;
268 
269  // Scale and clip
270  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
271 
272  _mm_store_ps(outputFloatBuffer, ret);
273  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
274  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
275  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
276  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
277  }
278 
279  number = quarterPoints * 4;
280  for (; number < num_points; number++) {
281  r = inputVector[number] * scalar;
282  if (r > max_val)
283  r = max_val;
284  else if (r < min_val)
285  r = min_val;
286  outputVector[number] = (int16_t)rintf(r);
287  }
288 }
289 #endif /* LV_HAVE_SSE */
290 
291 
292 #ifdef LV_HAVE_GENERIC
293 
294 static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
295  const float* inputVector,
296  const float scalar,
297  unsigned int num_points)
298 {
299  int16_t* outputVectorPtr = outputVector;
300  const float* inputVectorPtr = inputVector;
301  unsigned int number = 0;
302  float min_val = SHRT_MIN;
303  float max_val = SHRT_MAX;
304  float r;
305 
306  for (number = 0; number < num_points; number++) {
307  r = *inputVectorPtr++ * scalar;
308  if (r > max_val)
309  r = max_val;
310  else if (r < min_val)
311  r = min_val;
312  *outputVectorPtr++ = (int16_t)rintf(r);
313  }
314 }
315 #endif /* LV_HAVE_GENERIC */
316 
317 
318 #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
319 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
320 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
321 
322 #include <inttypes.h>
323 #include <math.h>
324 #include <stdio.h>
325 #include <volk/volk_common.h>
326 
327 #ifdef LV_HAVE_AVX2
328 #include <immintrin.h>
329 
330 static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
331  const float* inputVector,
332  const float scalar,
333  unsigned int num_points)
334 {
335  unsigned int number = 0;
336 
337  const unsigned int sixteenthPoints = num_points / 16;
338 
339  const float* inputVectorPtr = (const float*)inputVector;
340  int16_t* outputVectorPtr = outputVector;
341 
342  float min_val = SHRT_MIN;
343  float max_val = SHRT_MAX;
344  float r;
345 
346  __m256 vScalar = _mm256_set1_ps(scalar);
347  __m256 inputVal1, inputVal2;
348  __m256i intInputVal1, intInputVal2;
349  __m256 ret1, ret2;
350  __m256 vmin_val = _mm256_set1_ps(min_val);
351  __m256 vmax_val = _mm256_set1_ps(max_val);
352 
353  for (; number < sixteenthPoints; number++) {
354  inputVal1 = _mm256_load_ps(inputVectorPtr);
355  inputVectorPtr += 8;
356  inputVal2 = _mm256_load_ps(inputVectorPtr);
357  inputVectorPtr += 8;
358 
359  // Scale and clip
360  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
361  vmin_val);
362  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
363  vmin_val);
364 
365  intInputVal1 = _mm256_cvtps_epi32(ret1);
366  intInputVal2 = _mm256_cvtps_epi32(ret2);
367 
368  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
369  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
370 
371  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
372  outputVectorPtr += 16;
373  }
374 
375  number = sixteenthPoints * 16;
376  for (; number < num_points; number++) {
377  r = inputVector[number] * scalar;
378  if (r > max_val)
379  r = max_val;
380  else if (r < min_val)
381  r = min_val;
382  outputVector[number] = (int16_t)rintf(r);
383  }
384 }
385 #endif /* LV_HAVE_AVX2 */
386 
387 
388 #ifdef LV_HAVE_AVX
389 #include <immintrin.h>
390 
391 static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
392  const float* inputVector,
393  const float scalar,
394  unsigned int num_points)
395 {
396  unsigned int number = 0;
397 
398  const unsigned int eighthPoints = num_points / 8;
399 
400  const float* inputVectorPtr = (const float*)inputVector;
401  int16_t* outputVectorPtr = outputVector;
402 
403  float min_val = SHRT_MIN;
404  float max_val = SHRT_MAX;
405  float r;
406 
407  __m256 vScalar = _mm256_set1_ps(scalar);
408  __m256 inputVal, ret;
409  __m256i intInputVal;
410  __m128i intInputVal1, intInputVal2;
411  __m256 vmin_val = _mm256_set1_ps(min_val);
412  __m256 vmax_val = _mm256_set1_ps(max_val);
413 
414  for (; number < eighthPoints; number++) {
415  inputVal = _mm256_load_ps(inputVectorPtr);
416  inputVectorPtr += 8;
417 
418  // Scale and clip
419  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
420  vmin_val);
421 
422  intInputVal = _mm256_cvtps_epi32(ret);
423 
424  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
425  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
426 
427  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
428 
429  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
430  outputVectorPtr += 8;
431  }
432 
433  number = eighthPoints * 8;
434  for (; number < num_points; number++) {
435  r = inputVector[number] * scalar;
436  if (r > max_val)
437  r = max_val;
438  else if (r < min_val)
439  r = min_val;
440  outputVector[number] = (int16_t)rintf(r);
441  }
442 }
443 #endif /* LV_HAVE_AVX */
444 
445 #ifdef LV_HAVE_SSE2
446 #include <emmintrin.h>
447 
448 static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
449  const float* inputVector,
450  const float scalar,
451  unsigned int num_points)
452 {
453  unsigned int number = 0;
454 
455  const unsigned int eighthPoints = num_points / 8;
456 
457  const float* inputVectorPtr = (const float*)inputVector;
458  int16_t* outputVectorPtr = outputVector;
459 
460  float min_val = SHRT_MIN;
461  float max_val = SHRT_MAX;
462  float r;
463 
464  __m128 vScalar = _mm_set_ps1(scalar);
465  __m128 inputVal1, inputVal2;
466  __m128i intInputVal1, intInputVal2;
467  __m128 ret1, ret2;
468  __m128 vmin_val = _mm_set_ps1(min_val);
469  __m128 vmax_val = _mm_set_ps1(max_val);
470 
471  for (; number < eighthPoints; number++) {
472  inputVal1 = _mm_load_ps(inputVectorPtr);
473  inputVectorPtr += 4;
474  inputVal2 = _mm_load_ps(inputVectorPtr);
475  inputVectorPtr += 4;
476 
477  // Scale and clip
478  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
479  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
480 
481  intInputVal1 = _mm_cvtps_epi32(ret1);
482  intInputVal2 = _mm_cvtps_epi32(ret2);
483 
484  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
485 
486  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
487  outputVectorPtr += 8;
488  }
489 
490  number = eighthPoints * 8;
491  for (; number < num_points; number++) {
492  r = inputVector[number] * scalar;
493  if (r > max_val)
494  r = max_val;
495  else if (r < min_val)
496  r = min_val;
497  outputVector[number] = (int16_t)rintf(r);
498  }
499 }
500 #endif /* LV_HAVE_SSE2 */
501 
502 
503 #ifdef LV_HAVE_SSE
504 #include <xmmintrin.h>
505 
506 static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
507  const float* inputVector,
508  const float scalar,
509  unsigned int num_points)
510 {
511  unsigned int number = 0;
512 
513  const unsigned int quarterPoints = num_points / 4;
514 
515  const float* inputVectorPtr = (const float*)inputVector;
516  int16_t* outputVectorPtr = outputVector;
517 
518  float min_val = SHRT_MIN;
519  float max_val = SHRT_MAX;
520  float r;
521 
522  __m128 vScalar = _mm_set_ps1(scalar);
523  __m128 ret;
524  __m128 vmin_val = _mm_set_ps1(min_val);
525  __m128 vmax_val = _mm_set_ps1(max_val);
526 
527  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
528 
529  for (; number < quarterPoints; number++) {
530  ret = _mm_load_ps(inputVectorPtr);
531  inputVectorPtr += 4;
532 
533  // Scale and clip
534  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
535 
536  _mm_store_ps(outputFloatBuffer, ret);
537  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
538  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
539  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
540  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
541  }
542 
543  number = quarterPoints * 4;
544  for (; number < num_points; number++) {
545  r = inputVector[number] * scalar;
546  if (r > max_val)
547  r = max_val;
548  else if (r < min_val)
549  r = min_val;
550  outputVector[number] = (int16_t)rintf(r);
551  }
552 }
553 #endif /* LV_HAVE_SSE */
554 
555 
556 #ifdef LV_HAVE_GENERIC
557 
558 static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
559  const float* inputVector,
560  const float scalar,
561  unsigned int num_points)
562 {
563  int16_t* outputVectorPtr = outputVector;
564  const float* inputVectorPtr = inputVector;
565  unsigned int number = 0;
566  float min_val = SHRT_MIN;
567  float max_val = SHRT_MAX;
568  float r;
569 
570  for (number = 0; number < num_points; number++) {
571  r = *inputVectorPtr++ * scalar;
572  if (r < min_val)
573  r = min_val;
574  else if (r > max_val)
575  r = max_val;
576  *outputVectorPtr++ = (int16_t)rintf(r);
577  }
578 }
579 #endif /* LV_HAVE_GENERIC */
580 
581 #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:448
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:242
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:391
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:184
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:558
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:126
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:294
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:506
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65