Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
67 {
68  float min_val = INT8_MIN;
69  float max_val = INT8_MAX;
70  if (in > max_val) {
71  *out = (int8_t)(max_val);
72  } else if (in < min_val) {
73  *out = (int8_t)(min_val);
74  } else {
75  *out = (int8_t)(rintf(in));
76  }
77 }
78 
79 #ifdef LV_HAVE_AVX2
80 #include <immintrin.h>
81 
82 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
83  const float* inputVector,
84  const float scalar,
85  unsigned int num_points)
86 {
87  unsigned int number = 0;
88 
89  const unsigned int thirtysecondPoints = num_points / 32;
90 
91  const float* inputVectorPtr = (const float*)inputVector;
92  int8_t* outputVectorPtr = outputVector;
93 
94  float min_val = INT8_MIN;
95  float max_val = INT8_MAX;
96  float r;
97 
98  __m256 vScalar = _mm256_set1_ps(scalar);
99  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
100  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
101  __m256 vmin_val = _mm256_set1_ps(min_val);
102  __m256 vmax_val = _mm256_set1_ps(max_val);
103  __m256i intInputVal;
104 
105  for (; number < thirtysecondPoints; number++) {
106  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
107  inputVectorPtr += 8;
108  inputVal2 = _mm256_loadu_ps(inputVectorPtr);
109  inputVectorPtr += 8;
110  inputVal3 = _mm256_loadu_ps(inputVectorPtr);
111  inputVectorPtr += 8;
112  inputVal4 = _mm256_loadu_ps(inputVectorPtr);
113  inputVectorPtr += 8;
114 
115  inputVal1 = _mm256_max_ps(
116  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
117  inputVal2 = _mm256_max_ps(
118  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
119  inputVal3 = _mm256_max_ps(
120  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
121  inputVal4 = _mm256_max_ps(
122  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
123 
124  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
125  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
126  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
127  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
128 
129  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
130  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
131  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
132  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
133 
134  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
135  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
136 
137  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
138  outputVectorPtr += 32;
139  }
140 
141  number = thirtysecondPoints * 32;
142  for (; number < num_points; number++) {
143  r = inputVector[number] * scalar;
144  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
145  }
146 }
147 
148 #endif /* LV_HAVE_AVX2 */
149 
150 
151 #ifdef LV_HAVE_SSE2
152 #include <emmintrin.h>
153 
154 static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
155  const float* inputVector,
156  const float scalar,
157  unsigned int num_points)
158 {
159  unsigned int number = 0;
160 
161  const unsigned int sixteenthPoints = num_points / 16;
162 
163  const float* inputVectorPtr = (const float*)inputVector;
164  int8_t* outputVectorPtr = outputVector;
165 
166  float min_val = INT8_MIN;
167  float max_val = INT8_MAX;
168  float r;
169 
170  __m128 vScalar = _mm_set_ps1(scalar);
171  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
172  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
173  __m128 vmin_val = _mm_set_ps1(min_val);
174  __m128 vmax_val = _mm_set_ps1(max_val);
175 
176  for (; number < sixteenthPoints; number++) {
177  inputVal1 = _mm_loadu_ps(inputVectorPtr);
178  inputVectorPtr += 4;
179  inputVal2 = _mm_loadu_ps(inputVectorPtr);
180  inputVectorPtr += 4;
181  inputVal3 = _mm_loadu_ps(inputVectorPtr);
182  inputVectorPtr += 4;
183  inputVal4 = _mm_loadu_ps(inputVectorPtr);
184  inputVectorPtr += 4;
185 
186  inputVal1 =
187  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
188  inputVal2 =
189  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
190  inputVal3 =
191  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
192  inputVal4 =
193  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
194 
195  intInputVal1 = _mm_cvtps_epi32(inputVal1);
196  intInputVal2 = _mm_cvtps_epi32(inputVal2);
197  intInputVal3 = _mm_cvtps_epi32(inputVal3);
198  intInputVal4 = _mm_cvtps_epi32(inputVal4);
199 
200  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
201  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
202 
203  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
204 
205  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
206  outputVectorPtr += 16;
207  }
208 
209  number = sixteenthPoints * 16;
210  for (; number < num_points; number++) {
211  r = inputVector[number] * scalar;
212  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
213  }
214 }
215 
216 #endif /* LV_HAVE_SSE2 */
217 
218 
219 #ifdef LV_HAVE_SSE
220 #include <xmmintrin.h>
221 
222 static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
223  const float* inputVector,
224  const float scalar,
225  unsigned int num_points)
226 {
227  unsigned int number = 0;
228  size_t inner_loop;
229 
230  const unsigned int quarterPoints = num_points / 4;
231 
232  const float* inputVectorPtr = (const float*)inputVector;
233  int8_t* outputVectorPtr = outputVector;
234 
235  float min_val = INT8_MIN;
236  float max_val = INT8_MAX;
237  float r;
238 
239  __m128 vScalar = _mm_set_ps1(scalar);
240  __m128 ret;
241  __m128 vmin_val = _mm_set_ps1(min_val);
242  __m128 vmax_val = _mm_set_ps1(max_val);
243 
244  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
245 
246  for (; number < quarterPoints; number++) {
247  ret = _mm_loadu_ps(inputVectorPtr);
248  inputVectorPtr += 4;
249 
250  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
251 
252  _mm_store_ps(outputFloatBuffer, ret);
253  for (inner_loop = 0; inner_loop < 4; inner_loop++) {
254  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
255  }
256  }
257 
258  number = quarterPoints * 4;
259  for (; number < num_points; number++) {
260  r = inputVector[number] * scalar;
261  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
262  }
263 }
264 
265 #endif /* LV_HAVE_SSE */
266 
267 
268 #ifdef LV_HAVE_GENERIC
269 
270 static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
271  const float* inputVector,
272  const float scalar,
273  unsigned int num_points)
274 {
275  const float* inputVectorPtr = inputVector;
276  unsigned int number = 0;
277  float r;
278 
279  for (number = 0; number < num_points; number++) {
280  r = *inputVectorPtr++ * scalar;
281  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
282  }
283 }
284 
285 #endif /* LV_HAVE_GENERIC */
286 
287 
288 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
289 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
290 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
291 
292 #include <inttypes.h>
293 #include <stdio.h>
294 #include <volk/volk_common.h>
295 
296 #ifdef LV_HAVE_AVX2
297 #include <immintrin.h>
298 
299 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
300  const float* inputVector,
301  const float scalar,
302  unsigned int num_points)
303 {
304  unsigned int number = 0;
305 
306  const unsigned int thirtysecondPoints = num_points / 32;
307 
308  const float* inputVectorPtr = (const float*)inputVector;
309  int8_t* outputVectorPtr = outputVector;
310 
311  float min_val = INT8_MIN;
312  float max_val = INT8_MAX;
313  float r;
314 
315  __m256 vScalar = _mm256_set1_ps(scalar);
316  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
317  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
318  __m256 vmin_val = _mm256_set1_ps(min_val);
319  __m256 vmax_val = _mm256_set1_ps(max_val);
320  __m256i intInputVal;
321 
322  for (; number < thirtysecondPoints; number++) {
323  inputVal1 = _mm256_load_ps(inputVectorPtr);
324  inputVectorPtr += 8;
325  inputVal2 = _mm256_load_ps(inputVectorPtr);
326  inputVectorPtr += 8;
327  inputVal3 = _mm256_load_ps(inputVectorPtr);
328  inputVectorPtr += 8;
329  inputVal4 = _mm256_load_ps(inputVectorPtr);
330  inputVectorPtr += 8;
331 
332  inputVal1 = _mm256_max_ps(
333  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
334  inputVal2 = _mm256_max_ps(
335  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
336  inputVal3 = _mm256_max_ps(
337  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
338  inputVal4 = _mm256_max_ps(
339  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
340 
341  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
342  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
343  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
344  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
345 
346  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
347  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
348  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
349  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
350 
351  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
352  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
353 
354  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
355  outputVectorPtr += 32;
356  }
357 
358  number = thirtysecondPoints * 32;
359  for (; number < num_points; number++) {
360  r = inputVector[number] * scalar;
361  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
362  }
363 }
364 
365 #endif /* LV_HAVE_AVX2 */
366 
367 
368 #ifdef LV_HAVE_SSE2
369 #include <emmintrin.h>
370 
371 static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
372  const float* inputVector,
373  const float scalar,
374  unsigned int num_points)
375 {
376  unsigned int number = 0;
377 
378  const unsigned int sixteenthPoints = num_points / 16;
379 
380  const float* inputVectorPtr = (const float*)inputVector;
381  int8_t* outputVectorPtr = outputVector;
382 
383  float min_val = INT8_MIN;
384  float max_val = INT8_MAX;
385  float r;
386 
387  __m128 vScalar = _mm_set_ps1(scalar);
388  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
389  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
390  __m128 vmin_val = _mm_set_ps1(min_val);
391  __m128 vmax_val = _mm_set_ps1(max_val);
392 
393  for (; number < sixteenthPoints; number++) {
394  inputVal1 = _mm_load_ps(inputVectorPtr);
395  inputVectorPtr += 4;
396  inputVal2 = _mm_load_ps(inputVectorPtr);
397  inputVectorPtr += 4;
398  inputVal3 = _mm_load_ps(inputVectorPtr);
399  inputVectorPtr += 4;
400  inputVal4 = _mm_load_ps(inputVectorPtr);
401  inputVectorPtr += 4;
402 
403  inputVal1 =
404  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
405  inputVal2 =
406  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
407  inputVal3 =
408  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
409  inputVal4 =
410  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
411 
412  intInputVal1 = _mm_cvtps_epi32(inputVal1);
413  intInputVal2 = _mm_cvtps_epi32(inputVal2);
414  intInputVal3 = _mm_cvtps_epi32(inputVal3);
415  intInputVal4 = _mm_cvtps_epi32(inputVal4);
416 
417  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
418  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
419 
420  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
421 
422  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
423  outputVectorPtr += 16;
424  }
425 
426  number = sixteenthPoints * 16;
427  for (; number < num_points; number++) {
428  r = inputVector[number] * scalar;
429  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
430  }
431 }
432 #endif /* LV_HAVE_SSE2 */
433 
434 
435 #ifdef LV_HAVE_SSE
436 #include <xmmintrin.h>
437 
438 static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
439  const float* inputVector,
440  const float scalar,
441  unsigned int num_points)
442 {
443  unsigned int number = 0;
444  size_t inner_loop;
445 
446  const unsigned int quarterPoints = num_points / 4;
447 
448  const float* inputVectorPtr = (const float*)inputVector;
449 
450  float min_val = INT8_MIN;
451  float max_val = INT8_MAX;
452  float r;
453 
454  int8_t* outputVectorPtr = outputVector;
455  __m128 vScalar = _mm_set_ps1(scalar);
456  __m128 ret;
457  __m128 vmin_val = _mm_set_ps1(min_val);
458  __m128 vmax_val = _mm_set_ps1(max_val);
459 
460  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
461 
462  for (; number < quarterPoints; number++) {
463  ret = _mm_load_ps(inputVectorPtr);
464  inputVectorPtr += 4;
465 
466  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
467 
468  _mm_store_ps(outputFloatBuffer, ret);
469  for (inner_loop = 0; inner_loop < 4; inner_loop++) {
470  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
471  }
472  }
473 
474  number = quarterPoints * 4;
475  for (; number < num_points; number++) {
476  r = inputVector[number] * scalar;
477  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
478  }
479 }
480 
481 #endif /* LV_HAVE_SSE */
482 
483 
484 #ifdef LV_HAVE_GENERIC
485 
486 static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
487  const float* inputVector,
488  const float scalar,
489  unsigned int num_points)
490 {
491  const float* inputVectorPtr = inputVector;
492  unsigned int number = 0;
493  float r;
494 
495  for (number = 0; number < num_points; number++) {
496  r = *inputVectorPtr++ * scalar;
497  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
498  }
499 }
500 
501 #endif /* LV_HAVE_GENERIC */
502 
503 
504 #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:371
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:66
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:222
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:486
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:438
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:270
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:154
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65