55 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
63 #include <immintrin.h>
65 static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66 const float* inputVector,
68 unsigned int num_points)
70 unsigned int number = 0;
72 const unsigned int sixteenthPoints = num_points / 16;
74 const float* inputVectorPtr = (
const float*)inputVector;
75 int16_t* outputVectorPtr = outputVector;
77 float min_val = SHRT_MIN;
78 float max_val = SHRT_MAX;
81 __m256 vScalar = _mm256_set1_ps(scalar);
82 __m256 inputVal1, inputVal2;
83 __m256i intInputVal1, intInputVal2;
85 __m256 vmin_val = _mm256_set1_ps(min_val);
86 __m256 vmax_val = _mm256_set1_ps(max_val);
88 for (; number < sixteenthPoints; number++) {
89 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
91 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
95 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
97 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
100 intInputVal1 = _mm256_cvtps_epi32(ret1);
101 intInputVal2 = _mm256_cvtps_epi32(ret2);
103 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
106 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107 outputVectorPtr += 16;
110 number = sixteenthPoints * 16;
111 for (; number < num_points; number++) {
112 r = inputVector[number] * scalar;
115 else if (r < min_val)
117 outputVector[number] = (int16_t)
rintf(r);
124 #include <immintrin.h>
127 const float* inputVector,
129 unsigned int num_points)
131 unsigned int number = 0;
133 const unsigned int eighthPoints = num_points / 8;
135 const float* inputVectorPtr = (
const float*)inputVector;
136 int16_t* outputVectorPtr = outputVector;
138 float min_val = SHRT_MIN;
139 float max_val = SHRT_MAX;
142 __m256 vScalar = _mm256_set1_ps(scalar);
143 __m256 inputVal, ret;
145 __m128i intInputVal1, intInputVal2;
146 __m256 vmin_val = _mm256_set1_ps(min_val);
147 __m256 vmax_val = _mm256_set1_ps(max_val);
149 for (; number < eighthPoints; number++) {
150 inputVal = _mm256_loadu_ps(inputVectorPtr);
154 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
157 intInputVal = _mm256_cvtps_epi32(ret);
159 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
160 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
165 outputVectorPtr += 8;
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 r = inputVector[number] * scalar;
173 else if (r < min_val)
175 outputVector[number] = (int16_t)
rintf(r);
182 #include <emmintrin.h>
185 const float* inputVector,
187 unsigned int num_points)
189 unsigned int number = 0;
191 const unsigned int eighthPoints = num_points / 8;
193 const float* inputVectorPtr = (
const float*)inputVector;
194 int16_t* outputVectorPtr = outputVector;
196 float min_val = SHRT_MIN;
197 float max_val = SHRT_MAX;
201 __m128 inputVal1, inputVal2;
202 __m128i intInputVal1, intInputVal2;
207 for (; number < eighthPoints; number++) {
223 outputVectorPtr += 8;
226 number = eighthPoints * 8;
227 for (; number < num_points; number++) {
228 r = inputVector[number] * scalar;
231 else if (r < min_val)
233 outputVector[number] = (int16_t)
rintf(r);
240 #include <xmmintrin.h>
243 const float* inputVector,
245 unsigned int num_points)
247 unsigned int number = 0;
249 const unsigned int quarterPoints = num_points / 4;
251 const float* inputVectorPtr = (
const float*)inputVector;
252 int16_t* outputVectorPtr = outputVector;
254 float min_val = SHRT_MIN;
255 float max_val = SHRT_MAX;
265 for (; number < quarterPoints; number++) {
273 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
274 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
275 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
276 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 r = inputVector[number] * scalar;
284 else if (r < min_val)
286 outputVector[number] = (int16_t)
rintf(r);
292 #ifdef LV_HAVE_GENERIC
295 const float* inputVector,
297 unsigned int num_points)
299 int16_t* outputVectorPtr = outputVector;
300 const float* inputVectorPtr = inputVector;
301 unsigned int number = 0;
302 float min_val = SHRT_MIN;
303 float max_val = SHRT_MAX;
306 for (number = 0; number < num_points; number++) {
307 r = *inputVectorPtr++ * scalar;
310 else if (r < min_val)
312 *outputVectorPtr++ = (int16_t)
rintf(r);
319 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
320 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
322 #include <inttypes.h>
328 #include <immintrin.h>
330 static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
331 const float* inputVector,
333 unsigned int num_points)
335 unsigned int number = 0;
337 const unsigned int sixteenthPoints = num_points / 16;
339 const float* inputVectorPtr = (
const float*)inputVector;
340 int16_t* outputVectorPtr = outputVector;
342 float min_val = SHRT_MIN;
343 float max_val = SHRT_MAX;
346 __m256 vScalar = _mm256_set1_ps(scalar);
347 __m256 inputVal1, inputVal2;
348 __m256i intInputVal1, intInputVal2;
350 __m256 vmin_val = _mm256_set1_ps(min_val);
351 __m256 vmax_val = _mm256_set1_ps(max_val);
353 for (; number < sixteenthPoints; number++) {
354 inputVal1 = _mm256_load_ps(inputVectorPtr);
356 inputVal2 = _mm256_load_ps(inputVectorPtr);
360 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
362 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
365 intInputVal1 = _mm256_cvtps_epi32(ret1);
366 intInputVal2 = _mm256_cvtps_epi32(ret2);
368 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
369 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
371 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
372 outputVectorPtr += 16;
375 number = sixteenthPoints * 16;
376 for (; number < num_points; number++) {
377 r = inputVector[number] * scalar;
380 else if (r < min_val)
382 outputVector[number] = (int16_t)
rintf(r);
389 #include <immintrin.h>
392 const float* inputVector,
394 unsigned int num_points)
396 unsigned int number = 0;
398 const unsigned int eighthPoints = num_points / 8;
400 const float* inputVectorPtr = (
const float*)inputVector;
401 int16_t* outputVectorPtr = outputVector;
403 float min_val = SHRT_MIN;
404 float max_val = SHRT_MAX;
407 __m256 vScalar = _mm256_set1_ps(scalar);
408 __m256 inputVal, ret;
410 __m128i intInputVal1, intInputVal2;
411 __m256 vmin_val = _mm256_set1_ps(min_val);
412 __m256 vmax_val = _mm256_set1_ps(max_val);
414 for (; number < eighthPoints; number++) {
415 inputVal = _mm256_load_ps(inputVectorPtr);
419 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
422 intInputVal = _mm256_cvtps_epi32(ret);
424 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
425 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
430 outputVectorPtr += 8;
433 number = eighthPoints * 8;
434 for (; number < num_points; number++) {
435 r = inputVector[number] * scalar;
438 else if (r < min_val)
440 outputVector[number] = (int16_t)
rintf(r);
446 #include <emmintrin.h>
449 const float* inputVector,
451 unsigned int num_points)
453 unsigned int number = 0;
455 const unsigned int eighthPoints = num_points / 8;
457 const float* inputVectorPtr = (
const float*)inputVector;
458 int16_t* outputVectorPtr = outputVector;
460 float min_val = SHRT_MIN;
461 float max_val = SHRT_MAX;
465 __m128 inputVal1, inputVal2;
466 __m128i intInputVal1, intInputVal2;
471 for (; number < eighthPoints; number++) {
487 outputVectorPtr += 8;
490 number = eighthPoints * 8;
491 for (; number < num_points; number++) {
492 r = inputVector[number] * scalar;
495 else if (r < min_val)
497 outputVector[number] = (int16_t)
rintf(r);
504 #include <xmmintrin.h>
507 const float* inputVector,
509 unsigned int num_points)
511 unsigned int number = 0;
513 const unsigned int quarterPoints = num_points / 4;
515 const float* inputVectorPtr = (
const float*)inputVector;
516 int16_t* outputVectorPtr = outputVector;
518 float min_val = SHRT_MIN;
519 float max_val = SHRT_MAX;
529 for (; number < quarterPoints; number++) {
537 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
538 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
539 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
540 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
543 number = quarterPoints * 4;
544 for (; number < num_points; number++) {
545 r = inputVector[number] * scalar;
548 else if (r < min_val)
550 outputVector[number] = (int16_t)
rintf(r);
556 #ifdef LV_HAVE_GENERIC
559 const float* inputVector,
561 unsigned int num_points)
563 int16_t* outputVectorPtr = outputVector;
564 const float* inputVectorPtr = inputVector;
565 unsigned int number = 0;
566 float min_val = SHRT_MIN;
567 float max_val = SHRT_MAX;
570 for (number = 0; number < num_points; number++) {
571 r = *inputVectorPtr++ * scalar;
574 else if (r > max_val)
576 *outputVectorPtr++ = (int16_t)
rintf(r);
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:448
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:242
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:391
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:184
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:558
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:126
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:294
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:506
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65