60 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
68 float min_val = INT8_MIN;
69 float max_val = INT8_MAX;
71 *out = (int8_t)(max_val);
72 }
else if (in < min_val) {
73 *out = (int8_t)(min_val);
75 *out = (int8_t)(
rintf(in));
80 #include <immintrin.h>
82 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
83 const float* inputVector,
85 unsigned int num_points)
87 unsigned int number = 0;
89 const unsigned int thirtysecondPoints = num_points / 32;
91 const float* inputVectorPtr = (
const float*)inputVector;
92 int8_t* outputVectorPtr = outputVector;
94 float min_val = INT8_MIN;
95 float max_val = INT8_MAX;
98 __m256 vScalar = _mm256_set1_ps(scalar);
99 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
100 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
101 __m256 vmin_val = _mm256_set1_ps(min_val);
102 __m256 vmax_val = _mm256_set1_ps(max_val);
105 for (; number < thirtysecondPoints; number++) {
106 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
108 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
110 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
112 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
115 inputVal1 = _mm256_max_ps(
116 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
117 inputVal2 = _mm256_max_ps(
118 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
119 inputVal3 = _mm256_max_ps(
120 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
121 inputVal4 = _mm256_max_ps(
122 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
124 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
125 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
126 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
127 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
129 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
130 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
131 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
132 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
134 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
135 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
137 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
138 outputVectorPtr += 32;
141 number = thirtysecondPoints * 32;
142 for (; number < num_points; number++) {
143 r = inputVector[number] * scalar;
152 #include <emmintrin.h>
155 const float* inputVector,
157 unsigned int num_points)
159 unsigned int number = 0;
161 const unsigned int sixteenthPoints = num_points / 16;
163 const float* inputVectorPtr = (
const float*)inputVector;
164 int8_t* outputVectorPtr = outputVector;
166 float min_val = INT8_MIN;
167 float max_val = INT8_MAX;
171 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
172 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
176 for (; number < sixteenthPoints; number++) {
206 outputVectorPtr += 16;
209 number = sixteenthPoints * 16;
210 for (; number < num_points; number++) {
211 r = inputVector[number] * scalar;
220 #include <xmmintrin.h>
223 const float* inputVector,
225 unsigned int num_points)
227 unsigned int number = 0;
230 const unsigned int quarterPoints = num_points / 4;
232 const float* inputVectorPtr = (
const float*)inputVector;
233 int8_t* outputVectorPtr = outputVector;
235 float min_val = INT8_MIN;
236 float max_val = INT8_MAX;
246 for (; number < quarterPoints; number++) {
253 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
254 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
258 number = quarterPoints * 4;
259 for (; number < num_points; number++) {
260 r = inputVector[number] * scalar;
268 #ifdef LV_HAVE_GENERIC
271 const float* inputVector,
273 unsigned int num_points)
275 const float* inputVectorPtr = inputVector;
276 unsigned int number = 0;
279 for (number = 0; number < num_points; number++) {
280 r = *inputVectorPtr++ * scalar;
289 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
290 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
292 #include <inttypes.h>
297 #include <immintrin.h>
299 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
300 const float* inputVector,
302 unsigned int num_points)
304 unsigned int number = 0;
306 const unsigned int thirtysecondPoints = num_points / 32;
308 const float* inputVectorPtr = (
const float*)inputVector;
309 int8_t* outputVectorPtr = outputVector;
311 float min_val = INT8_MIN;
312 float max_val = INT8_MAX;
315 __m256 vScalar = _mm256_set1_ps(scalar);
316 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
317 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
318 __m256 vmin_val = _mm256_set1_ps(min_val);
319 __m256 vmax_val = _mm256_set1_ps(max_val);
322 for (; number < thirtysecondPoints; number++) {
323 inputVal1 = _mm256_load_ps(inputVectorPtr);
325 inputVal2 = _mm256_load_ps(inputVectorPtr);
327 inputVal3 = _mm256_load_ps(inputVectorPtr);
329 inputVal4 = _mm256_load_ps(inputVectorPtr);
332 inputVal1 = _mm256_max_ps(
333 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
334 inputVal2 = _mm256_max_ps(
335 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
336 inputVal3 = _mm256_max_ps(
337 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
338 inputVal4 = _mm256_max_ps(
339 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
341 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
342 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
343 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
344 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
346 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
347 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
348 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
349 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
351 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
352 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
354 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
355 outputVectorPtr += 32;
358 number = thirtysecondPoints * 32;
359 for (; number < num_points; number++) {
360 r = inputVector[number] * scalar;
369 #include <emmintrin.h>
372 const float* inputVector,
374 unsigned int num_points)
376 unsigned int number = 0;
378 const unsigned int sixteenthPoints = num_points / 16;
380 const float* inputVectorPtr = (
const float*)inputVector;
381 int8_t* outputVectorPtr = outputVector;
383 float min_val = INT8_MIN;
384 float max_val = INT8_MAX;
388 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
389 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
393 for (; number < sixteenthPoints; number++) {
423 outputVectorPtr += 16;
426 number = sixteenthPoints * 16;
427 for (; number < num_points; number++) {
428 r = inputVector[number] * scalar;
436 #include <xmmintrin.h>
439 const float* inputVector,
441 unsigned int num_points)
443 unsigned int number = 0;
446 const unsigned int quarterPoints = num_points / 4;
448 const float* inputVectorPtr = (
const float*)inputVector;
450 float min_val = INT8_MIN;
451 float max_val = INT8_MAX;
454 int8_t* outputVectorPtr = outputVector;
462 for (; number < quarterPoints; number++) {
469 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
470 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
474 number = quarterPoints * 4;
475 for (; number < num_points; number++) {
476 r = inputVector[number] * scalar;
484 #ifdef LV_HAVE_GENERIC
487 const float* inputVector,
489 unsigned int num_points)
491 const float* inputVectorPtr = inputVector;
492 unsigned int number = 0;
495 for (number = 0; number < num_points; number++) {
496 r = *inputVectorPtr++ * scalar;
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:371
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:66
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:222
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:486
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:438
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:270
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:154
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65