33 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34 #define INCLUDED_volk_32fc_convert_16ic_a_H
41 #include <immintrin.h>
43 static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
45 unsigned int num_points)
47 const unsigned int avx_iters = num_points / 8;
49 float* inputVectorPtr = (
float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
63 for (
i = 0;
i < avx_iters;
i++) {
64 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
66 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
84 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
85 aux = *inputVectorPtr++;
88 else if (aux < min_val)
90 *outputVectorPtr++ = (int16_t)
rintf(aux);
96 #include <emmintrin.h>
100 unsigned int num_points)
102 const unsigned int sse_iters = num_points / 4;
104 float* inputVectorPtr = (
float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
111 __m128 inputVal1, inputVal2;
112 __m128i intInputVal1, intInputVal2;
118 for (
i = 0;
i < sse_iters;
i++) {
135 outputVectorPtr += 8;
138 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
139 aux = *inputVectorPtr++;
142 else if (aux < min_val)
144 *outputVectorPtr++ = (int16_t)
rintf(aux);
151 #include <arm_neon.h>
153 #define VCVTRQ_S32_F32(result, value) \
154 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
155 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
156 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
157 __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
159 static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
161 unsigned int num_points)
164 const unsigned int neon_iters = num_points / 4;
166 float32_t* inputVectorPtr = (float32_t*)inputVector;
167 int16_t* outputVectorPtr = (int16_t*)outputVector;
169 const float min_val_f = (float)SHRT_MIN;
170 const float max_val_f = (float)SHRT_MAX;
174 const float32x4_t min_val = vmovq_n_f32(min_val_f);
175 const float32x4_t max_val = vmovq_n_f32(max_val_f);
176 float32x4_t ret1, ret2, a, b;
178 int32x4_t toint_a = { 0, 0, 0, 0 };
179 int32x4_t toint_b = { 0, 0, 0, 0 };
180 int16x4_t intInputVal1, intInputVal2;
183 for (
i = 0;
i < neon_iters;
i++) {
184 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
186 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
190 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
191 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
194 VCVTRQ_S32_F32(toint_a, ret1);
195 VCVTRQ_S32_F32(toint_b, ret2);
197 intInputVal1 = vqmovn_s32(toint_a);
198 intInputVal2 = vqmovn_s32(toint_b);
200 res = vcombine_s16(intInputVal1, intInputVal2);
201 vst1q_s16((int16_t*)outputVectorPtr, res);
202 outputVectorPtr += 8;
205 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
206 aux = *inputVectorPtr++;
209 else if (aux < min_val_f)
211 *outputVectorPtr++ = (int16_t)
rintf(aux);
215 #undef VCVTRQ_S32_F32
219 #include <arm_neon.h>
221 static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
223 unsigned int num_points)
225 const unsigned int neon_iters = num_points / 4;
227 float32_t* inputVectorPtr = (float32_t*)inputVector;
228 int16_t* outputVectorPtr = (int16_t*)outputVector;
230 const float min_val_f = (float)SHRT_MIN;
231 const float max_val_f = (float)SHRT_MAX;
235 const float32x4_t min_val = vmovq_n_f32(min_val_f);
236 const float32x4_t max_val = vmovq_n_f32(max_val_f);
237 float32x4_t ret1, ret2, a, b;
239 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240 int16x4_t intInputVal1, intInputVal2;
243 for (
i = 0;
i < neon_iters;
i++) {
244 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
246 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
250 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
254 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
257 intInputVal1 = vqmovn_s32(toint_a);
258 intInputVal2 = vqmovn_s32(toint_b);
260 res = vcombine_s16(intInputVal1, intInputVal2);
261 vst1q_s16((int16_t*)outputVectorPtr, res);
262 outputVectorPtr += 8;
265 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
266 aux = *inputVectorPtr++;
269 else if (aux < min_val_f)
271 *outputVectorPtr++ = (int16_t)
rintf(aux);
277 #ifdef LV_HAVE_GENERIC
281 unsigned int num_points)
283 float* inputVectorPtr = (
float*)inputVector;
284 int16_t* outputVectorPtr = (int16_t*)outputVector;
285 const float min_val = (float)SHRT_MIN;
286 const float max_val = (float)SHRT_MAX;
289 for (
i = 0;
i < num_points * 2;
i++) {
290 aux = *inputVectorPtr++;
293 else if (aux < min_val)
295 *outputVectorPtr++ = (int16_t)
rintf(aux);
302 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303 #define INCLUDED_volk_32fc_convert_16ic_u_H
311 #include <immintrin.h>
313 static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
315 unsigned int num_points)
317 const unsigned int avx_iters = num_points / 8;
319 float* inputVectorPtr = (
float*)inputVector;
320 int16_t* outputVectorPtr = (int16_t*)outputVector;
323 const float min_val = (float)SHRT_MIN;
324 const float max_val = (float)SHRT_MAX;
326 __m256 inputVal1, inputVal2;
327 __m256i intInputVal1, intInputVal2;
329 const __m256 vmin_val = _mm256_set1_ps(min_val);
330 const __m256 vmax_val = _mm256_set1_ps(max_val);
333 for (
i = 0;
i < avx_iters;
i++) {
334 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
336 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
341 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
344 intInputVal1 = _mm256_cvtps_epi32(ret1);
345 intInputVal2 = _mm256_cvtps_epi32(ret2);
347 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
350 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351 outputVectorPtr += 16;
354 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
355 aux = *inputVectorPtr++;
358 else if (aux < min_val)
360 *outputVectorPtr++ = (int16_t)
rintf(aux);
367 #include <emmintrin.h>
371 unsigned int num_points)
373 const unsigned int sse_iters = num_points / 4;
375 float* inputVectorPtr = (
float*)inputVector;
376 int16_t* outputVectorPtr = (int16_t*)outputVector;
379 const float min_val = (float)SHRT_MIN;
380 const float max_val = (float)SHRT_MAX;
382 __m128 inputVal1, inputVal2;
383 __m128i intInputVal1, intInputVal2;
389 for (
i = 0;
i < sse_iters;
i++) {
406 outputVectorPtr += 8;
409 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
410 aux = *inputVectorPtr++;
413 else if (aux < min_val)
415 *outputVectorPtr++ = (int16_t)
rintf(aux);
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:98
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:369
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:279
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74
short complex lv_16sc_t
Definition: volk_complex.h:71
for i
Definition: volk_config_fixed.tmpl.h:13