Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
33 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34 #define INCLUDED_volk_32fc_convert_16ic_a_H
35 
36 #include "volk/volk_complex.h"
37 #include <limits.h>
38 #include <math.h>
39 
40 #ifdef LV_HAVE_AVX2
41 #include <immintrin.h>
42 
43 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44  const lv_32fc_t* inputVector,
45  unsigned int num_points)
46 {
47  const unsigned int avx_iters = num_points / 8;
48 
49  float* inputVectorPtr = (float*)inputVector;
50  int16_t* outputVectorPtr = (int16_t*)outputVector;
51  float aux;
52 
53  const float min_val = (float)SHRT_MIN;
54  const float max_val = (float)SHRT_MAX;
55 
56  __m256 inputVal1, inputVal2;
57  __m256i intInputVal1, intInputVal2;
58  __m256 ret1, ret2;
59  const __m256 vmin_val = _mm256_set1_ps(min_val);
60  const __m256 vmax_val = _mm256_set1_ps(max_val);
61  unsigned int i;
62 
63  for (i = 0; i < avx_iters; i++) {
64  inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65  inputVectorPtr += 8;
66  inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67  inputVectorPtr += 8;
68  __VOLK_PREFETCH(inputVectorPtr + 16);
69 
70  // Clip
71  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73 
74  intInputVal1 = _mm256_cvtps_epi32(ret1);
75  intInputVal2 = _mm256_cvtps_epi32(ret2);
76 
77  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79 
80  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81  outputVectorPtr += 16;
82  }
83 
84  for (i = avx_iters * 16; i < num_points * 2; i++) {
85  aux = *inputVectorPtr++;
86  if (aux > max_val)
87  aux = max_val;
88  else if (aux < min_val)
89  aux = min_val;
90  *outputVectorPtr++ = (int16_t)rintf(aux);
91  }
92 }
93 #endif /* LV_HAVE_AVX2 */
94 
95 #ifdef LV_HAVE_SSE2
96 #include <emmintrin.h>
97 
98 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
99  const lv_32fc_t* inputVector,
100  unsigned int num_points)
101 {
102  const unsigned int sse_iters = num_points / 4;
103 
104  float* inputVectorPtr = (float*)inputVector;
105  int16_t* outputVectorPtr = (int16_t*)outputVector;
106  float aux;
107 
108  const float min_val = (float)SHRT_MIN;
109  const float max_val = (float)SHRT_MAX;
110 
111  __m128 inputVal1, inputVal2;
112  __m128i intInputVal1, intInputVal2;
113  __m128 ret1, ret2;
114  const __m128 vmin_val = _mm_set_ps1(min_val);
115  const __m128 vmax_val = _mm_set_ps1(max_val);
116  unsigned int i;
117 
118  for (i = 0; i < sse_iters; i++) {
119  inputVal1 = _mm_load_ps((float*)inputVectorPtr);
120  inputVectorPtr += 4;
121  inputVal2 = _mm_load_ps((float*)inputVectorPtr);
122  inputVectorPtr += 4;
123  __VOLK_PREFETCH(inputVectorPtr + 8);
124 
125  // Clip
126  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
127  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
128 
129  intInputVal1 = _mm_cvtps_epi32(ret1);
130  intInputVal2 = _mm_cvtps_epi32(ret2);
131 
132  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
133 
134  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
135  outputVectorPtr += 8;
136  }
137 
138  for (i = sse_iters * 8; i < num_points * 2; i++) {
139  aux = *inputVectorPtr++;
140  if (aux > max_val)
141  aux = max_val;
142  else if (aux < min_val)
143  aux = min_val;
144  *outputVectorPtr++ = (int16_t)rintf(aux);
145  }
146 }
147 #endif /* LV_HAVE_SSE2 */
148 
149 
150 #if LV_HAVE_NEONV7
151 #include <arm_neon.h>
152 
153 #define VCVTRQ_S32_F32(result, value) \
154  __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
155  __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
156  __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
157  __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
158 
159 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
160  const lv_32fc_t* inputVector,
161  unsigned int num_points)
162 {
163 
164  const unsigned int neon_iters = num_points / 4;
165 
166  float32_t* inputVectorPtr = (float32_t*)inputVector;
167  int16_t* outputVectorPtr = (int16_t*)outputVector;
168 
169  const float min_val_f = (float)SHRT_MIN;
170  const float max_val_f = (float)SHRT_MAX;
171  float32_t aux;
172  unsigned int i;
173 
174  const float32x4_t min_val = vmovq_n_f32(min_val_f);
175  const float32x4_t max_val = vmovq_n_f32(max_val_f);
176  float32x4_t ret1, ret2, a, b;
177 
178  int32x4_t toint_a = { 0, 0, 0, 0 };
179  int32x4_t toint_b = { 0, 0, 0, 0 };
180  int16x4_t intInputVal1, intInputVal2;
181  int16x8_t res;
182 
183  for (i = 0; i < neon_iters; i++) {
184  a = vld1q_f32((const float32_t*)(inputVectorPtr));
185  inputVectorPtr += 4;
186  b = vld1q_f32((const float32_t*)(inputVectorPtr));
187  inputVectorPtr += 4;
188  __VOLK_PREFETCH(inputVectorPtr + 8);
189 
190  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
191  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
192 
193  // vcvtr takes into account the current rounding mode (as does rintf)
194  VCVTRQ_S32_F32(toint_a, ret1);
195  VCVTRQ_S32_F32(toint_b, ret2);
196 
197  intInputVal1 = vqmovn_s32(toint_a);
198  intInputVal2 = vqmovn_s32(toint_b);
199 
200  res = vcombine_s16(intInputVal1, intInputVal2);
201  vst1q_s16((int16_t*)outputVectorPtr, res);
202  outputVectorPtr += 8;
203  }
204 
205  for (i = neon_iters * 8; i < num_points * 2; i++) {
206  aux = *inputVectorPtr++;
207  if (aux > max_val_f)
208  aux = max_val_f;
209  else if (aux < min_val_f)
210  aux = min_val_f;
211  *outputVectorPtr++ = (int16_t)rintf(aux);
212  }
213 }
214 
215 #undef VCVTRQ_S32_F32
216 #endif /* LV_HAVE_NEONV7 */
217 
218 #if LV_HAVE_NEONV8
219 #include <arm_neon.h>
220 
221 static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
222  const lv_32fc_t* inputVector,
223  unsigned int num_points)
224 {
225  const unsigned int neon_iters = num_points / 4;
226 
227  float32_t* inputVectorPtr = (float32_t*)inputVector;
228  int16_t* outputVectorPtr = (int16_t*)outputVector;
229 
230  const float min_val_f = (float)SHRT_MIN;
231  const float max_val_f = (float)SHRT_MAX;
232  float32_t aux;
233  unsigned int i;
234 
235  const float32x4_t min_val = vmovq_n_f32(min_val_f);
236  const float32x4_t max_val = vmovq_n_f32(max_val_f);
237  float32x4_t ret1, ret2, a, b;
238 
239  int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240  int16x4_t intInputVal1, intInputVal2;
241  int16x8_t res;
242 
243  for (i = 0; i < neon_iters; i++) {
244  a = vld1q_f32((const float32_t*)(inputVectorPtr));
245  inputVectorPtr += 4;
246  b = vld1q_f32((const float32_t*)(inputVectorPtr));
247  inputVectorPtr += 4;
248  __VOLK_PREFETCH(inputVectorPtr + 8);
249 
250  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
252 
253  // vrndiq takes into account the current rounding mode (as does rintf)
254  toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255  toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
256 
257  intInputVal1 = vqmovn_s32(toint_a);
258  intInputVal2 = vqmovn_s32(toint_b);
259 
260  res = vcombine_s16(intInputVal1, intInputVal2);
261  vst1q_s16((int16_t*)outputVectorPtr, res);
262  outputVectorPtr += 8;
263  }
264 
265  for (i = neon_iters * 8; i < num_points * 2; i++) {
266  aux = *inputVectorPtr++;
267  if (aux > max_val_f)
268  aux = max_val_f;
269  else if (aux < min_val_f)
270  aux = min_val_f;
271  *outputVectorPtr++ = (int16_t)rintf(aux);
272  }
273 }
274 #endif /* LV_HAVE_NEONV8 */
275 
276 
277 #ifdef LV_HAVE_GENERIC
278 
279 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
280  const lv_32fc_t* inputVector,
281  unsigned int num_points)
282 {
283  float* inputVectorPtr = (float*)inputVector;
284  int16_t* outputVectorPtr = (int16_t*)outputVector;
285  const float min_val = (float)SHRT_MIN;
286  const float max_val = (float)SHRT_MAX;
287  float aux;
288  unsigned int i;
289  for (i = 0; i < num_points * 2; i++) {
290  aux = *inputVectorPtr++;
291  if (aux > max_val)
292  aux = max_val;
293  else if (aux < min_val)
294  aux = min_val;
295  *outputVectorPtr++ = (int16_t)rintf(aux);
296  }
297 }
298 #endif /* LV_HAVE_GENERIC */
299 
300 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
301 
302 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303 #define INCLUDED_volk_32fc_convert_16ic_u_H
304 
305 #include "volk/volk_complex.h"
306 #include <limits.h>
307 #include <math.h>
308 
309 
310 #ifdef LV_HAVE_AVX2
311 #include <immintrin.h>
312 
313 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
314  const lv_32fc_t* inputVector,
315  unsigned int num_points)
316 {
317  const unsigned int avx_iters = num_points / 8;
318 
319  float* inputVectorPtr = (float*)inputVector;
320  int16_t* outputVectorPtr = (int16_t*)outputVector;
321  float aux;
322 
323  const float min_val = (float)SHRT_MIN;
324  const float max_val = (float)SHRT_MAX;
325 
326  __m256 inputVal1, inputVal2;
327  __m256i intInputVal1, intInputVal2;
328  __m256 ret1, ret2;
329  const __m256 vmin_val = _mm256_set1_ps(min_val);
330  const __m256 vmax_val = _mm256_set1_ps(max_val);
331  unsigned int i;
332 
333  for (i = 0; i < avx_iters; i++) {
334  inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
335  inputVectorPtr += 8;
336  inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
337  inputVectorPtr += 8;
338  __VOLK_PREFETCH(inputVectorPtr + 16);
339 
340  // Clip
341  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
343 
344  intInputVal1 = _mm256_cvtps_epi32(ret1);
345  intInputVal2 = _mm256_cvtps_epi32(ret2);
346 
347  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
349 
350  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351  outputVectorPtr += 16;
352  }
353 
354  for (i = avx_iters * 16; i < num_points * 2; i++) {
355  aux = *inputVectorPtr++;
356  if (aux > max_val)
357  aux = max_val;
358  else if (aux < min_val)
359  aux = min_val;
360  *outputVectorPtr++ = (int16_t)rintf(aux);
361  }
362 }
363 #endif /* LV_HAVE_AVX2 */
364 
365 
366 #ifdef LV_HAVE_SSE2
367 #include <emmintrin.h>
368 
369 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
370  const lv_32fc_t* inputVector,
371  unsigned int num_points)
372 {
373  const unsigned int sse_iters = num_points / 4;
374 
375  float* inputVectorPtr = (float*)inputVector;
376  int16_t* outputVectorPtr = (int16_t*)outputVector;
377  float aux;
378 
379  const float min_val = (float)SHRT_MIN;
380  const float max_val = (float)SHRT_MAX;
381 
382  __m128 inputVal1, inputVal2;
383  __m128i intInputVal1, intInputVal2;
384  __m128 ret1, ret2;
385  const __m128 vmin_val = _mm_set_ps1(min_val);
386  const __m128 vmax_val = _mm_set_ps1(max_val);
387 
388  unsigned int i;
389  for (i = 0; i < sse_iters; i++) {
390  inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
391  inputVectorPtr += 4;
392  inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
393  inputVectorPtr += 4;
394  __VOLK_PREFETCH(inputVectorPtr + 8);
395 
396  // Clip
397  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
398  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
399 
400  intInputVal1 = _mm_cvtps_epi32(ret1);
401  intInputVal2 = _mm_cvtps_epi32(ret2);
402 
403  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
404 
405  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
406  outputVectorPtr += 8;
407  }
408 
409  for (i = sse_iters * 8; i < num_points * 2; i++) {
410  aux = *inputVectorPtr++;
411  if (aux > max_val)
412  aux = max_val;
413  else if (aux < min_val)
414  aux = min_val;
415  *outputVectorPtr++ = (int16_t)rintf(aux);
416  }
417 }
418 #endif /* LV_HAVE_SSE2 */
419 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:98
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:369
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:279
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74
short complex lv_16sc_t
Definition: volk_complex.h:71
for i
Definition: volk_config_fixed.tmpl.h:13