Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
51  const int16_t* inputVector,
52  const float scalar,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const unsigned int eighthPoints = num_points / 8;
57 
58  float* outputVectorPtr = outputVector;
59  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60  int16_t* inputPtr = (int16_t*)inputVector;
61  __m128i inputVal;
62  __m256i inputVal2;
63  __m256 ret;
64 
65  for (; number < eighthPoints; number++) {
66 
67  // Load the 8 values
68  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
69 
70  // Convert
71  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
72 
73  ret = _mm256_cvtepi32_ps(inputVal2);
74  ret = _mm256_mul_ps(ret, invScalar);
75 
76  _mm256_storeu_ps(outputVectorPtr, ret);
77 
78  outputVectorPtr += 8;
79 
80  inputPtr += 8;
81  }
82 
83  number = eighthPoints * 8;
84  for (; number < num_points; number++) {
85  outputVector[number] = ((float)(inputVector[number])) / scalar;
86  }
87 }
88 #endif /* LV_HAVE_AVX2 */
89 
90 #ifdef LV_HAVE_AVX
91 #include <immintrin.h>
92 
93 static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
94  const int16_t* inputVector,
95  const float scalar,
96  unsigned int num_points)
97 {
98  unsigned int number = 0;
99  const unsigned int eighthPoints = num_points / 8;
100 
101  float* outputVectorPtr = outputVector;
102  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
103  int16_t* inputPtr = (int16_t*)inputVector;
104  __m128i inputVal, inputVal2;
105  __m128 ret;
106  __m256 output;
107  __m256 dummy = _mm256_setzero_ps();
108 
109  for (; number < eighthPoints; number++) {
110 
111  // Load the 8 values
112  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
113  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
114 
115  // Shift the input data to the right by 64 bits ( 8 bytes )
116  inputVal2 = _mm_srli_si128(inputVal, 8);
117 
118  // Convert the lower 4 values into 32 bit words
119  inputVal = _mm_cvtepi16_epi32(inputVal);
120  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
121 
122  ret = _mm_cvtepi32_ps(inputVal);
123  ret = _mm_mul_ps(ret, invScalar);
124  output = _mm256_insertf128_ps(dummy, ret, 0);
125 
126  ret = _mm_cvtepi32_ps(inputVal2);
127  ret = _mm_mul_ps(ret, invScalar);
128  output = _mm256_insertf128_ps(output, ret, 1);
129 
130  _mm256_storeu_ps(outputVectorPtr, output);
131 
132  outputVectorPtr += 8;
133 
134  inputPtr += 8;
135  }
136 
137  number = eighthPoints * 8;
138  for (; number < num_points; number++) {
139  outputVector[number] = ((float)(inputVector[number])) / scalar;
140  }
141 }
142 #endif /* LV_HAVE_AVX */
143 
144 #ifdef LV_HAVE_SSE4_1
145 #include <smmintrin.h>
146 
147 static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
148  const int16_t* inputVector,
149  const float scalar,
150  unsigned int num_points)
151 {
152  unsigned int number = 0;
153  const unsigned int eighthPoints = num_points / 8;
154 
155  float* outputVectorPtr = outputVector;
156  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157  int16_t* inputPtr = (int16_t*)inputVector;
158  __m128i inputVal;
159  __m128i inputVal2;
160  __m128 ret;
161 
162  for (; number < eighthPoints; number++) {
163 
164  // Load the 8 values
165  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
166 
167  // Shift the input data to the right by 64 bits ( 8 bytes )
168  inputVal2 = _mm_srli_si128(inputVal, 8);
169 
170  // Convert the lower 4 values into 32 bit words
171  inputVal = _mm_cvtepi16_epi32(inputVal);
172  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
173 
174  ret = _mm_cvtepi32_ps(inputVal);
175  ret = _mm_mul_ps(ret, invScalar);
176  _mm_storeu_ps(outputVectorPtr, ret);
177  outputVectorPtr += 4;
178 
179  ret = _mm_cvtepi32_ps(inputVal2);
180  ret = _mm_mul_ps(ret, invScalar);
181  _mm_storeu_ps(outputVectorPtr, ret);
182 
183  outputVectorPtr += 4;
184 
185  inputPtr += 8;
186  }
187 
188  number = eighthPoints * 8;
189  for (; number < num_points; number++) {
190  outputVector[number] = ((float)(inputVector[number])) / scalar;
191  }
192 }
193 #endif /* LV_HAVE_SSE4_1 */
194 
195 #ifdef LV_HAVE_SSE
196 #include <xmmintrin.h>
197 
198 static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
199  const int16_t* inputVector,
200  const float scalar,
201  unsigned int num_points)
202 {
203  unsigned int number = 0;
204  const unsigned int quarterPoints = num_points / 4;
205 
206  float* outputVectorPtr = outputVector;
207  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
208  int16_t* inputPtr = (int16_t*)inputVector;
209  __m128 ret;
210 
211  for (; number < quarterPoints; number++) {
212  ret = _mm_set_ps((float)(inputPtr[3]),
213  (float)(inputPtr[2]),
214  (float)(inputPtr[1]),
215  (float)(inputPtr[0]));
216 
217  ret = _mm_mul_ps(ret, invScalar);
218  _mm_storeu_ps(outputVectorPtr, ret);
219 
220  inputPtr += 4;
221  outputVectorPtr += 4;
222  }
223 
224  number = quarterPoints * 4;
225  for (; number < num_points; number++) {
226  outputVector[number] = (float)(inputVector[number]) / scalar;
227  }
228 }
229 #endif /* LV_HAVE_SSE */
230 
231 #ifdef LV_HAVE_GENERIC
232 
233 static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
234  const int16_t* inputVector,
235  const float scalar,
236  unsigned int num_points)
237 {
238  float* outputVectorPtr = outputVector;
239  const int16_t* inputVectorPtr = inputVector;
240  unsigned int number = 0;
241 
242  for (number = 0; number < num_points; number++) {
243  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
244  }
245 }
246 #endif /* LV_HAVE_GENERIC */
247 
248 #ifdef LV_HAVE_NEON
249 #include <arm_neon.h>
250 
251 static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
252  const int16_t* inputVector,
253  const float scalar,
254  unsigned int num_points)
255 {
256  float* outputPtr = outputVector;
257  const int16_t* inputPtr = inputVector;
258  unsigned int number = 0;
259  unsigned int eighth_points = num_points / 8;
260 
261  int16x4x2_t input16;
262  int32x4_t input32_0, input32_1;
263  float32x4_t input_float_0, input_float_1;
264  float32x4x2_t output_float;
265  float32x4_t inv_scale;
266 
267  inv_scale = vdupq_n_f32(1.0 / scalar);
268 
269  // the generic disassembles to a 128-bit load
270  // and duplicates every instruction to operate on 64-bits
271  // at a time. This is only possible with lanes, which is faster
272  // than just doing a vld1_s16, but still slower.
273  for (number = 0; number < eighth_points; number++) {
274  input16 = vld2_s16(inputPtr);
275  // widen 16-bit int to 32-bit int
276  input32_0 = vmovl_s16(input16.val[0]);
277  input32_1 = vmovl_s16(input16.val[1]);
278  // convert 32-bit int to float with scale
279  input_float_0 = vcvtq_f32_s32(input32_0);
280  input_float_1 = vcvtq_f32_s32(input32_1);
281  output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
282  output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
283  vst2q_f32(outputPtr, output_float);
284  inputPtr += 8;
285  outputPtr += 8;
286  }
287 
288  for (number = eighth_points * 8; number < num_points; number++) {
289  *outputPtr++ = ((float)(*inputPtr++)) / scalar;
290  }
291 }
292 #endif /* LV_HAVE_NEON */
293 
294 
295 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
296 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
297 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
298 
299 #include <inttypes.h>
300 #include <stdio.h>
301 
302 #ifdef LV_HAVE_AVX2
303 #include <immintrin.h>
304 
305 static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
306  const int16_t* inputVector,
307  const float scalar,
308  unsigned int num_points)
309 {
310  unsigned int number = 0;
311  const unsigned int eighthPoints = num_points / 8;
312 
313  float* outputVectorPtr = outputVector;
314  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
315  int16_t* inputPtr = (int16_t*)inputVector;
316  __m128i inputVal;
317  __m256i inputVal2;
318  __m256 ret;
319 
320  for (; number < eighthPoints; number++) {
321 
322  // Load the 8 values
323  inputVal = _mm_load_si128((__m128i*)inputPtr);
324 
325  // Convert
326  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
327 
328  ret = _mm256_cvtepi32_ps(inputVal2);
329  ret = _mm256_mul_ps(ret, invScalar);
330 
331  _mm256_store_ps(outputVectorPtr, ret);
332 
333  outputVectorPtr += 8;
334 
335  inputPtr += 8;
336  }
337 
338  number = eighthPoints * 8;
339  for (; number < num_points; number++) {
340  outputVector[number] = ((float)(inputVector[number])) / scalar;
341  }
342 }
343 #endif /* LV_HAVE_AVX2 */
344 
345 #ifdef LV_HAVE_AVX
346 #include <immintrin.h>
347 
348 static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
349  const int16_t* inputVector,
350  const float scalar,
351  unsigned int num_points)
352 {
353  unsigned int number = 0;
354  const unsigned int eighthPoints = num_points / 8;
355 
356  float* outputVectorPtr = outputVector;
357  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
358  int16_t* inputPtr = (int16_t*)inputVector;
359  __m128i inputVal, inputVal2;
360  __m128 ret;
361  __m256 output;
362  __m256 dummy = _mm256_setzero_ps();
363 
364  for (; number < eighthPoints; number++) {
365 
366  // Load the 8 values
367  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
368  inputVal = _mm_load_si128((__m128i*)inputPtr);
369 
370  // Shift the input data to the right by 64 bits ( 8 bytes )
371  inputVal2 = _mm_srli_si128(inputVal, 8);
372 
373  // Convert the lower 4 values into 32 bit words
374  inputVal = _mm_cvtepi16_epi32(inputVal);
375  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
376 
377  ret = _mm_cvtepi32_ps(inputVal);
378  ret = _mm_mul_ps(ret, invScalar);
379  output = _mm256_insertf128_ps(dummy, ret, 0);
380 
381  ret = _mm_cvtepi32_ps(inputVal2);
382  ret = _mm_mul_ps(ret, invScalar);
383  output = _mm256_insertf128_ps(output, ret, 1);
384 
385  _mm256_store_ps(outputVectorPtr, output);
386 
387  outputVectorPtr += 8;
388 
389  inputPtr += 8;
390  }
391 
392  number = eighthPoints * 8;
393  for (; number < num_points; number++) {
394  outputVector[number] = ((float)(inputVector[number])) / scalar;
395  }
396 }
397 #endif /* LV_HAVE_AVX */
398 
399 #ifdef LV_HAVE_SSE4_1
400 #include <smmintrin.h>
401 
402 static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
403  const int16_t* inputVector,
404  const float scalar,
405  unsigned int num_points)
406 {
407  unsigned int number = 0;
408  const unsigned int eighthPoints = num_points / 8;
409 
410  float* outputVectorPtr = outputVector;
411  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
412  int16_t* inputPtr = (int16_t*)inputVector;
413  __m128i inputVal;
414  __m128i inputVal2;
415  __m128 ret;
416 
417  for (; number < eighthPoints; number++) {
418 
419  // Load the 8 values
420  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
421 
422  // Shift the input data to the right by 64 bits ( 8 bytes )
423  inputVal2 = _mm_srli_si128(inputVal, 8);
424 
425  // Convert the lower 4 values into 32 bit words
426  inputVal = _mm_cvtepi16_epi32(inputVal);
427  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
428 
429  ret = _mm_cvtepi32_ps(inputVal);
430  ret = _mm_mul_ps(ret, invScalar);
431  _mm_storeu_ps(outputVectorPtr, ret);
432  outputVectorPtr += 4;
433 
434  ret = _mm_cvtepi32_ps(inputVal2);
435  ret = _mm_mul_ps(ret, invScalar);
436  _mm_storeu_ps(outputVectorPtr, ret);
437 
438  outputVectorPtr += 4;
439 
440  inputPtr += 8;
441  }
442 
443  number = eighthPoints * 8;
444  for (; number < num_points; number++) {
445  outputVector[number] = ((float)(inputVector[number])) / scalar;
446  }
447 }
448 #endif /* LV_HAVE_SSE4_1 */
449 
450 #ifdef LV_HAVE_SSE
451 #include <xmmintrin.h>
452 
453 static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
454  const int16_t* inputVector,
455  const float scalar,
456  unsigned int num_points)
457 {
458  unsigned int number = 0;
459  const unsigned int quarterPoints = num_points / 4;
460 
461  float* outputVectorPtr = outputVector;
462  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
463  int16_t* inputPtr = (int16_t*)inputVector;
464  __m128 ret;
465 
466  for (; number < quarterPoints; number++) {
467  ret = _mm_set_ps((float)(inputPtr[3]),
468  (float)(inputPtr[2]),
469  (float)(inputPtr[1]),
470  (float)(inputPtr[0]));
471 
472  ret = _mm_mul_ps(ret, invScalar);
473  _mm_storeu_ps(outputVectorPtr, ret);
474 
475  inputPtr += 4;
476  outputVectorPtr += 4;
477  }
478 
479  number = quarterPoints * 4;
480  for (; number < num_points; number++) {
481  outputVector[number] = (float)(inputVector[number]) / scalar;
482  }
483 }
484 #endif /* LV_HAVE_SSE */
485 
486 #ifdef LV_HAVE_GENERIC
487 
488 static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
489  const int16_t* inputVector,
490  const float scalar,
491  unsigned int num_points)
492 {
493  float* outputVectorPtr = outputVector;
494  const int16_t* inputVectorPtr = inputVector;
495  unsigned int number = 0;
496 
497  for (number = 0; number < num_points; number++) {
498  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
499  }
500 }
501 #endif /* LV_HAVE_GENERIC */
502 
503 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:7539
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:233
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:488
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:198
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:348
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:93
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:251
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:453