Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51  const int8_t* inputVector,
52  const float scalar,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const unsigned int sixteenthPoints = num_points / 16;
57 
58  float* outputVectorPtr = outputVector;
59  const float iScalar = 1.0 / scalar;
60  __m256 invScalar = _mm256_set1_ps(iScalar);
61  const int8_t* inputVectorPtr = inputVector;
62  __m256 ret;
63  __m128i inputVal128;
64  __m256i interimVal;
65 
66  for (; number < sixteenthPoints; number++) {
67  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68 
69  interimVal = _mm256_cvtepi8_epi32(inputVal128);
70  ret = _mm256_cvtepi32_ps(interimVal);
71  ret = _mm256_mul_ps(ret, invScalar);
72  _mm256_storeu_ps(outputVectorPtr, ret);
73  outputVectorPtr += 8;
74 
75  inputVal128 = _mm_srli_si128(inputVal128, 8);
76  interimVal = _mm256_cvtepi8_epi32(inputVal128);
77  ret = _mm256_cvtepi32_ps(interimVal);
78  ret = _mm256_mul_ps(ret, invScalar);
79  _mm256_storeu_ps(outputVectorPtr, ret);
80  outputVectorPtr += 8;
81 
82  inputVectorPtr += 16;
83  }
84 
85  number = sixteenthPoints * 16;
86  for (; number < num_points; number++) {
87  outputVector[number] = (float)(inputVector[number]) * iScalar;
88  }
89 }
90 #endif /* LV_HAVE_AVX2 */
91 
92 
93 #ifdef LV_HAVE_SSE4_1
94 #include <smmintrin.h>
95 
96 static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
97  const int8_t* inputVector,
98  const float scalar,
99  unsigned int num_points)
100 {
101  unsigned int number = 0;
102  const unsigned int sixteenthPoints = num_points / 16;
103 
104  float* outputVectorPtr = outputVector;
105  const float iScalar = 1.0 / scalar;
106  __m128 invScalar = _mm_set_ps1(iScalar);
107  const int8_t* inputVectorPtr = inputVector;
108  __m128 ret;
109  __m128i inputVal;
110  __m128i interimVal;
111 
112  for (; number < sixteenthPoints; number++) {
113  inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
114 
115  interimVal = _mm_cvtepi8_epi32(inputVal);
116  ret = _mm_cvtepi32_ps(interimVal);
117  ret = _mm_mul_ps(ret, invScalar);
118  _mm_storeu_ps(outputVectorPtr, ret);
119  outputVectorPtr += 4;
120 
121  inputVal = _mm_srli_si128(inputVal, 4);
122  interimVal = _mm_cvtepi8_epi32(inputVal);
123  ret = _mm_cvtepi32_ps(interimVal);
124  ret = _mm_mul_ps(ret, invScalar);
125  _mm_storeu_ps(outputVectorPtr, ret);
126  outputVectorPtr += 4;
127 
128  inputVal = _mm_srli_si128(inputVal, 4);
129  interimVal = _mm_cvtepi8_epi32(inputVal);
130  ret = _mm_cvtepi32_ps(interimVal);
131  ret = _mm_mul_ps(ret, invScalar);
132  _mm_storeu_ps(outputVectorPtr, ret);
133  outputVectorPtr += 4;
134 
135  inputVal = _mm_srli_si128(inputVal, 4);
136  interimVal = _mm_cvtepi8_epi32(inputVal);
137  ret = _mm_cvtepi32_ps(interimVal);
138  ret = _mm_mul_ps(ret, invScalar);
139  _mm_storeu_ps(outputVectorPtr, ret);
140  outputVectorPtr += 4;
141 
142  inputVectorPtr += 16;
143  }
144 
145  number = sixteenthPoints * 16;
146  for (; number < num_points; number++) {
147  outputVector[number] = (float)(inputVector[number]) * iScalar;
148  }
149 }
150 #endif /* LV_HAVE_SSE4_1 */
151 
152 #ifdef LV_HAVE_GENERIC
153 
154 static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
155  const int8_t* inputVector,
156  const float scalar,
157  unsigned int num_points)
158 {
159  float* outputVectorPtr = outputVector;
160  const int8_t* inputVectorPtr = inputVector;
161  unsigned int number = 0;
162  const float iScalar = 1.0 / scalar;
163 
164  for (number = 0; number < num_points; number++) {
165  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
166  }
167 }
168 #endif /* LV_HAVE_GENERIC */
169 
170 
171 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
172 
173 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
175 
176 #include <inttypes.h>
177 #include <stdio.h>
178 
179 #ifdef LV_HAVE_AVX2
180 #include <immintrin.h>
181 
182 static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
183  const int8_t* inputVector,
184  const float scalar,
185  unsigned int num_points)
186 {
187  unsigned int number = 0;
188  const unsigned int sixteenthPoints = num_points / 16;
189 
190  float* outputVectorPtr = outputVector;
191  const float iScalar = 1.0 / scalar;
192  __m256 invScalar = _mm256_set1_ps(iScalar);
193  const int8_t* inputVectorPtr = inputVector;
194  __m256 ret;
195  __m128i inputVal128;
196  __m256i interimVal;
197 
198  for (; number < sixteenthPoints; number++) {
199  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
200 
201  interimVal = _mm256_cvtepi8_epi32(inputVal128);
202  ret = _mm256_cvtepi32_ps(interimVal);
203  ret = _mm256_mul_ps(ret, invScalar);
204  _mm256_store_ps(outputVectorPtr, ret);
205  outputVectorPtr += 8;
206 
207  inputVal128 = _mm_srli_si128(inputVal128, 8);
208  interimVal = _mm256_cvtepi8_epi32(inputVal128);
209  ret = _mm256_cvtepi32_ps(interimVal);
210  ret = _mm256_mul_ps(ret, invScalar);
211  _mm256_store_ps(outputVectorPtr, ret);
212  outputVectorPtr += 8;
213 
214  inputVectorPtr += 16;
215  }
216 
217  number = sixteenthPoints * 16;
218  for (; number < num_points; number++) {
219  outputVector[number] = (float)(inputVector[number]) * iScalar;
220  }
221 }
222 #endif /* LV_HAVE_AVX2 */
223 
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
226 
227 static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
228  const int8_t* inputVector,
229  const float scalar,
230  unsigned int num_points)
231 {
232  unsigned int number = 0;
233  const unsigned int sixteenthPoints = num_points / 16;
234 
235  float* outputVectorPtr = outputVector;
236  const float iScalar = 1.0 / scalar;
237  __m128 invScalar = _mm_set_ps1(iScalar);
238  const int8_t* inputVectorPtr = inputVector;
239  __m128 ret;
240  __m128i inputVal;
241  __m128i interimVal;
242 
243  for (; number < sixteenthPoints; number++) {
244  inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
245 
246  interimVal = _mm_cvtepi8_epi32(inputVal);
247  ret = _mm_cvtepi32_ps(interimVal);
248  ret = _mm_mul_ps(ret, invScalar);
249  _mm_store_ps(outputVectorPtr, ret);
250  outputVectorPtr += 4;
251 
252  inputVal = _mm_srli_si128(inputVal, 4);
253  interimVal = _mm_cvtepi8_epi32(inputVal);
254  ret = _mm_cvtepi32_ps(interimVal);
255  ret = _mm_mul_ps(ret, invScalar);
256  _mm_store_ps(outputVectorPtr, ret);
257  outputVectorPtr += 4;
258 
259  inputVal = _mm_srli_si128(inputVal, 4);
260  interimVal = _mm_cvtepi8_epi32(inputVal);
261  ret = _mm_cvtepi32_ps(interimVal);
262  ret = _mm_mul_ps(ret, invScalar);
263  _mm_store_ps(outputVectorPtr, ret);
264  outputVectorPtr += 4;
265 
266  inputVal = _mm_srli_si128(inputVal, 4);
267  interimVal = _mm_cvtepi8_epi32(inputVal);
268  ret = _mm_cvtepi32_ps(interimVal);
269  ret = _mm_mul_ps(ret, invScalar);
270  _mm_store_ps(outputVectorPtr, ret);
271  outputVectorPtr += 4;
272 
273  inputVectorPtr += 16;
274  }
275 
276  number = sixteenthPoints * 16;
277  for (; number < num_points; number++) {
278  outputVector[number] = (float)(inputVector[number]) * iScalar;
279  }
280 }
281 #endif /* LV_HAVE_SSE4_1 */
282 
283 #ifdef LV_HAVE_NEON
284 #include <arm_neon.h>
285 
286 static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
287  const int8_t* inputVector,
288  const float scalar,
289  unsigned int num_points)
290 {
291  float* outputVectorPtr = outputVector;
292  const int8_t* inputVectorPtr = inputVector;
293 
294  const float iScalar = 1.0 / scalar;
295  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
296 
297  int8x16_t inputVal;
298 
299  int16x8_t lower;
300  int16x8_t higher;
301 
302  float32x4_t outputFloat;
303 
304  unsigned int number = 0;
305  const unsigned int sixteenthPoints = num_points / 16;
306  for (; number < sixteenthPoints; number++) {
307  inputVal = vld1q_s8(inputVectorPtr);
308  inputVectorPtr += 16;
309 
310  lower = vmovl_s8(vget_low_s8(inputVal));
311  higher = vmovl_s8(vget_high_s8(inputVal));
312 
313  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314  vst1q_f32(outputVectorPtr, outputFloat);
315  outputVectorPtr += 4;
316 
317  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318  vst1q_f32(outputVectorPtr, outputFloat);
319  outputVectorPtr += 4;
320 
321  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322  vst1q_f32(outputVectorPtr, outputFloat);
323  outputVectorPtr += 4;
324 
325  outputFloat =
326  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327  vst1q_f32(outputVectorPtr, outputFloat);
328  outputVectorPtr += 4;
329  }
330  for (number = sixteenthPoints * 16; number < num_points; number++) {
331  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
332  }
333 }
334 
335 #endif /* LV_HAVE_NEON */
336 
337 #ifdef LV_HAVE_GENERIC
338 
339 static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
340  const int8_t* inputVector,
341  const float scalar,
342  unsigned int num_points)
343 {
344  float* outputVectorPtr = outputVector;
345  const int8_t* inputVectorPtr = inputVector;
346  unsigned int number = 0;
347  const float iScalar = 1.0 / scalar;
348 
349  for (number = 0; number < num_points; number++) {
350  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
351  }
352 }
353 #endif /* LV_HAVE_GENERIC */
354 
355 
356 #ifdef LV_HAVE_ORC
357 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
358  const int8_t* inputVector,
359  const float scalar,
360  unsigned int num_points);
361 
362 static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
363  const int8_t* inputVector,
364  const float scalar,
365  unsigned int num_points)
366 {
367  float invscalar = 1.0 / scalar;
368  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
369 }
370 #endif /* LV_HAVE_ORC */
371 
372 
373 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:339
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:154
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:286