41 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
48 #include <immintrin.h>
50 static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int8_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
66 for (; number < sixteenthPoints; number++) {
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
94 #include <smmintrin.h>
96 static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
97 const int8_t* inputVector,
99 unsigned int num_points)
101 unsigned int number = 0;
102 const unsigned int sixteenthPoints = num_points / 16;
104 float* outputVectorPtr = outputVector;
105 const float iScalar = 1.0 / scalar;
107 const int8_t* inputVectorPtr = inputVector;
112 for (; number < sixteenthPoints; number++) {
119 outputVectorPtr += 4;
126 outputVectorPtr += 4;
133 outputVectorPtr += 4;
140 outputVectorPtr += 4;
142 inputVectorPtr += 16;
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 outputVector[number] = (float)(inputVector[number]) * iScalar;
152 #ifdef LV_HAVE_GENERIC
155 const int8_t* inputVector,
157 unsigned int num_points)
159 float* outputVectorPtr = outputVector;
160 const int8_t* inputVectorPtr = inputVector;
161 unsigned int number = 0;
162 const float iScalar = 1.0 / scalar;
164 for (number = 0; number < num_points; number++) {
165 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
173 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
176 #include <inttypes.h>
180 #include <immintrin.h>
182 static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
183 const int8_t* inputVector,
185 unsigned int num_points)
187 unsigned int number = 0;
188 const unsigned int sixteenthPoints = num_points / 16;
190 float* outputVectorPtr = outputVector;
191 const float iScalar = 1.0 / scalar;
192 __m256 invScalar = _mm256_set1_ps(iScalar);
193 const int8_t* inputVectorPtr = inputVector;
198 for (; number < sixteenthPoints; number++) {
201 interimVal = _mm256_cvtepi8_epi32(inputVal128);
202 ret = _mm256_cvtepi32_ps(interimVal);
203 ret = _mm256_mul_ps(ret, invScalar);
204 _mm256_store_ps(outputVectorPtr, ret);
205 outputVectorPtr += 8;
208 interimVal = _mm256_cvtepi8_epi32(inputVal128);
209 ret = _mm256_cvtepi32_ps(interimVal);
210 ret = _mm256_mul_ps(ret, invScalar);
211 _mm256_store_ps(outputVectorPtr, ret);
212 outputVectorPtr += 8;
214 inputVectorPtr += 16;
217 number = sixteenthPoints * 16;
218 for (; number < num_points; number++) {
219 outputVector[number] = (float)(inputVector[number]) * iScalar;
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
227 static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
228 const int8_t* inputVector,
230 unsigned int num_points)
232 unsigned int number = 0;
233 const unsigned int sixteenthPoints = num_points / 16;
235 float* outputVectorPtr = outputVector;
236 const float iScalar = 1.0 / scalar;
238 const int8_t* inputVectorPtr = inputVector;
243 for (; number < sixteenthPoints; number++) {
250 outputVectorPtr += 4;
257 outputVectorPtr += 4;
264 outputVectorPtr += 4;
271 outputVectorPtr += 4;
273 inputVectorPtr += 16;
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (float)(inputVector[number]) * iScalar;
284 #include <arm_neon.h>
287 const int8_t* inputVector,
289 unsigned int num_points)
291 float* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
294 const float iScalar = 1.0 / scalar;
295 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
302 float32x4_t outputFloat;
304 unsigned int number = 0;
305 const unsigned int sixteenthPoints = num_points / 16;
306 for (; number < sixteenthPoints; number++) {
307 inputVal = vld1q_s8(inputVectorPtr);
308 inputVectorPtr += 16;
310 lower = vmovl_s8(vget_low_s8(inputVal));
311 higher = vmovl_s8(vget_high_s8(inputVal));
313 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314 vst1q_f32(outputVectorPtr, outputFloat);
315 outputVectorPtr += 4;
317 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318 vst1q_f32(outputVectorPtr, outputFloat);
319 outputVectorPtr += 4;
321 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat);
323 outputVectorPtr += 4;
326 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
330 for (number = sixteenthPoints * 16; number < num_points; number++) {
331 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
337 #ifdef LV_HAVE_GENERIC
340 const int8_t* inputVector,
342 unsigned int num_points)
344 float* outputVectorPtr = outputVector;
345 const int8_t* inputVectorPtr = inputVector;
346 unsigned int number = 0;
347 const float iScalar = 1.0 / scalar;
349 for (number = 0; number < num_points; number++) {
350 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
357 extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
358 const int8_t* inputVector,
360 unsigned int num_points);
362 static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
363 const int8_t* inputVector,
365 unsigned int num_points)
367 float invscalar = 1.0 / scalar;
368 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:339
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:154
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:286