41 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
48 #include <immintrin.h>
50 static inline void volk_16i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int16_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int eighthPoints = num_points / 8;
58 float* outputVectorPtr = outputVector;
59 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60 int16_t* inputPtr = (int16_t*)inputVector;
65 for (; number < eighthPoints; number++) {
71 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
73 ret = _mm256_cvtepi32_ps(inputVal2);
74 ret = _mm256_mul_ps(ret, invScalar);
76 _mm256_storeu_ps(outputVectorPtr, ret);
83 number = eighthPoints * 8;
84 for (; number < num_points; number++) {
85 outputVector[number] = ((float)(inputVector[number])) / scalar;
91 #include <immintrin.h>
94 const int16_t* inputVector,
96 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int eighthPoints = num_points / 8;
101 float* outputVectorPtr = outputVector;
103 int16_t* inputPtr = (int16_t*)inputVector;
107 __m256 dummy = _mm256_setzero_ps();
109 for (; number < eighthPoints; number++) {
124 output = _mm256_insertf128_ps(dummy, ret, 0);
128 output = _mm256_insertf128_ps(output, ret, 1);
130 _mm256_storeu_ps(outputVectorPtr, output);
132 outputVectorPtr += 8;
137 number = eighthPoints * 8;
138 for (; number < num_points; number++) {
139 outputVector[number] = ((float)(inputVector[number])) / scalar;
144 #ifdef LV_HAVE_SSE4_1
145 #include <smmintrin.h>
147 static inline void volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
148 const int16_t* inputVector,
150 unsigned int num_points)
152 unsigned int number = 0;
153 const unsigned int eighthPoints = num_points / 8;
155 float* outputVectorPtr = outputVector;
157 int16_t* inputPtr = (int16_t*)inputVector;
162 for (; number < eighthPoints; number++) {
177 outputVectorPtr += 4;
183 outputVectorPtr += 4;
188 number = eighthPoints * 8;
189 for (; number < num_points; number++) {
190 outputVector[number] = ((float)(inputVector[number])) / scalar;
196 #include <xmmintrin.h>
199 const int16_t* inputVector,
201 unsigned int num_points)
203 unsigned int number = 0;
204 const unsigned int quarterPoints = num_points / 4;
206 float* outputVectorPtr = outputVector;
208 int16_t* inputPtr = (int16_t*)inputVector;
211 for (; number < quarterPoints; number++) {
213 (
float)(inputPtr[2]),
214 (
float)(inputPtr[1]),
215 (
float)(inputPtr[0]));
221 outputVectorPtr += 4;
224 number = quarterPoints * 4;
225 for (; number < num_points; number++) {
226 outputVector[number] = (float)(inputVector[number]) / scalar;
231 #ifdef LV_HAVE_GENERIC
234 const int16_t* inputVector,
236 unsigned int num_points)
238 float* outputVectorPtr = outputVector;
239 const int16_t* inputVectorPtr = inputVector;
240 unsigned int number = 0;
242 for (number = 0; number < num_points; number++) {
243 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
249 #include <arm_neon.h>
252 const int16_t* inputVector,
254 unsigned int num_points)
256 float* outputPtr = outputVector;
257 const int16_t* inputPtr = inputVector;
258 unsigned int number = 0;
259 unsigned int eighth_points = num_points / 8;
262 int32x4_t input32_0, input32_1;
263 float32x4_t input_float_0, input_float_1;
264 float32x4x2_t output_float;
265 float32x4_t inv_scale;
267 inv_scale = vdupq_n_f32(1.0 / scalar);
273 for (number = 0; number < eighth_points; number++) {
274 input16 = vld2_s16(inputPtr);
276 input32_0 = vmovl_s16(input16.val[0]);
277 input32_1 = vmovl_s16(input16.val[1]);
279 input_float_0 = vcvtq_f32_s32(input32_0);
280 input_float_1 = vcvtq_f32_s32(input32_1);
281 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
282 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
283 vst2q_f32(outputPtr, output_float);
288 for (number = eighth_points * 8; number < num_points; number++) {
289 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
296 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
297 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
299 #include <inttypes.h>
303 #include <immintrin.h>
305 static inline void volk_16i_s32f_convert_32f_a_avx2(
float* outputVector,
306 const int16_t* inputVector,
308 unsigned int num_points)
310 unsigned int number = 0;
311 const unsigned int eighthPoints = num_points / 8;
313 float* outputVectorPtr = outputVector;
314 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
315 int16_t* inputPtr = (int16_t*)inputVector;
320 for (; number < eighthPoints; number++) {
326 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
328 ret = _mm256_cvtepi32_ps(inputVal2);
329 ret = _mm256_mul_ps(ret, invScalar);
331 _mm256_store_ps(outputVectorPtr, ret);
333 outputVectorPtr += 8;
338 number = eighthPoints * 8;
339 for (; number < num_points; number++) {
340 outputVector[number] = ((float)(inputVector[number])) / scalar;
346 #include <immintrin.h>
349 const int16_t* inputVector,
351 unsigned int num_points)
353 unsigned int number = 0;
354 const unsigned int eighthPoints = num_points / 8;
356 float* outputVectorPtr = outputVector;
358 int16_t* inputPtr = (int16_t*)inputVector;
362 __m256 dummy = _mm256_setzero_ps();
364 for (; number < eighthPoints; number++) {
379 output = _mm256_insertf128_ps(dummy, ret, 0);
383 output = _mm256_insertf128_ps(output, ret, 1);
385 _mm256_store_ps(outputVectorPtr, output);
387 outputVectorPtr += 8;
392 number = eighthPoints * 8;
393 for (; number < num_points; number++) {
394 outputVector[number] = ((float)(inputVector[number])) / scalar;
399 #ifdef LV_HAVE_SSE4_1
400 #include <smmintrin.h>
402 static inline void volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
403 const int16_t* inputVector,
405 unsigned int num_points)
407 unsigned int number = 0;
408 const unsigned int eighthPoints = num_points / 8;
410 float* outputVectorPtr = outputVector;
412 int16_t* inputPtr = (int16_t*)inputVector;
417 for (; number < eighthPoints; number++) {
432 outputVectorPtr += 4;
438 outputVectorPtr += 4;
443 number = eighthPoints * 8;
444 for (; number < num_points; number++) {
445 outputVector[number] = ((float)(inputVector[number])) / scalar;
451 #include <xmmintrin.h>
454 const int16_t* inputVector,
456 unsigned int num_points)
458 unsigned int number = 0;
459 const unsigned int quarterPoints = num_points / 4;
461 float* outputVectorPtr = outputVector;
463 int16_t* inputPtr = (int16_t*)inputVector;
466 for (; number < quarterPoints; number++) {
468 (
float)(inputPtr[2]),
469 (
float)(inputPtr[1]),
470 (
float)(inputPtr[0]));
476 outputVectorPtr += 4;
479 number = quarterPoints * 4;
480 for (; number < num_points; number++) {
481 outputVector[number] = (float)(inputVector[number]) / scalar;
486 #ifdef LV_HAVE_GENERIC
489 const int16_t* inputVector,
491 unsigned int num_points)
493 float* outputVectorPtr = outputVector;
494 const int16_t* inputVectorPtr = inputVector;
495 unsigned int number = 0;
497 for (number = 0; number < num_points; number++) {
498 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:7539
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:233
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:488
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:198
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:348
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:93
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:251
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:453