41 #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
42 #define INCLUDED_volk_16ic_magnitude_16i_a_H
51 #include <immintrin.h>
53 static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
55 unsigned int num_points)
57 unsigned int number = 0;
58 const unsigned int eighthPoints = num_points / 8;
60 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
61 int16_t* magnitudeVectorPtr = magnitudeVector;
63 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
64 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
67 __m256 cplxValue1, cplxValue2, result;
68 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
70 for (; number < eighthPoints; number++) {
72 int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
73 complexVectorPtr += 16;
74 short1 = _mm256_extracti128_si256(int1, 0);
75 short2 = _mm256_extracti128_si256(int1, 1);
77 int1 = _mm256_cvtepi16_epi32(short1);
78 int2 = _mm256_cvtepi16_epi32(short2);
79 cplxValue1 = _mm256_cvtepi32_ps(int1);
80 cplxValue2 = _mm256_cvtepi32_ps(int2);
82 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
83 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
86 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
88 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
90 result = _mm256_sqrt_ps(result);
92 result = _mm256_mul_ps(result, vScalar);
94 int1 = _mm256_cvtps_epi32(result);
95 int1 = _mm256_packs_epi32(int1, int1);
96 int1 = _mm256_permutevar8x32_epi32(
98 short1 = _mm256_extracti128_si256(int1, 0);
100 magnitudeVectorPtr += 8;
103 number = eighthPoints * 8;
104 magnitudeVectorPtr = &magnitudeVector[number];
105 complexVectorPtr = (
const int16_t*)&complexVector[number];
106 for (; number < num_points; number++) {
107 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
108 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
109 const float val1Result =
110 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
111 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
117 #include <pmmintrin.h>
121 unsigned int num_points)
123 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
126 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
127 int16_t* magnitudeVectorPtr = magnitudeVector;
132 __m128 cplxValue1, cplxValue2, result;
137 for (; number < quarterPoints; number++) {
139 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
140 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
141 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
142 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
144 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
145 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
146 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
147 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
152 complexVectorPtr += 8;
154 cplxValue1 =
_mm_mul_ps(cplxValue1, invScalar);
155 cplxValue2 =
_mm_mul_ps(cplxValue2, invScalar);
157 cplxValue1 =
_mm_mul_ps(cplxValue1, cplxValue1);
158 cplxValue2 =
_mm_mul_ps(cplxValue2, cplxValue2);
167 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
168 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
169 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
170 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
173 number = quarterPoints * 4;
174 magnitudeVectorPtr = &magnitudeVector[number];
175 complexVectorPtr = (
const int16_t*)&complexVector[number];
176 for (; number < num_points; number++) {
177 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
178 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
179 const float val1Result =
180 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
181 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
187 #include <xmmintrin.h>
191 unsigned int num_points)
193 unsigned int number = 0;
194 const unsigned int quarterPoints = num_points / 4;
196 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
197 int16_t* magnitudeVectorPtr = magnitudeVector;
202 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
207 for (; number < quarterPoints; number++) {
209 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
210 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
211 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
212 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
215 complexVectorPtr += 4;
217 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
218 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
219 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
220 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
223 complexVectorPtr += 4;
225 cplxValue1 =
_mm_mul_ps(cplxValue1, invScalar);
226 cplxValue2 =
_mm_mul_ps(cplxValue2, invScalar);
243 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
244 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
245 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
246 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
249 number = quarterPoints * 4;
250 magnitudeVectorPtr = &magnitudeVector[number];
251 complexVectorPtr = (
const int16_t*)&complexVector[number];
252 for (; number < num_points; number++) {
253 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
254 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
255 const float val1Result =
256 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
257 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
262 #ifdef LV_HAVE_GENERIC
266 unsigned int num_points)
268 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
269 int16_t* magnitudeVectorPtr = magnitudeVector;
270 unsigned int number = 0;
271 const float scalar = SHRT_MAX;
272 for (number = 0; number < num_points; number++) {
273 float real = ((float)(*complexVectorPtr++)) / scalar;
274 float imag = ((float)(*complexVectorPtr++)) / scalar;
275 *magnitudeVectorPtr++ =
276 (int16_t)
rintf(sqrtf((real * real) + (imag * imag)) * scalar);
281 #ifdef LV_HAVE_ORC_DISABLED
282 extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
285 unsigned int num_points);
287 static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
289 unsigned int num_points)
291 volk_16ic_magnitude_16i_a_orc_impl(
292 magnitudeVector, complexVector, SHRT_MAX, num_points);
300 #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
301 #define INCLUDED_volk_16ic_magnitude_16i_u_H
303 #include <inttypes.h>
309 #include <immintrin.h>
311 static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
313 unsigned int num_points)
315 unsigned int number = 0;
316 const unsigned int eighthPoints = num_points / 8;
318 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
319 int16_t* magnitudeVectorPtr = magnitudeVector;
321 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
322 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
325 __m256 cplxValue1, cplxValue2, result;
326 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
328 for (; number < eighthPoints; number++) {
330 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
331 complexVectorPtr += 16;
332 short1 = _mm256_extracti128_si256(int1, 0);
333 short2 = _mm256_extracti128_si256(int1, 1);
335 int1 = _mm256_cvtepi16_epi32(short1);
336 int2 = _mm256_cvtepi16_epi32(short2);
337 cplxValue1 = _mm256_cvtepi32_ps(int1);
338 cplxValue2 = _mm256_cvtepi32_ps(int2);
340 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
341 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
343 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
344 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
346 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
348 result = _mm256_sqrt_ps(result);
350 result = _mm256_mul_ps(result, vScalar);
352 int1 = _mm256_cvtps_epi32(result);
353 int1 = _mm256_packs_epi32(int1, int1);
354 int1 = _mm256_permutevar8x32_epi32(
356 short1 = _mm256_extracti128_si256(int1, 0);
358 magnitudeVectorPtr += 8;
361 number = eighthPoints * 8;
362 magnitudeVectorPtr = &magnitudeVector[number];
363 complexVectorPtr = (
const int16_t*)&complexVector[number];
364 for (; number < num_points; number++) {
365 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
366 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
367 const float val1Result =
368 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
369 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
374 #ifdef LV_HAVE_NEONV7
375 #include <arm_neon.h>
378 static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
380 unsigned int num_points)
382 unsigned int number = 0;
383 unsigned int quarter_points = num_points / 4;
385 const float scalar = SHRT_MAX;
386 const float inv_scalar = 1.0f / scalar;
388 int16_t* magnitudeVectorPtr = magnitudeVector;
389 const lv_16sc_t* complexVectorPtr = complexVector;
394 for (number = 0; number < quarter_points; number++) {
395 const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
397 c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
398 c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
400 c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
401 c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
404 mag_vec = vmulq_f32(mag_vec_squared,
_vinvsqrtq_f32(mag_vec_squared));
406 mag_vec = vmulq_n_f32(mag_vec, scalar);
409 mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
410 const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
411 vst1_s16(magnitudeVectorPtr, mag16_vec);
413 magnitudeVectorPtr += 4;
414 complexVectorPtr += 4;
418 for (number = quarter_points * 4; number < num_points; number++) {
419 const float real =
lv_creal(*complexVectorPtr) * inv_scalar;
420 const float imag =
lv_cimag(*complexVectorPtr) * inv_scalar;
421 *magnitudeVectorPtr =
422 (int16_t)
rintf(sqrtf((real * real) + (imag * imag)) * scalar);
424 magnitudeVectorPtr++;
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:264
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:189
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:119
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
short complex lv_16sc_t
Definition: volk_complex.h:71
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73