Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifdef LV_HAVE_GENERIC
61 #include <volk/volk_common.h>
62 
63 static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
64  const lv_32fc_t* complexVector,
65  const float scalar,
66  unsigned int num_points)
67 {
68  const float* complexVectorPtr = (float*)complexVector;
69  int16_t* magnitudeVectorPtr = magnitudeVector;
70  unsigned int number = 0;
71  for (number = 0; number < num_points; number++) {
72  __VOLK_VOLATILE float real = *complexVectorPtr++;
73  __VOLK_VOLATILE float imag = *complexVectorPtr++;
74  real *= real;
75  imag *= imag;
76  *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
77  }
78 }
79 #endif /* LV_HAVE_GENERIC */
80 
81 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
82 #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
83 
84 #include <inttypes.h>
85 #include <math.h>
86 #include <stdio.h>
87 #include <volk/volk_common.h>
88 
89 #ifdef LV_HAVE_AVX2
90 #include <immintrin.h>
91 
92 static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
93  const lv_32fc_t* complexVector,
94  const float scalar,
95  unsigned int num_points)
96 {
97  unsigned int number = 0;
98  const unsigned int eighthPoints = num_points / 8;
99 
100  const float* complexVectorPtr = (const float*)complexVector;
101  int16_t* magnitudeVectorPtr = magnitudeVector;
102 
103  __m256 vScalar = _mm256_set1_ps(scalar);
104  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
105  __m256 cplxValue1, cplxValue2, result;
106  __m256i resultInt;
107  __m128i resultShort;
108 
109  for (; number < eighthPoints; number++) {
110  cplxValue1 = _mm256_load_ps(complexVectorPtr);
111  complexVectorPtr += 8;
112 
113  cplxValue2 = _mm256_load_ps(complexVectorPtr);
114  complexVectorPtr += 8;
115 
116  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
117  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
118 
119  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
120 
121  result = _mm256_sqrt_ps(result);
122 
123  result = _mm256_mul_ps(result, vScalar);
124 
125  resultInt = _mm256_cvtps_epi32(result);
126  resultInt = _mm256_packs_epi32(resultInt, resultInt);
127  resultInt = _mm256_permutevar8x32_epi32(
128  resultInt, idx); // permute to compensate for shuffling in hadd and packs
129  resultShort = _mm256_extracti128_si256(resultInt, 0);
130  _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
131  magnitudeVectorPtr += 8;
132  }
133 
134  number = eighthPoints * 8;
136  magnitudeVector + number, complexVector + number, scalar, num_points - number);
137 }
138 #endif /* LV_HAVE_AVX2 */
139 
140 #ifdef LV_HAVE_SSE3
141 #include <pmmintrin.h>
142 
143 static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
144  const lv_32fc_t* complexVector,
145  const float scalar,
146  unsigned int num_points)
147 {
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150 
151  const float* complexVectorPtr = (const float*)complexVector;
152  int16_t* magnitudeVectorPtr = magnitudeVector;
153 
154  __m128 vScalar = _mm_set_ps1(scalar);
155 
156  __m128 cplxValue1, cplxValue2, result;
157 
158  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
159 
160  for (; number < quarterPoints; number++) {
161  cplxValue1 = _mm_load_ps(complexVectorPtr);
162  complexVectorPtr += 4;
163 
164  cplxValue2 = _mm_load_ps(complexVectorPtr);
165  complexVectorPtr += 4;
166 
167  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
168  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
169 
170  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
171 
172  result = _mm_sqrt_ps(result);
173 
174  result = _mm_mul_ps(result, vScalar);
175 
176  _mm_store_ps(floatBuffer, result);
177  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
178  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
179  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
180  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
181  }
182 
183  number = quarterPoints * 4;
185  magnitudeVector + number, complexVector + number, scalar, num_points - number);
186 }
187 #endif /* LV_HAVE_SSE3 */
188 
189 
190 #ifdef LV_HAVE_SSE
191 #include <xmmintrin.h>
192 
193 static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
194  const lv_32fc_t* complexVector,
195  const float scalar,
196  unsigned int num_points)
197 {
198  unsigned int number = 0;
199  const unsigned int quarterPoints = num_points / 4;
200 
201  const float* complexVectorPtr = (const float*)complexVector;
202  int16_t* magnitudeVectorPtr = magnitudeVector;
203 
204  __m128 vScalar = _mm_set_ps1(scalar);
205 
206  __m128 cplxValue1, cplxValue2, result;
207  __m128 iValue, qValue;
208 
209  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
210 
211  for (; number < quarterPoints; number++) {
212  cplxValue1 = _mm_load_ps(complexVectorPtr);
213  complexVectorPtr += 4;
214 
215  cplxValue2 = _mm_load_ps(complexVectorPtr);
216  complexVectorPtr += 4;
217 
218  // Arrange in i1i2i3i4 format
219  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
220  // Arrange in q1q2q3q4 format
221  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
222 
223  __VOLK_VOLATILE __m128 iValue2 =
224  _mm_mul_ps(iValue, iValue); // Square the I values
225  __VOLK_VOLATILE __m128 qValue2 =
226  _mm_mul_ps(qValue, qValue); // Square the Q Values
227 
228  result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
229 
230  result = _mm_sqrt_ps(result);
231 
232  result = _mm_mul_ps(result, vScalar);
233 
234  _mm_store_ps(floatBuffer, result);
235  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
236  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
237  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
238  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
239  }
240 
241  number = quarterPoints * 4;
243  magnitudeVector + number, complexVector + number, scalar, num_points - number);
244 }
245 #endif /* LV_HAVE_SSE */
246 
247 
248 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
249 
250 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
251 #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
252 
253 #include <inttypes.h>
254 #include <math.h>
255 #include <stdio.h>
256 #include <volk/volk_common.h>
257 
258 #ifdef LV_HAVE_AVX2
259 #include <immintrin.h>
260 
261 static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
262  const lv_32fc_t* complexVector,
263  const float scalar,
264  unsigned int num_points)
265 {
266  unsigned int number = 0;
267  const unsigned int eighthPoints = num_points / 8;
268 
269  const float* complexVectorPtr = (const float*)complexVector;
270  int16_t* magnitudeVectorPtr = magnitudeVector;
271 
272  __m256 vScalar = _mm256_set1_ps(scalar);
273  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
274  __m256 cplxValue1, cplxValue2, result;
275  __m256i resultInt;
276  __m128i resultShort;
277 
278  for (; number < eighthPoints; number++) {
279  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
280  complexVectorPtr += 8;
281 
282  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
283  complexVectorPtr += 8;
284 
285  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
286  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
287 
288  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
289 
290  result = _mm256_sqrt_ps(result);
291 
292  result = _mm256_mul_ps(result, vScalar);
293 
294  resultInt = _mm256_cvtps_epi32(result);
295  resultInt = _mm256_packs_epi32(resultInt, resultInt);
296  resultInt = _mm256_permutevar8x32_epi32(
297  resultInt, idx); // permute to compensate for shuffling in hadd and packs
298  resultShort = _mm256_extracti128_si256(resultInt, 0);
299  _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
300  magnitudeVectorPtr += 8;
301  }
302 
303  number = eighthPoints * 8;
305  magnitudeVector + number, complexVector + number, scalar, num_points - number);
306 }
307 #endif /* LV_HAVE_AVX2 */
308 
309 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32fc_s32f_magnitude_16i_generic(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:63
static void volk_32fc_s32f_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:193
static void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:143
#define __VOLK_VOLATILE
Definition: volk_common.h:73
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74