Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_magnitude_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
43 #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
44 
45 #include <inttypes.h>
46 #include <math.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
54  const lv_16sc_t* complexVector,
55  const float scalar,
56  unsigned int num_points)
57 {
58  unsigned int number = 0;
59  const unsigned int eighthPoints = num_points / 8;
60 
61  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
62  float* magnitudeVectorPtr = magnitudeVector;
63 
64  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
65 
66  __m256 cplxValue1, cplxValue2, result;
67  __m256i int1, int2;
68  __m128i short1, short2;
69  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
70 
71  for (; number < eighthPoints; number++) {
72 
73  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
74  complexVectorPtr += 16;
75  short1 = _mm256_extracti128_si256(int1, 0);
76  short2 = _mm256_extracti128_si256(int1, 1);
77 
78  int1 = _mm256_cvtepi16_epi32(short1);
79  int2 = _mm256_cvtepi16_epi32(short2);
80  cplxValue1 = _mm256_cvtepi32_ps(int1);
81  cplxValue2 = _mm256_cvtepi32_ps(int2);
82 
83  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
84  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85 
86  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
87  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
88 
89  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
90  result = _mm256_permutevar8x32_ps(result, idx);
91 
92  result = _mm256_sqrt_ps(result); // Square root the values
93 
94  _mm256_store_ps(magnitudeVectorPtr, result);
95 
96  magnitudeVectorPtr += 8;
97  }
98 
99  number = eighthPoints * 8;
100  magnitudeVectorPtr = &magnitudeVector[number];
101  complexVectorPtr = (const int16_t*)&complexVector[number];
102  for (; number < num_points; number++) {
103  float val1Real = (float)(*complexVectorPtr++) / scalar;
104  float val1Imag = (float)(*complexVectorPtr++) / scalar;
105  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
106  }
107 }
108 #endif /* LV_HAVE_AVX2 */
109 
110 
111 #ifdef LV_HAVE_SSE3
112 #include <pmmintrin.h>
113 
114 static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
115  const lv_16sc_t* complexVector,
116  const float scalar,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int quarterPoints = num_points / 4;
121 
122  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
123  float* magnitudeVectorPtr = magnitudeVector;
124 
125  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
126 
127  __m128 cplxValue1, cplxValue2, result;
128 
129  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
130 
131  for (; number < quarterPoints; number++) {
132 
133  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
134  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
135  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
136  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
137 
138  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
139  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
140  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
141  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
142 
143  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
144  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
145 
146  complexVectorPtr += 8;
147 
148  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
149  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
150 
151  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
152  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
153 
154  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
155 
156  result = _mm_sqrt_ps(result); // Square root the values
157 
158  _mm_store_ps(magnitudeVectorPtr, result);
159 
160  magnitudeVectorPtr += 4;
161  }
162 
163  number = quarterPoints * 4;
164  magnitudeVectorPtr = &magnitudeVector[number];
165  complexVectorPtr = (const int16_t*)&complexVector[number];
166  for (; number < num_points; number++) {
167  float val1Real = (float)(*complexVectorPtr++) / scalar;
168  float val1Imag = (float)(*complexVectorPtr++) / scalar;
169  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
170  }
171 }
172 #endif /* LV_HAVE_SSE3 */
173 
174 #ifdef LV_HAVE_SSE
175 #include <xmmintrin.h>
176 
177 static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
178  const lv_16sc_t* complexVector,
179  const float scalar,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int quarterPoints = num_points / 4;
184 
185  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
186  float* magnitudeVectorPtr = magnitudeVector;
187 
188  const float iScalar = 1.0 / scalar;
189  __m128 invScalar = _mm_set_ps1(iScalar);
190 
191  __m128 cplxValue1, cplxValue2, result, re, im;
192 
193  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
194 
195  for (; number < quarterPoints; number++) {
196  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
197  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
198  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
199  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
200 
201  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
202  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
203  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
204  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
205 
206  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
207  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
208 
209  re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
210  im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
211 
212  complexVectorPtr += 8;
213 
214  cplxValue1 = _mm_mul_ps(re, invScalar);
215  cplxValue2 = _mm_mul_ps(im, invScalar);
216 
217  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
218  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
219 
220  result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
221 
222  result = _mm_sqrt_ps(result); // Square root the values
223 
224  _mm_store_ps(magnitudeVectorPtr, result);
225 
226  magnitudeVectorPtr += 4;
227  }
228 
229  number = quarterPoints * 4;
230  magnitudeVectorPtr = &magnitudeVector[number];
231  complexVectorPtr = (const int16_t*)&complexVector[number];
232  for (; number < num_points; number++) {
233  float val1Real = (float)(*complexVectorPtr++) * iScalar;
234  float val1Imag = (float)(*complexVectorPtr++) * iScalar;
235  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
236  }
237 }
238 
239 
240 #endif /* LV_HAVE_SSE */
241 
242 #ifdef LV_HAVE_GENERIC
243 
244 static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
245  const lv_16sc_t* complexVector,
246  const float scalar,
247  unsigned int num_points)
248 {
249  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
250  float* magnitudeVectorPtr = magnitudeVector;
251  unsigned int number = 0;
252  const float invScalar = 1.0 / scalar;
253  for (number = 0; number < num_points; number++) {
254  float real = ((float)(*complexVectorPtr++)) * invScalar;
255  float imag = ((float)(*complexVectorPtr++)) * invScalar;
256  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
257  }
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 #ifdef LV_HAVE_ORC_DISABLED
262 
263 extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector,
264  const lv_16sc_t* complexVector,
265  const float scalar,
266  unsigned int num_points);
267 
268 static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector,
269  const lv_16sc_t* complexVector,
270  const float scalar,
271  unsigned int num_points)
272 {
273  volk_16ic_s32f_magnitude_32f_a_orc_impl(
274  magnitudeVector, complexVector, scalar, num_points);
275 }
276 #endif /* LV_HAVE_ORC */
277 
278 
279 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
280 
281 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
282 #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
283 
284 #include <inttypes.h>
285 #include <math.h>
286 #include <stdio.h>
287 #include <volk/volk_common.h>
288 
289 #ifdef LV_HAVE_AVX2
290 #include <immintrin.h>
291 
292 static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
293  const lv_16sc_t* complexVector,
294  const float scalar,
295  unsigned int num_points)
296 {
297  unsigned int number = 0;
298  const unsigned int eighthPoints = num_points / 8;
299 
300  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
301  float* magnitudeVectorPtr = magnitudeVector;
302 
303  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
304 
305  __m256 cplxValue1, cplxValue2, result;
306  __m256i int1, int2;
307  __m128i short1, short2;
308  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
309 
310  for (; number < eighthPoints; number++) {
311 
312  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
313  complexVectorPtr += 16;
314  short1 = _mm256_extracti128_si256(int1, 0);
315  short2 = _mm256_extracti128_si256(int1, 1);
316 
317  int1 = _mm256_cvtepi16_epi32(short1);
318  int2 = _mm256_cvtepi16_epi32(short2);
319  cplxValue1 = _mm256_cvtepi32_ps(int1);
320  cplxValue2 = _mm256_cvtepi32_ps(int2);
321 
322  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
323  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
324 
325  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
326  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
327 
328  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
329  result = _mm256_permutevar8x32_ps(result, idx);
330 
331  result = _mm256_sqrt_ps(result); // Square root the values
332 
333  _mm256_storeu_ps(magnitudeVectorPtr, result);
334 
335  magnitudeVectorPtr += 8;
336  }
337 
338  number = eighthPoints * 8;
339  magnitudeVectorPtr = &magnitudeVector[number];
340  complexVectorPtr = (const int16_t*)&complexVector[number];
341  for (; number < num_points; number++) {
342  float val1Real = (float)(*complexVectorPtr++) / scalar;
343  float val1Imag = (float)(*complexVectorPtr++) / scalar;
344  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
345  }
346 }
347 #endif /* LV_HAVE_AVX2 */
348 
349 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_16ic_s32f_magnitude_32f_generic(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:244
static void volk_16ic_s32f_magnitude_32f_a_sse(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:177
static void volk_16ic_s32f_magnitude_32f_a_sse3(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:114
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
short complex lv_16sc_t
Definition: volk_complex.h:71