Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_magnitude_squared_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
59 #define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
60 
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 
65 #ifdef LV_HAVE_AVX
66 #include <immintrin.h>
68 
69 static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
70  const lv_32fc_t* complexVector,
71  unsigned int num_points)
72 {
73  unsigned int number = 0;
74  const unsigned int eighthPoints = num_points / 8;
75 
76  const float* complexVectorPtr = (float*)complexVector;
77  float* magnitudeVectorPtr = magnitudeVector;
78 
79  __m256 cplxValue1, cplxValue2, result;
80 
81  for (; number < eighthPoints; number++) {
82  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
83  cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
84  result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
85  _mm256_storeu_ps(magnitudeVectorPtr, result);
86 
87  complexVectorPtr += 16;
88  magnitudeVectorPtr += 8;
89  }
90 
91  number = eighthPoints * 8;
92  for (; number < num_points; number++) {
93  float val1Real = *complexVectorPtr++;
94  float val1Imag = *complexVectorPtr++;
95  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
96  }
97 }
98 #endif /* LV_HAVE_AVX */
99 
100 
101 #ifdef LV_HAVE_SSE3
102 #include <pmmintrin.h>
104 
105 static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
106  const lv_32fc_t* complexVector,
107  unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const unsigned int quarterPoints = num_points / 4;
111 
112  const float* complexVectorPtr = (float*)complexVector;
113  float* magnitudeVectorPtr = magnitudeVector;
114 
115  __m128 cplxValue1, cplxValue2, result;
116  for (; number < quarterPoints; number++) {
117  cplxValue1 = _mm_loadu_ps(complexVectorPtr);
118  complexVectorPtr += 4;
119 
120  cplxValue2 = _mm_loadu_ps(complexVectorPtr);
121  complexVectorPtr += 4;
122 
123  result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
124  _mm_storeu_ps(magnitudeVectorPtr, result);
125  magnitudeVectorPtr += 4;
126  }
127 
128  number = quarterPoints * 4;
129  for (; number < num_points; number++) {
130  float val1Real = *complexVectorPtr++;
131  float val1Imag = *complexVectorPtr++;
132  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
133  }
134 }
135 #endif /* LV_HAVE_SSE3 */
136 
137 
138 #ifdef LV_HAVE_SSE
140 #include <xmmintrin.h>
141 
142 static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
143  const lv_32fc_t* complexVector,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  const float* complexVectorPtr = (float*)complexVector;
150  float* magnitudeVectorPtr = magnitudeVector;
151 
152  __m128 cplxValue1, cplxValue2, result;
153 
154  for (; number < quarterPoints; number++) {
155  cplxValue1 = _mm_loadu_ps(complexVectorPtr);
156  complexVectorPtr += 4;
157 
158  cplxValue2 = _mm_loadu_ps(complexVectorPtr);
159  complexVectorPtr += 4;
160 
161  result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
162  _mm_storeu_ps(magnitudeVectorPtr, result);
163  magnitudeVectorPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  float val1Real = *complexVectorPtr++;
169  float val1Imag = *complexVectorPtr++;
170  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
171  }
172 }
173 #endif /* LV_HAVE_SSE */
174 
175 
176 #ifdef LV_HAVE_GENERIC
177 
178 static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
179  const lv_32fc_t* complexVector,
180  unsigned int num_points)
181 {
182  const float* complexVectorPtr = (float*)complexVector;
183  float* magnitudeVectorPtr = magnitudeVector;
184  unsigned int number = 0;
185  for (number = 0; number < num_points; number++) {
186  const float real = *complexVectorPtr++;
187  const float imag = *complexVectorPtr++;
188  *magnitudeVectorPtr++ = (real * real) + (imag * imag);
189  }
190 }
191 #endif /* LV_HAVE_GENERIC */
192 
193 
194 #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
195 #ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
196 #define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
197 
198 #include <inttypes.h>
199 #include <math.h>
200 #include <stdio.h>
201 
202 #ifdef LV_HAVE_AVX
203 #include <immintrin.h>
205 
206 static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
207  const lv_32fc_t* complexVector,
208  unsigned int num_points)
209 {
210  unsigned int number = 0;
211  const unsigned int eighthPoints = num_points / 8;
212 
213  const float* complexVectorPtr = (float*)complexVector;
214  float* magnitudeVectorPtr = magnitudeVector;
215 
216  __m256 cplxValue1, cplxValue2, result;
217  for (; number < eighthPoints; number++) {
218  cplxValue1 = _mm256_load_ps(complexVectorPtr);
219  complexVectorPtr += 8;
220 
221  cplxValue2 = _mm256_load_ps(complexVectorPtr);
222  complexVectorPtr += 8;
223 
224  result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
225  _mm256_store_ps(magnitudeVectorPtr, result);
226  magnitudeVectorPtr += 8;
227  }
228 
229  number = eighthPoints * 8;
230  for (; number < num_points; number++) {
231  float val1Real = *complexVectorPtr++;
232  float val1Imag = *complexVectorPtr++;
233  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
234  }
235 }
236 #endif /* LV_HAVE_AVX */
237 
238 
239 #ifdef LV_HAVE_SSE3
240 #include <pmmintrin.h>
242 
243 static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
244  const lv_32fc_t* complexVector,
245  unsigned int num_points)
246 {
247  unsigned int number = 0;
248  const unsigned int quarterPoints = num_points / 4;
249 
250  const float* complexVectorPtr = (float*)complexVector;
251  float* magnitudeVectorPtr = magnitudeVector;
252 
253  __m128 cplxValue1, cplxValue2, result;
254  for (; number < quarterPoints; number++) {
255  cplxValue1 = _mm_load_ps(complexVectorPtr);
256  complexVectorPtr += 4;
257 
258  cplxValue2 = _mm_load_ps(complexVectorPtr);
259  complexVectorPtr += 4;
260 
261  result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
262  _mm_store_ps(magnitudeVectorPtr, result);
263  magnitudeVectorPtr += 4;
264  }
265 
266  number = quarterPoints * 4;
267  for (; number < num_points; number++) {
268  float val1Real = *complexVectorPtr++;
269  float val1Imag = *complexVectorPtr++;
270  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
271  }
272 }
273 #endif /* LV_HAVE_SSE3 */
274 
275 
276 #ifdef LV_HAVE_SSE
278 #include <xmmintrin.h>
279 
280 static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
281  const lv_32fc_t* complexVector,
282  unsigned int num_points)
283 {
284  unsigned int number = 0;
285  const unsigned int quarterPoints = num_points / 4;
286 
287  const float* complexVectorPtr = (float*)complexVector;
288  float* magnitudeVectorPtr = magnitudeVector;
289 
290  __m128 cplxValue1, cplxValue2, result;
291  for (; number < quarterPoints; number++) {
292  cplxValue1 = _mm_load_ps(complexVectorPtr);
293  complexVectorPtr += 4;
294 
295  cplxValue2 = _mm_load_ps(complexVectorPtr);
296  complexVectorPtr += 4;
297 
298  result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
299  _mm_store_ps(magnitudeVectorPtr, result);
300  magnitudeVectorPtr += 4;
301  }
302 
303  number = quarterPoints * 4;
304  for (; number < num_points; number++) {
305  float val1Real = *complexVectorPtr++;
306  float val1Imag = *complexVectorPtr++;
307  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
308  }
309 }
310 #endif /* LV_HAVE_SSE */
311 
312 
313 #ifdef LV_HAVE_NEON
314 #include <arm_neon.h>
315 
316 static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
317  const lv_32fc_t* complexVector,
318  unsigned int num_points)
319 {
320  unsigned int number = 0;
321  const unsigned int quarterPoints = num_points / 4;
322 
323  const float* complexVectorPtr = (float*)complexVector;
324  float* magnitudeVectorPtr = magnitudeVector;
325 
326  float32x4x2_t cmplx_val;
327  float32x4_t result;
328  for (; number < quarterPoints; number++) {
329  cmplx_val = vld2q_f32(complexVectorPtr);
330  complexVectorPtr += 8;
331 
332  cmplx_val.val[0] =
333  vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
334  cmplx_val.val[1] =
335  vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
336 
337  result =
338  vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
339 
340  vst1q_f32(magnitudeVectorPtr, result);
341  magnitudeVectorPtr += 4;
342  }
343 
344  number = quarterPoints * 4;
345  for (; number < num_points; number++) {
346  float val1Real = *complexVectorPtr++;
347  float val1Imag = *complexVectorPtr++;
348  *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
349  }
350 }
351 #endif /* LV_HAVE_NEON */
352 
353 
354 #ifdef LV_HAVE_GENERIC
355 
357  float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
358 {
359  const float* complexVectorPtr = (float*)complexVector;
360  float* magnitudeVectorPtr = magnitudeVector;
361  unsigned int number = 0;
362  for (number = 0; number < num_points; number++) {
363  const float real = *complexVectorPtr++;
364  const float imag = *complexVectorPtr++;
365  *magnitudeVectorPtr++ = (real * real) + (imag * imag);
366  }
367 }
368 #endif /* LV_HAVE_GENERIC */
369 
370 #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_magnitude_squared_32f_a_generic(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:356
static void volk_32fc_magnitude_squared_32f_neon(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:316
static void volk_32fc_magnitude_squared_32f_u_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:69
static void volk_32fc_magnitude_squared_32f_a_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:243
static void volk_32fc_magnitude_squared_32f_a_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:280
static void volk_32fc_magnitude_squared_32f_a_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:206
static void volk_32fc_magnitude_squared_32f_u_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:142
static void volk_32fc_magnitude_squared_32f_generic(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:178
static void volk_32fc_magnitude_squared_32f_u_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_squared_32f.h:105
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:60
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:19