Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62 #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63 
64 #include <inttypes.h>
65 #include <math.h>
66 #include <stdio.h>
67 
68 #ifdef LV_HAVE_SSE4_1
69 #include <smmintrin.h>
70 
71 #ifdef LV_HAVE_LIB_SIMDMATH
72 #include <simdmath.h>
73 #endif /* LV_HAVE_LIB_SIMDMATH */
74 
75 static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,
76  const lv_32fc_t* complexVector,
77  const float normalizeFactor,
78  unsigned int num_points)
79 {
80  const float* complexVectorPtr = (float*)complexVector;
81  float* outPtr = outputVector;
82 
83  unsigned int number = 0;
84  const float invNormalizeFactor = 1.0 / normalizeFactor;
85 
86 #ifdef LV_HAVE_LIB_SIMDMATH
87  const unsigned int quarterPoints = num_points / 4;
88  __m128 testVector = _mm_set_ps1(2 * M_PI);
89  __m128 correctVector = _mm_set_ps1(M_PI);
90  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
91  __m128 phase;
92  __m128 complex1, complex2, iValue, qValue;
93  __m128 keepMask;
94 
95  for (; number < quarterPoints; number++) {
96  // Load IQ data:
97  complex1 = _mm_load_ps(complexVectorPtr);
98  complexVectorPtr += 4;
99  complex2 = _mm_load_ps(complexVectorPtr);
100  complexVectorPtr += 4;
101  // Deinterleave IQ data:
102  iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
103  qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
104  // Arctan to get phase:
105  phase = atan2f4(qValue, iValue);
106  // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
107  // Compare to 2pi:
108  keepMask = _mm_cmpneq_ps(phase, testVector);
109  phase = _mm_blendv_ps(correctVector, phase, keepMask);
110  // done with above correction.
111  phase = _mm_mul_ps(phase, vNormalizeFactor);
112  _mm_store_ps((float*)outPtr, phase);
113  outPtr += 4;
114  }
115  number = quarterPoints * 4;
116 #endif /* LV_HAVE_LIB_SIMDMATH */
117 
118  for (; number < num_points; number++) {
119  const float real = *complexVectorPtr++;
120  const float imag = *complexVectorPtr++;
121  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
122  }
123 }
124 #endif /* LV_HAVE_SSE4_1 */
125 
126 
127 #ifdef LV_HAVE_SSE
128 #include <xmmintrin.h>
129 
130 #ifdef LV_HAVE_LIB_SIMDMATH
131 #include <simdmath.h>
132 #endif /* LV_HAVE_LIB_SIMDMATH */
133 
134 static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,
135  const lv_32fc_t* complexVector,
136  const float normalizeFactor,
137  unsigned int num_points)
138 {
139  const float* complexVectorPtr = (float*)complexVector;
140  float* outPtr = outputVector;
141 
142  unsigned int number = 0;
143  const float invNormalizeFactor = 1.0 / normalizeFactor;
144 
145 #ifdef LV_HAVE_LIB_SIMDMATH
146  const unsigned int quarterPoints = num_points / 4;
147  __m128 testVector = _mm_set_ps1(2 * M_PI);
148  __m128 correctVector = _mm_set_ps1(M_PI);
149  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
150  __m128 phase;
151  __m128 complex1, complex2, iValue, qValue;
152  __m128 mask;
153  __m128 keepMask;
154 
155  for (; number < quarterPoints; number++) {
156  // Load IQ data:
157  complex1 = _mm_load_ps(complexVectorPtr);
158  complexVectorPtr += 4;
159  complex2 = _mm_load_ps(complexVectorPtr);
160  complexVectorPtr += 4;
161  // Deinterleave IQ data:
162  iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
163  qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
164  // Arctan to get phase:
165  phase = atan2f4(qValue, iValue);
166  // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
167  // Compare to 2pi:
168  keepMask = _mm_cmpneq_ps(phase, testVector);
169  phase = _mm_and_ps(phase, keepMask);
170  mask = _mm_andnot_ps(keepMask, correctVector);
171  phase = _mm_or_ps(phase, mask);
172  // done with above correction.
173  phase = _mm_mul_ps(phase, vNormalizeFactor);
174  _mm_store_ps((float*)outPtr, phase);
175  outPtr += 4;
176  }
177  number = quarterPoints * 4;
178 #endif /* LV_HAVE_LIB_SIMDMATH */
179 
180  for (; number < num_points; number++) {
181  const float real = *complexVectorPtr++;
182  const float imag = *complexVectorPtr++;
183  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
184  }
185 }
186 #endif /* LV_HAVE_SSE */
187 
188 #ifdef LV_HAVE_GENERIC
189 
190 static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
191  const lv_32fc_t* inputVector,
192  const float normalizeFactor,
193  unsigned int num_points)
194 {
195  float* outPtr = outputVector;
196  const float* inPtr = (float*)inputVector;
197  const float invNormalizeFactor = 1.0 / normalizeFactor;
198  unsigned int number;
199  for (number = 0; number < num_points; number++) {
200  const float real = *inPtr++;
201  const float imag = *inPtr++;
202  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
203  }
204 }
205 #endif /* LV_HAVE_GENERIC */
206 
207 
208 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:190
static void volk_32fc_s32f_atan2_32f_a_sse(float *outputVector, const lv_32fc_t *complexVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:134
float complex lv_32fc_t
Definition: volk_complex.h:74