Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
56 #define INCLUDED_volk_32fc_conjugate_32fc_u_H
57 
58 #include <float.h>
59 #include <inttypes.h>
60 #include <stdio.h>
61 #include <volk/volk_complex.h>
62 
63 #ifdef LV_HAVE_AVX
64 #include <immintrin.h>
65 
66 static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
67  const lv_32fc_t* aVector,
68  unsigned int num_points)
69 {
70  unsigned int number = 0;
71  const unsigned int quarterPoints = num_points / 4;
72 
73  __m256 x;
74  lv_32fc_t* c = cVector;
75  const lv_32fc_t* a = aVector;
76 
77  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
78 
79  for (; number < quarterPoints; number++) {
80 
81  x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
82 
83  x = _mm256_xor_ps(x, conjugator); // conjugate register
84 
85  _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
86 
87  a += 4;
88  c += 4;
89  }
90 
91  number = quarterPoints * 4;
92 
93  for (; number < num_points; number++) {
94  *c++ = lv_conj(*a++);
95  }
96 }
97 #endif /* LV_HAVE_AVX */
98 
99 #ifdef LV_HAVE_SSE3
100 #include <pmmintrin.h>
101 
102 static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
103  const lv_32fc_t* aVector,
104  unsigned int num_points)
105 {
106  unsigned int number = 0;
107  const unsigned int halfPoints = num_points / 2;
108 
109  __m128 x;
110  lv_32fc_t* c = cVector;
111  const lv_32fc_t* a = aVector;
112 
113  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
114 
115  for (; number < halfPoints; number++) {
116 
117  x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
118 
119  x = _mm_xor_ps(x, conjugator); // conjugate register
120 
121  _mm_storeu_ps((float*)c, x); // Store the results back into the C container
122 
123  a += 2;
124  c += 2;
125  }
126 
127  if ((num_points % 2) != 0) {
128  *c = lv_conj(*a);
129  }
130 }
131 #endif /* LV_HAVE_SSE3 */
132 
133 #ifdef LV_HAVE_GENERIC
134 
135 static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
136  const lv_32fc_t* aVector,
137  unsigned int num_points)
138 {
139  lv_32fc_t* cPtr = cVector;
140  const lv_32fc_t* aPtr = aVector;
141  unsigned int number = 0;
142 
143  for (number = 0; number < num_points; number++) {
144  *cPtr++ = lv_conj(*aPtr++);
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 
150 #endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
151 #ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
152 #define INCLUDED_volk_32fc_conjugate_32fc_a_H
153 
154 #include <float.h>
155 #include <inttypes.h>
156 #include <stdio.h>
157 #include <volk/volk_complex.h>
158 
159 #ifdef LV_HAVE_AVX
160 #include <immintrin.h>
161 
162 static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
163  const lv_32fc_t* aVector,
164  unsigned int num_points)
165 {
166  unsigned int number = 0;
167  const unsigned int quarterPoints = num_points / 4;
168 
169  __m256 x;
170  lv_32fc_t* c = cVector;
171  const lv_32fc_t* a = aVector;
172 
173  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
174 
175  for (; number < quarterPoints; number++) {
176 
177  x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
178 
179  x = _mm256_xor_ps(x, conjugator); // conjugate register
180 
181  _mm256_store_ps((float*)c, x); // Store the results back into the C container
182 
183  a += 4;
184  c += 4;
185  }
186 
187  number = quarterPoints * 4;
188 
189  for (; number < num_points; number++) {
190  *c++ = lv_conj(*a++);
191  }
192 }
193 #endif /* LV_HAVE_AVX */
194 
195 #ifdef LV_HAVE_SSE3
196 #include <pmmintrin.h>
197 
198 static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
199  const lv_32fc_t* aVector,
200  unsigned int num_points)
201 {
202  unsigned int number = 0;
203  const unsigned int halfPoints = num_points / 2;
204 
205  __m128 x;
206  lv_32fc_t* c = cVector;
207  const lv_32fc_t* a = aVector;
208 
209  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
210 
211  for (; number < halfPoints; number++) {
212 
213  x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
214 
215  x = _mm_xor_ps(x, conjugator); // conjugate register
216 
217  _mm_store_ps((float*)c, x); // Store the results back into the C container
218 
219  a += 2;
220  c += 2;
221  }
222 
223  if ((num_points % 2) != 0) {
224  *c = lv_conj(*a);
225  }
226 }
227 #endif /* LV_HAVE_SSE3 */
228 
229 #ifdef LV_HAVE_NEON
230 #include <arm_neon.h>
231 
232 static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
233  const lv_32fc_t* aVector,
234  unsigned int num_points)
235 {
236  unsigned int number;
237  const unsigned int quarterPoints = num_points / 4;
238 
239  float32x4x2_t x;
240  lv_32fc_t* c = cVector;
241  const lv_32fc_t* a = aVector;
242 
243  for (number = 0; number < quarterPoints; number++) {
244  __VOLK_PREFETCH(a + 4);
245  x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
246 
247  // xor the imaginary lane
248  x.val[1] = vnegq_f32(x.val[1]);
249 
250  vst2q_f32((float*)c, x); // Store the results back into the C container
251 
252  a += 4;
253  c += 4;
254  }
255 
256  for (number = quarterPoints * 4; number < num_points; number++) {
257  *c++ = lv_conj(*a++);
258  }
259 }
260 #endif /* LV_HAVE_NEON */
261 
262 
263 #ifdef LV_HAVE_GENERIC
264 
265 static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector,
266  const lv_32fc_t* aVector,
267  unsigned int num_points)
268 {
269  lv_32fc_t* cPtr = cVector;
270  const lv_32fc_t* aPtr = aVector;
271  unsigned int number = 0;
272 
273  for (number = 0; number < num_points; number++) {
274  *cPtr++ = lv_conj(*aPtr++);
275  }
276 }
277 #endif /* LV_HAVE_GENERIC */
278 
279 
280 #endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2958
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2523
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:162
static void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:102
static void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:198
static void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:232
static void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:66
static void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:265
static void volk_32fc_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, unsigned int num_points)
Definition: volk_32fc_conjugate_32fc.h:135
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74