Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46 
47 #include <inttypes.h>
48 #include <stdio.h>
49 #include <volk/volk_complex.h>
50 
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53 
54 static inline void
55 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56  const lv_8sc_t* aVector,
57  const lv_8sc_t* bVector,
58  const float scalar,
59  unsigned int num_points)
60 {
61  unsigned int number = 0;
62  const unsigned int oneEigthPoints = num_points / 8;
63 
64  __m256i x, y, realz, imagz;
65  __m256 ret, retlo, rethi;
66  lv_32fc_t* c = cVector;
67  const lv_8sc_t* a = aVector;
68  const lv_8sc_t* b = bVector;
69  __m256i conjugateSign =
70  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71 
72  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73 
74  for (; number < oneEigthPoints; number++) {
75  // Convert 8 bit values into 16 bit values
76  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78 
79  // Calculate the ar*cr - ai*(-ci) portions
80  realz = _mm256_madd_epi16(x, y);
81 
82  // Calculate the complex conjugate of the cr + ci j values
83  y = _mm256_sign_epi16(y, conjugateSign);
84 
85  // Shift the order of the cr and ci values
86  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87  _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Calculate the ar*(-ci) + cr*(ai)
90  imagz = _mm256_madd_epi16(x, y);
91 
92  // Interleave real and imaginary and then convert to float values
93  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94 
95  // Normalize the floating point values
96  retlo = _mm256_mul_ps(retlo, invScalar);
97 
98  // Interleave real and imaginary and then convert to float values
99  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100 
101  // Normalize the floating point values
102  rethi = _mm256_mul_ps(rethi, invScalar);
103 
104  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105  _mm256_store_ps((float*)c, ret);
106  c += 4;
107 
108  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109  _mm256_store_ps((float*)c, ret);
110  c += 4;
111 
112  a += 8;
113  b += 8;
114  }
115 
116  number = oneEigthPoints * 8;
117  float* cFloatPtr = (float*)&cVector[number];
118  int8_t* a8Ptr = (int8_t*)&aVector[number];
119  int8_t* b8Ptr = (int8_t*)&bVector[number];
120  for (; number < num_points; number++) {
121  float aReal = (float)*a8Ptr++;
122  float aImag = (float)*a8Ptr++;
123  lv_32fc_t aVal = lv_cmake(aReal, aImag);
124  float bReal = (float)*b8Ptr++;
125  float bImag = (float)*b8Ptr++;
126  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127  lv_32fc_t temp = aVal * bVal;
128 
129  *cFloatPtr++ = lv_creal(temp) / scalar;
130  *cFloatPtr++ = lv_cimag(temp) / scalar;
131  }
132 }
133 #endif /* LV_HAVE_AVX2*/
134 
135 
136 #ifdef LV_HAVE_SSE4_1
137 #include <smmintrin.h>
138 
139 static inline void
140 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141  const lv_8sc_t* aVector,
142  const lv_8sc_t* bVector,
143  const float scalar,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  __m128i x, y, realz, imagz;
150  __m128 ret;
151  lv_32fc_t* c = cVector;
152  const lv_8sc_t* a = aVector;
153  const lv_8sc_t* b = bVector;
154  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155 
156  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157 
158  for (; number < quarterPoints; number++) {
159  // Convert into 8 bit values into 16 bit values
162 
163  // Calculate the ar*cr - ai*(-ci) portions
164  realz = _mm_madd_epi16(x, y);
165 
166  // Calculate the complex conjugate of the cr + ci j values
167  y = _mm_sign_epi16(y, conjugateSign);
168 
169  // Shift the order of the cr and ci values
171  _MM_SHUFFLE(2, 3, 0, 1));
172 
173  // Calculate the ar*(-ci) + cr*(ai)
174  imagz = _mm_madd_epi16(x, y);
175 
176  // Interleave real and imaginary and then convert to float values
177  ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178 
179  // Normalize the floating point values
180  ret = _mm_mul_ps(ret, invScalar);
181 
182  // Store the floating point values
183  _mm_store_ps((float*)c, ret);
184  c += 2;
185 
186  // Interleave real and imaginary and then convert to float values
187  ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188 
189  // Normalize the floating point values
190  ret = _mm_mul_ps(ret, invScalar);
191 
192  // Store the floating point values
193  _mm_store_ps((float*)c, ret);
194  c += 2;
195 
196  a += 4;
197  b += 4;
198  }
199 
200  number = quarterPoints * 4;
201  float* cFloatPtr = (float*)&cVector[number];
202  int8_t* a8Ptr = (int8_t*)&aVector[number];
203  int8_t* b8Ptr = (int8_t*)&bVector[number];
204  for (; number < num_points; number++) {
205  float aReal = (float)*a8Ptr++;
206  float aImag = (float)*a8Ptr++;
207  lv_32fc_t aVal = lv_cmake(aReal, aImag);
208  float bReal = (float)*b8Ptr++;
209  float bImag = (float)*b8Ptr++;
210  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211  lv_32fc_t temp = aVal * bVal;
212 
213  *cFloatPtr++ = lv_creal(temp) / scalar;
214  *cFloatPtr++ = lv_cimag(temp) / scalar;
215  }
216 }
217 #endif /* LV_HAVE_SSE4_1 */
218 
219 
220 #ifdef LV_HAVE_GENERIC
221 
222 static inline void
224  const lv_8sc_t* aVector,
225  const lv_8sc_t* bVector,
226  const float scalar,
227  unsigned int num_points)
228 {
229  unsigned int number = 0;
230  float* cPtr = (float*)cVector;
231  const float invScalar = 1.0 / scalar;
232  int8_t* a8Ptr = (int8_t*)aVector;
233  int8_t* b8Ptr = (int8_t*)bVector;
234  for (number = 0; number < num_points; number++) {
235  float aReal = (float)*a8Ptr++;
236  float aImag = (float)*a8Ptr++;
237  lv_32fc_t aVal = lv_cmake(aReal, aImag);
238  float bReal = (float)*b8Ptr++;
239  float bImag = (float)*b8Ptr++;
240  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241  lv_32fc_t temp = aVal * bVal;
242 
243  *cPtr++ = (lv_creal(temp) * invScalar);
244  *cPtr++ = (lv_cimag(temp) * invScalar);
245  }
246 }
247 #endif /* LV_HAVE_GENERIC */
248 
249 
250 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251 
252 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254 
255 #include <inttypes.h>
256 #include <stdio.h>
257 #include <volk/volk_complex.h>
258 
259 #ifdef LV_HAVE_AVX2
260 #include <immintrin.h>
261 
262 static inline void
263 volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264  const lv_8sc_t* aVector,
265  const lv_8sc_t* bVector,
266  const float scalar,
267  unsigned int num_points)
268 {
269  unsigned int number = 0;
270  const unsigned int oneEigthPoints = num_points / 8;
271 
272  __m256i x, y, realz, imagz;
273  __m256 ret, retlo, rethi;
274  lv_32fc_t* c = cVector;
275  const lv_8sc_t* a = aVector;
276  const lv_8sc_t* b = bVector;
277  __m256i conjugateSign =
278  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279 
280  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281 
282  for (; number < oneEigthPoints; number++) {
283  // Convert 8 bit values into 16 bit values
284  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286 
287  // Calculate the ar*cr - ai*(-ci) portions
288  realz = _mm256_madd_epi16(x, y);
289 
290  // Calculate the complex conjugate of the cr + ci j values
291  y = _mm256_sign_epi16(y, conjugateSign);
292 
293  // Shift the order of the cr and ci values
294  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295  _MM_SHUFFLE(2, 3, 0, 1));
296 
297  // Calculate the ar*(-ci) + cr*(ai)
298  imagz = _mm256_madd_epi16(x, y);
299 
300  // Interleave real and imaginary and then convert to float values
301  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302 
303  // Normalize the floating point values
304  retlo = _mm256_mul_ps(retlo, invScalar);
305 
306  // Interleave real and imaginary and then convert to float values
307  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308 
309  // Normalize the floating point values
310  rethi = _mm256_mul_ps(rethi, invScalar);
311 
312  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313  _mm256_storeu_ps((float*)c, ret);
314  c += 4;
315 
316  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317  _mm256_storeu_ps((float*)c, ret);
318  c += 4;
319 
320  a += 8;
321  b += 8;
322  }
323 
324  number = oneEigthPoints * 8;
325  float* cFloatPtr = (float*)&cVector[number];
326  int8_t* a8Ptr = (int8_t*)&aVector[number];
327  int8_t* b8Ptr = (int8_t*)&bVector[number];
328  for (; number < num_points; number++) {
329  float aReal = (float)*a8Ptr++;
330  float aImag = (float)*a8Ptr++;
331  lv_32fc_t aVal = lv_cmake(aReal, aImag);
332  float bReal = (float)*b8Ptr++;
333  float bImag = (float)*b8Ptr++;
334  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335  lv_32fc_t temp = aVal * bVal;
336 
337  *cFloatPtr++ = lv_creal(temp) / scalar;
338  *cFloatPtr++ = lv_cimag(temp) / scalar;
339  }
340 }
341 #endif /* LV_HAVE_AVX2*/
342 
343 
344 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6373
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6263
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4595
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4513
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:5100
#define _mm_shufflehi_epi16(a, imm)
Definition: sse2neon.h:5444
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:7132
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:223
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74