Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_multiply_conjugate_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
12 
13 #include <inttypes.h>
14 #include <stdio.h>
15 #include <volk/volk_complex.h>
16 
17 #ifdef LV_HAVE_AVX2
18 #include <immintrin.h>
27 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
28  const lv_8sc_t* aVector,
29  const lv_8sc_t* bVector,
30  unsigned int num_points)
31 {
32  unsigned int number = 0;
33  const unsigned int quarterPoints = num_points / 8;
34 
35  __m256i x, y, realz, imagz;
36  lv_16sc_t* c = cVector;
37  const lv_8sc_t* a = aVector;
38  const lv_8sc_t* b = bVector;
39  __m256i conjugateSign =
40  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
41 
42  for (; number < quarterPoints; number++) {
43  // Convert 8 bit values into 16 bit values
44  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
45  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
46 
47  // Calculate the ar*cr - ai*(-ci) portions
48  realz = _mm256_madd_epi16(x, y);
49 
50  // Calculate the complex conjugate of the cr + ci j values
51  y = _mm256_sign_epi16(y, conjugateSign);
52 
53  // Shift the order of the cr and ci values
54  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
55  _MM_SHUFFLE(2, 3, 0, 1));
56 
57  // Calculate the ar*(-ci) + cr*(ai)
58  imagz = _mm256_madd_epi16(x, y);
59 
60  // Perform the addition of products
61 
62  _mm256_store_si256((__m256i*)c,
63  _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
64  _mm256_unpackhi_epi32(realz, imagz)));
65 
66  a += 8;
67  b += 8;
68  c += 8;
69  }
70 
71  number = quarterPoints * 8;
72  int16_t* c16Ptr = (int16_t*)&cVector[number];
73  int8_t* a8Ptr = (int8_t*)&aVector[number];
74  int8_t* b8Ptr = (int8_t*)&bVector[number];
75  for (; number < num_points; number++) {
76  float aReal = (float)*a8Ptr++;
77  float aImag = (float)*a8Ptr++;
78  lv_32fc_t aVal = lv_cmake(aReal, aImag);
79  float bReal = (float)*b8Ptr++;
80  float bImag = (float)*b8Ptr++;
81  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
82  lv_32fc_t temp = aVal * bVal;
83 
84  *c16Ptr++ = (int16_t)lv_creal(temp);
85  *c16Ptr++ = (int16_t)lv_cimag(temp);
86  }
87 }
88 #endif /* LV_HAVE_AVX2 */
89 
90 
91 #ifdef LV_HAVE_SSE4_1
92 #include <smmintrin.h>
101 static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
102  const lv_8sc_t* aVector,
103  const lv_8sc_t* bVector,
104  unsigned int num_points)
105 {
106  unsigned int number = 0;
107  const unsigned int quarterPoints = num_points / 4;
108 
109  __m128i x, y, realz, imagz;
110  lv_16sc_t* c = cVector;
111  const lv_8sc_t* a = aVector;
112  const lv_8sc_t* b = bVector;
113  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
114 
115  for (; number < quarterPoints; number++) {
116  // Convert into 8 bit values into 16 bit values
119 
120  // Calculate the ar*cr - ai*(-ci) portions
121  realz = _mm_madd_epi16(x, y);
122 
123  // Calculate the complex conjugate of the cr + ci j values
124  y = _mm_sign_epi16(y, conjugateSign);
125 
126  // Shift the order of the cr and ci values
128  _MM_SHUFFLE(2, 3, 0, 1));
129 
130  // Calculate the ar*(-ci) + cr*(ai)
131  imagz = _mm_madd_epi16(x, y);
132 
134  _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
135  _mm_unpackhi_epi32(realz, imagz)));
136 
137  a += 4;
138  b += 4;
139  c += 4;
140  }
141 
142  number = quarterPoints * 4;
143  int16_t* c16Ptr = (int16_t*)&cVector[number];
144  int8_t* a8Ptr = (int8_t*)&aVector[number];
145  int8_t* b8Ptr = (int8_t*)&bVector[number];
146  for (; number < num_points; number++) {
147  float aReal = (float)*a8Ptr++;
148  float aImag = (float)*a8Ptr++;
149  lv_32fc_t aVal = lv_cmake(aReal, aImag);
150  float bReal = (float)*b8Ptr++;
151  float bImag = (float)*b8Ptr++;
152  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
153  lv_32fc_t temp = aVal * bVal;
154 
155  *c16Ptr++ = (int16_t)lv_creal(temp);
156  *c16Ptr++ = (int16_t)lv_cimag(temp);
157  }
158 }
159 #endif /* LV_HAVE_SSE4_1 */
160 
161 #ifdef LV_HAVE_GENERIC
171  const lv_8sc_t* aVector,
172  const lv_8sc_t* bVector,
173  unsigned int num_points)
174 {
175  unsigned int number = 0;
176  int16_t* c16Ptr = (int16_t*)cVector;
177  int8_t* a8Ptr = (int8_t*)aVector;
178  int8_t* b8Ptr = (int8_t*)bVector;
179  for (number = 0; number < num_points; number++) {
180  float aReal = (float)*a8Ptr++;
181  float aImag = (float)*a8Ptr++;
182  lv_32fc_t aVal = lv_cmake(aReal, aImag);
183  float bReal = (float)*b8Ptr++;
184  float bImag = (float)*b8Ptr++;
185  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
186  lv_32fc_t temp = aVal * bVal;
187 
188  *c16Ptr++ = (int16_t)lv_creal(temp);
189  *c16Ptr++ = (int16_t)lv_cimag(temp);
190  }
191 }
192 #endif /* LV_HAVE_GENERIC */
193 
194 #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
195 
196 #ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
197 #define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
198 
199 #include <inttypes.h>
200 #include <stdio.h>
201 #include <volk/volk_complex.h>
202 
203 #ifdef LV_HAVE_AVX2
204 #include <immintrin.h>
213 static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
214  const lv_8sc_t* aVector,
215  const lv_8sc_t* bVector,
216  unsigned int num_points)
217 {
218  unsigned int number = 0;
219  const unsigned int oneEigthPoints = num_points / 8;
220 
221  __m256i x, y, realz, imagz;
222  lv_16sc_t* c = cVector;
223  const lv_8sc_t* a = aVector;
224  const lv_8sc_t* b = bVector;
225  __m256i conjugateSign =
226  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
227 
228  for (; number < oneEigthPoints; number++) {
229  // Convert 8 bit values into 16 bit values
230  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
231  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
232 
233  // Calculate the ar*cr - ai*(-ci) portions
234  realz = _mm256_madd_epi16(x, y);
235 
236  // Calculate the complex conjugate of the cr + ci j values
237  y = _mm256_sign_epi16(y, conjugateSign);
238 
239  // Shift the order of the cr and ci values
240  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
241  _MM_SHUFFLE(2, 3, 0, 1));
242 
243  // Calculate the ar*(-ci) + cr*(ai)
244  imagz = _mm256_madd_epi16(x, y);
245 
246  // Perform the addition of products
247 
248  _mm256_storeu_si256((__m256i*)c,
249  _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
250  _mm256_unpackhi_epi32(realz, imagz)));
251 
252  a += 8;
253  b += 8;
254  c += 8;
255  }
256 
257  number = oneEigthPoints * 8;
258  int16_t* c16Ptr = (int16_t*)&cVector[number];
259  int8_t* a8Ptr = (int8_t*)&aVector[number];
260  int8_t* b8Ptr = (int8_t*)&bVector[number];
261  for (; number < num_points; number++) {
262  float aReal = (float)*a8Ptr++;
263  float aImag = (float)*a8Ptr++;
264  lv_32fc_t aVal = lv_cmake(aReal, aImag);
265  float bReal = (float)*b8Ptr++;
266  float bImag = (float)*b8Ptr++;
267  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
268  lv_32fc_t temp = aVal * bVal;
269 
270  *c16Ptr++ = (int16_t)lv_creal(temp);
271  *c16Ptr++ = (int16_t)lv_cimag(temp);
272  }
273 }
274 #endif /* LV_HAVE_AVX2 */
275 
276 #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6373
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6263
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4595
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4513
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:5100
#define _mm_shufflehi_epi16(a, imm)
Definition: sse2neon.h:5444
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:7132
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, unsigned int num_points)
Multiplys the one complex vector with the complex conjugate of the second complex vector and stores t...
Definition: volk_8ic_x2_multiply_conjugate_16ic.h:170
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
short complex lv_16sc_t
Definition: volk_complex.h:71