Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
42 #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
51  int16_t* qBuffer,
52  const lv_8sc_t* complexVector,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const int8_t* complexVectorPtr = (int8_t*)complexVector;
57  int16_t* iBufferPtr = iBuffer;
58  int16_t* qBufferPtr = qBuffer;
59  __m256i MoveMask = _mm256_set_epi8(15,
60  13,
61  11,
62  9,
63  7,
64  5,
65  3,
66  1,
67  14,
68  12,
69  10,
70  8,
71  6,
72  4,
73  2,
74  0,
75  15,
76  13,
77  11,
78  9,
79  7,
80  5,
81  3,
82  1,
83  14,
84  12,
85  10,
86  8,
87  6,
88  4,
89  2,
90  0);
91  __m256i complexVal, iOutputVal, qOutputVal;
92  __m128i iOutputVal0, qOutputVal0;
93 
94  unsigned int sixteenthPoints = num_points / 16;
95 
96  for (number = 0; number < sixteenthPoints; number++) {
97  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
98  complexVectorPtr += 32;
99 
100  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
101  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
102 
103  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
104  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
105 
106  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
107  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
108 
109  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
110  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
111 
112  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
113  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
114 
115  iBufferPtr += 16;
116  qBufferPtr += 16;
117  }
118 
119  number = sixteenthPoints * 16;
120  for (; number < num_points; number++) {
121  *iBufferPtr++ =
122  ((int16_t)*complexVectorPtr++) *
123  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
124  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
125  }
126 }
127 #endif /* LV_HAVE_AVX2 */
128 
129 #ifdef LV_HAVE_SSE4_1
130 #include <smmintrin.h>
131 
132 static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
133  int16_t* qBuffer,
134  const lv_8sc_t* complexVector,
135  unsigned int num_points)
136 {
137  unsigned int number = 0;
138  const int8_t* complexVectorPtr = (int8_t*)complexVector;
139  int16_t* iBufferPtr = iBuffer;
140  int16_t* qBufferPtr = qBuffer;
141  __m128i iMoveMask = _mm_set_epi8(0x80,
142  0x80,
143  0x80,
144  0x80,
145  0x80,
146  0x80,
147  0x80,
148  0x80,
149  14,
150  12,
151  10,
152  8,
153  6,
154  4,
155  2,
156  0); // set 16 byte values
157  __m128i qMoveMask = _mm_set_epi8(
158  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
159  __m128i complexVal, iOutputVal, qOutputVal;
160 
161  unsigned int eighthPoints = num_points / 8;
162 
163  for (number = 0; number < eighthPoints; number++) {
164  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
165  complexVectorPtr += 16; // aligned load
166 
167  iOutputVal = _mm_shuffle_epi8(complexVal,
168  iMoveMask); // shuffle 16 bytes of 128bit complexVal
169  qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
170 
171  iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
172  // of lower 8 bytes of input to output
173  iOutputVal =
174  _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
175  // 16-bit integers, shift in with zeros
176 
177  qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
178  qOutputVal = _mm_slli_epi16(qOutputVal, 8);
179 
180  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
181  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182 
183  iBufferPtr += 8;
184  qBufferPtr += 8;
185  }
186 
187  number = eighthPoints * 8;
188  for (; number < num_points; number++) {
189  *iBufferPtr++ =
190  ((int16_t)*complexVectorPtr++) *
191  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
192  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
193  }
194 }
195 #endif /* LV_HAVE_SSE4_1 */
196 
197 
198 #ifdef LV_HAVE_AVX
199 #include <immintrin.h>
200 
201 static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
202  int16_t* qBuffer,
203  const lv_8sc_t* complexVector,
204  unsigned int num_points)
205 {
206  unsigned int number = 0;
207  const int8_t* complexVectorPtr = (int8_t*)complexVector;
208  int16_t* iBufferPtr = iBuffer;
209  int16_t* qBufferPtr = qBuffer;
210  __m128i iMoveMask = _mm_set_epi8(0x80,
211  0x80,
212  0x80,
213  0x80,
214  0x80,
215  0x80,
216  0x80,
217  0x80,
218  14,
219  12,
220  10,
221  8,
222  6,
223  4,
224  2,
225  0); // set 16 byte values
226  __m128i qMoveMask = _mm_set_epi8(
227  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
228  __m256i complexVal, iOutputVal, qOutputVal;
229  __m128i complexVal1, complexVal0;
230  __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
231 
232  unsigned int sixteenthPoints = num_points / 16;
233 
234  for (number = 0; number < sixteenthPoints; number++) {
235  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
236  complexVectorPtr += 32; // aligned load
237 
238  // Extract from complexVal to iOutputVal and qOutputVal
239  complexVal1 = _mm256_extractf128_si256(complexVal, 1);
240  complexVal0 = _mm256_extractf128_si256(complexVal, 0);
241 
242  iOutputVal1 = _mm_shuffle_epi8(
243  complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
244  iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
245  qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
246  qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
247 
248  iOutputVal1 =
249  _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
250  // lower 8 bytes of input to output
251  iOutputVal1 =
252  _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
253  // 16-bit integers, shift in with zeros
254  iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
255  iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
256 
257  qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
258  qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
259  qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
260  qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
261 
262  // Pack iOutputVal0,1 to iOutputVal
263  __m256i dummy = _mm256_setzero_si256();
264  iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
265  iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
266  qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
267  qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
268 
269  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
270  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
271 
272  iBufferPtr += 16;
273  qBufferPtr += 16;
274  }
275 
276  number = sixteenthPoints * 16;
277  for (; number < num_points; number++) {
278  *iBufferPtr++ =
279  ((int16_t)*complexVectorPtr++) *
280  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
281  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
282  }
283 }
284 #endif /* LV_HAVE_AVX */
285 
286 
287 #ifdef LV_HAVE_GENERIC
288 
289 static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
290  int16_t* qBuffer,
291  const lv_8sc_t* complexVector,
292  unsigned int num_points)
293 {
294  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
295  int16_t* iBufferPtr = iBuffer;
296  int16_t* qBufferPtr = qBuffer;
297  unsigned int number;
298  for (number = 0; number < num_points; number++) {
299  *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
300  *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
301  }
302 }
303 #endif /* LV_HAVE_GENERIC */
304 
305 
306 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
307 
308 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
309 #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
310 
311 #include <inttypes.h>
312 #include <stdio.h>
313 
314 #ifdef LV_HAVE_AVX2
315 #include <immintrin.h>
316 
317 static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
318  int16_t* qBuffer,
319  const lv_8sc_t* complexVector,
320  unsigned int num_points)
321 {
322  unsigned int number = 0;
323  const int8_t* complexVectorPtr = (int8_t*)complexVector;
324  int16_t* iBufferPtr = iBuffer;
325  int16_t* qBufferPtr = qBuffer;
326  __m256i MoveMask = _mm256_set_epi8(15,
327  13,
328  11,
329  9,
330  7,
331  5,
332  3,
333  1,
334  14,
335  12,
336  10,
337  8,
338  6,
339  4,
340  2,
341  0,
342  15,
343  13,
344  11,
345  9,
346  7,
347  5,
348  3,
349  1,
350  14,
351  12,
352  10,
353  8,
354  6,
355  4,
356  2,
357  0);
358  __m256i complexVal, iOutputVal, qOutputVal;
359  __m128i iOutputVal0, qOutputVal0;
360 
361  unsigned int sixteenthPoints = num_points / 16;
362 
363  for (number = 0; number < sixteenthPoints; number++) {
364  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
365  complexVectorPtr += 32;
366 
367  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
368  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
369 
370  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
371  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
372 
373  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
374  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
375 
376  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
377  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
378 
379  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
380  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
381 
382  iBufferPtr += 16;
383  qBufferPtr += 16;
384  }
385 
386  number = sixteenthPoints * 16;
387  for (; number < num_points; number++) {
388  *iBufferPtr++ =
389  ((int16_t)*complexVectorPtr++) *
390  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
391  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
392  }
393 }
394 #endif /* LV_HAVE_AVX2 */
395 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:289
static void volk_8ic_deinterleave_16i_x2_a_avx(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:201
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70