Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42 #define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
50  int16_t* qBuffer,
51  const lv_16sc_t* complexVector,
52  unsigned int num_points)
53 {
54  unsigned int number = 0;
55  const int8_t* complexVectorPtr = (int8_t*)complexVector;
56  int16_t* iBufferPtr = iBuffer;
57  int16_t* qBufferPtr = qBuffer;
58 
59  __m256i MoveMask = _mm256_set_epi8(15,
60  14,
61  11,
62  10,
63  7,
64  6,
65  3,
66  2,
67  13,
68  12,
69  9,
70  8,
71  5,
72  4,
73  1,
74  0,
75  15,
76  14,
77  11,
78  10,
79  7,
80  6,
81  3,
82  2,
83  13,
84  12,
85  9,
86  8,
87  5,
88  4,
89  1,
90  0);
91 
92  __m256i iMove2, iMove1;
93  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
94 
95  unsigned int sixteenthPoints = num_points / 16;
96 
97  for (number = 0; number < sixteenthPoints; number++) {
98  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99  complexVectorPtr += 32;
100  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101  complexVectorPtr += 32;
102 
103  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
105 
106  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107  _mm256_permute4x64_epi64(iMove2, 0x80),
108  0x30);
109  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110  _mm256_permute4x64_epi64(iMove2, 0xd0),
111  0x30);
112 
113  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
115 
116  iBufferPtr += 16;
117  qBufferPtr += 16;
118  }
119 
120  number = sixteenthPoints * 16;
121  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122  for (; number < num_points; number++) {
123  *iBufferPtr++ = *int16ComplexVectorPtr++;
124  *qBufferPtr++ = *int16ComplexVectorPtr++;
125  }
126 }
127 #endif /* LV_HAVE_AVX2 */
128 
129 #ifdef LV_HAVE_SSSE3
130 #include <tmmintrin.h>
131 
132 static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
133  int16_t* qBuffer,
134  const lv_16sc_t* complexVector,
135  unsigned int num_points)
136 {
137  unsigned int number = 0;
138  const int8_t* complexVectorPtr = (int8_t*)complexVector;
139  int16_t* iBufferPtr = iBuffer;
140  int16_t* qBufferPtr = qBuffer;
141 
142  __m128i iMoveMask1 = _mm_set_epi8(
143  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144  __m128i iMoveMask2 = _mm_set_epi8(
145  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
146 
147  __m128i qMoveMask1 = _mm_set_epi8(
148  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149  __m128i qMoveMask2 = _mm_set_epi8(
150  15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
151 
152  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
153 
154  unsigned int eighthPoints = num_points / 8;
155 
156  for (number = 0; number < eighthPoints; number++) {
157  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158  complexVectorPtr += 16;
159  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160  complexVectorPtr += 16;
161 
162  iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163  _mm_shuffle_epi8(complexVal2, iMoveMask2));
164  qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165  _mm_shuffle_epi8(complexVal2, qMoveMask2));
166 
167  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
169 
170  iBufferPtr += 8;
171  qBufferPtr += 8;
172  }
173 
174  number = eighthPoints * 8;
175  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176  for (; number < num_points; number++) {
177  *iBufferPtr++ = *int16ComplexVectorPtr++;
178  *qBufferPtr++ = *int16ComplexVectorPtr++;
179  }
180 }
181 #endif /* LV_HAVE_SSSE3 */
182 
183 #ifdef LV_HAVE_SSE2
184 #include <emmintrin.h>
185 
186 static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
187  int16_t* qBuffer,
188  const lv_16sc_t* complexVector,
189  unsigned int num_points)
190 {
191  unsigned int number = 0;
192  const int16_t* complexVectorPtr = (int16_t*)complexVector;
193  int16_t* iBufferPtr = iBuffer;
194  int16_t* qBufferPtr = qBuffer;
195  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196  qComplexVal2, iOutputVal, qOutputVal;
197  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
199 
200  unsigned int eighthPoints = num_points / 8;
201 
202  for (number = 0; number < eighthPoints; number++) {
203  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204  complexVectorPtr += 8;
205  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206  complexVectorPtr += 8;
207 
208  iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
209 
210  iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
211 
212  iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
213 
214  iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
215 
216  iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
217 
218  iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
219 
220  iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221  _mm_and_si128(iComplexVal2, highMask));
222 
223  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
224 
225  qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
226 
227  qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
228 
229  qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
230 
231  qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232 
233  qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
234 
235  qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
236 
237  qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238  _mm_and_si128(qComplexVal2, highMask));
239 
240  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
241 
242  iBufferPtr += 8;
243  qBufferPtr += 8;
244  }
245 
246  number = eighthPoints * 8;
247  for (; number < num_points; number++) {
248  *iBufferPtr++ = *complexVectorPtr++;
249  *qBufferPtr++ = *complexVectorPtr++;
250  }
251 }
252 #endif /* LV_HAVE_SSE2 */
253 
254 #ifdef LV_HAVE_GENERIC
255 
256 static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
257  int16_t* qBuffer,
258  const lv_16sc_t* complexVector,
259  unsigned int num_points)
260 {
261  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
262  int16_t* iBufferPtr = iBuffer;
263  int16_t* qBufferPtr = qBuffer;
264  unsigned int number;
265  for (number = 0; number < num_points; number++) {
266  *iBufferPtr++ = *complexVectorPtr++;
267  *qBufferPtr++ = *complexVectorPtr++;
268  }
269 }
270 #endif /* LV_HAVE_GENERIC */
271 
272 #ifdef LV_HAVE_ORC
273 
274 extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
275  int16_t* qBuffer,
276  const lv_16sc_t* complexVector,
277  unsigned int num_points);
278 static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
279  int16_t* qBuffer,
280  const lv_16sc_t* complexVector,
281  unsigned int num_points)
282 {
283  volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
284 }
285 #endif /* LV_HAVE_ORC */
286 
287 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
288 
289 
290 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
291 #define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
292 
293 #include <inttypes.h>
294 #include <stdio.h>
295 #ifdef LV_HAVE_AVX2
296 #include <immintrin.h>
297 
298 static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
299  int16_t* qBuffer,
300  const lv_16sc_t* complexVector,
301  unsigned int num_points)
302 {
303  unsigned int number = 0;
304  const int8_t* complexVectorPtr = (int8_t*)complexVector;
305  int16_t* iBufferPtr = iBuffer;
306  int16_t* qBufferPtr = qBuffer;
307 
308  __m256i MoveMask = _mm256_set_epi8(15,
309  14,
310  11,
311  10,
312  7,
313  6,
314  3,
315  2,
316  13,
317  12,
318  9,
319  8,
320  5,
321  4,
322  1,
323  0,
324  15,
325  14,
326  11,
327  10,
328  7,
329  6,
330  3,
331  2,
332  13,
333  12,
334  9,
335  8,
336  5,
337  4,
338  1,
339  0);
340 
341  __m256i iMove2, iMove1;
342  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
343 
344  unsigned int sixteenthPoints = num_points / 16;
345 
346  for (number = 0; number < sixteenthPoints; number++) {
347  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
348  complexVectorPtr += 32;
349  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
350  complexVectorPtr += 32;
351 
352  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
353  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
354 
355  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
356  _mm256_permute4x64_epi64(iMove2, 0x80),
357  0x30);
358  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
359  _mm256_permute4x64_epi64(iMove2, 0xd0),
360  0x30);
361 
362  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
363  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
364 
365  iBufferPtr += 16;
366  qBufferPtr += 16;
367  }
368 
369  number = sixteenthPoints * 16;
370  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
371  for (; number < num_points; number++) {
372  *iBufferPtr++ = *int16ComplexVectorPtr++;
373  *qBufferPtr++ = *int16ComplexVectorPtr++;
374  }
375 }
376 #endif /* LV_HAVE_AVX2 */
377 
378 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:5115
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
#define _mm_shufflehi_epi16(a, imm)
Definition: sse2neon.h:5444
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:256
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:186
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:132
short complex lv_16sc_t
Definition: volk_complex.h:71