Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 
48 #ifdef LV_HAVE_AVX2
49 #include <immintrin.h>
50 
51 static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
52  const lv_16sc_t* complexVector,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const int16_t* complexVectorPtr = (int16_t*)complexVector;
57  int16_t* iBufferPtr = iBuffer;
58 
59  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  0x80,
65  0x80,
66  0x80,
67  13,
68  12,
69  9,
70  8,
71  5,
72  4,
73  1,
74  0,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  0x80,
81  0x80,
82  0x80,
83  13,
84  12,
85  9,
86  8,
87  5,
88  4,
89  1,
90  0);
91  __m256i iMoveMask2 = _mm256_set_epi8(13,
92  12,
93  9,
94  8,
95  5,
96  4,
97  1,
98  0,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  0x80,
105  0x80,
106  0x80,
107  13,
108  12,
109  9,
110  8,
111  5,
112  4,
113  1,
114  0,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80,
120  0x80,
121  0x80,
122  0x80);
123 
124  __m256i complexVal1, complexVal2, iOutputVal;
125 
126  unsigned int sixteenthPoints = num_points / 16;
127 
128  for (number = 0; number < sixteenthPoints; number++) {
129  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
130  complexVectorPtr += 16;
131  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
132  complexVectorPtr += 16;
133 
134  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
135  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
136 
137  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
138  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
139 
140  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
141 
142  iBufferPtr += 16;
143  }
144 
145  number = sixteenthPoints * 16;
146  for (; number < num_points; number++) {
147  *iBufferPtr++ = *complexVectorPtr++;
148  complexVectorPtr++;
149  }
150 }
151 #endif /* LV_HAVE_AVX2 */
152 
153 #ifdef LV_HAVE_SSSE3
154 #include <tmmintrin.h>
155 
156 static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
157  const lv_16sc_t* complexVector,
158  unsigned int num_points)
159 {
160  unsigned int number = 0;
161  const int16_t* complexVectorPtr = (int16_t*)complexVector;
162  int16_t* iBufferPtr = iBuffer;
163 
164  __m128i iMoveMask1 = _mm_set_epi8(
165  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
166  __m128i iMoveMask2 = _mm_set_epi8(
167  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
168 
169  __m128i complexVal1, complexVal2, iOutputVal;
170 
171  unsigned int eighthPoints = num_points / 8;
172 
173  for (number = 0; number < eighthPoints; number++) {
174  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
175  complexVectorPtr += 8;
176  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
177  complexVectorPtr += 8;
178 
179  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
180  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
181 
182  iOutputVal = _mm_or_si128(complexVal1, complexVal2);
183 
184  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
185 
186  iBufferPtr += 8;
187  }
188 
189  number = eighthPoints * 8;
190  for (; number < num_points; number++) {
191  *iBufferPtr++ = *complexVectorPtr++;
192  complexVectorPtr++;
193  }
194 }
195 #endif /* LV_HAVE_SSSE3 */
196 
197 
198 #ifdef LV_HAVE_SSE2
199 #include <emmintrin.h>
200 
201 static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
202  const lv_16sc_t* complexVector,
203  unsigned int num_points)
204 {
205  unsigned int number = 0;
206  const int16_t* complexVectorPtr = (int16_t*)complexVector;
207  int16_t* iBufferPtr = iBuffer;
208  __m128i complexVal1, complexVal2, iOutputVal;
209  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
210  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
211 
212  unsigned int eighthPoints = num_points / 8;
213 
214  for (number = 0; number < eighthPoints; number++) {
215  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
216  complexVectorPtr += 8;
217  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
218  complexVectorPtr += 8;
219 
220  complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
221 
222  complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
223 
224  complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
225 
226  complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
227 
228  complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
229 
230  complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
231 
232  iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
233  _mm_and_si128(complexVal2, highMask));
234 
235  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
236 
237  iBufferPtr += 8;
238  }
239 
240  number = eighthPoints * 8;
241  for (; number < num_points; number++) {
242  *iBufferPtr++ = *complexVectorPtr++;
243  complexVectorPtr++;
244  }
245 }
246 #endif /* LV_HAVE_SSE2 */
247 
248 #ifdef LV_HAVE_GENERIC
249 
250 static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
251  const lv_16sc_t* complexVector,
252  unsigned int num_points)
253 {
254  unsigned int number = 0;
255  const int16_t* complexVectorPtr = (int16_t*)complexVector;
256  int16_t* iBufferPtr = iBuffer;
257  for (number = 0; number < num_points; number++) {
258  *iBufferPtr++ = *complexVectorPtr++;
259  complexVectorPtr++;
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
266 
267 
268 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
269 #define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
270 
271 #include <inttypes.h>
272 #include <stdio.h>
273 
274 
275 #ifdef LV_HAVE_AVX2
276 #include <immintrin.h>
277 
278 static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
279  const lv_16sc_t* complexVector,
280  unsigned int num_points)
281 {
282  unsigned int number = 0;
283  const int16_t* complexVectorPtr = (int16_t*)complexVector;
284  int16_t* iBufferPtr = iBuffer;
285 
286  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
287  0x80,
288  0x80,
289  0x80,
290  0x80,
291  0x80,
292  0x80,
293  0x80,
294  13,
295  12,
296  9,
297  8,
298  5,
299  4,
300  1,
301  0,
302  0x80,
303  0x80,
304  0x80,
305  0x80,
306  0x80,
307  0x80,
308  0x80,
309  0x80,
310  13,
311  12,
312  9,
313  8,
314  5,
315  4,
316  1,
317  0);
318  __m256i iMoveMask2 = _mm256_set_epi8(13,
319  12,
320  9,
321  8,
322  5,
323  4,
324  1,
325  0,
326  0x80,
327  0x80,
328  0x80,
329  0x80,
330  0x80,
331  0x80,
332  0x80,
333  0x80,
334  13,
335  12,
336  9,
337  8,
338  5,
339  4,
340  1,
341  0,
342  0x80,
343  0x80,
344  0x80,
345  0x80,
346  0x80,
347  0x80,
348  0x80,
349  0x80);
350 
351  __m256i complexVal1, complexVal2, iOutputVal;
352 
353  unsigned int sixteenthPoints = num_points / 16;
354 
355  for (number = 0; number < sixteenthPoints; number++) {
356  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
357  complexVectorPtr += 16;
358  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
359  complexVectorPtr += 16;
360 
361  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
362  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
363 
364  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
365  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
366 
367  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
368 
369  iBufferPtr += 16;
370  }
371 
372  number = sixteenthPoints * 16;
373  for (; number < num_points; number++) {
374  *iBufferPtr++ = *complexVectorPtr++;
375  complexVectorPtr++;
376  }
377 }
378 #endif /* LV_HAVE_AVX2 */
379 
380 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:5115
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
#define _mm_shufflehi_epi16(a, imm)
Definition: sse2neon.h:5444
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_16ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:250
static void volk_16ic_deinterleave_real_16i_a_sse2(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:201
static void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:156
short complex lv_16sc_t
Definition: volk_complex.h:71