Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 
48 #ifdef LV_HAVE_AVX2
49 #include <immintrin.h>
50 
51 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
52  const lv_16sc_t* complexVector,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const int8_t* complexVectorPtr = (int8_t*)complexVector;
57  int8_t* iBufferPtr = iBuffer;
58  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
59  0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  0x80,
65  0x80,
66  13,
67  12,
68  9,
69  8,
70  5,
71  4,
72  1,
73  0,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  0x80,
81  0x80,
82  13,
83  12,
84  9,
85  8,
86  5,
87  4,
88  1,
89  0);
90  __m256i iMoveMask2 = _mm256_set_epi8(13,
91  12,
92  9,
93  8,
94  5,
95  4,
96  1,
97  0,
98  0x80,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  0x80,
105  0x80,
106  13,
107  12,
108  9,
109  8,
110  5,
111  4,
112  1,
113  0,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80,
120  0x80,
121  0x80);
122  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
123 
124  unsigned int thirtysecondPoints = num_points / 32;
125 
126  for (number = 0; number < thirtysecondPoints; number++) {
127  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128  complexVectorPtr += 32;
129  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130  complexVectorPtr += 32;
131 
132  complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133  complexVectorPtr += 32;
134  complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135  complexVectorPtr += 32;
136 
137  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
139 
140  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
142 
143  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
145 
146  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
148 
149  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
151 
152  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
154 
155  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
156 
157  iBufferPtr += 32;
158  }
159 
160  number = thirtysecondPoints * 32;
161  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162  for (; number < num_points; number++) {
163  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164  int16ComplexVectorPtr++;
165  }
166 }
167 #endif /* LV_HAVE_AVX2 */
168 
169 
170 #ifdef LV_HAVE_SSSE3
171 #include <tmmintrin.h>
172 
173 static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
174  const lv_16sc_t* complexVector,
175  unsigned int num_points)
176 {
177  unsigned int number = 0;
178  const int8_t* complexVectorPtr = (int8_t*)complexVector;
179  int8_t* iBufferPtr = iBuffer;
180  __m128i iMoveMask1 = _mm_set_epi8(
181  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182  __m128i iMoveMask2 = _mm_set_epi8(
183  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
185 
186  unsigned int sixteenthPoints = num_points / 16;
187 
188  for (number = 0; number < sixteenthPoints; number++) {
189  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190  complexVectorPtr += 16;
191  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192  complexVectorPtr += 16;
193 
194  complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195  complexVectorPtr += 16;
196  complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197  complexVectorPtr += 16;
198 
199  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
201 
202  complexVal1 = _mm_or_si128(complexVal1, complexVal2);
203 
204  complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205  complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
206 
207  complexVal3 = _mm_or_si128(complexVal3, complexVal4);
208 
209 
210  complexVal1 = _mm_srai_epi16(complexVal1, 8);
211  complexVal3 = _mm_srai_epi16(complexVal3, 8);
212 
213  iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
214 
215  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
216 
217  iBufferPtr += 16;
218  }
219 
220  number = sixteenthPoints * 16;
221  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222  for (; number < num_points; number++) {
223  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224  int16ComplexVectorPtr++;
225  }
226 }
227 #endif /* LV_HAVE_SSSE3 */
228 
229 #ifdef LV_HAVE_GENERIC
230 
231 static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
232  const lv_16sc_t* complexVector,
233  unsigned int num_points)
234 {
235  unsigned int number = 0;
236  int16_t* complexVectorPtr = (int16_t*)complexVector;
237  int8_t* iBufferPtr = iBuffer;
238  for (number = 0; number < num_points; number++) {
239  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
240  complexVectorPtr++;
241  }
242 }
243 #endif /* LV_HAVE_GENERIC */
244 
245 #ifdef LV_HAVE_NEON
246 #include <arm_neon.h>
247 
248 static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
249  const lv_16sc_t* complexVector,
250  unsigned int num_points)
251 {
252  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
253  int8_t* iBufferPtr = iBuffer;
254  unsigned int eighth_points = num_points / 8;
255  unsigned int number;
256 
257  int16x8x2_t complexInput;
258  int8x8_t realOutput;
259  for (number = 0; number < eighth_points; number++) {
260  complexInput = vld2q_s16(complexVectorPtr);
261  realOutput = vshrn_n_s16(complexInput.val[0], 8);
262  vst1_s8(iBufferPtr, realOutput);
263  complexVectorPtr += 16;
264  iBufferPtr += 8;
265  }
266 
267  for (number = eighth_points * 8; number < num_points; number++) {
268  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
269  complexVectorPtr++;
270  }
271 }
272 #endif
273 
274 #ifdef LV_HAVE_ORC
275 
276 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
277  const lv_16sc_t* complexVector,
278  unsigned int num_points);
279 
280 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
281  const lv_16sc_t* complexVector,
282  unsigned int num_points)
283 {
284  volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
285 }
286 #endif /* LV_HAVE_ORC */
287 
288 
289 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
290 
291 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
292 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
293 
294 #include <inttypes.h>
295 #include <stdio.h>
296 
297 
298 #ifdef LV_HAVE_AVX2
299 #include <immintrin.h>
300 
301 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
302  const lv_16sc_t* complexVector,
303  unsigned int num_points)
304 {
305  unsigned int number = 0;
306  const int8_t* complexVectorPtr = (int8_t*)complexVector;
307  int8_t* iBufferPtr = iBuffer;
308  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
309  0x80,
310  0x80,
311  0x80,
312  0x80,
313  0x80,
314  0x80,
315  0x80,
316  13,
317  12,
318  9,
319  8,
320  5,
321  4,
322  1,
323  0,
324  0x80,
325  0x80,
326  0x80,
327  0x80,
328  0x80,
329  0x80,
330  0x80,
331  0x80,
332  13,
333  12,
334  9,
335  8,
336  5,
337  4,
338  1,
339  0);
340  __m256i iMoveMask2 = _mm256_set_epi8(13,
341  12,
342  9,
343  8,
344  5,
345  4,
346  1,
347  0,
348  0x80,
349  0x80,
350  0x80,
351  0x80,
352  0x80,
353  0x80,
354  0x80,
355  0x80,
356  13,
357  12,
358  9,
359  8,
360  5,
361  4,
362  1,
363  0,
364  0x80,
365  0x80,
366  0x80,
367  0x80,
368  0x80,
369  0x80,
370  0x80,
371  0x80);
372  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
373 
374  unsigned int thirtysecondPoints = num_points / 32;
375 
376  for (number = 0; number < thirtysecondPoints; number++) {
377  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378  complexVectorPtr += 32;
379  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
380  complexVectorPtr += 32;
381 
382  complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
383  complexVectorPtr += 32;
384  complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
385  complexVectorPtr += 32;
386 
387  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
388  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
389 
390  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
391  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
392 
393  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
394  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
395 
396  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
397  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
398 
399  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
400  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
401 
402  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
403  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
404 
405  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
406 
407  iBufferPtr += 32;
408  }
409 
410  number = thirtysecondPoints * 32;
411  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
412  for (; number < num_points; number++) {
413  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
414  int16ComplexVectorPtr++;
415  }
416 }
417 #endif /* LV_HAVE_AVX2 */
418 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:231
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:248
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:173
short complex lv_16sc_t
Definition: volk_complex.h:71