Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
41 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
50  const lv_8sc_t* complexVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const int8_t* complexVectorPtr = (int8_t*)complexVector;
55  int8_t* iBufferPtr = iBuffer;
56  __m256i moveMask1 = _mm256_set_epi8(0x80,
57  0x80,
58  0x80,
59  0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  14,
65  12,
66  10,
67  8,
68  6,
69  4,
70  2,
71  0,
72  0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  14,
81  12,
82  10,
83  8,
84  6,
85  4,
86  2,
87  0);
88  __m256i moveMask2 = _mm256_set_epi8(14,
89  12,
90  10,
91  8,
92  6,
93  4,
94  2,
95  0,
96  0x80,
97  0x80,
98  0x80,
99  0x80,
100  0x80,
101  0x80,
102  0x80,
103  0x80,
104  14,
105  12,
106  10,
107  8,
108  6,
109  4,
110  2,
111  0,
112  0x80,
113  0x80,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80);
120  __m256i complexVal1, complexVal2, outputVal;
121 
122  unsigned int thirtysecondPoints = num_points / 32;
123 
124  for (number = 0; number < thirtysecondPoints; number++) {
125 
126  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
127  complexVectorPtr += 32;
128  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
129  complexVectorPtr += 32;
130 
131  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
132  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
133  outputVal = _mm256_or_si256(complexVal1, complexVal2);
134  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
135 
136  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
137  iBufferPtr += 32;
138  }
139 
140  number = thirtysecondPoints * 32;
141  for (; number < num_points; number++) {
142  *iBufferPtr++ = *complexVectorPtr++;
143  complexVectorPtr++;
144  }
145 }
146 #endif /* LV_HAVE_AVX2 */
147 
148 
149 #ifdef LV_HAVE_SSSE3
150 #include <tmmintrin.h>
151 
152 static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
153  const lv_8sc_t* complexVector,
154  unsigned int num_points)
155 {
156  unsigned int number = 0;
157  const int8_t* complexVectorPtr = (int8_t*)complexVector;
158  int8_t* iBufferPtr = iBuffer;
159  __m128i moveMask1 = _mm_set_epi8(
160  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
161  __m128i moveMask2 = _mm_set_epi8(
162  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
163  __m128i complexVal1, complexVal2, outputVal;
164 
165  unsigned int sixteenthPoints = num_points / 16;
166 
167  for (number = 0; number < sixteenthPoints; number++) {
168  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
169  complexVectorPtr += 16;
170  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
171  complexVectorPtr += 16;
172 
173  complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
174  complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
175 
176  outputVal = _mm_or_si128(complexVal1, complexVal2);
177 
178  _mm_store_si128((__m128i*)iBufferPtr, outputVal);
179  iBufferPtr += 16;
180  }
181 
182  number = sixteenthPoints * 16;
183  for (; number < num_points; number++) {
184  *iBufferPtr++ = *complexVectorPtr++;
185  complexVectorPtr++;
186  }
187 }
188 #endif /* LV_HAVE_SSSE3 */
189 
190 
191 #ifdef LV_HAVE_AVX
192 #include <immintrin.h>
193 
194 static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
195  const lv_8sc_t* complexVector,
196  unsigned int num_points)
197 {
198  unsigned int number = 0;
199  const int8_t* complexVectorPtr = (int8_t*)complexVector;
200  int8_t* iBufferPtr = iBuffer;
201  __m128i moveMaskL = _mm_set_epi8(
202  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
203  __m128i moveMaskH = _mm_set_epi8(
204  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
205  __m256i complexVal1, complexVal2, outputVal;
206  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
207  outputVal2;
208 
209  unsigned int thirtysecondPoints = num_points / 32;
210 
211  for (number = 0; number < thirtysecondPoints; number++) {
212 
213  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
214  complexVectorPtr += 32;
215  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
216  complexVectorPtr += 32;
217 
218  complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
219  complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
220  complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
221  complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
222 
223  complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
224  complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
225  outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
226 
227 
228  complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
229  complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
230  outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
231 
232  __m256i dummy = _mm256_setzero_si256();
233  outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
234  outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
235 
236 
237  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
238  iBufferPtr += 32;
239  }
240 
241  number = thirtysecondPoints * 32;
242  for (; number < num_points; number++) {
243  *iBufferPtr++ = *complexVectorPtr++;
244  complexVectorPtr++;
245  }
246 }
247 #endif /* LV_HAVE_AVX */
248 
249 
250 #ifdef LV_HAVE_GENERIC
251 
252 static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
253  const lv_8sc_t* complexVector,
254  unsigned int num_points)
255 {
256  unsigned int number = 0;
257  const int8_t* complexVectorPtr = (int8_t*)complexVector;
258  int8_t* iBufferPtr = iBuffer;
259  for (number = 0; number < num_points; number++) {
260  *iBufferPtr++ = *complexVectorPtr++;
261  complexVectorPtr++;
262  }
263 }
264 #endif /* LV_HAVE_GENERIC */
265 
266 
267 #ifdef LV_HAVE_NEON
268 #include <arm_neon.h>
269 
270 static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
271  const lv_8sc_t* complexVector,
272  unsigned int num_points)
273 {
274  unsigned int number;
275  unsigned int sixteenth_points = num_points / 16;
276 
277  int8x16x2_t input_vector;
278  for (number = 0; number < sixteenth_points; ++number) {
279  input_vector = vld2q_s8((int8_t*)complexVector);
280  vst1q_s8(iBuffer, input_vector.val[0]);
281  iBuffer += 16;
282  complexVector += 16;
283  }
284 
285  const int8_t* complexVectorPtr = (int8_t*)complexVector;
286  int8_t* iBufferPtr = iBuffer;
287  for (number = sixteenth_points * 16; number < num_points; number++) {
288  *iBufferPtr++ = *complexVectorPtr++;
289  complexVectorPtr++;
290  }
291 }
292 #endif /* LV_HAVE_NEON */
293 
294 
295 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
296 
297 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
298 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
299 
300 #include <inttypes.h>
301 #include <stdio.h>
302 
303 #ifdef LV_HAVE_AVX2
304 #include <immintrin.h>
305 
306 static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
307  const lv_8sc_t* complexVector,
308  unsigned int num_points)
309 {
310  unsigned int number = 0;
311  const int8_t* complexVectorPtr = (int8_t*)complexVector;
312  int8_t* iBufferPtr = iBuffer;
313  __m256i moveMask1 = _mm256_set_epi8(0x80,
314  0x80,
315  0x80,
316  0x80,
317  0x80,
318  0x80,
319  0x80,
320  0x80,
321  14,
322  12,
323  10,
324  8,
325  6,
326  4,
327  2,
328  0,
329  0x80,
330  0x80,
331  0x80,
332  0x80,
333  0x80,
334  0x80,
335  0x80,
336  0x80,
337  14,
338  12,
339  10,
340  8,
341  6,
342  4,
343  2,
344  0);
345  __m256i moveMask2 = _mm256_set_epi8(14,
346  12,
347  10,
348  8,
349  6,
350  4,
351  2,
352  0,
353  0x80,
354  0x80,
355  0x80,
356  0x80,
357  0x80,
358  0x80,
359  0x80,
360  0x80,
361  14,
362  12,
363  10,
364  8,
365  6,
366  4,
367  2,
368  0,
369  0x80,
370  0x80,
371  0x80,
372  0x80,
373  0x80,
374  0x80,
375  0x80,
376  0x80);
377  __m256i complexVal1, complexVal2, outputVal;
378 
379  unsigned int thirtysecondPoints = num_points / 32;
380 
381  for (number = 0; number < thirtysecondPoints; number++) {
382 
383  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
384  complexVectorPtr += 32;
385  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
386  complexVectorPtr += 32;
387 
388  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
389  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
390  outputVal = _mm256_or_si256(complexVal1, complexVal2);
391  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
392 
393  _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
394  iBufferPtr += 32;
395  }
396 
397  number = thirtysecondPoints * 32;
398  for (; number < num_points; number++) {
399  *iBufferPtr++ = *complexVectorPtr++;
400  complexVectorPtr++;
401  }
402 }
403 #endif /* LV_HAVE_AVX2 */
404 
405 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:152
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:194
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:270
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:252
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70