Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H
41 #define INCLUDED_volk_8ic_deinterleave_real_16i_a_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
51  const lv_8sc_t* complexVector,
52  unsigned int num_points)
53 {
54  unsigned int number = 0;
55  const int8_t* complexVectorPtr = (int8_t*)complexVector;
56  int16_t* iBufferPtr = iBuffer;
57  __m256i moveMask = _mm256_set_epi8(0x80,
58  0x80,
59  0x80,
60  0x80,
61  0x80,
62  0x80,
63  0x80,
64  0x80,
65  14,
66  12,
67  10,
68  8,
69  6,
70  4,
71  2,
72  0,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  0x80,
81  14,
82  12,
83  10,
84  8,
85  6,
86  4,
87  2,
88  0);
89  __m256i complexVal, outputVal;
90  __m128i outputVal0;
91 
92  unsigned int sixteenthPoints = num_points / 16;
93 
94  for (number = 0; number < sixteenthPoints; number++) {
95  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
96  complexVectorPtr += 32;
97 
98  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
99  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
100 
101  outputVal0 = _mm256_extractf128_si256(complexVal, 0);
102 
103  outputVal = _mm256_cvtepi8_epi16(outputVal0);
104  outputVal = _mm256_slli_epi16(outputVal, 7);
105 
106  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
107 
108  iBufferPtr += 16;
109  }
110 
111  number = sixteenthPoints * 16;
112  for (; number < num_points; number++) {
113  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
114  complexVectorPtr++;
115  }
116 }
117 #endif /* LV_HAVE_AVX2 */
118 
119 #ifdef LV_HAVE_SSE4_1
120 #include <smmintrin.h>
121 
122 static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer,
123  const lv_8sc_t* complexVector,
124  unsigned int num_points)
125 {
126  unsigned int number = 0;
127  const int8_t* complexVectorPtr = (int8_t*)complexVector;
128  int16_t* iBufferPtr = iBuffer;
129  __m128i moveMask = _mm_set_epi8(
130  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
131  __m128i complexVal, outputVal;
132 
133  unsigned int eighthPoints = num_points / 8;
134 
135  for (number = 0; number < eighthPoints; number++) {
136  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
137  complexVectorPtr += 16;
138 
139  complexVal = _mm_shuffle_epi8(complexVal, moveMask);
140 
141  outputVal = _mm_cvtepi8_epi16(complexVal);
142  outputVal = _mm_slli_epi16(outputVal, 7);
143 
144  _mm_store_si128((__m128i*)iBufferPtr, outputVal);
145  iBufferPtr += 8;
146  }
147 
148  number = eighthPoints * 8;
149  for (; number < num_points; number++) {
150  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
151  complexVectorPtr++;
152  }
153 }
154 #endif /* LV_HAVE_SSE4_1 */
155 
156 
157 #ifdef LV_HAVE_AVX
158 #include <immintrin.h>
159 
160 static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
161  const lv_8sc_t* complexVector,
162  unsigned int num_points)
163 {
164  unsigned int number = 0;
165  const int8_t* complexVectorPtr = (int8_t*)complexVector;
166  int16_t* iBufferPtr = iBuffer;
167  __m128i moveMask = _mm_set_epi8(
168  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
169  __m256i complexVal, outputVal;
170  __m128i complexVal1, complexVal0, outputVal1, outputVal0;
171 
172  unsigned int sixteenthPoints = num_points / 16;
173 
174  for (number = 0; number < sixteenthPoints; number++) {
175  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
176  complexVectorPtr += 32;
177 
178  complexVal1 = _mm256_extractf128_si256(complexVal, 1);
179  complexVal0 = _mm256_extractf128_si256(complexVal, 0);
180 
181  outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
182  outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
183 
184  outputVal1 = _mm_cvtepi8_epi16(outputVal1);
185  outputVal1 = _mm_slli_epi16(outputVal1, 7);
186  outputVal0 = _mm_cvtepi8_epi16(outputVal0);
187  outputVal0 = _mm_slli_epi16(outputVal0, 7);
188 
189  __m256i dummy = _mm256_setzero_si256();
190  outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
191  outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
192  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
193 
194  iBufferPtr += 16;
195  }
196 
197  number = sixteenthPoints * 16;
198  for (; number < num_points; number++) {
199  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
200  complexVectorPtr++;
201  }
202 }
203 #endif /* LV_HAVE_AVX */
204 
205 
206 #ifdef LV_HAVE_GENERIC
207 
208 static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer,
209  const lv_8sc_t* complexVector,
210  unsigned int num_points)
211 {
212  unsigned int number = 0;
213  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
214  int16_t* iBufferPtr = iBuffer;
215  for (number = 0; number < num_points; number++) {
216  *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
217  complexVectorPtr++;
218  }
219 }
220 #endif /* LV_HAVE_GENERIC */
221 
222 
223 #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */
224 
225 #ifndef INCLUDED_volk_8ic_deinterleave_real_16i_u_H
226 #define INCLUDED_volk_8ic_deinterleave_real_16i_u_H
227 
228 #include <inttypes.h>
229 #include <stdio.h>
230 
231 
232 #ifdef LV_HAVE_AVX2
233 #include <immintrin.h>
234 
235 static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
236  const lv_8sc_t* complexVector,
237  unsigned int num_points)
238 {
239  unsigned int number = 0;
240  const int8_t* complexVectorPtr = (int8_t*)complexVector;
241  int16_t* iBufferPtr = iBuffer;
242  __m256i moveMask = _mm256_set_epi8(0x80,
243  0x80,
244  0x80,
245  0x80,
246  0x80,
247  0x80,
248  0x80,
249  0x80,
250  14,
251  12,
252  10,
253  8,
254  6,
255  4,
256  2,
257  0,
258  0x80,
259  0x80,
260  0x80,
261  0x80,
262  0x80,
263  0x80,
264  0x80,
265  0x80,
266  14,
267  12,
268  10,
269  8,
270  6,
271  4,
272  2,
273  0);
274  __m256i complexVal, outputVal;
275  __m128i outputVal0;
276 
277  unsigned int sixteenthPoints = num_points / 16;
278 
279  for (number = 0; number < sixteenthPoints; number++) {
280  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
281  complexVectorPtr += 32;
282 
283  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
284  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
285 
286  outputVal0 = _mm256_extractf128_si256(complexVal, 0);
287 
288  outputVal = _mm256_cvtepi8_epi16(outputVal0);
289  outputVal = _mm256_slli_epi16(outputVal, 7);
290 
291  _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
292 
293  iBufferPtr += 16;
294  }
295 
296  number = sixteenthPoints * 16;
297  for (; number < num_points; number++) {
298  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
299  complexVectorPtr++;
300  }
301 }
302 #endif /* LV_HAVE_AVX2 */
303 #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8ic_deinterleave_real_16i_a_avx(int16_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_16i.h:160
static void volk_8ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_16i.h:208
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70