41 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
49 #include <immintrin.h>
51 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 int8_t* iBufferPtr = iBuffer;
58 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
90 __m256i iMoveMask2 = _mm256_set_epi8(13,
122 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
124 unsigned int thirtysecondPoints = num_points / 32;
126 for (number = 0; number < thirtysecondPoints; number++) {
127 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128 complexVectorPtr += 32;
129 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 32;
132 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133 complexVectorPtr += 32;
134 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135 complexVectorPtr += 32;
137 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
140 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
143 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
146 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
149 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
152 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
155 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
160 number = thirtysecondPoints * 32;
161 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162 for (; number < num_points; number++) {
163 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164 int16ComplexVectorPtr++;
171 #include <tmmintrin.h>
175 unsigned int num_points)
177 unsigned int number = 0;
178 const int8_t* complexVectorPtr = (int8_t*)complexVector;
179 int8_t* iBufferPtr = iBuffer;
181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
183 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
186 unsigned int sixteenthPoints = num_points / 16;
188 for (number = 0; number < sixteenthPoints; number++) {
190 complexVectorPtr += 16;
192 complexVectorPtr += 16;
195 complexVectorPtr += 16;
197 complexVectorPtr += 16;
220 number = sixteenthPoints * 16;
221 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222 for (; number < num_points; number++) {
223 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224 int16ComplexVectorPtr++;
229 #ifdef LV_HAVE_GENERIC
233 unsigned int num_points)
235 unsigned int number = 0;
236 int16_t* complexVectorPtr = (int16_t*)complexVector;
237 int8_t* iBufferPtr = iBuffer;
238 for (number = 0; number < num_points; number++) {
239 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
246 #include <arm_neon.h>
250 unsigned int num_points)
252 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
253 int8_t* iBufferPtr = iBuffer;
254 unsigned int eighth_points = num_points / 8;
257 int16x8x2_t complexInput;
259 for (number = 0; number < eighth_points; number++) {
260 complexInput = vld2q_s16(complexVectorPtr);
261 realOutput = vshrn_n_s16(complexInput.val[0], 8);
262 vst1_s8(iBufferPtr, realOutput);
263 complexVectorPtr += 16;
267 for (number = eighth_points * 8; number < num_points; number++) {
268 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
276 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
278 unsigned int num_points);
280 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
282 unsigned int num_points)
284 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
291 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
292 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
294 #include <inttypes.h>
299 #include <immintrin.h>
301 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
303 unsigned int num_points)
305 unsigned int number = 0;
306 const int8_t* complexVectorPtr = (int8_t*)complexVector;
307 int8_t* iBufferPtr = iBuffer;
308 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
340 __m256i iMoveMask2 = _mm256_set_epi8(13,
372 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
374 unsigned int thirtysecondPoints = num_points / 32;
376 for (number = 0; number < thirtysecondPoints; number++) {
377 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378 complexVectorPtr += 32;
379 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
380 complexVectorPtr += 32;
382 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
383 complexVectorPtr += 32;
384 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
385 complexVectorPtr += 32;
387 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
388 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
390 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
391 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
393 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
394 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
396 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
397 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
399 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
400 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
402 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
403 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
405 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
410 number = thirtysecondPoints * 32;
411 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
412 for (; number < num_points; number++) {
413 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
414 int16ComplexVectorPtr++;
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:231
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:248
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:173
short complex lv_16sc_t
Definition: volk_complex.h:71