doxygen/volk__8ic__s32f__deinterleave__32f__x2_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2012, 2014 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H

 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H


 #include <inttypes.h>

 #include <stdio.h>

 #include <volk/volk_common.h>


 #ifdef LV_HAVE_SSE4_1

 #include <smmintrin.h>


 static inline void

 volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,

                                            float* qBuffer,

                                            const lv_8sc_t* complexVector,

                                            const float scalar,

                                            unsigned int num_points)

 {

     float* iBufferPtr = iBuffer;

     float* qBufferPtr = qBuffer;


     unsigned int number = 0;

     const unsigned int eighthPoints = num_points / 8;

     __m128 iFloatValue, qFloatValue;


     const float iScalar = 1.0 / scalar;

     __m128 invScalar = _mm_set_ps1(iScalar);

     __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;

     int8_t* complexVectorPtr = (int8_t*)complexVector;


     __m128i iMoveMask = _mm_set_epi8(

         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);

     __m128i qMoveMask = _mm_set_epi8(

         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);


     for (; number < eighthPoints; number++) {

         complexVal = _mm_load_si128((__m128i*)complexVectorPtr);

         complexVectorPtr += 16;

         iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);

         qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);


         iIntVal = _mm_cvtepi8_epi32(iComplexVal);

         iFloatValue = _mm_cvtepi32_ps(iIntVal);

         iFloatValue = _mm_mul_ps(iFloatValue, invScalar);

         _mm_store_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 4;


         iComplexVal = _mm_srli_si128(iComplexVal, 4);


         iIntVal = _mm_cvtepi8_epi32(iComplexVal);

         iFloatValue = _mm_cvtepi32_ps(iIntVal);

         iFloatValue = _mm_mul_ps(iFloatValue, invScalar);

         _mm_store_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 4;


         qIntVal = _mm_cvtepi8_epi32(qComplexVal);

         qFloatValue = _mm_cvtepi32_ps(qIntVal);

         qFloatValue = _mm_mul_ps(qFloatValue, invScalar);

         _mm_store_ps(qBufferPtr, qFloatValue);

         qBufferPtr += 4;


         qComplexVal = _mm_srli_si128(qComplexVal, 4);


         qIntVal = _mm_cvtepi8_epi32(qComplexVal);

         qFloatValue = _mm_cvtepi32_ps(qIntVal);

         qFloatValue = _mm_mul_ps(qFloatValue, invScalar);

         _mm_store_ps(qBufferPtr, qFloatValue);


         qBufferPtr += 4;

     }


     number = eighthPoints * 8;

     for (; number < num_points; number++) {

         *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

         *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

     }

 }

 #endif /* LV_HAVE_SSE4_1 */


 #ifdef LV_HAVE_SSE

 #include <xmmintrin.h>


 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,

                                                            float* qBuffer,

                                                            const lv_8sc_t* complexVector,

                                                            const float scalar,

                                                            unsigned int num_points)

 {

     float* iBufferPtr = iBuffer;

     float* qBufferPtr = qBuffer;


     unsigned int number = 0;

     const unsigned int quarterPoints = num_points / 4;

     __m128 cplxValue1, cplxValue2, iValue, qValue;


     __m128 invScalar = _mm_set_ps1(1.0 / scalar);

     int8_t* complexVectorPtr = (int8_t*)complexVector;


     __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];


     for (; number < quarterPoints; number++) {

         floatBuffer[0] = (float)(complexVectorPtr[0]);

         floatBuffer[1] = (float)(complexVectorPtr[1]);

         floatBuffer[2] = (float)(complexVectorPtr[2]);

         floatBuffer[3] = (float)(complexVectorPtr[3]);


         floatBuffer[4] = (float)(complexVectorPtr[4]);

         floatBuffer[5] = (float)(complexVectorPtr[5]);

         floatBuffer[6] = (float)(complexVectorPtr[6]);

         floatBuffer[7] = (float)(complexVectorPtr[7]);


         cplxValue1 = _mm_load_ps(&floatBuffer[0]);

         cplxValue2 = _mm_load_ps(&floatBuffer[4]);


         complexVectorPtr += 8;


         cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);

         cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);


         // Arrange in i1i2i3i4 format

         iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));

         qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));


         _mm_store_ps(iBufferPtr, iValue);

         _mm_store_ps(qBufferPtr, qValue);


         iBufferPtr += 4;

         qBufferPtr += 4;

     }


     number = quarterPoints * 4;

     complexVectorPtr = (int8_t*)&complexVector[number];

     for (; number < num_points; number++) {

         *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;

         *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;

     }

 }

 #endif /* LV_HAVE_SSE */


 #ifdef LV_HAVE_AVX2

 #include <immintrin.h>


 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,

                                                             float* qBuffer,

                                                             const lv_8sc_t* complexVector,

                                                             const float scalar,

                                                             unsigned int num_points)

 {

     float* iBufferPtr = iBuffer;

     float* qBufferPtr = qBuffer;


     unsigned int number = 0;

     const unsigned int sixteenthPoints = num_points / 16;

     __m256 iFloatValue, qFloatValue;


     const float iScalar = 1.0 / scalar;

     __m256 invScalar = _mm256_set1_ps(iScalar);

     __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;

     int8_t* complexVectorPtr = (int8_t*)complexVector;


     __m256i iMoveMask = _mm256_set_epi8(0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         14,

                                         12,

                                         10,

                                         8,

                                         6,

                                         4,

                                         2,

                                         0,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         14,

                                         12,

                                         10,

                                         8,

                                         6,

                                         4,

                                         2,

                                         0);

     __m256i qMoveMask = _mm256_set_epi8(0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         15,

                                         13,

                                         11,

                                         9,

                                         7,

                                         5,

                                         3,

                                         1,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         0x80,

                                         15,

                                         13,

                                         11,

                                         9,

                                         7,

                                         5,

                                         3,

                                         1);


     for (; number < sixteenthPoints; number++) {

         complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);

         complexVectorPtr += 32;

         iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);

         qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);


         iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));

         iFloatValue = _mm256_cvtepi32_ps(iIntVal);

         iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);

         _mm256_store_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 8;


         iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);

         iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));

         iFloatValue = _mm256_cvtepi32_ps(iIntVal);

         iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);

         _mm256_store_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 8;


         qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));

         qFloatValue = _mm256_cvtepi32_ps(qIntVal);

         qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);

         _mm256_store_ps(qBufferPtr, qFloatValue);

         qBufferPtr += 8;


         qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);

         qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));

         qFloatValue = _mm256_cvtepi32_ps(qIntVal);

         qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);

         _mm256_store_ps(qBufferPtr, qFloatValue);

         qBufferPtr += 8;

     }


     number = sixteenthPoints * 16;

     for (; number < num_points; number++) {

         *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

         *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

     }

 }

 #endif /* LV_HAVE_AVX2 */


 #ifdef LV_HAVE_GENERIC


 static inline void

 volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,

                                           float* qBuffer,

                                           const lv_8sc_t* complexVector,

                                           const float scalar,

                                           unsigned int num_points)

 {

     const int8_t* complexVectorPtr = (const int8_t*)complexVector;

     float* iBufferPtr = iBuffer;

     float* qBufferPtr = qBuffer;

     unsigned int number;

     const float invScalar = 1.0 / scalar;

     for (number = 0; number < num_points; number++) {

         *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;

         *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;

     }

 }

 #endif /* LV_HAVE_GENERIC */


 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */


 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H

 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H


 #include <inttypes.h>

 #include <stdio.h>

 #include <volk/volk_common.h>


 #ifdef LV_HAVE_AVX2

 #include <immintrin.h>


 static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,

                                                             float* qBuffer,

                                                             const lv_8sc_t* complexVector,

                                                             const float scalar,

                                                             unsigned int num_points)

 {

     float* iBufferPtr = iBuffer;

     float* qBufferPtr = qBuffer;


     unsigned int number = 0;

     const unsigned int sixteenthPoints = num_points / 16;

     __m256 iFloatValue, qFloatValue;


     const float iScalar = 1.0 / scalar;

     __m256 invScalar = _mm256_set1_ps(iScalar);

     __m256i complexVal, iIntVal, qIntVal;

     __m128i iComplexVal, qComplexVal;

     int8_t* complexVectorPtr = (int8_t*)complexVector;


     __m256i MoveMask = _mm256_set_epi8(15,

                                        13,

                                        11,

                                        9,

                                        7,

                                        5,

                                        3,

                                        1,

                                        14,

                                        12,

                                        10,

                                        8,

                                        6,

                                        4,

                                        2,

                                        0,

                                        15,

                                        13,

                                        11,

                                        9,

                                        7,

                                        5,

                                        3,

                                        1,

                                        14,

                                        12,

                                        10,

                                        8,

                                        6,

                                        4,

                                        2,

                                        0);


     for (; number < sixteenthPoints; number++) {

         complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);

         complexVectorPtr += 32;

         complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);

         complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);

         iComplexVal = _mm256_extractf128_si256(complexVal, 0);

         qComplexVal = _mm256_extractf128_si256(complexVal, 1);


         iIntVal = _mm256_cvtepi8_epi32(iComplexVal);

         iFloatValue = _mm256_cvtepi32_ps(iIntVal);

         iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);

         _mm256_storeu_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 8;


         qIntVal = _mm256_cvtepi8_epi32(qComplexVal);

         qFloatValue = _mm256_cvtepi32_ps(qIntVal);

         qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);

         _mm256_storeu_ps(qBufferPtr, qFloatValue);

         qBufferPtr += 8;


         complexVal = _mm256_srli_si256(complexVal, 8);

         iComplexVal = _mm256_extractf128_si256(complexVal, 0);

         qComplexVal = _mm256_extractf128_si256(complexVal, 1);


         iIntVal = _mm256_cvtepi8_epi32(iComplexVal);

         iFloatValue = _mm256_cvtepi32_ps(iIntVal);

         iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);

         _mm256_storeu_ps(iBufferPtr, iFloatValue);

         iBufferPtr += 8;


         qIntVal = _mm256_cvtepi8_epi32(qComplexVal);

         qFloatValue = _mm256_cvtepi32_ps(qIntVal);

         qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);

         _mm256_storeu_ps(qBufferPtr, qFloatValue);

         qBufferPtr += 8;

     }


     number = sixteenthPoints * 16;

     for (; number < num_points; number++) {

         *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

         *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;

     }

 }

 #endif /* LV_HAVE_AVX2 */


 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_set_epi8
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140

_mm_shuffle_ps
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586

_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205

_mm_set_ps1
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437

_mm_shuffle_epi8
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069

_mm_cvtepi8_epi32
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574

_mm_load_si128
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471

_mm_srli_si128
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885

_MM_SHUFFLE
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

__m128i
int64x2_t __m128i
Definition: sse2neon.h:244

_mm_store_ps
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704

_mm_cvtepi32_ps
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937

volk_8ic_s32f_deinterleave_32f_x2_a_sse
static void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:126

volk_8ic_s32f_deinterleave_32f_x2_generic
static void volk_8ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:315

volk_common.h

__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65

lv_8sc_t
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70