Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_real_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
44 #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void
54 volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
55  const lv_16sc_t* complexVector,
56  const float scalar,
57  unsigned int num_points)
58 {
59  float* iBufferPtr = iBuffer;
60 
61  unsigned int number = 0;
62  const unsigned int eighthPoints = num_points / 8;
63 
64  __m256 iFloatValue;
65 
66  const float iScalar = 1.0 / scalar;
67  __m256 invScalar = _mm256_set1_ps(iScalar);
68  __m256i complexVal, iIntVal;
69  __m128i complexVal128;
70  int8_t* complexVectorPtr = (int8_t*)complexVector;
71 
72  __m256i moveMask = _mm256_set_epi8(0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  13,
81  12,
82  9,
83  8,
84  5,
85  4,
86  1,
87  0,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  0x80,
94  0x80,
95  0x80,
96  13,
97  12,
98  9,
99  8,
100  5,
101  4,
102  1,
103  0);
104 
105  for (; number < eighthPoints; number++) {
106  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
107  complexVectorPtr += 32;
108  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
109  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
110  complexVal128 = _mm256_extracti128_si256(complexVal, 0);
111 
112  iIntVal = _mm256_cvtepi16_epi32(complexVal128);
113  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
114 
115  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
116 
117  _mm256_store_ps(iBufferPtr, iFloatValue);
118 
119  iBufferPtr += 8;
120  }
121 
122  number = eighthPoints * 8;
123  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
124  for (; number < num_points; number++) {
125  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
126  sixteenTComplexVectorPtr++;
127  }
128 }
129 #endif /* LV_HAVE_AVX2 */
130 
131 #ifdef LV_HAVE_SSE4_1
132 #include <smmintrin.h>
133 
134 static inline void
135 volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
136  const lv_16sc_t* complexVector,
137  const float scalar,
138  unsigned int num_points)
139 {
140  float* iBufferPtr = iBuffer;
141 
142  unsigned int number = 0;
143  const unsigned int quarterPoints = num_points / 4;
144 
145  __m128 iFloatValue;
146 
147  const float iScalar = 1.0 / scalar;
148  __m128 invScalar = _mm_set_ps1(iScalar);
149  __m128i complexVal, iIntVal;
150  int8_t* complexVectorPtr = (int8_t*)complexVector;
151 
152  __m128i moveMask = _mm_set_epi8(
153  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
154 
155  for (; number < quarterPoints; number++) {
156  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
157  complexVectorPtr += 16;
158  complexVal = _mm_shuffle_epi8(complexVal, moveMask);
159 
160  iIntVal = _mm_cvtepi16_epi32(complexVal);
161  iFloatValue = _mm_cvtepi32_ps(iIntVal);
162 
163  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
164 
165  _mm_store_ps(iBufferPtr, iFloatValue);
166 
167  iBufferPtr += 4;
168  }
169 
170  number = quarterPoints * 4;
171  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
172  for (; number < num_points; number++) {
173  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
174  sixteenTComplexVectorPtr++;
175  }
176 }
177 #endif /* LV_HAVE_SSE4_1 */
178 
179 #ifdef LV_HAVE_SSE
180 #include <xmmintrin.h>
181 
182 static inline void
184  const lv_16sc_t* complexVector,
185  const float scalar,
186  unsigned int num_points)
187 {
188  float* iBufferPtr = iBuffer;
189 
190  unsigned int number = 0;
191  const unsigned int quarterPoints = num_points / 4;
192  __m128 iValue;
193 
194  const float iScalar = 1.0 / scalar;
195  __m128 invScalar = _mm_set_ps1(iScalar);
196  int16_t* complexVectorPtr = (int16_t*)complexVector;
197 
198  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
199 
200  for (; number < quarterPoints; number++) {
201  floatBuffer[0] = (float)(*complexVectorPtr);
202  complexVectorPtr += 2;
203  floatBuffer[1] = (float)(*complexVectorPtr);
204  complexVectorPtr += 2;
205  floatBuffer[2] = (float)(*complexVectorPtr);
206  complexVectorPtr += 2;
207  floatBuffer[3] = (float)(*complexVectorPtr);
208  complexVectorPtr += 2;
209 
210  iValue = _mm_load_ps(floatBuffer);
211 
212  iValue = _mm_mul_ps(iValue, invScalar);
213 
214  _mm_store_ps(iBufferPtr, iValue);
215 
216  iBufferPtr += 4;
217  }
218 
219  number = quarterPoints * 4;
220  complexVectorPtr = (int16_t*)&complexVector[number];
221  for (; number < num_points; number++) {
222  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
223  complexVectorPtr++;
224  }
225 }
226 #endif /* LV_HAVE_SSE */
227 
228 #ifdef LV_HAVE_GENERIC
229 static inline void
231  const lv_16sc_t* complexVector,
232  const float scalar,
233  unsigned int num_points)
234 {
235  unsigned int number = 0;
236  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
237  float* iBufferPtr = iBuffer;
238  const float invScalar = 1.0 / scalar;
239  for (number = 0; number < num_points; number++) {
240  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
241  complexVectorPtr++;
242  }
243 }
244 #endif /* LV_HAVE_GENERIC */
245 
246 
247 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
248 
249 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
250 #define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
251 
252 #include <inttypes.h>
253 #include <stdio.h>
254 #include <volk/volk_common.h>
255 
256 #ifdef LV_HAVE_AVX2
257 #include <immintrin.h>
258 
259 static inline void
260 volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
261  const lv_16sc_t* complexVector,
262  const float scalar,
263  unsigned int num_points)
264 {
265  float* iBufferPtr = iBuffer;
266 
267  unsigned int number = 0;
268  const unsigned int eighthPoints = num_points / 8;
269 
270  __m256 iFloatValue;
271 
272  const float iScalar = 1.0 / scalar;
273  __m256 invScalar = _mm256_set1_ps(iScalar);
274  __m256i complexVal, iIntVal;
275  __m128i complexVal128;
276  int8_t* complexVectorPtr = (int8_t*)complexVector;
277 
278  __m256i moveMask = _mm256_set_epi8(0x80,
279  0x80,
280  0x80,
281  0x80,
282  0x80,
283  0x80,
284  0x80,
285  0x80,
286  13,
287  12,
288  9,
289  8,
290  5,
291  4,
292  1,
293  0,
294  0x80,
295  0x80,
296  0x80,
297  0x80,
298  0x80,
299  0x80,
300  0x80,
301  0x80,
302  13,
303  12,
304  9,
305  8,
306  5,
307  4,
308  1,
309  0);
310 
311  for (; number < eighthPoints; number++) {
312  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
313  complexVectorPtr += 32;
314  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
315  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
316  complexVal128 = _mm256_extracti128_si256(complexVal, 0);
317 
318  iIntVal = _mm256_cvtepi16_epi32(complexVal128);
319  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
320 
321  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
322 
323  _mm256_storeu_ps(iBufferPtr, iFloatValue);
324 
325  iBufferPtr += 8;
326  }
327 
328  number = eighthPoints * 8;
329  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
330  for (; number < num_points; number++) {
331  *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
332  sixteenTComplexVectorPtr++;
333  }
334 }
335 #endif /* LV_HAVE_AVX2 */
336 
337 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:7539
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_16ic_s32f_deinterleave_real_32f_generic(float *iBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_real_32f.h:230
static void volk_16ic_s32f_deinterleave_real_32f_a_sse(float *iBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_real_32f.h:183
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
short complex lv_16sc_t
Definition: volk_complex.h:71