Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
44 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 
51 #ifdef LV_HAVE_SSE4_1
52 #include <smmintrin.h>
53 
54 static inline void
55 volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
56  float* qBuffer,
57  const lv_8sc_t* complexVector,
58  const float scalar,
59  unsigned int num_points)
60 {
61  float* iBufferPtr = iBuffer;
62  float* qBufferPtr = qBuffer;
63 
64  unsigned int number = 0;
65  const unsigned int eighthPoints = num_points / 8;
66  __m128 iFloatValue, qFloatValue;
67 
68  const float iScalar = 1.0 / scalar;
69  __m128 invScalar = _mm_set_ps1(iScalar);
70  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
71  int8_t* complexVectorPtr = (int8_t*)complexVector;
72 
73  __m128i iMoveMask = _mm_set_epi8(
74  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
75  __m128i qMoveMask = _mm_set_epi8(
76  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
77 
78  for (; number < eighthPoints; number++) {
79  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
80  complexVectorPtr += 16;
81  iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
82  qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
83 
84  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
85  iFloatValue = _mm_cvtepi32_ps(iIntVal);
86  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
87  _mm_store_ps(iBufferPtr, iFloatValue);
88  iBufferPtr += 4;
89 
90  iComplexVal = _mm_srli_si128(iComplexVal, 4);
91 
92  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
93  iFloatValue = _mm_cvtepi32_ps(iIntVal);
94  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
95  _mm_store_ps(iBufferPtr, iFloatValue);
96  iBufferPtr += 4;
97 
98  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
99  qFloatValue = _mm_cvtepi32_ps(qIntVal);
100  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
101  _mm_store_ps(qBufferPtr, qFloatValue);
102  qBufferPtr += 4;
103 
104  qComplexVal = _mm_srli_si128(qComplexVal, 4);
105 
106  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
107  qFloatValue = _mm_cvtepi32_ps(qIntVal);
108  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
109  _mm_store_ps(qBufferPtr, qFloatValue);
110 
111  qBufferPtr += 4;
112  }
113 
114  number = eighthPoints * 8;
115  for (; number < num_points; number++) {
116  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
117  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
118  }
119 }
120 #endif /* LV_HAVE_SSE4_1 */
121 
122 
123 #ifdef LV_HAVE_SSE
124 #include <xmmintrin.h>
125 
126 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
127  float* qBuffer,
128  const lv_8sc_t* complexVector,
129  const float scalar,
130  unsigned int num_points)
131 {
132  float* iBufferPtr = iBuffer;
133  float* qBufferPtr = qBuffer;
134 
135  unsigned int number = 0;
136  const unsigned int quarterPoints = num_points / 4;
137  __m128 cplxValue1, cplxValue2, iValue, qValue;
138 
139  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
140  int8_t* complexVectorPtr = (int8_t*)complexVector;
141 
142  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
143 
144  for (; number < quarterPoints; number++) {
145  floatBuffer[0] = (float)(complexVectorPtr[0]);
146  floatBuffer[1] = (float)(complexVectorPtr[1]);
147  floatBuffer[2] = (float)(complexVectorPtr[2]);
148  floatBuffer[3] = (float)(complexVectorPtr[3]);
149 
150  floatBuffer[4] = (float)(complexVectorPtr[4]);
151  floatBuffer[5] = (float)(complexVectorPtr[5]);
152  floatBuffer[6] = (float)(complexVectorPtr[6]);
153  floatBuffer[7] = (float)(complexVectorPtr[7]);
154 
155  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
156  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
157 
158  complexVectorPtr += 8;
159 
160  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
161  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
162 
163  // Arrange in i1i2i3i4 format
164  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
165  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
166 
167  _mm_store_ps(iBufferPtr, iValue);
168  _mm_store_ps(qBufferPtr, qValue);
169 
170  iBufferPtr += 4;
171  qBufferPtr += 4;
172  }
173 
174  number = quarterPoints * 4;
175  complexVectorPtr = (int8_t*)&complexVector[number];
176  for (; number < num_points; number++) {
177  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
178  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
179  }
180 }
181 #endif /* LV_HAVE_SSE */
182 
183 
184 #ifdef LV_HAVE_AVX2
185 #include <immintrin.h>
186 
187 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
188  float* qBuffer,
189  const lv_8sc_t* complexVector,
190  const float scalar,
191  unsigned int num_points)
192 {
193  float* iBufferPtr = iBuffer;
194  float* qBufferPtr = qBuffer;
195 
196  unsigned int number = 0;
197  const unsigned int sixteenthPoints = num_points / 16;
198  __m256 iFloatValue, qFloatValue;
199 
200  const float iScalar = 1.0 / scalar;
201  __m256 invScalar = _mm256_set1_ps(iScalar);
202  __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
203  int8_t* complexVectorPtr = (int8_t*)complexVector;
204 
205  __m256i iMoveMask = _mm256_set_epi8(0x80,
206  0x80,
207  0x80,
208  0x80,
209  0x80,
210  0x80,
211  0x80,
212  0x80,
213  14,
214  12,
215  10,
216  8,
217  6,
218  4,
219  2,
220  0,
221  0x80,
222  0x80,
223  0x80,
224  0x80,
225  0x80,
226  0x80,
227  0x80,
228  0x80,
229  14,
230  12,
231  10,
232  8,
233  6,
234  4,
235  2,
236  0);
237  __m256i qMoveMask = _mm256_set_epi8(0x80,
238  0x80,
239  0x80,
240  0x80,
241  0x80,
242  0x80,
243  0x80,
244  0x80,
245  15,
246  13,
247  11,
248  9,
249  7,
250  5,
251  3,
252  1,
253  0x80,
254  0x80,
255  0x80,
256  0x80,
257  0x80,
258  0x80,
259  0x80,
260  0x80,
261  15,
262  13,
263  11,
264  9,
265  7,
266  5,
267  3,
268  1);
269 
270  for (; number < sixteenthPoints; number++) {
271  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
272  complexVectorPtr += 32;
273  iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
274  qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
275 
276  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
277  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
278  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
279  _mm256_store_ps(iBufferPtr, iFloatValue);
280  iBufferPtr += 8;
281 
282  iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
283  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
284  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
285  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
286  _mm256_store_ps(iBufferPtr, iFloatValue);
287  iBufferPtr += 8;
288 
289  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
290  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
291  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
292  _mm256_store_ps(qBufferPtr, qFloatValue);
293  qBufferPtr += 8;
294 
295  qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
296  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
297  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
298  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
299  _mm256_store_ps(qBufferPtr, qFloatValue);
300  qBufferPtr += 8;
301  }
302 
303  number = sixteenthPoints * 16;
304  for (; number < num_points; number++) {
305  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
306  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
307  }
308 }
309 #endif /* LV_HAVE_AVX2 */
310 
311 
312 #ifdef LV_HAVE_GENERIC
313 
314 static inline void
316  float* qBuffer,
317  const lv_8sc_t* complexVector,
318  const float scalar,
319  unsigned int num_points)
320 {
321  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
322  float* iBufferPtr = iBuffer;
323  float* qBufferPtr = qBuffer;
324  unsigned int number;
325  const float invScalar = 1.0 / scalar;
326  for (number = 0; number < num_points; number++) {
327  *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
328  *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
329  }
330 }
331 #endif /* LV_HAVE_GENERIC */
332 
333 
334 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
335 
336 
337 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
338 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
339 
340 #include <inttypes.h>
341 #include <stdio.h>
342 #include <volk/volk_common.h>
343 
344 #ifdef LV_HAVE_AVX2
345 #include <immintrin.h>
346 
347 static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
348  float* qBuffer,
349  const lv_8sc_t* complexVector,
350  const float scalar,
351  unsigned int num_points)
352 {
353  float* iBufferPtr = iBuffer;
354  float* qBufferPtr = qBuffer;
355 
356  unsigned int number = 0;
357  const unsigned int sixteenthPoints = num_points / 16;
358  __m256 iFloatValue, qFloatValue;
359 
360  const float iScalar = 1.0 / scalar;
361  __m256 invScalar = _mm256_set1_ps(iScalar);
362  __m256i complexVal, iIntVal, qIntVal;
363  __m128i iComplexVal, qComplexVal;
364  int8_t* complexVectorPtr = (int8_t*)complexVector;
365 
366  __m256i MoveMask = _mm256_set_epi8(15,
367  13,
368  11,
369  9,
370  7,
371  5,
372  3,
373  1,
374  14,
375  12,
376  10,
377  8,
378  6,
379  4,
380  2,
381  0,
382  15,
383  13,
384  11,
385  9,
386  7,
387  5,
388  3,
389  1,
390  14,
391  12,
392  10,
393  8,
394  6,
395  4,
396  2,
397  0);
398 
399  for (; number < sixteenthPoints; number++) {
400  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
401  complexVectorPtr += 32;
402  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
403  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
404  iComplexVal = _mm256_extractf128_si256(complexVal, 0);
405  qComplexVal = _mm256_extractf128_si256(complexVal, 1);
406 
407  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
408  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
409  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
410  _mm256_storeu_ps(iBufferPtr, iFloatValue);
411  iBufferPtr += 8;
412 
413  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
414  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
415  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
416  _mm256_storeu_ps(qBufferPtr, qFloatValue);
417  qBufferPtr += 8;
418 
419  complexVal = _mm256_srli_si256(complexVal, 8);
420  iComplexVal = _mm256_extractf128_si256(complexVal, 0);
421  qComplexVal = _mm256_extractf128_si256(complexVal, 1);
422 
423  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
424  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
425  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
426  _mm256_storeu_ps(iBufferPtr, iFloatValue);
427  iBufferPtr += 8;
428 
429  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
430  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
431  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
432  _mm256_storeu_ps(qBufferPtr, qFloatValue);
433  qBufferPtr += 8;
434  }
435 
436  number = sixteenthPoints * 16;
437  for (; number < num_points; number++) {
438  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
439  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
440  }
441 }
442 #endif /* LV_HAVE_AVX2 */
443 
444 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:126
static void volk_8ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:315
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70