Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_real_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
43 #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
44 
45 #include <inttypes.h>
46 #include <stdio.h>
47 #include <volk/volk_common.h>
48 
49 #ifdef LV_HAVE_AVX2
50 #include <immintrin.h>
51 
52 static inline void
53 volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
54  const lv_8sc_t* complexVector,
55  const float scalar,
56  unsigned int num_points)
57 {
58  float* iBufferPtr = iBuffer;
59 
60  unsigned int number = 0;
61  const unsigned int sixteenthPoints = num_points / 16;
62  __m256 iFloatValue;
63 
64  const float iScalar = 1.0 / scalar;
65  __m256 invScalar = _mm256_set1_ps(iScalar);
66  __m256i complexVal, iIntVal;
67  int8_t* complexVectorPtr = (int8_t*)complexVector;
68 
69  __m256i moveMask = _mm256_set_epi8(0x80,
70  0x80,
71  0x80,
72  0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  14,
78  12,
79  10,
80  8,
81  6,
82  4,
83  2,
84  0,
85  0x80,
86  0x80,
87  0x80,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  14,
94  12,
95  10,
96  8,
97  6,
98  4,
99  2,
100  0);
101  for (; number < sixteenthPoints; number++) {
102  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
103  complexVectorPtr += 32;
104  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
105 
106  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
107  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
108  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
109  _mm256_store_ps(iBufferPtr, iFloatValue);
110  iBufferPtr += 8;
111 
112  complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
113  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
114  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
115  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
116  _mm256_store_ps(iBufferPtr, iFloatValue);
117  iBufferPtr += 8;
118  }
119 
120  number = sixteenthPoints * 16;
121  for (; number < num_points; number++) {
122  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
123  complexVectorPtr++;
124  }
125 }
126 #endif /* LV_HAVE_AVX2 */
127 
128 
129 #ifdef LV_HAVE_SSE4_1
130 #include <smmintrin.h>
131 
132 static inline void
133 volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
134  const lv_8sc_t* complexVector,
135  const float scalar,
136  unsigned int num_points)
137 {
138  float* iBufferPtr = iBuffer;
139 
140  unsigned int number = 0;
141  const unsigned int eighthPoints = num_points / 8;
142  __m128 iFloatValue;
143 
144  const float iScalar = 1.0 / scalar;
145  __m128 invScalar = _mm_set_ps1(iScalar);
146  __m128i complexVal, iIntVal;
147  int8_t* complexVectorPtr = (int8_t*)complexVector;
148 
149  __m128i moveMask = _mm_set_epi8(
150  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
151 
152  for (; number < eighthPoints; number++) {
153  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
154  complexVectorPtr += 16;
155  complexVal = _mm_shuffle_epi8(complexVal, moveMask);
156 
157  iIntVal = _mm_cvtepi8_epi32(complexVal);
158  iFloatValue = _mm_cvtepi32_ps(iIntVal);
159 
160  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
161 
162  _mm_store_ps(iBufferPtr, iFloatValue);
163 
164  iBufferPtr += 4;
165 
166  complexVal = _mm_srli_si128(complexVal, 4);
167  iIntVal = _mm_cvtepi8_epi32(complexVal);
168  iFloatValue = _mm_cvtepi32_ps(iIntVal);
169 
170  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
171 
172  _mm_store_ps(iBufferPtr, iFloatValue);
173 
174  iBufferPtr += 4;
175  }
176 
177  number = eighthPoints * 8;
178  for (; number < num_points; number++) {
179  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
180  complexVectorPtr++;
181  }
182 }
183 #endif /* LV_HAVE_SSE4_1 */
184 
185 
186 #ifdef LV_HAVE_SSE
187 #include <xmmintrin.h>
188 
189 static inline void
191  const lv_8sc_t* complexVector,
192  const float scalar,
193  unsigned int num_points)
194 {
195  float* iBufferPtr = iBuffer;
196 
197  unsigned int number = 0;
198  const unsigned int quarterPoints = num_points / 4;
199  __m128 iValue;
200 
201  const float iScalar = 1.0 / scalar;
202  __m128 invScalar = _mm_set_ps1(iScalar);
203  int8_t* complexVectorPtr = (int8_t*)complexVector;
204 
205  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
206 
207  for (; number < quarterPoints; number++) {
208  floatBuffer[0] = (float)(*complexVectorPtr);
209  complexVectorPtr += 2;
210  floatBuffer[1] = (float)(*complexVectorPtr);
211  complexVectorPtr += 2;
212  floatBuffer[2] = (float)(*complexVectorPtr);
213  complexVectorPtr += 2;
214  floatBuffer[3] = (float)(*complexVectorPtr);
215  complexVectorPtr += 2;
216 
217  iValue = _mm_load_ps(floatBuffer);
218 
219  iValue = _mm_mul_ps(iValue, invScalar);
220 
221  _mm_store_ps(iBufferPtr, iValue);
222 
223  iBufferPtr += 4;
224  }
225 
226  number = quarterPoints * 4;
227  for (; number < num_points; number++) {
228  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
229  complexVectorPtr++;
230  }
231 }
232 #endif /* LV_HAVE_SSE */
233 
234 
235 #ifdef LV_HAVE_GENERIC
236 
237 static inline void
239  const lv_8sc_t* complexVector,
240  const float scalar,
241  unsigned int num_points)
242 {
243  unsigned int number = 0;
244  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
245  float* iBufferPtr = iBuffer;
246  const float invScalar = 1.0 / scalar;
247  for (number = 0; number < num_points; number++) {
248  *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
249  complexVectorPtr++;
250  }
251 }
252 #endif /* LV_HAVE_GENERIC */
253 
254 
255 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
256 
257 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
258 #define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
259 
260 #include <inttypes.h>
261 #include <stdio.h>
262 #include <volk/volk_common.h>
263 
264 #ifdef LV_HAVE_AVX2
265 #include <immintrin.h>
266 
267 static inline void
268 volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
269  const lv_8sc_t* complexVector,
270  const float scalar,
271  unsigned int num_points)
272 {
273  float* iBufferPtr = iBuffer;
274 
275  unsigned int number = 0;
276  const unsigned int sixteenthPoints = num_points / 16;
277  __m256 iFloatValue;
278 
279  const float iScalar = 1.0 / scalar;
280  __m256 invScalar = _mm256_set1_ps(iScalar);
281  __m256i complexVal, iIntVal;
282  __m128i hcomplexVal;
283  int8_t* complexVectorPtr = (int8_t*)complexVector;
284 
285  __m256i moveMask = _mm256_set_epi8(0x80,
286  0x80,
287  0x80,
288  0x80,
289  0x80,
290  0x80,
291  0x80,
292  0x80,
293  14,
294  12,
295  10,
296  8,
297  6,
298  4,
299  2,
300  0,
301  0x80,
302  0x80,
303  0x80,
304  0x80,
305  0x80,
306  0x80,
307  0x80,
308  0x80,
309  14,
310  12,
311  10,
312  8,
313  6,
314  4,
315  2,
316  0);
317 
318  for (; number < sixteenthPoints; number++) {
319  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
320  complexVectorPtr += 32;
321  complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
322 
323  hcomplexVal = _mm256_extracti128_si256(complexVal, 0);
324  iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
325  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
326 
327  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
328 
329  _mm256_storeu_ps(iBufferPtr, iFloatValue);
330 
331  iBufferPtr += 8;
332 
333  hcomplexVal = _mm256_extracti128_si256(complexVal, 1);
334  iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
335  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
336 
337  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
338 
339  _mm256_storeu_ps(iBufferPtr, iFloatValue);
340 
341  iBufferPtr += 8;
342  }
343 
344  number = sixteenthPoints * 16;
345  for (; number < num_points; number++) {
346  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
347  complexVectorPtr++;
348  }
349 }
350 #endif /* LV_HAVE_AVX2 */
351 
352 
353 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_8ic_s32f_deinterleave_real_32f_generic(float *iBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_real_32f.h:238
static void volk_8ic_s32f_deinterleave_real_32f_a_sse(float *iBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_real_32f.h:190
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70