Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
44 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 #include <volk/volk_common.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void
54 volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
55  float* qBuffer,
56  const lv_16sc_t* complexVector,
57  const float scalar,
58  unsigned int num_points)
59 {
60  float* iBufferPtr = iBuffer;
61  float* qBufferPtr = qBuffer;
62 
63  uint64_t number = 0;
64  const uint64_t eighthPoints = num_points / 8;
65  __m256 cplxValue1, cplxValue2, iValue, qValue;
66  __m256i cplxValueA, cplxValueB;
67  __m128i cplxValue128;
68 
69  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
70  int16_t* complexVectorPtr = (int16_t*)complexVector;
71  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
72 
73  for (; number < eighthPoints; number++) {
74 
75  cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
76  complexVectorPtr += 16;
77 
78  // cvt
79  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
80  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
81  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
82  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
83  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
84  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
85 
86  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
87  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
88 
89  // Arrange in i1i2i3i4 format
90  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
91  iValue = _mm256_permutevar8x32_ps(iValue, idx);
92  // Arrange in q1q2q3q4 format
93  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
94  qValue = _mm256_permutevar8x32_ps(qValue, idx);
95 
96  _mm256_store_ps(iBufferPtr, iValue);
97  _mm256_store_ps(qBufferPtr, qValue);
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  complexVectorPtr = (int16_t*)&complexVector[number];
105  for (; number < num_points; number++) {
106  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
107  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
108  }
109 }
110 #endif /* LV_HAVE_AVX2 */
111 
112 #ifdef LV_HAVE_SSE
113 #include <xmmintrin.h>
114 
115 static inline void
117  float* qBuffer,
118  const lv_16sc_t* complexVector,
119  const float scalar,
120  unsigned int num_points)
121 {
122  float* iBufferPtr = iBuffer;
123  float* qBufferPtr = qBuffer;
124 
125  uint64_t number = 0;
126  const uint64_t quarterPoints = num_points / 4;
127  __m128 cplxValue1, cplxValue2, iValue, qValue;
128 
129  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
130  int16_t* complexVectorPtr = (int16_t*)complexVector;
131 
132  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
133 
134  for (; number < quarterPoints; number++) {
135 
136  floatBuffer[0] = (float)(complexVectorPtr[0]);
137  floatBuffer[1] = (float)(complexVectorPtr[1]);
138  floatBuffer[2] = (float)(complexVectorPtr[2]);
139  floatBuffer[3] = (float)(complexVectorPtr[3]);
140 
141  floatBuffer[4] = (float)(complexVectorPtr[4]);
142  floatBuffer[5] = (float)(complexVectorPtr[5]);
143  floatBuffer[6] = (float)(complexVectorPtr[6]);
144  floatBuffer[7] = (float)(complexVectorPtr[7]);
145 
146  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
147  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
148 
149  complexVectorPtr += 8;
150 
151  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
152  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
153 
154  // Arrange in i1i2i3i4 format
155  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
156  // Arrange in q1q2q3q4 format
157  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
158 
159  _mm_store_ps(iBufferPtr, iValue);
160  _mm_store_ps(qBufferPtr, qValue);
161 
162  iBufferPtr += 4;
163  qBufferPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  complexVectorPtr = (int16_t*)&complexVector[number];
168  for (; number < num_points; number++) {
169  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
170  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
171  }
172 }
173 #endif /* LV_HAVE_SSE */
174 
175 #ifdef LV_HAVE_GENERIC
176 
177 static inline void
179  float* qBuffer,
180  const lv_16sc_t* complexVector,
181  const float scalar,
182  unsigned int num_points)
183 {
184  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
185  float* iBufferPtr = iBuffer;
186  float* qBufferPtr = qBuffer;
187  unsigned int number;
188  for (number = 0; number < num_points; number++) {
189  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
190  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
191  }
192 }
193 #endif /* LV_HAVE_GENERIC */
194 
195 #ifdef LV_HAVE_NEON
196 #include <arm_neon.h>
197 static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
198  float* qBuffer,
199  const lv_16sc_t* complexVector,
200  const float scalar,
201  unsigned int num_points)
202 {
203  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
204  float* iBufferPtr = iBuffer;
205  float* qBufferPtr = qBuffer;
206  unsigned int eighth_points = num_points / 4;
207  unsigned int number;
208  float iScalar = 1.f / scalar;
209  float32x4_t invScalar;
210  invScalar = vld1q_dup_f32(&iScalar);
211 
212  int16x4x2_t complexInput_s16;
213  int32x4x2_t complexInput_s32;
214  float32x4x2_t complexFloat;
215 
216  for (number = 0; number < eighth_points; number++) {
217  complexInput_s16 = vld2_s16(complexVectorPtr);
218  complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
219  complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
220  complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
221  complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
222  complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
223  complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
224  vst1q_f32(iBufferPtr, complexFloat.val[0]);
225  vst1q_f32(qBufferPtr, complexFloat.val[1]);
226  complexVectorPtr += 8;
227  iBufferPtr += 4;
228  qBufferPtr += 4;
229  }
230 
231  for (number = eighth_points * 4; number < num_points; number++) {
232  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
233  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
234  }
235 }
236 #endif /* LV_HAVE_GENERIC */
237 
238 #ifdef LV_HAVE_ORC
239 extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
240  float* qBuffer,
241  const lv_16sc_t* complexVector,
242  const float scalar,
243  unsigned int num_points);
244 
245 static inline void
246 volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
247  float* qBuffer,
248  const lv_16sc_t* complexVector,
249  const float scalar,
250  unsigned int num_points)
251 {
252  volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
253  iBuffer, qBuffer, complexVector, scalar, num_points);
254 }
255 #endif /* LV_HAVE_ORC */
256 
257 
258 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
259 
260 
261 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
262 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
263 
264 #include <inttypes.h>
265 #include <stdio.h>
266 #include <volk/volk_common.h>
267 
268 #ifdef LV_HAVE_AVX2
269 #include <immintrin.h>
270 
271 static inline void
272 volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
273  float* qBuffer,
274  const lv_16sc_t* complexVector,
275  const float scalar,
276  unsigned int num_points)
277 {
278  float* iBufferPtr = iBuffer;
279  float* qBufferPtr = qBuffer;
280 
281  uint64_t number = 0;
282  const uint64_t eighthPoints = num_points / 8;
283  __m256 cplxValue1, cplxValue2, iValue, qValue;
284  __m256i cplxValueA, cplxValueB;
285  __m128i cplxValue128;
286 
287  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
288  int16_t* complexVectorPtr = (int16_t*)complexVector;
289  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
290 
291  for (; number < eighthPoints; number++) {
292 
293  cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
294  complexVectorPtr += 16;
295 
296  // cvt
297  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
298  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
299  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
300  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
301  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
302  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
303 
304  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
305  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
306 
307  // Arrange in i1i2i3i4 format
308  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
309  iValue = _mm256_permutevar8x32_ps(iValue, idx);
310  // Arrange in q1q2q3q4 format
311  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
312  qValue = _mm256_permutevar8x32_ps(qValue, idx);
313 
314  _mm256_storeu_ps(iBufferPtr, iValue);
315  _mm256_storeu_ps(qBufferPtr, qValue);
316 
317  iBufferPtr += 8;
318  qBufferPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  complexVectorPtr = (int16_t*)&complexVector[number];
323  for (; number < num_points; number++) {
324  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
325  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
326  }
327 }
328 #endif /* LV_HAVE_AVX2 */
329 
330 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_16ic_s32f_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:197
static void volk_16ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:178
static void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:116
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
short complex lv_16sc_t
Definition: volk_complex.h:71