Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_16ic_convert_32fc_a_H
46 #define INCLUDED_volk_16ic_convert_32fc_a_H
47 
48 #include <volk/volk_complex.h>
49 
50 #ifdef LV_HAVE_AVX2
51 #include <immintrin.h>
52 
53 static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
54  const lv_16sc_t* inputVector,
55  unsigned int num_points)
56 {
57  const unsigned int avx_iters = num_points / 8;
58  unsigned int number = 0;
59  const int16_t* complexVectorPtr = (int16_t*)inputVector;
60  float* outputVectorPtr = (float*)outputVector;
61  __m256 outVal;
62  __m256i outValInt;
63  __m128i cplxValue;
64 
65  for (number = 0; number < avx_iters; number++) {
66  cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
67  complexVectorPtr += 8;
68 
69  outValInt = _mm256_cvtepi16_epi32(cplxValue);
70  outVal = _mm256_cvtepi32_ps(outValInt);
71  _mm256_store_ps((float*)outputVectorPtr, outVal);
72 
73  outputVectorPtr += 8;
74  }
75 
76  number = avx_iters * 8;
77  for (; number < num_points * 2; number++) {
78  *outputVectorPtr++ = (float)*complexVectorPtr++;
79  }
80 }
81 
82 #endif /* LV_HAVE_AVX2 */
83 
84 #ifdef LV_HAVE_GENERIC
85 
86 static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
87  const lv_16sc_t* inputVector,
88  unsigned int num_points)
89 {
90  unsigned int i;
91  for (i = 0; i < num_points; i++) {
92  outputVector[i] =
93  lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
94  }
95 }
96 
97 #endif /* LV_HAVE_GENERIC */
98 
99 
100 #ifdef LV_HAVE_SSE2
101 #include <emmintrin.h>
102 
103 static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
104  const lv_16sc_t* inputVector,
105  unsigned int num_points)
106 {
107  const unsigned int sse_iters = num_points / 2;
108 
109  const lv_16sc_t* _in = inputVector;
110  lv_32fc_t* _out = outputVector;
111  __m128 a;
112  unsigned int number;
113 
114  for (number = 0; number < sse_iters; number++) {
115  a = _mm_set_ps(
116  (float)(lv_cimag(_in[1])),
117  (float)(lv_creal(_in[1])),
118  (float)(lv_cimag(_in[0])),
119  (float)(lv_creal(
120  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
121  _mm_store_ps((float*)_out, a);
122  _in += 2;
123  _out += 2;
124  }
125  if (num_points & 1) {
126  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
127  _in++;
128  }
129 }
130 
131 #endif /* LV_HAVE_SSE2 */
132 
133 #ifdef LV_HAVE_AVX
134 #include <immintrin.h>
135 
136 static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
137  const lv_16sc_t* inputVector,
138  unsigned int num_points)
139 {
140  const unsigned int sse_iters = num_points / 4;
141 
142  const lv_16sc_t* _in = inputVector;
143  lv_32fc_t* _out = outputVector;
144  __m256 a;
145  unsigned int i, number;
146 
147  for (number = 0; number < sse_iters; number++) {
148  a = _mm256_set_ps(
149  (float)(lv_cimag(_in[3])),
150  (float)(lv_creal(_in[3])),
151  (float)(lv_cimag(_in[2])),
152  (float)(lv_creal(_in[2])),
153  (float)(lv_cimag(_in[1])),
154  (float)(lv_creal(_in[1])),
155  (float)(lv_cimag(_in[0])),
156  (float)(lv_creal(
157  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
158  _mm256_store_ps((float*)_out, a);
159  _in += 4;
160  _out += 4;
161  }
162 
163  for (i = 0; i < (num_points % 4); ++i) {
164  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
165  _in++;
166  }
167 }
168 
169 #endif /* LV_HAVE_AVX */
170 
171 
172 #ifdef LV_HAVE_NEON
173 #include <arm_neon.h>
174 
175 static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
176  const lv_16sc_t* inputVector,
177  unsigned int num_points)
178 {
179  const unsigned int sse_iters = num_points / 2;
180 
181  const lv_16sc_t* _in = inputVector;
182  lv_32fc_t* _out = outputVector;
183 
184  int16x4_t a16x4;
185  int32x4_t a32x4;
186  float32x4_t f32x4;
187  unsigned int i, number;
188 
189  for (number = 0; number < sse_iters; number++) {
190  a16x4 = vld1_s16((const int16_t*)_in);
191  __VOLK_PREFETCH(_in + 4);
192  a32x4 = vmovl_s16(a16x4);
193  f32x4 = vcvtq_f32_s32(a32x4);
194  vst1q_f32((float32_t*)_out, f32x4);
195  _in += 2;
196  _out += 2;
197  }
198  for (i = 0; i < (num_points % 2); ++i) {
199  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
200  _in++;
201  }
202 }
203 #endif /* LV_HAVE_NEON */
204 
205 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
206 
207 #ifndef INCLUDED_volk_16ic_convert_32fc_u_H
208 #define INCLUDED_volk_16ic_convert_32fc_u_H
209 
210 #include <volk/volk_complex.h>
211 
212 
213 #ifdef LV_HAVE_AVX2
214 #include <immintrin.h>
215 
216 static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
217  const lv_16sc_t* inputVector,
218  unsigned int num_points)
219 {
220  const unsigned int avx_iters = num_points / 8;
221  unsigned int number = 0;
222  const int16_t* complexVectorPtr = (int16_t*)inputVector;
223  float* outputVectorPtr = (float*)outputVector;
224  __m256 outVal;
225  __m256i outValInt;
226  __m128i cplxValue;
227 
228  for (number = 0; number < avx_iters; number++) {
229  cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
230  complexVectorPtr += 8;
231 
232  outValInt = _mm256_cvtepi16_epi32(cplxValue);
233  outVal = _mm256_cvtepi32_ps(outValInt);
234  _mm256_storeu_ps((float*)outputVectorPtr, outVal);
235 
236  outputVectorPtr += 8;
237  }
238 
239  number = avx_iters * 8;
240  for (; number < num_points * 2; number++) {
241  *outputVectorPtr++ = (float)*complexVectorPtr++;
242  }
243 }
244 
245 #endif /* LV_HAVE_AVX2 */
246 
247 #ifdef LV_HAVE_SSE2
248 #include <emmintrin.h>
249 
250 static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
251  const lv_16sc_t* inputVector,
252  unsigned int num_points)
253 {
254  const unsigned int sse_iters = num_points / 2;
255 
256  const lv_16sc_t* _in = inputVector;
257  lv_32fc_t* _out = outputVector;
258  __m128 a;
259  unsigned int number;
260 
261  for (number = 0; number < sse_iters; number++) {
262  a = _mm_set_ps(
263  (float)(lv_cimag(_in[1])),
264  (float)(lv_creal(_in[1])),
265  (float)(lv_cimag(_in[0])),
266  (float)(lv_creal(
267  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
268  _mm_storeu_ps((float*)_out, a);
269  _in += 2;
270  _out += 2;
271  }
272  if (num_points & 1) {
273  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
274  _in++;
275  }
276 }
277 
278 #endif /* LV_HAVE_SSE2 */
279 
280 
281 #ifdef LV_HAVE_AVX
282 #include <immintrin.h>
283 
284 static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
285  const lv_16sc_t* inputVector,
286  unsigned int num_points)
287 {
288  const unsigned int sse_iters = num_points / 4;
289 
290  const lv_16sc_t* _in = inputVector;
291  lv_32fc_t* _out = outputVector;
292  __m256 a;
293  unsigned int i, number;
294 
295  for (number = 0; number < sse_iters; number++) {
296  a = _mm256_set_ps(
297  (float)(lv_cimag(_in[3])),
298  (float)(lv_creal(_in[3])),
299  (float)(lv_cimag(_in[2])),
300  (float)(lv_creal(_in[2])),
301  (float)(lv_cimag(_in[1])),
302  (float)(lv_creal(_in[1])),
303  (float)(lv_cimag(_in[0])),
304  (float)(lv_creal(
305  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
306  _mm256_storeu_ps((float*)_out, a);
307  _in += 4;
308  _out += 4;
309  }
310 
311  for (i = 0; i < (num_points % 4); ++i) {
312  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
313  _in++;
314  }
315 }
316 
317 #endif /* LV_HAVE_AVX */
318 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:86
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:284
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:136
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:250
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:175
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:103
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
short complex lv_16sc_t
Definition: volk_complex.h:71
for i
Definition: volk_config_fixed.tmpl.h:13