Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
62 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
64 
65 #include <inttypes.h>
66 #include <stdio.h>
67 #include <volk/volk_common.h>
68 
69 #ifdef LV_HAVE_AVX2
70 #include <immintrin.h>
71 
72 static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
73  const float* iBuffer,
74  const float* qBuffer,
75  const float scalar,
76  unsigned int num_points)
77 {
78  unsigned int number = 0;
79  const float* iBufferPtr = iBuffer;
80  const float* qBufferPtr = qBuffer;
81 
82  __m256 vScalar = _mm256_set1_ps(scalar);
83 
84  const unsigned int eighthPoints = num_points / 8;
85 
86  __m256 iValue, qValue, cplxValue1, cplxValue2;
87  __m256i intValue1, intValue2;
88 
89  int16_t* complexVectorPtr = (int16_t*)complexVector;
90 
91  for (; number < eighthPoints; number++) {
92  iValue = _mm256_load_ps(iBufferPtr);
93  qValue = _mm256_load_ps(qBufferPtr);
94 
95  // Interleaves the lower two values in the i and q variables into one buffer
96  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
98 
99  // Interleaves the upper two values in the i and q variables into one buffer
100  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
102 
103  intValue1 = _mm256_cvtps_epi32(cplxValue1);
104  intValue2 = _mm256_cvtps_epi32(cplxValue2);
105 
106  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
107 
108  _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109  complexVectorPtr += 16;
110 
111  iBufferPtr += 8;
112  qBufferPtr += 8;
113  }
114 
115  number = eighthPoints * 8;
116  complexVectorPtr = (int16_t*)(&complexVector[number]);
117  for (; number < num_points; number++) {
118  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
120  }
121 }
122 #endif /* LV_HAVE_AVX2 */
123 
124 
125 #ifdef LV_HAVE_SSE2
126 #include <emmintrin.h>
127 
128 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
129  const float* iBuffer,
130  const float* qBuffer,
131  const float scalar,
132  unsigned int num_points)
133 {
134  unsigned int number = 0;
135  const float* iBufferPtr = iBuffer;
136  const float* qBufferPtr = qBuffer;
137 
138  __m128 vScalar = _mm_set_ps1(scalar);
139 
140  const unsigned int quarterPoints = num_points / 4;
141 
142  __m128 iValue, qValue, cplxValue1, cplxValue2;
143  __m128i intValue1, intValue2;
144 
145  int16_t* complexVectorPtr = (int16_t*)complexVector;
146 
147  for (; number < quarterPoints; number++) {
148  iValue = _mm_load_ps(iBufferPtr);
149  qValue = _mm_load_ps(qBufferPtr);
150 
151  // Interleaves the lower two values in the i and q variables into one buffer
152  cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153  cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
154 
155  // Interleaves the upper two values in the i and q variables into one buffer
156  cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157  cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
158 
159  intValue1 = _mm_cvtps_epi32(cplxValue1);
160  intValue2 = _mm_cvtps_epi32(cplxValue2);
161 
162  intValue1 = _mm_packs_epi32(intValue1, intValue2);
163 
164  _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165  complexVectorPtr += 8;
166 
167  iBufferPtr += 4;
168  qBufferPtr += 4;
169  }
170 
171  number = quarterPoints * 4;
172  complexVectorPtr = (int16_t*)(&complexVector[number]);
173  for (; number < num_points; number++) {
174  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
176  }
177 }
178 #endif /* LV_HAVE_SSE2 */
179 
180 
181 #ifdef LV_HAVE_SSE
182 #include <xmmintrin.h>
183 
184 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
185  const float* iBuffer,
186  const float* qBuffer,
187  const float scalar,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191  const float* iBufferPtr = iBuffer;
192  const float* qBufferPtr = qBuffer;
193 
194  __m128 vScalar = _mm_set_ps1(scalar);
195 
196  const unsigned int quarterPoints = num_points / 4;
197 
198  __m128 iValue, qValue, cplxValue;
199 
200  int16_t* complexVectorPtr = (int16_t*)complexVector;
201 
202  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
203 
204  for (; number < quarterPoints; number++) {
205  iValue = _mm_load_ps(iBufferPtr);
206  qValue = _mm_load_ps(qBufferPtr);
207 
208  // Interleaves the lower two values in the i and q variables into one buffer
209  cplxValue = _mm_unpacklo_ps(iValue, qValue);
210  cplxValue = _mm_mul_ps(cplxValue, vScalar);
211 
212  _mm_store_ps(floatBuffer, cplxValue);
213 
214  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
218 
219  // Interleaves the upper two values in the i and q variables into one buffer
220  cplxValue = _mm_unpackhi_ps(iValue, qValue);
221  cplxValue = _mm_mul_ps(cplxValue, vScalar);
222 
223  _mm_store_ps(floatBuffer, cplxValue);
224 
225  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
229 
230  iBufferPtr += 4;
231  qBufferPtr += 4;
232  }
233 
234  number = quarterPoints * 4;
235  complexVectorPtr = (int16_t*)(&complexVector[number]);
236  for (; number < num_points; number++) {
237  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
239  }
240 }
241 #endif /* LV_HAVE_SSE */
242 
243 
244 #ifdef LV_HAVE_GENERIC
245 
246 static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
247  const float* iBuffer,
248  const float* qBuffer,
249  const float scalar,
250  unsigned int num_points)
251 {
252  int16_t* complexVectorPtr = (int16_t*)complexVector;
253  const float* iBufferPtr = iBuffer;
254  const float* qBufferPtr = qBuffer;
255  unsigned int number = 0;
256 
257  for (number = 0; number < num_points; number++) {
258  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
266 
267 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
269 
270 #include <inttypes.h>
271 #include <stdio.h>
272 #include <volk/volk_common.h>
273 
274 #ifdef LV_HAVE_AVX2
275 #include <immintrin.h>
276 
277 static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
278  const float* iBuffer,
279  const float* qBuffer,
280  const float scalar,
281  unsigned int num_points)
282 {
283  unsigned int number = 0;
284  const float* iBufferPtr = iBuffer;
285  const float* qBufferPtr = qBuffer;
286 
287  __m256 vScalar = _mm256_set1_ps(scalar);
288 
289  const unsigned int eighthPoints = num_points / 8;
290 
291  __m256 iValue, qValue, cplxValue1, cplxValue2;
292  __m256i intValue1, intValue2;
293 
294  int16_t* complexVectorPtr = (int16_t*)complexVector;
295 
296  for (; number < eighthPoints; number++) {
297  iValue = _mm256_loadu_ps(iBufferPtr);
298  qValue = _mm256_loadu_ps(qBufferPtr);
299 
300  // Interleaves the lower two values in the i and q variables into one buffer
301  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
303 
304  // Interleaves the upper two values in the i and q variables into one buffer
305  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
307 
308  intValue1 = _mm256_cvtps_epi32(cplxValue1);
309  intValue2 = _mm256_cvtps_epi32(cplxValue2);
310 
311  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
312 
313  _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314  complexVectorPtr += 16;
315 
316  iBufferPtr += 8;
317  qBufferPtr += 8;
318  }
319 
320  number = eighthPoints * 8;
321  complexVectorPtr = (int16_t*)(&complexVector[number]);
322  for (; number < num_points; number++) {
323  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
325  }
326 }
327 #endif /* LV_HAVE_AVX2 */
328 
329 
330 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
static float rintf(float x)
Definition: config.h:45
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:128
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:184
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:246
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
short complex lv_16sc_t
Definition: volk_complex.h:71