Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
61 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 #include <volk/volk_common.h>
66 
67 
68 #ifdef LV_HAVE_AVX2
69 #include <immintrin.h>
70 
71 static inline void
72 volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
73  const lv_32fc_t* complexVector,
74  const float scalar,
75  unsigned int num_points)
76 {
77  unsigned int number = 0;
78  const unsigned int eighthPoints = num_points / 8;
79 
80  const float* complexVectorPtr = (float*)complexVector;
81  int16_t* iBufferPtr = iBuffer;
82 
83  __m256 vScalar = _mm256_set1_ps(scalar);
84 
85  __m256 cplxValue1, cplxValue2, iValue;
86  __m256i a;
87  __m128i b;
88 
89  __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
90 
91  for (; number < eighthPoints; number++) {
92  cplxValue1 = _mm256_load_ps(complexVectorPtr);
93  complexVectorPtr += 8;
94 
95  cplxValue2 = _mm256_load_ps(complexVectorPtr);
96  complexVectorPtr += 8;
97 
98  // Arrange in i1i2i3i4 format
99  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
100 
101  iValue = _mm256_mul_ps(iValue, vScalar);
102 
103  iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
104  a = _mm256_cvtps_epi32(iValue);
105  a = _mm256_packs_epi32(a, a);
106  a = _mm256_permutevar8x32_epi32(a, idx);
107  b = _mm256_extracti128_si256(a, 0);
108 
109  _mm_store_si128((__m128i*)iBufferPtr, b);
110  iBufferPtr += 8;
111  }
112 
113  number = eighthPoints * 8;
114  iBufferPtr = &iBuffer[number];
115  for (; number < num_points; number++) {
116  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
117  complexVectorPtr++;
118  }
119 }
120 
121 
122 #endif /* LV_HAVE_AVX2 */
123 
124 #ifdef LV_HAVE_SSE
125 #include <xmmintrin.h>
126 
127 static inline void
129  const lv_32fc_t* complexVector,
130  const float scalar,
131  unsigned int num_points)
132 {
133  unsigned int number = 0;
134  const unsigned int quarterPoints = num_points / 4;
135 
136  const float* complexVectorPtr = (float*)complexVector;
137  int16_t* iBufferPtr = iBuffer;
138 
139  __m128 vScalar = _mm_set_ps1(scalar);
140 
141  __m128 cplxValue1, cplxValue2, iValue;
142 
143  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
144 
145  for (; number < quarterPoints; number++) {
146  cplxValue1 = _mm_load_ps(complexVectorPtr);
147  complexVectorPtr += 4;
148 
149  cplxValue2 = _mm_load_ps(complexVectorPtr);
150  complexVectorPtr += 4;
151 
152  // Arrange in i1i2i3i4 format
153  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
154 
155  iValue = _mm_mul_ps(iValue, vScalar);
156 
157  _mm_store_ps(floatBuffer, iValue);
158  *iBufferPtr++ = (int16_t)(floatBuffer[0]);
159  *iBufferPtr++ = (int16_t)(floatBuffer[1]);
160  *iBufferPtr++ = (int16_t)(floatBuffer[2]);
161  *iBufferPtr++ = (int16_t)(floatBuffer[3]);
162  }
163 
164  number = quarterPoints * 4;
165  iBufferPtr = &iBuffer[number];
166  for (; number < num_points; number++) {
167  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
168  complexVectorPtr++;
169  }
170 }
171 
172 #endif /* LV_HAVE_SSE */
173 
174 
175 #ifdef LV_HAVE_GENERIC
176 
177 static inline void
179  const lv_32fc_t* complexVector,
180  const float scalar,
181  unsigned int num_points)
182 {
183  const float* complexVectorPtr = (float*)complexVector;
184  int16_t* iBufferPtr = iBuffer;
185  unsigned int number = 0;
186  for (number = 0; number < num_points; number++) {
187  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
188  complexVectorPtr++;
189  }
190 }
191 
192 #endif /* LV_HAVE_GENERIC */
193 
194 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
195 
196 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
197 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
198 
199 #include <inttypes.h>
200 #include <stdio.h>
201 #include <volk/volk_common.h>
202 
203 #ifdef LV_HAVE_AVX2
204 #include <immintrin.h>
205 
206 static inline void
207 volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
208  const lv_32fc_t* complexVector,
209  const float scalar,
210  unsigned int num_points)
211 {
212  unsigned int number = 0;
213  const unsigned int eighthPoints = num_points / 8;
214 
215  const float* complexVectorPtr = (float*)complexVector;
216  int16_t* iBufferPtr = iBuffer;
217 
218  __m256 vScalar = _mm256_set1_ps(scalar);
219 
220  __m256 cplxValue1, cplxValue2, iValue;
221  __m256i a;
222  __m128i b;
223 
224  __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
225 
226  for (; number < eighthPoints; number++) {
227  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
228  complexVectorPtr += 8;
229 
230  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
231  complexVectorPtr += 8;
232 
233  // Arrange in i1i2i3i4 format
234  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
235 
236  iValue = _mm256_mul_ps(iValue, vScalar);
237 
238  iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
239  a = _mm256_cvtps_epi32(iValue);
240  a = _mm256_packs_epi32(a, a);
241  a = _mm256_permutevar8x32_epi32(a, idx);
242  b = _mm256_extracti128_si256(a, 0);
243 
244  _mm_storeu_si128((__m128i*)iBufferPtr, b);
245  iBufferPtr += 8;
246  }
247 
248  number = eighthPoints * 8;
249  iBufferPtr = &iBuffer[number];
250  for (; number < num_points; number++) {
251  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
252  complexVectorPtr++;
253  }
254 }
255 
256 #endif /* LV_HAVE_AVX2 */
257 
258 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
#define _MM_FROUND_TO_ZERO
Definition: sse2neon.h:202
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:178
static void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:128
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74