Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
61 #define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
69  float* qBuffer,
70  const lv_32fc_t* complexVector,
71  unsigned int num_points)
72 {
73  const float* complexVectorPtr = (float*)complexVector;
74  float* iBufferPtr = iBuffer;
75  float* qBufferPtr = qBuffer;
76 
77  unsigned int number = 0;
78  // Mask for real and imaginary parts
79  const unsigned int eighthPoints = num_points / 8;
80  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
81  for (; number < eighthPoints; number++) {
82  cplxValue1 = _mm256_load_ps(complexVectorPtr);
83  complexVectorPtr += 8;
84 
85  cplxValue2 = _mm256_load_ps(complexVectorPtr);
86  complexVectorPtr += 8;
87 
88  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
89  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
90 
91  // Arrange in i1i2i3i4 format
92  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
93  // Arrange in q1q2q3q4 format
94  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
95 
96  _mm256_store_ps(iBufferPtr, iValue);
97  _mm256_store_ps(qBufferPtr, qValue);
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  for (; number < num_points; number++) {
105  *iBufferPtr++ = *complexVectorPtr++;
106  *qBufferPtr++ = *complexVectorPtr++;
107  }
108 }
109 #endif /* LV_HAVE_AVX */
110 
111 #ifdef LV_HAVE_SSE
112 #include <xmmintrin.h>
113 
114 static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
115  float* qBuffer,
116  const lv_32fc_t* complexVector,
117  unsigned int num_points)
118 {
119  const float* complexVectorPtr = (float*)complexVector;
120  float* iBufferPtr = iBuffer;
121  float* qBufferPtr = qBuffer;
122 
123  unsigned int number = 0;
124  const unsigned int quarterPoints = num_points / 4;
125  __m128 cplxValue1, cplxValue2, iValue, qValue;
126  for (; number < quarterPoints; number++) {
127  cplxValue1 = _mm_load_ps(complexVectorPtr);
128  complexVectorPtr += 4;
129 
130  cplxValue2 = _mm_load_ps(complexVectorPtr);
131  complexVectorPtr += 4;
132 
133  // Arrange in i1i2i3i4 format
134  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
135  // Arrange in q1q2q3q4 format
136  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
137 
138  _mm_store_ps(iBufferPtr, iValue);
139  _mm_store_ps(qBufferPtr, qValue);
140 
141  iBufferPtr += 4;
142  qBufferPtr += 4;
143  }
144 
145  number = quarterPoints * 4;
146  for (; number < num_points; number++) {
147  *iBufferPtr++ = *complexVectorPtr++;
148  *qBufferPtr++ = *complexVectorPtr++;
149  }
150 }
151 #endif /* LV_HAVE_SSE */
152 
153 
154 #ifdef LV_HAVE_NEON
155 #include <arm_neon.h>
156 
157 static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
158  float* qBuffer,
159  const lv_32fc_t* complexVector,
160  unsigned int num_points)
161 {
162  unsigned int number = 0;
163  unsigned int quarter_points = num_points / 4;
164  const float* complexVectorPtr = (float*)complexVector;
165  float* iBufferPtr = iBuffer;
166  float* qBufferPtr = qBuffer;
167  float32x4x2_t complexInput;
168 
169  for (number = 0; number < quarter_points; number++) {
170  complexInput = vld2q_f32(complexVectorPtr);
171  vst1q_f32(iBufferPtr, complexInput.val[0]);
172  vst1q_f32(qBufferPtr, complexInput.val[1]);
173  complexVectorPtr += 8;
174  iBufferPtr += 4;
175  qBufferPtr += 4;
176  }
177 
178  for (number = quarter_points * 4; number < num_points; number++) {
179  *iBufferPtr++ = *complexVectorPtr++;
180  *qBufferPtr++ = *complexVectorPtr++;
181  }
182 }
183 #endif /* LV_HAVE_NEON */
184 
185 
186 #ifdef LV_HAVE_GENERIC
187 
188 static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
189  float* qBuffer,
190  const lv_32fc_t* complexVector,
191  unsigned int num_points)
192 {
193  const float* complexVectorPtr = (float*)complexVector;
194  float* iBufferPtr = iBuffer;
195  float* qBufferPtr = qBuffer;
196  unsigned int number;
197  for (number = 0; number < num_points; number++) {
198  *iBufferPtr++ = *complexVectorPtr++;
199  *qBufferPtr++ = *complexVectorPtr++;
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
205 
206 
207 #ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
208 #define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
209 
210 #include <inttypes.h>
211 #include <stdio.h>
212 
213 #ifdef LV_HAVE_AVX
214 #include <immintrin.h>
215 static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
216  float* qBuffer,
217  const lv_32fc_t* complexVector,
218  unsigned int num_points)
219 {
220  const float* complexVectorPtr = (float*)complexVector;
221  float* iBufferPtr = iBuffer;
222  float* qBufferPtr = qBuffer;
223 
224  unsigned int number = 0;
225  // Mask for real and imaginary parts
226  const unsigned int eighthPoints = num_points / 8;
227  __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
228  for (; number < eighthPoints; number++) {
229  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
230  complexVectorPtr += 8;
231 
232  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
233  complexVectorPtr += 8;
234 
235  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
236  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
237 
238  // Arrange in i1i2i3i4 format
239  iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
240  // Arrange in q1q2q3q4 format
241  qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
242 
243  _mm256_storeu_ps(iBufferPtr, iValue);
244  _mm256_storeu_ps(qBufferPtr, qValue);
245 
246  iBufferPtr += 8;
247  qBufferPtr += 8;
248  }
249 
250  number = eighthPoints * 8;
251  for (; number < num_points; number++) {
252  *iBufferPtr++ = *complexVectorPtr++;
253  *qBufferPtr++ = *complexVectorPtr++;
254  }
255 }
256 #endif /* LV_HAVE_AVX */
257 #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:188
static void volk_32fc_deinterleave_32f_x2_a_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:68
static void volk_32fc_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:114
static void volk_32fc_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:157
static void volk_32fc_deinterleave_32f_x2_u_avx(float *iBuffer, float *qBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_32f_x2.h:215
float complex lv_32fc_t
Definition: volk_complex.h:74