Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_interleave_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
61 #define INCLUDED_volk_32f_x2_interleave_32fc_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 
69 static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
70  const float* iBuffer,
71  const float* qBuffer,
72  unsigned int num_points)
73 {
74  unsigned int number = 0;
75  float* complexVectorPtr = (float*)complexVector;
76  const float* iBufferPtr = iBuffer;
77  const float* qBufferPtr = qBuffer;
78 
79  const uint64_t eighthPoints = num_points / 8;
80 
81  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
82  for (; number < eighthPoints; number++) {
83  iValue = _mm256_load_ps(iBufferPtr);
84  qValue = _mm256_load_ps(qBufferPtr);
85 
86  // Interleaves the lower two values in the i and q variables into one buffer
87  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
88  // Interleaves the upper two values in the i and q variables into one buffer
89  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
90 
91  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
92  _mm256_store_ps(complexVectorPtr, cplxValue);
93  complexVectorPtr += 8;
94 
95  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
96  _mm256_store_ps(complexVectorPtr, cplxValue);
97  complexVectorPtr += 8;
98 
99  iBufferPtr += 8;
100  qBufferPtr += 8;
101  }
102 
103  number = eighthPoints * 8;
104  for (; number < num_points; number++) {
105  *complexVectorPtr++ = *iBufferPtr++;
106  *complexVectorPtr++ = *qBufferPtr++;
107  }
108 }
109 
110 #endif /* LV_HAV_AVX */
111 
112 #ifdef LV_HAVE_SSE
113 #include <xmmintrin.h>
114 
115 static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
116  const float* iBuffer,
117  const float* qBuffer,
118  unsigned int num_points)
119 {
120  unsigned int number = 0;
121  float* complexVectorPtr = (float*)complexVector;
122  const float* iBufferPtr = iBuffer;
123  const float* qBufferPtr = qBuffer;
124 
125  const uint64_t quarterPoints = num_points / 4;
126 
127  __m128 iValue, qValue, cplxValue;
128  for (; number < quarterPoints; number++) {
129  iValue = _mm_load_ps(iBufferPtr);
130  qValue = _mm_load_ps(qBufferPtr);
131 
132  // Interleaves the lower two values in the i and q variables into one buffer
133  cplxValue = _mm_unpacklo_ps(iValue, qValue);
134  _mm_store_ps(complexVectorPtr, cplxValue);
135  complexVectorPtr += 4;
136 
137  // Interleaves the upper two values in the i and q variables into one buffer
138  cplxValue = _mm_unpackhi_ps(iValue, qValue);
139  _mm_store_ps(complexVectorPtr, cplxValue);
140  complexVectorPtr += 4;
141 
142  iBufferPtr += 4;
143  qBufferPtr += 4;
144  }
145 
146  number = quarterPoints * 4;
147  for (; number < num_points; number++) {
148  *complexVectorPtr++ = *iBufferPtr++;
149  *complexVectorPtr++ = *qBufferPtr++;
150  }
151 }
152 #endif /* LV_HAVE_SSE */
153 
154 
155 #ifdef LV_HAVE_NEON
156 #include <arm_neon.h>
157 
158 static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
159  const float* iBuffer,
160  const float* qBuffer,
161  unsigned int num_points)
162 {
163  unsigned int quarter_points = num_points / 4;
164  unsigned int number;
165  float* complexVectorPtr = (float*)complexVector;
166 
167  float32x4x2_t complex_vec;
168  for (number = 0; number < quarter_points; ++number) {
169  complex_vec.val[0] = vld1q_f32(iBuffer);
170  complex_vec.val[1] = vld1q_f32(qBuffer);
171  vst2q_f32(complexVectorPtr, complex_vec);
172  iBuffer += 4;
173  qBuffer += 4;
174  complexVectorPtr += 8;
175  }
176 
177  for (number = quarter_points * 4; number < num_points; ++number) {
178  *complexVectorPtr++ = *iBuffer++;
179  *complexVectorPtr++ = *qBuffer++;
180  }
181 }
182 #endif /* LV_HAVE_NEON */
183 
184 
185 #ifdef LV_HAVE_GENERIC
186 
187 static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
188  const float* iBuffer,
189  const float* qBuffer,
190  unsigned int num_points)
191 {
192  float* complexVectorPtr = (float*)complexVector;
193  const float* iBufferPtr = iBuffer;
194  const float* qBufferPtr = qBuffer;
195  unsigned int number;
196 
197  for (number = 0; number < num_points; number++) {
198  *complexVectorPtr++ = *iBufferPtr++;
199  *complexVectorPtr++ = *qBufferPtr++;
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 
205 #endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
206 
207 #ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
208 #define INCLUDED_volk_32f_x2_interleave_32fc_u_H
209 
210 #include <inttypes.h>
211 #include <stdio.h>
212 
213 #ifdef LV_HAVE_AVX
214 #include <immintrin.h>
215 
216 static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
217  const float* iBuffer,
218  const float* qBuffer,
219  unsigned int num_points)
220 {
221  unsigned int number = 0;
222  float* complexVectorPtr = (float*)complexVector;
223  const float* iBufferPtr = iBuffer;
224  const float* qBufferPtr = qBuffer;
225 
226  const uint64_t eighthPoints = num_points / 8;
227 
228  __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
229  for (; number < eighthPoints; number++) {
230  iValue = _mm256_loadu_ps(iBufferPtr);
231  qValue = _mm256_loadu_ps(qBufferPtr);
232 
233  // Interleaves the lower two values in the i and q variables into one buffer
234  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
235  // Interleaves the upper two values in the i and q variables into one buffer
236  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
237 
238  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
239  _mm256_storeu_ps(complexVectorPtr, cplxValue);
240  complexVectorPtr += 8;
241 
242  cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
243  _mm256_storeu_ps(complexVectorPtr, cplxValue);
244  complexVectorPtr += 8;
245 
246  iBufferPtr += 8;
247  qBufferPtr += 8;
248  }
249 
250  number = eighthPoints * 8;
251  for (; number < num_points; number++) {
252  *complexVectorPtr++ = *iBufferPtr++;
253  *complexVectorPtr++ = *qBufferPtr++;
254  }
255 }
256 #endif /* LV_HAVE_AVX */
257 
258 #endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:69
static void volk_32f_x2_interleave_32fc_generic(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:187
static void volk_32f_x2_interleave_32fc_neon(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:158
static void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:216
static void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t *complexVector, const float *iBuffer, const float *qBuffer, unsigned int num_points)
Definition: volk_32f_x2_interleave_32fc.h:115
float complex lv_32fc_t
Definition: volk_complex.h:74