Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_deinterleave_real_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
59 #define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX2
65 #include <immintrin.h>
66 
67 static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
68  const lv_32fc_t* complexVector,
69  unsigned int num_points)
70 {
71  unsigned int number = 0;
72 
73  const float* complexVectorPtr = (float*)complexVector;
74  double* iBufferPtr = iBuffer;
75 
76  const unsigned int quarterPoints = num_points / 4;
77  __m256 cplxValue;
78  __m128 fVal;
79  __m256d dVal;
80  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
81  for (; number < quarterPoints; number++) {
82 
83  cplxValue = _mm256_load_ps(complexVectorPtr);
84  complexVectorPtr += 8;
85 
86  // Arrange in i1i2i1i2 format
87  cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
88  fVal = _mm256_extractf128_ps(cplxValue, 0);
89  dVal = _mm256_cvtps_pd(fVal);
90  _mm256_store_pd(iBufferPtr, dVal);
91 
92  iBufferPtr += 4;
93  }
94 
95  number = quarterPoints * 4;
96  for (; number < num_points; number++) {
97  *iBufferPtr++ = (double)*complexVectorPtr++;
98  complexVectorPtr++;
99  }
100 }
101 #endif /* LV_HAVE_AVX2 */
102 
103 #ifdef LV_HAVE_SSE2
104 #include <emmintrin.h>
105 
106 static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
107  const lv_32fc_t* complexVector,
108  unsigned int num_points)
109 {
110  unsigned int number = 0;
111 
112  const float* complexVectorPtr = (float*)complexVector;
113  double* iBufferPtr = iBuffer;
114 
115  const unsigned int halfPoints = num_points / 2;
116  __m128 cplxValue, fVal;
117  __m128d dVal;
118  for (; number < halfPoints; number++) {
119 
120  cplxValue = _mm_load_ps(complexVectorPtr);
121  complexVectorPtr += 4;
122 
123  // Arrange in i1i2i1i2 format
124  fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
125  dVal = _mm_cvtps_pd(fVal);
126  _mm_store_pd(iBufferPtr, dVal);
127 
128  iBufferPtr += 2;
129  }
130 
131  number = halfPoints * 2;
132  for (; number < num_points; number++) {
133  *iBufferPtr++ = (double)*complexVectorPtr++;
134  complexVectorPtr++;
135  }
136 }
137 #endif /* LV_HAVE_SSE */
138 
139 #ifdef LV_HAVE_GENERIC
140 
141 static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
142  const lv_32fc_t* complexVector,
143  unsigned int num_points)
144 {
145  unsigned int number = 0;
146  const float* complexVectorPtr = (float*)complexVector;
147  double* iBufferPtr = iBuffer;
148  for (number = 0; number < num_points; number++) {
149  *iBufferPtr++ = (double)*complexVectorPtr++;
150  complexVectorPtr++;
151  }
152 }
153 #endif /* LV_HAVE_GENERIC */
154 
155 #ifdef LV_HAVE_NEONV8
156 #include <arm_neon.h>
157 
158 static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
159  const lv_32fc_t* complexVector,
160  unsigned int num_points)
161 {
162  unsigned int number = 0;
163  unsigned int quarter_points = num_points / 4;
164  const float* complexVectorPtr = (float*)complexVector;
165  double* iBufferPtr = iBuffer;
166  float32x2x4_t complexInput;
167  float64x2_t iVal1;
168  float64x2_t iVal2;
169  float64x2x2_t iVal;
170 
171  for (number = 0; number < quarter_points; number++) {
172  // Load data into register
173  complexInput = vld4_f32(complexVectorPtr);
174 
175  // Perform single to double precision conversion
176  iVal1 = vcvt_f64_f32(complexInput.val[0]);
177  iVal2 = vcvt_f64_f32(complexInput.val[2]);
178  iVal.val[0] = iVal1;
179  iVal.val[1] = iVal2;
180 
181  // Store results into memory buffer
182  vst2q_f64(iBufferPtr, iVal);
183 
184  // Update pointers
185  iBufferPtr += 4;
186  complexVectorPtr += 8;
187  }
188 
189  for (number = quarter_points * 4; number < num_points; number++) {
190  *iBufferPtr++ = (double)*complexVectorPtr++;
191  complexVectorPtr++;
192  }
193 }
194 #endif /* LV_HAVE_NEON */
195 
196 #endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
197 
198 #ifndef INCLUDED_volk_32fc_deinterleave_real_64f_u_H
199 #define INCLUDED_volk_32fc_deinterleave_real_64f_u_H
200 
201 #include <inttypes.h>
202 #include <stdio.h>
203 
204 #ifdef LV_HAVE_AVX2
205 #include <immintrin.h>
206 
207 static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
208  const lv_32fc_t* complexVector,
209  unsigned int num_points)
210 {
211  unsigned int number = 0;
212 
213  const float* complexVectorPtr = (float*)complexVector;
214  double* iBufferPtr = iBuffer;
215 
216  const unsigned int quarterPoints = num_points / 4;
217  __m256 cplxValue;
218  __m128 fVal;
219  __m256d dVal;
220  __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
221  for (; number < quarterPoints; number++) {
222 
223  cplxValue = _mm256_loadu_ps(complexVectorPtr);
224  complexVectorPtr += 8;
225 
226  // Arrange in i1i2i1i2 format
227  cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
228  fVal = _mm256_extractf128_ps(cplxValue, 0);
229  dVal = _mm256_cvtps_pd(fVal);
230  _mm256_storeu_pd(iBufferPtr, dVal);
231 
232  iBufferPtr += 4;
233  }
234 
235  number = quarterPoints * 4;
236  for (; number < num_points; number++) {
237  *iBufferPtr++ = (double)*complexVectorPtr++;
238  complexVectorPtr++;
239  }
240 }
241 #endif /* LV_HAVE_AVX2 */
242 
243 #endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition: sse2neon.h:4096
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
static void volk_32fc_deinterleave_real_64f_a_sse2(double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_real_64f.h:106
static void volk_32fc_deinterleave_real_64f_generic(double *iBuffer, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_deinterleave_real_64f.h:141
float complex lv_32fc_t
Definition: volk_complex.h:74