Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
62 #define INCLUDED_volk_32fc_x2_add_32fc_u_H
63 
64 #ifdef LV_HAVE_AVX
65 #include <immintrin.h>
66 
67 static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
68  const lv_32fc_t* aVector,
69  const lv_32fc_t* bVector,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73  const unsigned int quarterPoints = num_points / 4;
74 
75  lv_32fc_t* cPtr = cVector;
76  const lv_32fc_t* aPtr = aVector;
77  const lv_32fc_t* bPtr = bVector;
78 
79  __m256 aVal, bVal, cVal;
80  for (; number < quarterPoints; number++) {
81 
82  aVal = _mm256_loadu_ps((float*)aPtr);
83  bVal = _mm256_loadu_ps((float*)bPtr);
84 
85  cVal = _mm256_add_ps(aVal, bVal);
86 
87  _mm256_storeu_ps((float*)cPtr,
88  cVal); // Store the results back into the C container
89 
90  aPtr += 4;
91  bPtr += 4;
92  cPtr += 4;
93  }
94 
95  number = quarterPoints * 4;
96  for (; number < num_points; number++) {
97  *cPtr++ = (*aPtr++) + (*bPtr++);
98  }
99 }
100 #endif /* LV_HAVE_AVX */
101 
102 
103 #ifdef LV_HAVE_AVX
104 #include <immintrin.h>
105 
106 static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
107  const lv_32fc_t* aVector,
108  const lv_32fc_t* bVector,
109  unsigned int num_points)
110 {
111  unsigned int number = 0;
112  const unsigned int quarterPoints = num_points / 4;
113 
114  lv_32fc_t* cPtr = cVector;
115  const lv_32fc_t* aPtr = aVector;
116  const lv_32fc_t* bPtr = bVector;
117 
118  __m256 aVal, bVal, cVal;
119  for (; number < quarterPoints; number++) {
120 
121  aVal = _mm256_load_ps((float*)aPtr);
122  bVal = _mm256_load_ps((float*)bPtr);
123 
124  cVal = _mm256_add_ps(aVal, bVal);
125 
126  _mm256_store_ps((float*)cPtr,
127  cVal); // Store the results back into the C container
128 
129  aPtr += 4;
130  bPtr += 4;
131  cPtr += 4;
132  }
133 
134  number = quarterPoints * 4;
135  for (; number < num_points; number++) {
136  *cPtr++ = (*aPtr++) + (*bPtr++);
137  }
138 }
139 #endif /* LV_HAVE_AVX */
140 
141 
142 #ifdef LV_HAVE_SSE
143 #include <xmmintrin.h>
144 
145 static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
146  const lv_32fc_t* aVector,
147  const lv_32fc_t* bVector,
148  unsigned int num_points)
149 {
150  unsigned int number = 0;
151  const unsigned int halfPoints = num_points / 2;
152 
153  lv_32fc_t* cPtr = cVector;
154  const lv_32fc_t* aPtr = aVector;
155  const lv_32fc_t* bPtr = bVector;
156 
157  __m128 aVal, bVal, cVal;
158  for (; number < halfPoints; number++) {
159 
160  aVal = _mm_loadu_ps((float*)aPtr);
161  bVal = _mm_loadu_ps((float*)bPtr);
162 
163  cVal = _mm_add_ps(aVal, bVal);
164 
165  _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
166 
167  aPtr += 2;
168  bPtr += 2;
169  cPtr += 2;
170  }
171 
172  number = halfPoints * 2;
173  for (; number < num_points; number++) {
174  *cPtr++ = (*aPtr++) + (*bPtr++);
175  }
176 }
177 #endif /* LV_HAVE_SSE */
178 
179 
180 #ifdef LV_HAVE_GENERIC
181 
182 static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
183  const lv_32fc_t* aVector,
184  const lv_32fc_t* bVector,
185  unsigned int num_points)
186 {
187  lv_32fc_t* cPtr = cVector;
188  const lv_32fc_t* aPtr = aVector;
189  const lv_32fc_t* bPtr = bVector;
190  unsigned int number = 0;
191 
192  for (number = 0; number < num_points; number++) {
193  *cPtr++ = (*aPtr++) + (*bPtr++);
194  }
195 }
196 #endif /* LV_HAVE_GENERIC */
197 
198 
199 #ifdef LV_HAVE_SSE
200 #include <xmmintrin.h>
201 
202 static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
203  const lv_32fc_t* aVector,
204  const lv_32fc_t* bVector,
205  unsigned int num_points)
206 {
207  unsigned int number = 0;
208  const unsigned int halfPoints = num_points / 2;
209 
210  lv_32fc_t* cPtr = cVector;
211  const lv_32fc_t* aPtr = aVector;
212  const lv_32fc_t* bPtr = bVector;
213 
214  __m128 aVal, bVal, cVal;
215  for (; number < halfPoints; number++) {
216  aVal = _mm_load_ps((float*)aPtr);
217  bVal = _mm_load_ps((float*)bPtr);
218 
219  cVal = _mm_add_ps(aVal, bVal);
220 
221  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
222 
223  aPtr += 2;
224  bPtr += 2;
225  cPtr += 2;
226  }
227 
228  number = halfPoints * 2;
229  for (; number < num_points; number++) {
230  *cPtr++ = (*aPtr++) + (*bPtr++);
231  }
232 }
233 #endif /* LV_HAVE_SSE */
234 
235 
236 #ifdef LV_HAVE_NEON
237 #include <arm_neon.h>
238 
239 static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
240  const lv_32fc_t* aVector,
241  const lv_32fc_t* bVector,
242  unsigned int num_points)
243 {
244  unsigned int number = 0;
245  const unsigned int halfPoints = num_points / 2;
246 
247  lv_32fc_t* cPtr = cVector;
248  const lv_32fc_t* aPtr = aVector;
249  const lv_32fc_t* bPtr = bVector;
250  float32x4_t aVal, bVal, cVal;
251  for (number = 0; number < halfPoints; number++) {
252  // Load in to NEON registers
253  aVal = vld1q_f32((const float32_t*)(aPtr));
254  bVal = vld1q_f32((const float32_t*)(bPtr));
255  __VOLK_PREFETCH(aPtr + 2);
256  __VOLK_PREFETCH(bPtr + 2);
257 
258  // vector add
259  cVal = vaddq_f32(aVal, bVal);
260  // Store the results back into the C container
261  vst1q_f32((float*)(cPtr), cVal);
262 
263  aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
264  bPtr += 2;
265  cPtr += 2;
266  }
267 
268  number = halfPoints * 2; // should be = num_points
269  for (; number < num_points; number++) {
270  *cPtr++ = (*aPtr++) + (*bPtr++);
271  }
272 }
273 
274 #endif /* LV_HAVE_NEON */
275 
276 
277 #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:202
static void volk_32fc_x2_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:182
static void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:145
static void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:67
static void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:106
static void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:239
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74