61 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62 #define INCLUDED_volk_32f_x2_add_32f_u_H
67 #ifdef LV_HAVE_AVX512F
68 #include <immintrin.h>
70 static inline void volk_32f_x2_add_32f_u_avx512f(
float* cVector,
73 unsigned int num_points)
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
88 cVal = _mm512_add_ps(aVal, bVal);
90 _mm512_storeu_ps(cPtr, cVal);
97 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
108 #include <immintrin.h>
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
115 unsigned int number = 0;
116 const unsigned int eighthPoints = num_points / 8;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121 for (; number < eighthPoints; number++) {
123 aVal = _mm256_loadu_ps(aPtr);
124 bVal = _mm256_loadu_ps(bPtr);
126 cVal = _mm256_add_ps(aVal, bVal);
128 _mm256_storeu_ps(cPtr, cVal);
135 number = eighthPoints * 8;
137 for (; number < num_points; number++) {
138 *cPtr++ = (*aPtr++) + (*bPtr++);
145 #include <xmmintrin.h>
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
152 unsigned int number = 0;
153 const unsigned int quarterPoints = num_points / 4;
155 float* cPtr = cVector;
156 const float* aPtr = aVector;
157 const float* bPtr = bVector;
160 for (; number < quarterPoints; number++) {
174 number = quarterPoints * 4;
175 for (; number < num_points; number++) {
176 *cPtr++ = (*aPtr++) + (*bPtr++);
182 #ifdef LV_HAVE_GENERIC
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 const float* bPtr = bVector;
192 unsigned int number = 0;
194 for (number = 0; number < num_points; number++) {
195 *cPtr++ = (*aPtr++) + (*bPtr++);
202 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203 #define INCLUDED_volk_32f_x2_add_32f_a_H
205 #include <inttypes.h>
208 #ifdef LV_HAVE_AVX512F
209 #include <immintrin.h>
211 static inline void volk_32f_x2_add_32f_a_avx512f(
float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
229 cVal = _mm512_add_ps(aVal, bVal);
231 _mm512_store_ps(cPtr, cVal);
238 number = sixteenthPoints * 16;
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
249 #include <immintrin.h>
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
256 unsigned int number = 0;
257 const unsigned int eighthPoints = num_points / 8;
259 float* cPtr = cVector;
260 const float* aPtr = aVector;
261 const float* bPtr = bVector;
263 __m256 aVal, bVal, cVal;
264 for (; number < eighthPoints; number++) {
266 aVal = _mm256_load_ps(aPtr);
267 bVal = _mm256_load_ps(bPtr);
269 cVal = _mm256_add_ps(aVal, bVal);
271 _mm256_store_ps(cPtr, cVal);
278 number = eighthPoints * 8;
279 for (; number < num_points; number++) {
280 *cPtr++ = (*aPtr++) + (*bPtr++);
286 #include <xmmintrin.h>
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
293 unsigned int number = 0;
294 const unsigned int quarterPoints = num_points / 4;
296 float* cPtr = cVector;
297 const float* aPtr = aVector;
298 const float* bPtr = bVector;
301 for (; number < quarterPoints; number++) {
314 number = quarterPoints * 4;
315 for (; number < num_points; number++) {
316 *cPtr++ = (*aPtr++) + (*bPtr++);
323 #include <arm_neon.h>
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
345 cVal = vaddq_f32(aVal, bVal);
347 vst1q_f32(cPtr, cVal);
354 number = quarterPoints * 4;
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
362 #ifdef LV_HAVE_NEONV7
363 extern void volk_32f_x2_add_32f_a_neonasm(
float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
369 #ifdef LV_HAVE_NEONV7
370 extern void volk_32f_x2_add_32f_a_neonpipeline(
float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
376 #ifdef LV_HAVE_GENERIC
379 const float* aVector,
380 const float* bVector,
381 unsigned int num_points)
383 float* cPtr = cVector;
384 const float* aPtr = aVector;
385 const float* bPtr = bVector;
386 unsigned int number = 0;
388 for (number = 0; number < num_points; number++) {
389 *cPtr++ = (*aPtr++) + (*bPtr++);
397 extern void volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
398 const float* aVector,
399 const float* bVector,
400 unsigned int num_points);
402 static inline void volk_32f_x2_add_32f_u_orc(
float* cVector,
403 const float* aVector,
404 const float* bVector,
405 unsigned int num_points)
407 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:325
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:147
static void volk_32f_x2_add_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:378
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:251
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:110
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:288
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:184
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71