61 #ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
62 #define INCLUDED_volk_32fc_x2_add_32fc_u_H
65 #include <immintrin.h>
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
79 __m256 aVal, bVal, cVal;
80 for (; number < quarterPoints; number++) {
82 aVal = _mm256_loadu_ps((
float*)aPtr);
83 bVal = _mm256_loadu_ps((
float*)bPtr);
85 cVal = _mm256_add_ps(aVal, bVal);
87 _mm256_storeu_ps((
float*)cPtr,
95 number = quarterPoints * 4;
96 for (; number < num_points; number++) {
97 *cPtr++ = (*aPtr++) + (*bPtr++);
104 #include <immintrin.h>
109 unsigned int num_points)
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
118 __m256 aVal, bVal, cVal;
119 for (; number < quarterPoints; number++) {
121 aVal = _mm256_load_ps((
float*)aPtr);
122 bVal = _mm256_load_ps((
float*)bPtr);
124 cVal = _mm256_add_ps(aVal, bVal);
126 _mm256_store_ps((
float*)cPtr,
134 number = quarterPoints * 4;
135 for (; number < num_points; number++) {
136 *cPtr++ = (*aPtr++) + (*bPtr++);
143 #include <xmmintrin.h>
148 unsigned int num_points)
150 unsigned int number = 0;
151 const unsigned int halfPoints = num_points / 2;
158 for (; number < halfPoints; number++) {
172 number = halfPoints * 2;
173 for (; number < num_points; number++) {
174 *cPtr++ = (*aPtr++) + (*bPtr++);
180 #ifdef LV_HAVE_GENERIC
185 unsigned int num_points)
190 unsigned int number = 0;
192 for (number = 0; number < num_points; number++) {
193 *cPtr++ = (*aPtr++) + (*bPtr++);
200 #include <xmmintrin.h>
205 unsigned int num_points)
207 unsigned int number = 0;
208 const unsigned int halfPoints = num_points / 2;
215 for (; number < halfPoints; number++) {
228 number = halfPoints * 2;
229 for (; number < num_points; number++) {
230 *cPtr++ = (*aPtr++) + (*bPtr++);
237 #include <arm_neon.h>
242 unsigned int num_points)
244 unsigned int number = 0;
245 const unsigned int halfPoints = num_points / 2;
250 float32x4_t aVal, bVal, cVal;
251 for (number = 0; number < halfPoints; number++) {
253 aVal = vld1q_f32((
const float32_t*)(aPtr));
254 bVal = vld1q_f32((
const float32_t*)(bPtr));
259 cVal = vaddq_f32(aVal, bVal);
261 vst1q_f32((
float*)(cPtr), cVal);
268 number = halfPoints * 2;
269 for (; number < num_points; number++) {
270 *cPtr++ = (*aPtr++) + (*bPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:202
static void volk_32fc_x2_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:182
static void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:145
static void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:67
static void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:106
static void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_add_32fc.h:239
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74