51 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
57 #ifdef LV_HAVE_GENERIC
60 unsigned int num_points)
63 unsigned int number = 0;
66 for (; number < num_points; number++) {
67 returnValue += (*aPtr++);
69 *result = returnValue;
74 #include <immintrin.h>
78 unsigned int num_points)
81 unsigned int number = 0;
82 const unsigned int quarterPoints = num_points / 4;
87 __m256 accumulator = _mm256_setzero_ps();
88 __m256 aVal = _mm256_setzero_ps();
90 for (; number < quarterPoints; number++) {
91 aVal = _mm256_loadu_ps((
float*)aPtr);
92 accumulator = _mm256_add_ps(accumulator, aVal);
96 _mm256_store_ps(tempBuffer, accumulator);
98 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
99 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
100 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
101 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
103 number = quarterPoints * 4;
104 for (; number < num_points; number++) {
105 returnValue += (*aPtr++);
107 *result = returnValue;
112 #include <xmmintrin.h>
116 unsigned int num_points)
119 unsigned int number = 0;
120 const unsigned int halfPoints = num_points / 2;
128 for (; number < halfPoints; number++) {
136 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
137 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
139 number = halfPoints * 2;
140 for (; number < num_points; number++) {
141 returnValue += (*aPtr++);
143 *result = returnValue;
148 #include <immintrin.h>
152 unsigned int num_points)
155 unsigned int number = 0;
156 const unsigned int quarterPoints = num_points / 4;
161 __m256 accumulator = _mm256_setzero_ps();
162 __m256 aVal = _mm256_setzero_ps();
164 for (; number < quarterPoints; number++) {
165 aVal = _mm256_load_ps((
float*)aPtr);
166 accumulator = _mm256_add_ps(accumulator, aVal);
170 _mm256_store_ps(tempBuffer, accumulator);
172 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
173 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
174 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
175 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
177 number = quarterPoints * 4;
178 for (; number < num_points; number++) {
179 returnValue += (*aPtr++);
181 *result = returnValue;
186 #include <xmmintrin.h>
190 unsigned int num_points)
193 unsigned int number = 0;
194 const unsigned int halfPoints = num_points / 2;
202 for (; number < halfPoints; number++) {
210 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
211 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
213 number = halfPoints * 2;
214 for (; number < num_points; number++) {
215 returnValue += (*aPtr++);
217 *result = returnValue;
222 #include <arm_neon.h>
225 unsigned int num_points)
228 unsigned int number = 0;
230 unsigned int eighthPoints = num_points / 8;
232 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
233 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
234 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
235 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
238 for (; number < eighthPoints; number++) {
239 in_vec = vld1q_f32((
float*)aPtr);
240 out_vec0 = vaddq_f32(in_vec, out_vec0);
243 in_vec = vld1q_f32((
float*)aPtr);
244 out_vec1 = vaddq_f32(in_vec, out_vec1);
247 in_vec = vld1q_f32((
float*)aPtr);
248 out_vec2 = vaddq_f32(in_vec, out_vec2);
251 in_vec = vld1q_f32((
float*)aPtr);
252 out_vec3 = vaddq_f32(in_vec, out_vec3);
255 vst1q_f32(tempBuffer, out_vec0);
256 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
257 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
259 vst1q_f32(tempBuffer, out_vec1);
260 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
261 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
263 vst1q_f32(tempBuffer, out_vec2);
264 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
265 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
267 vst1q_f32(tempBuffer, out_vec3);
268 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
269 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
271 number = eighthPoints * 8;
272 for (; number < num_points; number++) {
273 returnValue += (*aPtr++);
275 *result = returnValue;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:58
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:188
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:114
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:150
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:223
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cmake(r, i)
Definition: volk_complex.h:77
float complex lv_32fc_t
Definition: volk_complex.h:74