50 #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51 #define INCLUDED_volk_32f_accumulator_s32f_a_H
57 #include <immintrin.h>
60 const float* inputBuffer,
61 unsigned int num_points)
63 float returnValue = 0;
64 unsigned int number = 0;
65 const unsigned int eighthPoints = num_points / 8;
67 const float* aPtr = inputBuffer;
70 __m256 accumulator = _mm256_setzero_ps();
71 __m256 aVal = _mm256_setzero_ps();
73 for (; number < eighthPoints; number++) {
74 aVal = _mm256_load_ps(aPtr);
75 accumulator = _mm256_add_ps(accumulator, aVal);
79 _mm256_store_ps(tempBuffer, accumulator);
81 returnValue = tempBuffer[0];
82 returnValue += tempBuffer[1];
83 returnValue += tempBuffer[2];
84 returnValue += tempBuffer[3];
85 returnValue += tempBuffer[4];
86 returnValue += tempBuffer[5];
87 returnValue += tempBuffer[6];
88 returnValue += tempBuffer[7];
90 number = eighthPoints * 8;
91 for (; number < num_points; number++) {
92 returnValue += (*aPtr++);
94 *result = returnValue;
100 #include <immintrin.h>
103 const float* inputBuffer,
104 unsigned int num_points)
106 float returnValue = 0;
107 unsigned int number = 0;
108 const unsigned int eighthPoints = num_points / 8;
110 const float* aPtr = inputBuffer;
113 __m256 accumulator = _mm256_setzero_ps();
114 __m256 aVal = _mm256_setzero_ps();
116 for (; number < eighthPoints; number++) {
117 aVal = _mm256_loadu_ps(aPtr);
118 accumulator = _mm256_add_ps(accumulator, aVal);
122 _mm256_store_ps(tempBuffer, accumulator);
124 returnValue = tempBuffer[0];
125 returnValue += tempBuffer[1];
126 returnValue += tempBuffer[2];
127 returnValue += tempBuffer[3];
128 returnValue += tempBuffer[4];
129 returnValue += tempBuffer[5];
130 returnValue += tempBuffer[6];
131 returnValue += tempBuffer[7];
133 number = eighthPoints * 8;
134 for (; number < num_points; number++) {
135 returnValue += (*aPtr++);
137 *result = returnValue;
143 #include <xmmintrin.h>
146 const float* inputBuffer,
147 unsigned int num_points)
149 float returnValue = 0;
150 unsigned int number = 0;
151 const unsigned int quarterPoints = num_points / 4;
153 const float* aPtr = inputBuffer;
159 for (; number < quarterPoints; number++) {
167 returnValue = tempBuffer[0];
168 returnValue += tempBuffer[1];
169 returnValue += tempBuffer[2];
170 returnValue += tempBuffer[3];
172 number = quarterPoints * 4;
173 for (; number < num_points; number++) {
174 returnValue += (*aPtr++);
176 *result = returnValue;
182 #include <xmmintrin.h>
185 const float* inputBuffer,
186 unsigned int num_points)
188 float returnValue = 0;
189 unsigned int number = 0;
190 const unsigned int quarterPoints = num_points / 4;
192 const float* aPtr = inputBuffer;
198 for (; number < quarterPoints; number++) {
206 returnValue = tempBuffer[0];
207 returnValue += tempBuffer[1];
208 returnValue += tempBuffer[2];
209 returnValue += tempBuffer[3];
211 number = quarterPoints * 4;
212 for (; number < num_points; number++) {
213 returnValue += (*aPtr++);
215 *result = returnValue;
219 #ifdef LV_HAVE_GENERIC
221 const float* inputBuffer,
222 unsigned int num_points)
224 const float* aPtr = inputBuffer;
225 unsigned int number = 0;
226 float returnValue = 0;
228 for (; number < num_points; number++) {
229 returnValue += (*aPtr++);
231 *result = returnValue;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:59
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:184
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:220
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:102
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:145
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65