Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53 
54 #include <inttypes.h>
55 #include <volk/volk_common.h>
56 
57 #ifdef LV_HAVE_GENERIC
59  const lv_32fc_t* inputBuffer,
60  unsigned int num_points)
61 {
62  const lv_32fc_t* aPtr = inputBuffer;
63  unsigned int number = 0;
64  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65 
66  for (; number < num_points; number++) {
67  returnValue += (*aPtr++);
68  }
69  *result = returnValue;
70 }
71 #endif /* LV_HAVE_GENERIC */
72 
73 #ifdef LV_HAVE_AVX
74 #include <immintrin.h>
75 
76 static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
77  const lv_32fc_t* inputBuffer,
78  unsigned int num_points)
79 {
80  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
81  unsigned int number = 0;
82  const unsigned int quarterPoints = num_points / 4;
83 
84  const lv_32fc_t* aPtr = inputBuffer;
85  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
86 
87  __m256 accumulator = _mm256_setzero_ps();
88  __m256 aVal = _mm256_setzero_ps();
89 
90  for (; number < quarterPoints; number++) {
91  aVal = _mm256_loadu_ps((float*)aPtr);
92  accumulator = _mm256_add_ps(accumulator, aVal);
93  aPtr += 4;
94  }
95 
96  _mm256_store_ps(tempBuffer, accumulator);
97 
98  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
99  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
100  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
101  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
102 
103  number = quarterPoints * 4;
104  for (; number < num_points; number++) {
105  returnValue += (*aPtr++);
106  }
107  *result = returnValue;
108 }
109 #endif /* LV_HAVE_AVX */
110 
111 #ifdef LV_HAVE_SSE
112 #include <xmmintrin.h>
113 
114 static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
115  const lv_32fc_t* inputBuffer,
116  unsigned int num_points)
117 {
118  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
119  unsigned int number = 0;
120  const unsigned int halfPoints = num_points / 2;
121 
122  const lv_32fc_t* aPtr = inputBuffer;
123  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
124 
125  __m128 accumulator = _mm_setzero_ps();
126  __m128 aVal = _mm_setzero_ps();
127 
128  for (; number < halfPoints; number++) {
129  aVal = _mm_loadu_ps((float*)aPtr);
130  accumulator = _mm_add_ps(accumulator, aVal);
131  aPtr += 2;
132  }
133 
134  _mm_store_ps(tempBuffer, accumulator);
135 
136  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
137  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
138 
139  number = halfPoints * 2;
140  for (; number < num_points; number++) {
141  returnValue += (*aPtr++);
142  }
143  *result = returnValue;
144 }
145 #endif /* LV_HAVE_SSE */
146 
147 #ifdef LV_HAVE_AVX
148 #include <immintrin.h>
149 
150 static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
151  const lv_32fc_t* inputBuffer,
152  unsigned int num_points)
153 {
154  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
155  unsigned int number = 0;
156  const unsigned int quarterPoints = num_points / 4;
157 
158  const lv_32fc_t* aPtr = inputBuffer;
159  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
160 
161  __m256 accumulator = _mm256_setzero_ps();
162  __m256 aVal = _mm256_setzero_ps();
163 
164  for (; number < quarterPoints; number++) {
165  aVal = _mm256_load_ps((float*)aPtr);
166  accumulator = _mm256_add_ps(accumulator, aVal);
167  aPtr += 4;
168  }
169 
170  _mm256_store_ps(tempBuffer, accumulator);
171 
172  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
173  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
174  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
175  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
176 
177  number = quarterPoints * 4;
178  for (; number < num_points; number++) {
179  returnValue += (*aPtr++);
180  }
181  *result = returnValue;
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_SSE
186 #include <xmmintrin.h>
187 
188 static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
189  const lv_32fc_t* inputBuffer,
190  unsigned int num_points)
191 {
192  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
193  unsigned int number = 0;
194  const unsigned int halfPoints = num_points / 2;
195 
196  const lv_32fc_t* aPtr = inputBuffer;
197  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
198 
199  __m128 accumulator = _mm_setzero_ps();
200  __m128 aVal = _mm_setzero_ps();
201 
202  for (; number < halfPoints; number++) {
203  aVal = _mm_load_ps((float*)aPtr);
204  accumulator = _mm_add_ps(accumulator, aVal);
205  aPtr += 2;
206  }
207 
208  _mm_store_ps(tempBuffer, accumulator);
209 
210  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
211  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
212 
213  number = halfPoints * 2;
214  for (; number < num_points; number++) {
215  returnValue += (*aPtr++);
216  }
217  *result = returnValue;
218 }
219 #endif /* LV_HAVE_SSE */
220 
221 #ifdef LV_HAVE_NEON
222 #include <arm_neon.h>
223 static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
224  const lv_32fc_t* inputBuffer,
225  unsigned int num_points)
226 {
227  const lv_32fc_t* aPtr = inputBuffer;
228  unsigned int number = 0;
229  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
230  unsigned int eighthPoints = num_points / 8;
231  float32x4_t in_vec;
232  float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
233  float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
234  float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
235  float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
236  __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
237 
238  for (; number < eighthPoints; number++) {
239  in_vec = vld1q_f32((float*)aPtr);
240  out_vec0 = vaddq_f32(in_vec, out_vec0);
241  aPtr += 2;
242 
243  in_vec = vld1q_f32((float*)aPtr);
244  out_vec1 = vaddq_f32(in_vec, out_vec1);
245  aPtr += 2;
246 
247  in_vec = vld1q_f32((float*)aPtr);
248  out_vec2 = vaddq_f32(in_vec, out_vec2);
249  aPtr += 2;
250 
251  in_vec = vld1q_f32((float*)aPtr);
252  out_vec3 = vaddq_f32(in_vec, out_vec3);
253  aPtr += 2;
254  }
255  vst1q_f32(tempBuffer, out_vec0);
256  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
257  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
258 
259  vst1q_f32(tempBuffer, out_vec1);
260  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
261  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262 
263  vst1q_f32(tempBuffer, out_vec2);
264  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
265  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
266 
267  vst1q_f32(tempBuffer, out_vec3);
268  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
269  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
270 
271  number = eighthPoints * 8;
272  for (; number < num_points; number++) {
273  returnValue += (*aPtr++);
274  }
275  *result = returnValue;
276 }
277 #endif /* LV_HAVE_NEON */
278 
279 #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:58
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:188
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:114
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:150
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:223
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cmake(r, i)
Definition: volk_complex.h:77
float complex lv_32fc_t
Definition: volk_complex.h:74