Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
50 #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51 #define INCLUDED_volk_32f_accumulator_s32f_a_H
52 
53 #include <inttypes.h>
54 #include <volk/volk_common.h>
55 
56 #ifdef LV_HAVE_AVX
57 #include <immintrin.h>
58 
59 static inline void volk_32f_accumulator_s32f_a_avx(float* result,
60  const float* inputBuffer,
61  unsigned int num_points)
62 {
63  float returnValue = 0;
64  unsigned int number = 0;
65  const unsigned int eighthPoints = num_points / 8;
66 
67  const float* aPtr = inputBuffer;
68  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
69 
70  __m256 accumulator = _mm256_setzero_ps();
71  __m256 aVal = _mm256_setzero_ps();
72 
73  for (; number < eighthPoints; number++) {
74  aVal = _mm256_load_ps(aPtr);
75  accumulator = _mm256_add_ps(accumulator, aVal);
76  aPtr += 8;
77  }
78 
79  _mm256_store_ps(tempBuffer, accumulator);
80 
81  returnValue = tempBuffer[0];
82  returnValue += tempBuffer[1];
83  returnValue += tempBuffer[2];
84  returnValue += tempBuffer[3];
85  returnValue += tempBuffer[4];
86  returnValue += tempBuffer[5];
87  returnValue += tempBuffer[6];
88  returnValue += tempBuffer[7];
89 
90  number = eighthPoints * 8;
91  for (; number < num_points; number++) {
92  returnValue += (*aPtr++);
93  }
94  *result = returnValue;
95 }
96 #endif /* LV_HAVE_AVX */
97 
98 
99 #ifdef LV_HAVE_AVX
100 #include <immintrin.h>
101 
102 static inline void volk_32f_accumulator_s32f_u_avx(float* result,
103  const float* inputBuffer,
104  unsigned int num_points)
105 {
106  float returnValue = 0;
107  unsigned int number = 0;
108  const unsigned int eighthPoints = num_points / 8;
109 
110  const float* aPtr = inputBuffer;
111  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
112 
113  __m256 accumulator = _mm256_setzero_ps();
114  __m256 aVal = _mm256_setzero_ps();
115 
116  for (; number < eighthPoints; number++) {
117  aVal = _mm256_loadu_ps(aPtr);
118  accumulator = _mm256_add_ps(accumulator, aVal);
119  aPtr += 8;
120  }
121 
122  _mm256_store_ps(tempBuffer, accumulator);
123 
124  returnValue = tempBuffer[0];
125  returnValue += tempBuffer[1];
126  returnValue += tempBuffer[2];
127  returnValue += tempBuffer[3];
128  returnValue += tempBuffer[4];
129  returnValue += tempBuffer[5];
130  returnValue += tempBuffer[6];
131  returnValue += tempBuffer[7];
132 
133  number = eighthPoints * 8;
134  for (; number < num_points; number++) {
135  returnValue += (*aPtr++);
136  }
137  *result = returnValue;
138 }
139 #endif /* LV_HAVE_AVX */
140 
141 
142 #ifdef LV_HAVE_SSE
143 #include <xmmintrin.h>
144 
145 static inline void volk_32f_accumulator_s32f_a_sse(float* result,
146  const float* inputBuffer,
147  unsigned int num_points)
148 {
149  float returnValue = 0;
150  unsigned int number = 0;
151  const unsigned int quarterPoints = num_points / 4;
152 
153  const float* aPtr = inputBuffer;
154  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
155 
156  __m128 accumulator = _mm_setzero_ps();
157  __m128 aVal = _mm_setzero_ps();
158 
159  for (; number < quarterPoints; number++) {
160  aVal = _mm_load_ps(aPtr);
161  accumulator = _mm_add_ps(accumulator, aVal);
162  aPtr += 4;
163  }
164 
165  _mm_store_ps(tempBuffer, accumulator);
166 
167  returnValue = tempBuffer[0];
168  returnValue += tempBuffer[1];
169  returnValue += tempBuffer[2];
170  returnValue += tempBuffer[3];
171 
172  number = quarterPoints * 4;
173  for (; number < num_points; number++) {
174  returnValue += (*aPtr++);
175  }
176  *result = returnValue;
177 }
178 #endif /* LV_HAVE_SSE */
179 
180 
181 #ifdef LV_HAVE_SSE
182 #include <xmmintrin.h>
183 
184 static inline void volk_32f_accumulator_s32f_u_sse(float* result,
185  const float* inputBuffer,
186  unsigned int num_points)
187 {
188  float returnValue = 0;
189  unsigned int number = 0;
190  const unsigned int quarterPoints = num_points / 4;
191 
192  const float* aPtr = inputBuffer;
193  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
194 
195  __m128 accumulator = _mm_setzero_ps();
196  __m128 aVal = _mm_setzero_ps();
197 
198  for (; number < quarterPoints; number++) {
199  aVal = _mm_load_ps(aPtr);
200  accumulator = _mm_add_ps(accumulator, aVal);
201  aPtr += 4;
202  }
203 
204  _mm_store_ps(tempBuffer, accumulator);
205 
206  returnValue = tempBuffer[0];
207  returnValue += tempBuffer[1];
208  returnValue += tempBuffer[2];
209  returnValue += tempBuffer[3];
210 
211  number = quarterPoints * 4;
212  for (; number < num_points; number++) {
213  returnValue += (*aPtr++);
214  }
215  *result = returnValue;
216 }
217 #endif /* LV_HAVE_SSE */
218 
219 #ifdef LV_HAVE_GENERIC
220 static inline void volk_32f_accumulator_s32f_generic(float* result,
221  const float* inputBuffer,
222  unsigned int num_points)
223 {
224  const float* aPtr = inputBuffer;
225  unsigned int number = 0;
226  float returnValue = 0;
227 
228  for (; number < num_points; number++) {
229  returnValue += (*aPtr++);
230  }
231  *result = returnValue;
232 }
233 #endif /* LV_HAVE_GENERIC */
234 
235 #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:59
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:184
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:220
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:102
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:145
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65