Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_stddev_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
57 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62 
63 #ifdef LV_HAVE_SSE4_1
64 #include <smmintrin.h>
65 
66 static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
67  const float* inputBuffer,
68  const float mean,
69  unsigned int num_points)
70 {
71  float returnValue = 0;
72  if (num_points > 0) {
73  unsigned int number = 0;
74  const unsigned int sixteenthPoints = num_points / 16;
75 
76  const float* aPtr = inputBuffer;
77 
78  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
79 
80  __m128 squareAccumulator = _mm_setzero_ps();
81  __m128 aVal1, aVal2, aVal3, aVal4;
82  __m128 cVal1, cVal2, cVal3, cVal4;
83  for (; number < sixteenthPoints; number++) {
84  aVal1 = _mm_load_ps(aPtr);
85  aPtr += 4;
86  cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
87 
88  aVal2 = _mm_load_ps(aPtr);
89  aPtr += 4;
90  cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
91 
92  aVal3 = _mm_load_ps(aPtr);
93  aPtr += 4;
94  cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
95 
96  aVal4 = _mm_load_ps(aPtr);
97  aPtr += 4;
98  cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
99 
100  cVal1 = _mm_or_ps(cVal1, cVal2);
101  cVal3 = _mm_or_ps(cVal3, cVal4);
102  cVal1 = _mm_or_ps(cVal1, cVal3);
103 
104  squareAccumulator =
105  _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
106  }
107  _mm_store_ps(squareBuffer,
108  squareAccumulator); // Store the results back into the C container
109  returnValue = squareBuffer[0];
110  returnValue += squareBuffer[1];
111  returnValue += squareBuffer[2];
112  returnValue += squareBuffer[3];
113 
114  number = sixteenthPoints * 16;
115  for (; number < num_points; number++) {
116  returnValue += (*aPtr) * (*aPtr);
117  aPtr++;
118  }
119  returnValue /= num_points;
120  returnValue -= (mean * mean);
121  returnValue = sqrtf(returnValue);
122  }
123  *stddev = returnValue;
124 }
125 
126 #endif /* LV_HAVE_SSE4_1 */
127 
128 #ifdef LV_HAVE_SSE
129 #include <xmmintrin.h>
130 
131 static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
132  const float* inputBuffer,
133  const float mean,
134  unsigned int num_points)
135 {
136  float returnValue = 0;
137  if (num_points > 0) {
138  unsigned int number = 0;
139  const unsigned int quarterPoints = num_points / 4;
140 
141  const float* aPtr = inputBuffer;
142 
143  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
144 
145  __m128 squareAccumulator = _mm_setzero_ps();
146  __m128 aVal = _mm_setzero_ps();
147  for (; number < quarterPoints; number++) {
148  aVal = _mm_load_ps(aPtr); // aVal = x
149  aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
150  squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
151  aPtr += 4;
152  }
153  _mm_store_ps(squareBuffer,
154  squareAccumulator); // Store the results back into the C container
155  returnValue = squareBuffer[0];
156  returnValue += squareBuffer[1];
157  returnValue += squareBuffer[2];
158  returnValue += squareBuffer[3];
159 
160  number = quarterPoints * 4;
161  for (; number < num_points; number++) {
162  returnValue += (*aPtr) * (*aPtr);
163  aPtr++;
164  }
165  returnValue /= num_points;
166  returnValue -= (mean * mean);
167  returnValue = sqrtf(returnValue);
168  }
169  *stddev = returnValue;
170 }
171 #endif /* LV_HAVE_SSE */
172 
173 
174 #ifdef LV_HAVE_AVX
175 #include <immintrin.h>
176 
177 static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
178  const float* inputBuffer,
179  const float mean,
180  unsigned int num_points)
181 {
182  float stdDev = 0;
183  if (num_points > 0) {
184  unsigned int number = 0;
185  const unsigned int thirtySecondthPoints = num_points / 32;
186 
187  const float* aPtr = inputBuffer;
188  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
189 
190  __m256 squareAccumulator = _mm256_setzero_ps();
191  __m256 aVal1, aVal2, aVal3, aVal4;
192  __m256 cVal1, cVal2, cVal3, cVal4;
193  for (; number < thirtySecondthPoints; number++) {
194  aVal1 = _mm256_load_ps(aPtr);
195  aPtr += 8;
196  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
197 
198  aVal2 = _mm256_load_ps(aPtr);
199  aPtr += 8;
200  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
201 
202  aVal3 = _mm256_load_ps(aPtr);
203  aPtr += 8;
204  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
205 
206  aVal4 = _mm256_load_ps(aPtr);
207  aPtr += 8;
208  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
209 
210  cVal1 = _mm256_or_ps(cVal1, cVal2);
211  cVal3 = _mm256_or_ps(cVal3, cVal4);
212  cVal1 = _mm256_or_ps(cVal1, cVal3);
213 
214  squareAccumulator =
215  _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
216  }
217  _mm256_store_ps(squareBuffer,
218  squareAccumulator); // Store the results back into the C container
219  stdDev = squareBuffer[0];
220  stdDev += squareBuffer[1];
221  stdDev += squareBuffer[2];
222  stdDev += squareBuffer[3];
223  stdDev += squareBuffer[4];
224  stdDev += squareBuffer[5];
225  stdDev += squareBuffer[6];
226  stdDev += squareBuffer[7];
227 
228  number = thirtySecondthPoints * 32;
229  for (; number < num_points; number++) {
230  stdDev += (*aPtr) * (*aPtr);
231  aPtr++;
232  }
233  stdDev /= num_points;
234  stdDev -= (mean * mean);
235  stdDev = sqrtf(stdDev);
236  }
237  *stddev = stdDev;
238 }
239 #endif /* LV_HAVE_AVX */
240 
241 
242 #ifdef LV_HAVE_GENERIC
243 
244 static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
245  const float* inputBuffer,
246  const float mean,
247  unsigned int num_points)
248 {
249  float returnValue = 0;
250  if (num_points > 0) {
251  const float* aPtr = inputBuffer;
252  unsigned int number = 0;
253 
254  for (number = 0; number < num_points; number++) {
255  returnValue += (*aPtr) * (*aPtr);
256  aPtr++;
257  }
258 
259  returnValue /= num_points;
260  returnValue -= (mean * mean);
261  returnValue = sqrtf(returnValue);
262  }
263  *stddev = returnValue;
264 }
265 
266 #endif /* LV_HAVE_GENERIC */
267 
268 
269 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
270 
271 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
272 #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
273 
274 #include <inttypes.h>
275 #include <math.h>
276 #include <stdio.h>
277 #include <volk/volk_common.h>
278 
279 #ifdef LV_HAVE_AVX
280 #include <immintrin.h>
281 
282 static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
283  const float* inputBuffer,
284  const float mean,
285  unsigned int num_points)
286 {
287  float stdDev = 0;
288  if (num_points > 0) {
289  unsigned int number = 0;
290  const unsigned int thirtySecondthPoints = num_points / 32;
291 
292  const float* aPtr = inputBuffer;
293  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
294 
295  __m256 squareAccumulator = _mm256_setzero_ps();
296  __m256 aVal1, aVal2, aVal3, aVal4;
297  __m256 cVal1, cVal2, cVal3, cVal4;
298  for (; number < thirtySecondthPoints; number++) {
299  aVal1 = _mm256_loadu_ps(aPtr);
300  aPtr += 8;
301  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
302 
303  aVal2 = _mm256_loadu_ps(aPtr);
304  aPtr += 8;
305  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
306 
307  aVal3 = _mm256_loadu_ps(aPtr);
308  aPtr += 8;
309  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
310 
311  aVal4 = _mm256_loadu_ps(aPtr);
312  aPtr += 8;
313  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
314 
315  cVal1 = _mm256_or_ps(cVal1, cVal2);
316  cVal3 = _mm256_or_ps(cVal3, cVal4);
317  cVal1 = _mm256_or_ps(cVal1, cVal3);
318 
319  squareAccumulator =
320  _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
321  }
322  _mm256_storeu_ps(
323  squareBuffer,
324  squareAccumulator); // Store the results back into the C container
325  stdDev = squareBuffer[0];
326  stdDev += squareBuffer[1];
327  stdDev += squareBuffer[2];
328  stdDev += squareBuffer[3];
329  stdDev += squareBuffer[4];
330  stdDev += squareBuffer[5];
331  stdDev += squareBuffer[6];
332  stdDev += squareBuffer[7];
333 
334  number = thirtySecondthPoints * 32;
335  for (; number < num_points; number++) {
336  stdDev += (*aPtr) * (*aPtr);
337  aPtr++;
338  }
339  stdDev /= num_points;
340  stdDev -= (mean * mean);
341  stdDev = sqrtf(stdDev);
342  }
343  *stddev = stdDev;
344 }
345 #endif /* LV_HAVE_AVX */
346 
347 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:177
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:131
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:282
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:244
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65