Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
61 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
62 
63 #include <inttypes.h>
64 #include <math.h>
65 #include <volk/volk_common.h>
66 
67 // Youngs and Cramer's Algorithm for calculating std and mean
68 // Using the methods discussed here:
69 // https://doi.org/10.1145/3221269.3223036
70 #ifdef LV_HAVE_GENERIC
71 
72 static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
73  float* mean,
74  const float* inputBuffer,
75  unsigned int num_points)
76 {
77  const float* in_ptr = inputBuffer;
78  if (num_points == 0) {
79  return;
80  } else if (num_points == 1) {
81  *stddev = 0.f;
82  *mean = (*in_ptr);
83  return;
84  }
85 
86  float Sum[2];
87  float SquareSum[2] = { 0.f, 0.f };
88  Sum[0] = (*in_ptr++);
89  Sum[1] = (*in_ptr++);
90 
91  uint32_t half_points = num_points / 2;
92 
93  for (uint32_t number = 1; number < half_points; number++) {
94  float Val0 = (*in_ptr++);
95  float Val1 = (*in_ptr++);
96  float n = (float)number;
97  float n_plus_one = n + 1.f;
98  float r = 1.f / (n * n_plus_one);
99 
100  Sum[0] += Val0;
101  Sum[1] += Val1;
102 
103  SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
104  SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
105  }
106 
107  SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
108  Sum[0] += Sum[1];
109 
110  uint32_t points_done = half_points * 2;
111 
112  for (; points_done < num_points; points_done++) {
113  float Val = (*in_ptr++);
114  float n = (float)points_done;
115  float n_plus_one = n + 1.f;
116  float r = 1.f / (n * n_plus_one);
117  Sum[0] += Val;
118  SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
119  }
120  *stddev = sqrtf(SquareSum[0] / num_points);
121  *mean = Sum[0] / num_points;
122 }
123 #endif /* LV_HAVE_GENERIC */
124 
125 static inline float update_square_sum_1_val(const float SquareSum,
126  const float Sum,
127  const uint32_t len,
128  const float val)
129 {
130  // Updates a sum of squares calculated over len values with the value val
131  float n = (float)len;
132  float n_plus_one = n + 1.f;
133  return SquareSum +
134  1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
135 }
136 
137 static inline float add_square_sums(const float SquareSum0,
138  const float Sum0,
139  const float SquareSum1,
140  const float Sum1,
141  const uint32_t len)
142 {
143  // Add two sums of squares calculated over the same number of values, len
144  float n = (float)len;
145  return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
146 }
147 
148 static inline void accrue_result(float* PartialSquareSums,
149  float* PartialSums,
150  const uint32_t NumberOfPartitions,
151  const uint32_t PartitionLen)
152 {
153  // Add all partial sums and square sums into the first element of the arrays
154  uint32_t accumulators = NumberOfPartitions;
155  uint32_t stages = 0;
156  uint32_t offset = 1;
157  uint32_t partition_len = PartitionLen;
158 
159  while (accumulators >>= 1) {
160  stages++;
161  } // Integer log2
162  accumulators = NumberOfPartitions;
163 
164  for (uint32_t s = 0; s < stages; s++) {
165  accumulators /= 2;
166  uint32_t idx = 0;
167  for (uint32_t a = 0; a < accumulators; a++) {
168  PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
169  PartialSums[idx],
170  PartialSquareSums[idx + offset],
171  PartialSums[idx + offset],
172  partition_len);
173  PartialSums[idx] += PartialSums[idx + offset];
174  idx += 2 * offset;
175  }
176  offset *= 2;
177  partition_len *= 2;
178  }
179 }
180 
181 #ifdef LV_HAVE_NEON
182 #include <arm_neon.h>
184 
185 static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
186  float* mean,
187  const float* inputBuffer,
188  unsigned int num_points)
189 {
190  if (num_points < 8) {
191  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
192  return;
193  }
194 
195  const float* in_ptr = inputBuffer;
196 
197  __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
198  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
199 
200  const uint32_t eigth_points = num_points / 8;
201 
202  float32x4_t Sum0, Sum1;
203 
204  Sum0 = vld1q_f32((const float32_t*)in_ptr);
205  in_ptr += 4;
206  __VOLK_PREFETCH(in_ptr + 4);
207 
208  Sum1 = vld1q_f32((const float32_t*)in_ptr);
209  in_ptr += 4;
210  __VOLK_PREFETCH(in_ptr + 4);
211 
212  float32x4_t SquareSum0 = { 0.f };
213  float32x4_t SquareSum1 = { 0.f };
214 
215  float32x4_t Values0, Values1;
216  float32x4_t Aux0, Aux1;
217  float32x4_t Reciprocal;
218 
219  for (uint32_t number = 1; number < eigth_points; number++) {
220  Values0 = vld1q_f32(in_ptr);
221  in_ptr += 4;
222  __VOLK_PREFETCH(in_ptr + 4);
223 
224  Values1 = vld1q_f32(in_ptr);
225  in_ptr += 4;
226  __VOLK_PREFETCH(in_ptr + 4);
227 
228  float n = (float)number;
229  float n_plus_one = n + 1.f;
230  Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
231 
232  Sum0 = vaddq_f32(Sum0, Values0);
233  Aux0 = vdupq_n_f32(n_plus_one);
234  SquareSum0 =
235  _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
236 
237  Sum1 = vaddq_f32(Sum1, Values1);
238  Aux1 = vdupq_n_f32(n_plus_one);
239  SquareSum1 =
240  _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
241  }
242 
243  vst1q_f32(&SumLocal[0], Sum0);
244  vst1q_f32(&SumLocal[4], Sum1);
245  vst1q_f32(&SquareSumLocal[0], SquareSum0);
246  vst1q_f32(&SquareSumLocal[4], SquareSum1);
247 
248  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
249 
250  uint32_t points_done = eigth_points * 8;
251 
252  for (; points_done < num_points; points_done++) {
253  float val = (*in_ptr++);
254  SumLocal[0] += val;
255  SquareSumLocal[0] =
256  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
257  }
258 
259  *stddev = sqrtf(SquareSumLocal[0] / num_points);
260  *mean = SumLocal[0] / num_points;
261 }
262 #endif /* LV_HAVE_NEON */
263 
264 #ifdef LV_HAVE_SSE
266 #include <xmmintrin.h>
267 
268 static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
269  float* mean,
270  const float* inputBuffer,
271  unsigned int num_points)
272 {
273  if (num_points < 8) {
274  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
275  return;
276  }
277 
278  const float* in_ptr = inputBuffer;
279 
280  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
281  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
282 
283 
284  const uint32_t eigth_points = num_points / 8;
285 
286  __m128 Sum0 = _mm_loadu_ps(in_ptr);
287  in_ptr += 4;
288  __m128 Sum1 = _mm_loadu_ps(in_ptr);
289  in_ptr += 4;
290  __m128 SquareSum0 = _mm_setzero_ps();
291  __m128 SquareSum1 = _mm_setzero_ps();
292  __m128 Values0, Values1;
293  __m128 Aux0, Aux1;
294  __m128 Reciprocal;
295 
296  for (uint32_t number = 1; number < eigth_points; number++) {
297  Values0 = _mm_loadu_ps(in_ptr);
298  in_ptr += 4;
299  __VOLK_PREFETCH(in_ptr + 4);
300 
301  Values1 = _mm_loadu_ps(in_ptr);
302  in_ptr += 4;
303  __VOLK_PREFETCH(in_ptr + 4);
304 
305  float n = (float)number;
306  float n_plus_one = n + 1.f;
307  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
308 
309  Sum0 = _mm_add_ps(Sum0, Values0);
310  Aux0 = _mm_set_ps1(n_plus_one);
311  SquareSum0 =
312  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
313 
314  Sum1 = _mm_add_ps(Sum1, Values1);
315  Aux1 = _mm_set_ps1(n_plus_one);
316  SquareSum1 =
317  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
318  }
319 
320  _mm_store_ps(&SumLocal[0], Sum0);
321  _mm_store_ps(&SumLocal[4], Sum1);
322  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
323  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
324 
325  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
326 
327  uint32_t points_done = eigth_points * 8;
328 
329  for (; points_done < num_points; points_done++) {
330  float val = (*in_ptr++);
331  SumLocal[0] += val;
332  SquareSumLocal[0] =
333  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
334  }
335 
336  *stddev = sqrtf(SquareSumLocal[0] / num_points);
337  *mean = SumLocal[0] / num_points;
338 }
339 #endif /* LV_HAVE_SSE */
340 
341 #ifdef LV_HAVE_AVX
342 #include <immintrin.h>
344 
345 static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
346  float* mean,
347  const float* inputBuffer,
348  unsigned int num_points)
349 {
350  if (num_points < 16) {
351  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
352  return;
353  }
354 
355  const float* in_ptr = inputBuffer;
356 
357  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
358  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
359 
360  const unsigned int sixteenth_points = num_points / 16;
361 
362  __m256 Sum0 = _mm256_loadu_ps(in_ptr);
363  in_ptr += 8;
364  __m256 Sum1 = _mm256_loadu_ps(in_ptr);
365  in_ptr += 8;
366 
367  __m256 SquareSum0 = _mm256_setzero_ps();
368  __m256 SquareSum1 = _mm256_setzero_ps();
369  __m256 Values0, Values1;
370  __m256 Aux0, Aux1;
371  __m256 Reciprocal;
372 
373  for (uint32_t number = 1; number < sixteenth_points; number++) {
374  Values0 = _mm256_loadu_ps(in_ptr);
375  in_ptr += 8;
376  __VOLK_PREFETCH(in_ptr + 8);
377 
378  Values1 = _mm256_loadu_ps(in_ptr);
379  in_ptr += 8;
380  __VOLK_PREFETCH(in_ptr + 8);
381 
382  float n = (float)number;
383  float n_plus_one = n + 1.f;
384 
385  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
386 
387  Sum0 = _mm256_add_ps(Sum0, Values0);
388  Aux0 = _mm256_set1_ps(n_plus_one);
389  SquareSum0 =
390  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
391 
392  Sum1 = _mm256_add_ps(Sum1, Values1);
393  Aux1 = _mm256_set1_ps(n_plus_one);
394  SquareSum1 =
395  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
396  }
397 
398  _mm256_store_ps(&SumLocal[0], Sum0);
399  _mm256_store_ps(&SumLocal[8], Sum1);
400  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
401  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
402 
403  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
404 
405  uint32_t points_done = sixteenth_points * 16;
406 
407  for (; points_done < num_points; points_done++) {
408  float val = (*in_ptr++);
409  SumLocal[0] += val;
410  SquareSumLocal[0] =
411  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
412  }
413 
414  *stddev = sqrtf(SquareSumLocal[0] / num_points);
415  *mean = SumLocal[0] / num_points;
416 }
417 #endif /* LV_HAVE_AVX */
418 
419 #ifdef LV_HAVE_SSE
420 #include <xmmintrin.h>
421 
422 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
423  float* mean,
424  const float* inputBuffer,
425  unsigned int num_points)
426 {
427  if (num_points < 8) {
428  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
429  return;
430  }
431 
432  const float* in_ptr = inputBuffer;
433 
434  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
435  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
436 
437 
438  const uint32_t eigth_points = num_points / 8;
439 
440  __m128 Sum0 = _mm_load_ps(in_ptr);
441  in_ptr += 4;
442  __m128 Sum1 = _mm_load_ps(in_ptr);
443  in_ptr += 4;
444  __m128 SquareSum0 = _mm_setzero_ps();
445  __m128 SquareSum1 = _mm_setzero_ps();
446  __m128 Values0, Values1;
447  __m128 Aux0, Aux1;
448  __m128 Reciprocal;
449 
450  for (uint32_t number = 1; number < eigth_points; number++) {
451  Values0 = _mm_load_ps(in_ptr);
452  in_ptr += 4;
453  __VOLK_PREFETCH(in_ptr + 4);
454 
455  Values1 = _mm_load_ps(in_ptr);
456  in_ptr += 4;
457  __VOLK_PREFETCH(in_ptr + 4);
458 
459  float n = (float)number;
460  float n_plus_one = n + 1.f;
461  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
462 
463  Sum0 = _mm_add_ps(Sum0, Values0);
464  Aux0 = _mm_set_ps1(n_plus_one);
465  SquareSum0 =
466  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
467 
468  Sum1 = _mm_add_ps(Sum1, Values1);
469  Aux1 = _mm_set_ps1(n_plus_one);
470  SquareSum1 =
471  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
472  }
473 
474  _mm_store_ps(&SumLocal[0], Sum0);
475  _mm_store_ps(&SumLocal[4], Sum1);
476  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
477  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
478 
479  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
480 
481  uint32_t points_done = eigth_points * 8;
482 
483  for (; points_done < num_points; points_done++) {
484  float val = (*in_ptr++);
485  SumLocal[0] += val;
486  SquareSumLocal[0] =
487  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
488  }
489 
490  *stddev = sqrtf(SquareSumLocal[0] / num_points);
491  *mean = SumLocal[0] / num_points;
492 }
493 #endif /* LV_HAVE_SSE */
494 
495 #ifdef LV_HAVE_AVX
496 #include <immintrin.h>
497 
498 static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
499  float* mean,
500  const float* inputBuffer,
501  unsigned int num_points)
502 {
503  if (num_points < 16) {
504  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
505  return;
506  }
507 
508  const float* in_ptr = inputBuffer;
509 
510  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
511  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
512 
513  const unsigned int sixteenth_points = num_points / 16;
514 
515  __m256 Sum0 = _mm256_load_ps(in_ptr);
516  in_ptr += 8;
517  __m256 Sum1 = _mm256_load_ps(in_ptr);
518  in_ptr += 8;
519 
520  __m256 SquareSum0 = _mm256_setzero_ps();
521  __m256 SquareSum1 = _mm256_setzero_ps();
522  __m256 Values0, Values1;
523  __m256 Aux0, Aux1;
524  __m256 Reciprocal;
525 
526  for (uint32_t number = 1; number < sixteenth_points; number++) {
527  Values0 = _mm256_load_ps(in_ptr);
528  in_ptr += 8;
529  __VOLK_PREFETCH(in_ptr + 8);
530 
531  Values1 = _mm256_load_ps(in_ptr);
532  in_ptr += 8;
533  __VOLK_PREFETCH(in_ptr + 8);
534 
535  float n = (float)number;
536  float n_plus_one = n + 1.f;
537 
538  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
539 
540  Sum0 = _mm256_add_ps(Sum0, Values0);
541  Aux0 = _mm256_set1_ps(n_plus_one);
542  SquareSum0 =
543  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
544 
545  Sum1 = _mm256_add_ps(Sum1, Values1);
546  Aux1 = _mm256_set1_ps(n_plus_one);
547  SquareSum1 =
548  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
549  }
550 
551  _mm256_store_ps(&SumLocal[0], Sum0);
552  _mm256_store_ps(&SumLocal[8], Sum1);
553  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
554  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
555 
556  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
557 
558  uint32_t points_done = sixteenth_points * 16;
559 
560  for (; points_done < num_points; points_done++) {
561  float val = (*in_ptr++);
562  SumLocal[0] += val;
563  SquareSumLocal[0] =
564  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
565  }
566 
567  *stddev = sqrtf(SquareSumLocal[0] / num_points);
568  *mean = SumLocal[0] / num_points;
569 }
570 #endif /* LV_HAVE_AVX */
571 
572 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
val
Definition: volk_arch_defs.py:57
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:268
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:137
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:148
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:345
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:72
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:498
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:185
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:422
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:125
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:185
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:267
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:49