60 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
61 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
70 #ifdef LV_HAVE_GENERIC
74 const float* inputBuffer,
75 unsigned int num_points)
77 const float* in_ptr = inputBuffer;
78 if (num_points == 0) {
80 }
else if (num_points == 1) {
87 float SquareSum[2] = { 0.f, 0.f };
91 uint32_t half_points = num_points / 2;
93 for (uint32_t number = 1; number < half_points; number++) {
94 float Val0 = (*in_ptr++);
95 float Val1 = (*in_ptr++);
96 float n = (float)number;
97 float n_plus_one = n + 1.f;
98 float r = 1.f / (n * n_plus_one);
103 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
104 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
107 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
110 uint32_t points_done = half_points * 2;
112 for (; points_done < num_points; points_done++) {
113 float Val = (*in_ptr++);
114 float n = (float)points_done;
115 float n_plus_one = n + 1.f;
116 float r = 1.f / (n * n_plus_one);
118 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
120 *stddev = sqrtf(SquareSum[0] / num_points);
121 *mean = Sum[0] / num_points;
131 float n = (float)len;
132 float n_plus_one = n + 1.f;
134 1.f / (n * n_plus_one) * (n_plus_one *
val - Sum) * (n_plus_one *
val - Sum);
139 const float SquareSum1,
144 float n = (float)len;
145 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
150 const uint32_t NumberOfPartitions,
151 const uint32_t PartitionLen)
154 uint32_t accumulators = NumberOfPartitions;
157 uint32_t partition_len = PartitionLen;
159 while (accumulators >>= 1) {
162 accumulators = NumberOfPartitions;
164 for (uint32_t s = 0; s < stages; s++) {
167 for (uint32_t a = 0; a < accumulators; a++) {
170 PartialSquareSums[idx + offset],
171 PartialSums[idx + offset],
173 PartialSums[idx] += PartialSums[idx + offset];
182 #include <arm_neon.h>
187 const float* inputBuffer,
188 unsigned int num_points)
190 if (num_points < 8) {
195 const float* in_ptr = inputBuffer;
200 const uint32_t eigth_points = num_points / 8;
202 float32x4_t Sum0, Sum1;
204 Sum0 = vld1q_f32((
const float32_t*)in_ptr);
208 Sum1 = vld1q_f32((
const float32_t*)in_ptr);
212 float32x4_t SquareSum0 = { 0.f };
213 float32x4_t SquareSum1 = { 0.f };
215 float32x4_t Values0, Values1;
216 float32x4_t Aux0, Aux1;
217 float32x4_t Reciprocal;
219 for (uint32_t number = 1; number < eigth_points; number++) {
220 Values0 = vld1q_f32(in_ptr);
224 Values1 = vld1q_f32(in_ptr);
228 float n = (float)number;
229 float n_plus_one = n + 1.f;
230 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
232 Sum0 = vaddq_f32(Sum0, Values0);
233 Aux0 = vdupq_n_f32(n_plus_one);
237 Sum1 = vaddq_f32(Sum1, Values1);
238 Aux1 = vdupq_n_f32(n_plus_one);
243 vst1q_f32(&SumLocal[0], Sum0);
244 vst1q_f32(&SumLocal[4], Sum1);
245 vst1q_f32(&SquareSumLocal[0], SquareSum0);
246 vst1q_f32(&SquareSumLocal[4], SquareSum1);
250 uint32_t points_done = eigth_points * 8;
252 for (; points_done < num_points; points_done++) {
253 float val = (*in_ptr++);
259 *stddev = sqrtf(SquareSumLocal[0] / num_points);
260 *mean = SumLocal[0] / num_points;
266 #include <xmmintrin.h>
270 const float* inputBuffer,
271 unsigned int num_points)
273 if (num_points < 8) {
278 const float* in_ptr = inputBuffer;
284 const uint32_t eigth_points = num_points / 8;
296 for (uint32_t number = 1; number < eigth_points; number++) {
305 float n = (float)number;
306 float n_plus_one = n + 1.f;
327 uint32_t points_done = eigth_points * 8;
329 for (; points_done < num_points; points_done++) {
330 float val = (*in_ptr++);
336 *stddev = sqrtf(SquareSumLocal[0] / num_points);
337 *mean = SumLocal[0] / num_points;
342 #include <immintrin.h>
347 const float* inputBuffer,
348 unsigned int num_points)
350 if (num_points < 16) {
355 const float* in_ptr = inputBuffer;
360 const unsigned int sixteenth_points = num_points / 16;
362 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
364 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
367 __m256 SquareSum0 = _mm256_setzero_ps();
368 __m256 SquareSum1 = _mm256_setzero_ps();
369 __m256 Values0, Values1;
373 for (uint32_t number = 1; number < sixteenth_points; number++) {
374 Values0 = _mm256_loadu_ps(in_ptr);
378 Values1 = _mm256_loadu_ps(in_ptr);
382 float n = (float)number;
383 float n_plus_one = n + 1.f;
385 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
387 Sum0 = _mm256_add_ps(Sum0, Values0);
388 Aux0 = _mm256_set1_ps(n_plus_one);
392 Sum1 = _mm256_add_ps(Sum1, Values1);
393 Aux1 = _mm256_set1_ps(n_plus_one);
398 _mm256_store_ps(&SumLocal[0], Sum0);
399 _mm256_store_ps(&SumLocal[8], Sum1);
400 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
401 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
403 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
405 uint32_t points_done = sixteenth_points * 16;
407 for (; points_done < num_points; points_done++) {
408 float val = (*in_ptr++);
414 *stddev = sqrtf(SquareSumLocal[0] / num_points);
415 *mean = SumLocal[0] / num_points;
420 #include <xmmintrin.h>
424 const float* inputBuffer,
425 unsigned int num_points)
427 if (num_points < 8) {
432 const float* in_ptr = inputBuffer;
438 const uint32_t eigth_points = num_points / 8;
450 for (uint32_t number = 1; number < eigth_points; number++) {
459 float n = (float)number;
460 float n_plus_one = n + 1.f;
481 uint32_t points_done = eigth_points * 8;
483 for (; points_done < num_points; points_done++) {
484 float val = (*in_ptr++);
490 *stddev = sqrtf(SquareSumLocal[0] / num_points);
491 *mean = SumLocal[0] / num_points;
496 #include <immintrin.h>
500 const float* inputBuffer,
501 unsigned int num_points)
503 if (num_points < 16) {
508 const float* in_ptr = inputBuffer;
513 const unsigned int sixteenth_points = num_points / 16;
515 __m256 Sum0 = _mm256_load_ps(in_ptr);
517 __m256 Sum1 = _mm256_load_ps(in_ptr);
520 __m256 SquareSum0 = _mm256_setzero_ps();
521 __m256 SquareSum1 = _mm256_setzero_ps();
522 __m256 Values0, Values1;
526 for (uint32_t number = 1; number < sixteenth_points; number++) {
527 Values0 = _mm256_load_ps(in_ptr);
531 Values1 = _mm256_load_ps(in_ptr);
535 float n = (float)number;
536 float n_plus_one = n + 1.f;
538 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
540 Sum0 = _mm256_add_ps(Sum0, Values0);
541 Aux0 = _mm256_set1_ps(n_plus_one);
545 Sum1 = _mm256_add_ps(Sum1, Values1);
546 Aux1 = _mm256_set1_ps(n_plus_one);
551 _mm256_store_ps(&SumLocal[0], Sum0);
552 _mm256_store_ps(&SumLocal[8], Sum1);
553 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
554 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
556 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
558 uint32_t points_done = sixteenth_points * 16;
560 for (; points_done < num_points; points_done++) {
561 float val = (*in_ptr++);
567 *stddev = sqrtf(SquareSumLocal[0] / num_points);
568 *mean = SumLocal[0] / num_points;
val
Definition: volk_arch_defs.py:57
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:268
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:137
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:148
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:345
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:72
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:498
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:185
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:422
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:125
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:185
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:267
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:49