55 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
64 #include <smmintrin.h>
66 static inline void volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
67 const float* inputBuffer,
69 unsigned int num_points)
71 float returnValue = 0;
73 unsigned int number = 0;
74 const unsigned int sixteenthPoints = num_points / 16;
76 const float* aPtr = inputBuffer;
81 __m128 aVal1, aVal2, aVal3, aVal4;
82 __m128 cVal1, cVal2, cVal3, cVal4;
83 for (; number < sixteenthPoints; number++) {
109 returnValue = squareBuffer[0];
110 returnValue += squareBuffer[1];
111 returnValue += squareBuffer[2];
112 returnValue += squareBuffer[3];
114 number = sixteenthPoints * 16;
115 for (; number < num_points; number++) {
116 returnValue += (*aPtr) * (*aPtr);
119 returnValue /= num_points;
120 returnValue -= (mean * mean);
121 returnValue = sqrtf(returnValue);
123 *stddev = returnValue;
129 #include <xmmintrin.h>
132 const float* inputBuffer,
134 unsigned int num_points)
136 float returnValue = 0;
137 if (num_points > 0) {
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
141 const float* aPtr = inputBuffer;
147 for (; number < quarterPoints; number++) {
150 squareAccumulator =
_mm_add_ps(squareAccumulator, aVal);
155 returnValue = squareBuffer[0];
156 returnValue += squareBuffer[1];
157 returnValue += squareBuffer[2];
158 returnValue += squareBuffer[3];
160 number = quarterPoints * 4;
161 for (; number < num_points; number++) {
162 returnValue += (*aPtr) * (*aPtr);
165 returnValue /= num_points;
166 returnValue -= (mean * mean);
167 returnValue = sqrtf(returnValue);
169 *stddev = returnValue;
175 #include <immintrin.h>
178 const float* inputBuffer,
180 unsigned int num_points)
183 if (num_points > 0) {
184 unsigned int number = 0;
185 const unsigned int thirtySecondthPoints = num_points / 32;
187 const float* aPtr = inputBuffer;
190 __m256 squareAccumulator = _mm256_setzero_ps();
191 __m256 aVal1, aVal2, aVal3, aVal4;
192 __m256 cVal1, cVal2, cVal3, cVal4;
193 for (; number < thirtySecondthPoints; number++) {
194 aVal1 = _mm256_load_ps(aPtr);
196 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
198 aVal2 = _mm256_load_ps(aPtr);
200 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
202 aVal3 = _mm256_load_ps(aPtr);
204 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
206 aVal4 = _mm256_load_ps(aPtr);
208 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
210 cVal1 = _mm256_or_ps(cVal1, cVal2);
211 cVal3 = _mm256_or_ps(cVal3, cVal4);
212 cVal1 = _mm256_or_ps(cVal1, cVal3);
215 _mm256_add_ps(squareAccumulator, cVal1);
217 _mm256_store_ps(squareBuffer,
219 stdDev = squareBuffer[0];
220 stdDev += squareBuffer[1];
221 stdDev += squareBuffer[2];
222 stdDev += squareBuffer[3];
223 stdDev += squareBuffer[4];
224 stdDev += squareBuffer[5];
225 stdDev += squareBuffer[6];
226 stdDev += squareBuffer[7];
228 number = thirtySecondthPoints * 32;
229 for (; number < num_points; number++) {
230 stdDev += (*aPtr) * (*aPtr);
233 stdDev /= num_points;
234 stdDev -= (mean * mean);
235 stdDev = sqrtf(stdDev);
242 #ifdef LV_HAVE_GENERIC
245 const float* inputBuffer,
247 unsigned int num_points)
249 float returnValue = 0;
250 if (num_points > 0) {
251 const float* aPtr = inputBuffer;
252 unsigned int number = 0;
254 for (number = 0; number < num_points; number++) {
255 returnValue += (*aPtr) * (*aPtr);
259 returnValue /= num_points;
260 returnValue -= (mean * mean);
261 returnValue = sqrtf(returnValue);
263 *stddev = returnValue;
271 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
272 #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
274 #include <inttypes.h>
280 #include <immintrin.h>
283 const float* inputBuffer,
285 unsigned int num_points)
288 if (num_points > 0) {
289 unsigned int number = 0;
290 const unsigned int thirtySecondthPoints = num_points / 32;
292 const float* aPtr = inputBuffer;
295 __m256 squareAccumulator = _mm256_setzero_ps();
296 __m256 aVal1, aVal2, aVal3, aVal4;
297 __m256 cVal1, cVal2, cVal3, cVal4;
298 for (; number < thirtySecondthPoints; number++) {
299 aVal1 = _mm256_loadu_ps(aPtr);
301 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
303 aVal2 = _mm256_loadu_ps(aPtr);
305 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
307 aVal3 = _mm256_loadu_ps(aPtr);
309 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
311 aVal4 = _mm256_loadu_ps(aPtr);
313 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
315 cVal1 = _mm256_or_ps(cVal1, cVal2);
316 cVal3 = _mm256_or_ps(cVal3, cVal4);
317 cVal1 = _mm256_or_ps(cVal1, cVal3);
320 _mm256_add_ps(squareAccumulator, cVal1);
325 stdDev = squareBuffer[0];
326 stdDev += squareBuffer[1];
327 stdDev += squareBuffer[2];
328 stdDev += squareBuffer[3];
329 stdDev += squareBuffer[4];
330 stdDev += squareBuffer[5];
331 stdDev += squareBuffer[6];
332 stdDev += squareBuffer[7];
334 number = thirtySecondthPoints * 32;
335 for (; number < num_points; number++) {
336 stdDev += (*aPtr) * (*aPtr);
339 stdDev /= num_points;
340 stdDev -= (mean * mean);
341 stdDev = sqrtf(stdDev);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:177
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:131
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:282
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:244
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65