52 #ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53 #define INCLUDED_volk_32f_invsqrt_32f_a_H
63 const float threehalfs = 1.5F;
71 u.i = 0x5f3759df - (u.i >> 1);
72 u.f = u.f * (threehalfs - (x2 * u.f * u.f));
80 #include <immintrin.h>
85 unsigned int number = 0;
86 const unsigned int eighthPoints = num_points / 8;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 cVal = _mm256_rsqrt_ps(aVal);
94 _mm256_store_ps(cPtr, cVal);
99 number = eighthPoints * 8;
100 for (; number < num_points; number++)
107 #include <xmmintrin.h>
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
119 for (; number < quarterPoints; number++) {
131 number = quarterPoints * 4;
132 for (; number < num_points; number++) {
140 #include <arm_neon.h>
146 const unsigned int quarter_points = num_points / 4;
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150 float32x4_t a_val, c_val;
151 for (number = 0; number < quarter_points; ++number) {
152 a_val = vld1q_f32(aPtr);
153 c_val = vrsqrteq_f32(a_val);
154 vst1q_f32(cPtr, c_val);
159 for (number = quarter_points * 4; number < num_points; number++)
165 #ifdef LV_HAVE_GENERIC
168 const float* aVector,
169 unsigned int num_points)
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
174 for (number = 0; number < num_points; number++) {
181 #include <immintrin.h>
186 unsigned int number = 0;
187 const unsigned int eighthPoints = num_points / 8;
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_loadu_ps(aPtr);
194 cVal = _mm256_rsqrt_ps(aVal);
195 _mm256_storeu_ps(cPtr, cVal);
200 number = eighthPoints * 8;
201 for (; number < num_points; number++)
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition: sse2neon.h:2359
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_invsqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:143
static void volk_32f_invsqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:83
static void volk_32f_invsqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:167
static void volk_32f_invsqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:110
static void volk_32f_invsqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:184
static float Q_rsqrt(float number)
Definition: volk_32f_invsqrt_32f.h:60
for i
Definition: volk_config_fixed.tmpl.h:13