52 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53 #define INCLUDED_volk_32f_sqrt_32f_a_H
60 #include <xmmintrin.h>
65 unsigned int number = 0;
66 const unsigned int quarterPoints = num_points / 4;
68 float* cPtr = cVector;
69 const float* aPtr = aVector;
72 for (; number < quarterPoints; number++) {
83 number = quarterPoints * 4;
84 for (; number < num_points; number++) {
85 *cPtr++ = sqrtf(*aPtr++);
92 #include <immintrin.h>
97 unsigned int number = 0;
98 const unsigned int eighthPoints = num_points / 8;
100 float* cPtr = cVector;
101 const float* aPtr = aVector;
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
107 cVal = _mm256_sqrt_ps(aVal);
109 _mm256_store_ps(cPtr, cVal);
115 number = eighthPoints * 8;
116 for (; number < num_points; number++) {
117 *cPtr++ = sqrtf(*aPtr++);
125 #include <arm_neon.h>
130 float* cPtr = cVector;
131 const float* aPtr = aVector;
132 unsigned int number = 0;
133 unsigned int quarter_points = num_points / 4;
134 float32x4_t in_vec, out_vec;
136 for (number = 0; number < quarter_points; number++) {
137 in_vec = vld1q_f32(aPtr);
139 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140 vst1q_f32(cPtr, out_vec);
145 for (number = quarter_points * 4; number < num_points; number++) {
146 *cPtr++ = sqrtf(*aPtr++);
153 #ifdef LV_HAVE_GENERIC
158 float* cPtr = cVector;
159 const float* aPtr = aVector;
160 unsigned int number = 0;
162 for (number = 0; number < num_points; number++) {
163 *cPtr++ = sqrtf(*aPtr++);
172 extern void volk_32f_sqrt_32f_a_orc_impl(
float*,
const float*,
unsigned int);
175 volk_32f_sqrt_32f_u_orc(
float* cVector,
const float* aVector,
unsigned int num_points)
177 volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
184 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
185 #define INCLUDED_volk_32f_sqrt_32f_u_H
187 #include <inttypes.h>
191 #include <immintrin.h>
196 unsigned int number = 0;
197 const unsigned int eighthPoints = num_points / 8;
199 float* cPtr = cVector;
200 const float* aPtr = aVector;
203 for (; number < eighthPoints; number++) {
204 aVal = _mm256_loadu_ps(aPtr);
206 cVal = _mm256_sqrt_ps(aVal);
208 _mm256_storeu_ps(cPtr, cVal);
214 number = eighthPoints * 8;
215 for (; number < num_points; number++) {
216 *cPtr++ = sqrtf(*aPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:128
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:95
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:63
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:194
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:156