Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53 #define INCLUDED_volk_32f_sqrt_32f_a_H
54 
55 #include <inttypes.h>
56 #include <math.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_SSE
60 #include <xmmintrin.h>
61 
62 static inline void
63 volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64 {
65  unsigned int number = 0;
66  const unsigned int quarterPoints = num_points / 4;
67 
68  float* cPtr = cVector;
69  const float* aPtr = aVector;
70 
71  __m128 aVal, cVal;
72  for (; number < quarterPoints; number++) {
73  aVal = _mm_load_ps(aPtr);
74 
75  cVal = _mm_sqrt_ps(aVal);
76 
77  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
78 
79  aPtr += 4;
80  cPtr += 4;
81  }
82 
83  number = quarterPoints * 4;
84  for (; number < num_points; number++) {
85  *cPtr++ = sqrtf(*aPtr++);
86  }
87 }
88 
89 #endif /* LV_HAVE_SSE */
90 
91 #ifdef LV_HAVE_AVX
92 #include <immintrin.h>
93 
94 static inline void
95 volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96 {
97  unsigned int number = 0;
98  const unsigned int eighthPoints = num_points / 8;
99 
100  float* cPtr = cVector;
101  const float* aPtr = aVector;
102 
103  __m256 aVal, cVal;
104  for (; number < eighthPoints; number++) {
105  aVal = _mm256_load_ps(aPtr);
106 
107  cVal = _mm256_sqrt_ps(aVal);
108 
109  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110 
111  aPtr += 8;
112  cPtr += 8;
113  }
114 
115  number = eighthPoints * 8;
116  for (; number < num_points; number++) {
117  *cPtr++ = sqrtf(*aPtr++);
118  }
119 }
120 
121 #endif /* LV_HAVE_AVX */
122 
123 
124 #ifdef LV_HAVE_NEON
125 #include <arm_neon.h>
126 
127 static inline void
128 volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129 {
130  float* cPtr = cVector;
131  const float* aPtr = aVector;
132  unsigned int number = 0;
133  unsigned int quarter_points = num_points / 4;
134  float32x4_t in_vec, out_vec;
135 
136  for (number = 0; number < quarter_points; number++) {
137  in_vec = vld1q_f32(aPtr);
138  // note that armv8 has vsqrt_f32 which will be much better
139  out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140  vst1q_f32(cPtr, out_vec);
141  aPtr += 4;
142  cPtr += 4;
143  }
144 
145  for (number = quarter_points * 4; number < num_points; number++) {
146  *cPtr++ = sqrtf(*aPtr++);
147  }
148 }
149 
150 #endif /* LV_HAVE_NEON */
151 
152 
153 #ifdef LV_HAVE_GENERIC
154 
155 static inline void
156 volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157 {
158  float* cPtr = cVector;
159  const float* aPtr = aVector;
160  unsigned int number = 0;
161 
162  for (number = 0; number < num_points; number++) {
163  *cPtr++ = sqrtf(*aPtr++);
164  }
165 }
166 
167 #endif /* LV_HAVE_GENERIC */
168 
169 
170 #ifdef LV_HAVE_ORC
171 
172 extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
173 
174 static inline void
175 volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
176 {
177  volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
178 }
179 
180 #endif /* LV_HAVE_ORC */
181 
182 #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
183 
184 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
185 #define INCLUDED_volk_32f_sqrt_32f_u_H
186 
187 #include <inttypes.h>
188 #include <math.h>
189 #include <stdio.h>
190 #ifdef LV_HAVE_AVX
191 #include <immintrin.h>
192 
193 static inline void
194 volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
195 {
196  unsigned int number = 0;
197  const unsigned int eighthPoints = num_points / 8;
198 
199  float* cPtr = cVector;
200  const float* aPtr = aVector;
201 
202  __m256 aVal, cVal;
203  for (; number < eighthPoints; number++) {
204  aVal = _mm256_loadu_ps(aPtr);
205 
206  cVal = _mm256_sqrt_ps(aVal);
207 
208  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
209 
210  aPtr += 8;
211  cPtr += 8;
212  }
213 
214  number = eighthPoints * 8;
215  for (; number < num_points; number++) {
216  *cPtr++ = sqrtf(*aPtr++);
217  }
218 }
219 
220 #endif /* LV_HAVE_AVX */
221 #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:128
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:95
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:63
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:194
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:156