58 #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
59 #define INCLUDED_volk_32f_x2_divide_32f_a_H
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
67 static inline void volk_32f_x2_divide_32f_a_avx512f(
float* cVector,
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int sixteenthPoints = num_points / 16;
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
79 __m512 aVal, bVal, cVal;
80 for (; number < sixteenthPoints; number++) {
81 aVal = _mm512_load_ps(aPtr);
82 bVal = _mm512_load_ps(bPtr);
84 cVal = _mm512_div_ps(aVal, bVal);
86 _mm512_store_ps(cPtr, cVal);
93 number = sixteenthPoints * 16;
94 for (; number < num_points; number++) {
95 *cPtr++ = (*aPtr++) / (*bPtr++);
102 #include <immintrin.h>
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
116 __m256 aVal, bVal, cVal;
117 for (; number < eighthPoints; number++) {
118 aVal = _mm256_load_ps(aPtr);
119 bVal = _mm256_load_ps(bPtr);
121 cVal = _mm256_div_ps(aVal, bVal);
123 _mm256_store_ps(cPtr, cVal);
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *cPtr++ = (*aPtr++) / (*bPtr++);
139 #include <xmmintrin.h>
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
154 for (; number < quarterPoints; number++) {
167 number = quarterPoints * 4;
168 for (; number < num_points; number++) {
169 *cPtr++ = (*aPtr++) / (*bPtr++);
176 #include <arm_neon.h>
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
187 float32x4x4_t aVal, bVal, bInv, cVal;
189 const unsigned int eighthPoints = num_points / 16;
190 unsigned int number = 0;
191 for (; number < eighthPoints; number++) {
192 aVal = vld4q_f32(aPtr);
194 bVal = vld4q_f32(bPtr);
200 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
201 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
202 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
203 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
205 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
206 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
207 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
208 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
210 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
211 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
212 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
213 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
215 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
216 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
217 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
218 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
220 vst4q_f32(cPtr, cVal);
224 for (number = eighthPoints * 16; number < num_points; number++) {
225 *cPtr++ = (*aPtr++) / (*bPtr++);
232 #ifdef LV_HAVE_GENERIC
235 const float* aVector,
236 const float* bVector,
237 unsigned int num_points)
239 float* cPtr = cVector;
240 const float* aPtr = aVector;
241 const float* bPtr = bVector;
242 unsigned int number = 0;
244 for (number = 0; number < num_points; number++) {
245 *cPtr++ = (*aPtr++) / (*bPtr++);
253 extern void volk_32f_x2_divide_32f_a_orc_impl(
float* cVector,
254 const float* aVector,
255 const float* bVector,
256 unsigned int num_points);
258 static inline void volk_32f_x2_divide_32f_u_orc(
float* cVector,
259 const float* aVector,
260 const float* bVector,
261 unsigned int num_points)
263 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
271 #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
272 #define INCLUDED_volk_32f_x2_divide_32f_u_H
274 #include <inttypes.h>
277 #ifdef LV_HAVE_AVX512F
278 #include <immintrin.h>
280 static inline void volk_32f_x2_divide_32f_u_avx512f(
float* cVector,
281 const float* aVector,
282 const float* bVector,
283 unsigned int num_points)
285 unsigned int number = 0;
286 const unsigned int sixteenthPoints = num_points / 16;
288 float* cPtr = cVector;
289 const float* aPtr = aVector;
290 const float* bPtr = bVector;
292 __m512 aVal, bVal, cVal;
293 for (; number < sixteenthPoints; number++) {
294 aVal = _mm512_loadu_ps(aPtr);
295 bVal = _mm512_loadu_ps(bPtr);
297 cVal = _mm512_div_ps(aVal, bVal);
299 _mm512_storeu_ps(cPtr, cVal);
306 number = sixteenthPoints * 16;
307 for (; number < num_points; number++) {
308 *cPtr++ = (*aPtr++) / (*bPtr++);
315 #include <immintrin.h>
318 const float* aVector,
319 const float* bVector,
320 unsigned int num_points)
322 unsigned int number = 0;
323 const unsigned int eighthPoints = num_points / 8;
325 float* cPtr = cVector;
326 const float* aPtr = aVector;
327 const float* bPtr = bVector;
329 __m256 aVal, bVal, cVal;
330 for (; number < eighthPoints; number++) {
331 aVal = _mm256_loadu_ps(aPtr);
332 bVal = _mm256_loadu_ps(bPtr);
334 cVal = _mm256_div_ps(aVal, bVal);
336 _mm256_storeu_ps(cPtr, cVal);
343 number = eighthPoints * 8;
344 for (; number < num_points; number++) {
345 *cPtr++ = (*aPtr++) / (*bPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_divide_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:178
static void volk_32f_x2_divide_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:317
static void volk_32f_x2_divide_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:141
static void volk_32f_x2_divide_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:234
static void volk_32f_x2_divide_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:104
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71