59 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
60 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
67 #ifdef LV_HAVE_GENERIC
72 unsigned int num_points)
78 for (
unsigned int number = 0; number < num_points; number++) {
79 *cPtr++ = (*aPtr++) / (*bPtr++);
86 #include <pmmintrin.h>
92 unsigned int num_points)
100 unsigned int number = 0;
101 const unsigned int quarterPoints = num_points / 4;
103 __m128 num01, num23, den01, den23, norm, result;
108 for (; number < quarterPoints; number++) {
134 for (; number < num_points; number++) {
145 #include <immintrin.h>
151 unsigned int num_points)
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
162 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
167 for (; number < quarterPoints; number++) {
168 num = _mm256_loadu_ps(
170 denum = _mm256_loadu_ps(
173 sq = _mm256_mul_ps(denum, denum);
174 mag_sq_un = _mm256_hadd_ps(
176 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
179 div = _mm256_div_ps(mul_conj, mag_sq);
181 _mm256_storeu_ps((
float*)c, div);
188 number = quarterPoints * 4;
190 for (; number < num_points; number++) {
191 *c++ = (*a++) / (*b++);
200 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
201 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
204 #include <inttypes.h>
209 #include <pmmintrin.h>
215 unsigned int num_points)
223 unsigned int number = 0;
224 const unsigned int quarterPoints = num_points / 4;
226 __m128 num01, num23, den01, den23, norm, result;
231 for (; number < quarterPoints; number++) {
258 for (; number < num_points; number++) {
268 #include <immintrin.h>
274 unsigned int num_points)
290 const unsigned int eigthPoints = num_points / 8;
292 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
294 for (
unsigned int number = 0; number < eigthPoints; number++) {
296 num01 = _mm256_load_ps((
float*)a);
297 denum01 = _mm256_load_ps((
float*)b);
303 num23 = _mm256_load_ps((
float*)a);
304 denum23 = _mm256_load_ps((
float*)b);
309 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
310 _mm256_mul_ps(denum23, denum23));
312 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
313 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
315 result0 = _mm256_div_ps(num01, denum01);
316 result1 = _mm256_div_ps(num23, denum23);
318 _mm256_store_ps((
float*)c, result0);
320 _mm256_store_ps((
float*)c, result1);
330 #include <arm_neon.h>
335 unsigned int num_points)
341 float32x4x2_t aVal, bVal, cVal;
342 float32x4_t bAbs, bAbsInv;
344 const unsigned int quarterPoints = num_points / 4;
345 unsigned int number = 0;
346 for (; number < quarterPoints; number++) {
347 aVal = vld2q_f32((
const float*)(aPtr));
348 bVal = vld2q_f32((
const float*)(bPtr));
354 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
355 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
357 bAbsInv = vrecpeq_f32(bAbs);
358 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
359 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
361 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
362 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
363 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
365 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
366 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
367 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
369 vst2q_f32((
float*)(cPtr), cVal);
373 for (number = quarterPoints * 4; number < num_points; number++) {
374 *cPtr++ = (*aPtr++) / (*bPtr++);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:212
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:69
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:332
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:148
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:271
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:89
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31