68 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
69 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
76 #define ROTATOR_RELOAD 512
77 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
81 #ifdef LV_HAVE_GENERIC
87 unsigned int num_points)
93 *outVector++ = *inVector++ * (*phase);
94 (*phase) *= phase_inc;
100 *outVector++ = *inVector++ * (*phase);
101 (*phase) *= phase_inc;
113 #include <arm_neon.h>
120 unsigned int num_points)
124 const lv_32fc_t* inputVectorPtr = inVector;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
130 unsigned int i = 0, j = 0;
133 for (
i = 0;
i < 4; ++
i) {
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((
float*)phasePtr);
145 input_vec = vld2q_f32((
float*)inputVectorPtr);
153 vst2q_f32((
float*)outputVectorPtr, output_vec);
155 outputVectorPtr += 4;
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
168 input_vec = vld2q_f32((
float*)inputVectorPtr);
176 vst2q_f32((
float*)outputVectorPtr, output_vec);
178 outputVectorPtr += 4;
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
192 vst2q_f32((
float*)phasePtr, phase_vec);
195 for (
i = 0;
i < num_points % 4;
i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (phase_inc);
201 (*phase) = phasePtr[0];
207 #ifdef LV_HAVE_SSE4_1
208 #include <smmintrin.h>
210 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
214 unsigned int num_points)
219 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
221 unsigned int i, j = 0;
223 for (
i = 0;
i < 2; ++
i) {
224 phase_Ptr[
i] *= incr;
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
299 if (num_points & 1) {
300 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 phase_Ptr[0] *= (phase_inc);
304 (*phase) = phase_Ptr[0];
310 #ifdef LV_HAVE_SSE4_1
311 #include <smmintrin.h>
313 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
317 unsigned int num_points)
322 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
324 unsigned int i, j = 0;
326 for (
i = 0;
i < 2; ++
i) {
327 phase_Ptr[
i] *= incr;
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
405 if (num_points & 1) {
406 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 phase_Ptr[0] *= (phase_inc);
410 (*phase) = phase_Ptr[0];
417 #include <immintrin.h>
424 unsigned int num_points)
429 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
431 unsigned int i, j = 0;
433 for (
i = 0;
i < 4; ++
i) {
434 phase_Ptr[
i] *= incr;
438 __m256 aVal, phase_Val, z;
440 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
442 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
454 aVal = _mm256_load_ps((
float*)aPtr);
459 _mm256_store_ps((
float*)cPtr, z);
468 aVal = _mm256_load_ps((
float*)aPtr);
473 _mm256_store_ps((
float*)cPtr, z);
482 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
483 (*phase) = phase_Ptr[0];
491 #include <immintrin.h>
498 unsigned int num_points)
503 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
505 unsigned int i, j = 0;
507 for (
i = 0;
i < 4; ++
i) {
508 phase_Ptr[
i] *= incr;
512 __m256 aVal, phase_Val, z;
514 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
516 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
528 aVal = _mm256_loadu_ps((
float*)aPtr);
533 _mm256_storeu_ps((
float*)cPtr, z);
542 aVal = _mm256_loadu_ps((
float*)aPtr);
547 _mm256_storeu_ps((
float*)cPtr, z);
556 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
557 (*phase) = phase_Ptr[0];
563 #if LV_HAVE_AVX && LV_HAVE_FMA
564 #include <immintrin.h>
566 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
lv_32fc_t* outVector,
570 unsigned int num_points)
576 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
578 unsigned int i, j = 0;
580 for (
i = 0;
i < 4; ++
i) {
581 phase_Ptr[
i] *= incr;
585 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
587 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
588 inc_Val = _mm256_set_ps(
lv_cimag(incr),
600 aVal = _mm256_load_ps((
float*)aPtr);
602 yl = _mm256_moveldup_ps(phase_Val);
603 yh = _mm256_movehdup_ps(phase_Val);
604 ylp = _mm256_moveldup_ps(inc_Val);
605 yhp = _mm256_movehdup_ps(inc_Val);
610 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
611 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
612 tmp2 = _mm256_mul_ps(aVal, yh);
613 tmp2p = _mm256_mul_ps(phase_Val, yhp);
615 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
616 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
618 _mm256_store_ps((
float*)cPtr, z);
623 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
624 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
625 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
626 tmp2 = _mm256_sqrt_ps(tmp1);
627 phase_Val = _mm256_div_ps(phase_Val, tmp2);
630 aVal = _mm256_load_ps((
float*)aPtr);
632 yl = _mm256_moveldup_ps(phase_Val);
633 yh = _mm256_movehdup_ps(phase_Val);
634 ylp = _mm256_moveldup_ps(inc_Val);
635 yhp = _mm256_movehdup_ps(inc_Val);
640 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
641 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
642 tmp2 = _mm256_mul_ps(aVal, yh);
643 tmp2p = _mm256_mul_ps(phase_Val, yhp);
645 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
646 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
648 _mm256_store_ps((
float*)cPtr, z);
654 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
655 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
656 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
657 tmp2 = _mm256_sqrt_ps(tmp1);
658 phase_Val = _mm256_div_ps(phase_Val, tmp2);
661 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
662 for (
i = 0;
i < num_points % 4; ++
i) {
663 *cPtr++ = *aPtr++ * phase_Ptr[0];
664 phase_Ptr[0] *= (phase_inc);
667 (*phase) = phase_Ptr[0];
672 #if LV_HAVE_AVX && LV_HAVE_FMA
673 #include <immintrin.h>
675 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
lv_32fc_t* outVector,
679 unsigned int num_points)
684 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
686 unsigned int i, j = 0;
688 for (
i = 0;
i < 4; ++
i) {
689 phase_Ptr[
i] *= incr;
693 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
695 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
696 inc_Val = _mm256_set_ps(
lv_cimag(incr),
708 aVal = _mm256_loadu_ps((
float*)aPtr);
710 yl = _mm256_moveldup_ps(phase_Val);
711 yh = _mm256_movehdup_ps(phase_Val);
712 ylp = _mm256_moveldup_ps(inc_Val);
713 yhp = _mm256_movehdup_ps(inc_Val);
718 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
719 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
720 tmp2 = _mm256_mul_ps(aVal, yh);
721 tmp2p = _mm256_mul_ps(phase_Val, yhp);
723 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
724 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
726 _mm256_storeu_ps((
float*)cPtr, z);
731 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
732 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
733 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
734 tmp2 = _mm256_sqrt_ps(tmp1);
735 phase_Val = _mm256_div_ps(phase_Val, tmp2);
738 aVal = _mm256_loadu_ps((
float*)aPtr);
740 yl = _mm256_moveldup_ps(phase_Val);
741 yh = _mm256_movehdup_ps(phase_Val);
742 ylp = _mm256_moveldup_ps(inc_Val);
743 yhp = _mm256_movehdup_ps(inc_Val);
748 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
749 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
750 tmp2 = _mm256_mul_ps(aVal, yh);
751 tmp2p = _mm256_mul_ps(phase_Val, yhp);
753 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
754 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
756 _mm256_storeu_ps((
float*)cPtr, z);
762 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
763 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
764 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
765 tmp2 = _mm256_sqrt_ps(tmp1);
766 phase_Val = _mm256_div_ps(phase_Val, tmp2);
769 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
770 for (
i = 0;
i < num_points % 4; ++
i) {
771 *cPtr++ = *aPtr++ * phase_Ptr[0];
772 phase_Ptr[0] *= (phase_inc);
775 (*phase) = phase_Ptr[0];
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:116
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:494
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:78
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:77
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:76
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:83
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:420
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:19
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:105
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73