15 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17 #include <immintrin.h>
21 __m256 yl, yh, tmp1, tmp2;
22 yl = _mm256_moveldup_ps(y);
23 yh = _mm256_movehdup_ps(y);
24 tmp1 = _mm256_mul_ps(x, yl);
25 x = _mm256_shuffle_ps(x, x, 0xB1);
26 tmp2 = _mm256_mul_ps(x, yh);
29 return _mm256_addsub_ps(tmp1, tmp2);
34 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
35 return _mm256_xor_ps(x, conjugator);
40 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
41 const __m256 dreal = _mm256_moveldup_ps(y);
42 const __m256 dimag = _mm256_movehdup_ps(y);
44 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
45 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
46 const __m256 multreal = _mm256_mul_ps(x, dreal);
47 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
48 return _mm256_add_ps(multreal, multimag);
53 __m256 tmp1 = _mm256_mul_ps(
val,
val);
54 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
55 tmp1 = _mm256_shuffle_ps(tmp1, tmp1,
_MM_SHUFFLE(3, 1, 2, 0));
56 tmp1 = _mm256_sqrt_ps(tmp1);
57 return _mm256_div_ps(
val, tmp1);
62 __m256 complex1, complex2;
63 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
64 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
65 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
66 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
67 return _mm256_hadd_ps(complex1, complex2);
76 const __m256 symbols1,
86 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
87 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
89 return _mm256_mul_ps(norms, scalar);
94 __m256 sign_mask_dummy = _mm256_setzero_ps();
148 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
149 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
150 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
151 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
156 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
157 const __m256 abs_mask =
158 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
165 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
167 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
168 return _mm256_or_ps(dst, sign);
180 llr0 = _mm256_xor_ps(llr0, sign_mask);
181 __m256 dst = _mm256_add_ps(llr0, llr1);
186 __m256 sq_acc, __m256 acc, __m256
val, __m256 rec, __m256 aux)
188 aux = _mm256_mul_ps(aux,
val);
189 aux = _mm256_sub_ps(aux, acc);
190 aux = _mm256_mul_ps(aux, aux);
191 aux = _mm256_mul_ps(aux, rec);
192 return _mm256_add_ps(sq_acc, aux);
val
Definition: volk_arch_defs.py:57
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:60
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:70
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:145
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:185
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:19
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:154
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:51
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:75
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:92
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:171