15 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
18 #include <immintrin.h>
24 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
56 __m256i sign_bits = _mm256_setzero_si256();
60 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
61 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
62 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
64 return _mm256_castsi256_ps(sign_bits);
77 llr0 = _mm256_xor_ps(llr0, sign_mask);
78 __m256 dst = _mm256_add_ps(llr0, llr1);
83 const __m256 cplxValue1)
85 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
86 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0);
87 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1);
88 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
89 return _mm256_permutevar8x32_ps(complex_result, idx);
93 const __m256 symbols1,
103 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
104 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
106 return _mm256_mul_ps(norms, scalar);
129 __m256i* max_indices,
130 __m256i* current_indices,
131 __m256i indices_increment)
133 in0 = _mm256_mul_ps(in0, in0);
134 in1 = _mm256_mul_ps(in1, in1);
154 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
165 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
168 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
179 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
180 _mm256_castsi256_ps(*current_indices),
184 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
191 __m256i* max_indices,
192 __m256i* current_indices,
193 __m256i indices_increment)
195 in0 = _mm256_mul_ps(in0, in0);
196 in1 = _mm256_mul_ps(in1, in1);
198 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
199 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
211 *max_values = _mm256_max_ps(abs_squared, *max_values);
214 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
215 _mm256_castsi256_ps(*current_indices),
218 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
241 __m256i* min_indices,
242 __m256i* current_indices,
243 __m256i indices_increment)
245 in0 = _mm256_mul_ps(in0, in0);
246 in1 = _mm256_mul_ps(in1, in1);
266 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
277 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
280 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
291 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
292 _mm256_castsi256_ps(*current_indices),
296 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
303 __m256i* min_indices,
304 __m256i* current_indices,
305 __m256i indices_increment)
307 in0 = _mm256_mul_ps(in0, in0);
308 in1 = _mm256_mul_ps(in1, in1);
310 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
311 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
323 *min_values = _mm256_min_ps(abs_squared, *min_values);
326 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
327 _mm256_castsi256_ps(*current_indices),
330 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
int64x2_t __m128i
Definition: sse2neon.h:244
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:92
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:20
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:188
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:82
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:238
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:68
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:126
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:300
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:145