Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_avx_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold AVX intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-pasta.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17 #include <immintrin.h>
18 
19 static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
20 {
21  __m256 yl, yh, tmp1, tmp2;
22  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
23  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
24  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
25  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
26  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
27 
28  // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
29  return _mm256_addsub_ps(tmp1, tmp2);
30 }
31 
32 static inline __m256 _mm256_conjugate_ps(__m256 x)
33 {
34  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
35  return _mm256_xor_ps(x, conjugator); // conjugate y
36 }
37 
38 static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
39 {
40  const __m256 nswap = _mm256_permute_ps(x, 0xb1);
41  const __m256 dreal = _mm256_moveldup_ps(y);
42  const __m256 dimag = _mm256_movehdup_ps(y);
43 
44  const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
45  const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
46  const __m256 multreal = _mm256_mul_ps(x, dreal);
47  const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
48  return _mm256_add_ps(multreal, multimag);
49 }
50 
51 static inline __m256 _mm256_normalize_ps(__m256 val)
52 {
53  __m256 tmp1 = _mm256_mul_ps(val, val);
54  tmp1 = _mm256_hadd_ps(tmp1, tmp1);
55  tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
56  tmp1 = _mm256_sqrt_ps(tmp1);
57  return _mm256_div_ps(val, tmp1);
58 }
59 
60 static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
61 {
62  __m256 complex1, complex2;
63  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
64  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
65  complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
66  complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
67  return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
68 }
69 
70 static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
71 {
72  return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
73 }
74 
75 static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
76  const __m256 symbols1,
77  const __m256 points0,
78  const __m256 points1,
79  const __m256 scalar)
80 {
81  /*
82  * Calculate: |y - x|^2 * SNR_lin
83  * Consider 'symbolsX' and 'pointsX' to be complex float
84  * 'symbolsX' are 'y' and 'pointsX' are 'x'
85  */
86  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
87  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
88  const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
89  return _mm256_mul_ps(norms, scalar);
90 }
91 
92 static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
93 {
94  __m256 sign_mask_dummy = _mm256_setzero_ps();
95  const __m128i zeros = _mm_set1_epi8(0x00);
96  const __m128i sign_extract = _mm_set1_epi8(0x80);
97  const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
98  0xff,
99  0xff,
100  0x00,
101  0xff,
102  0xff,
103  0xff,
104  0x01,
105  0xff,
106  0xff,
107  0xff,
108  0x02,
109  0xff,
110  0xff,
111  0xff,
112  0x03);
113  const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
114  0xff,
115  0xff,
116  0x04,
117  0xff,
118  0xff,
119  0xff,
120  0x05,
121  0xff,
122  0xff,
123  0xff,
124  0x06,
125  0xff,
126  0xff,
127  0xff,
128  0x07);
129 
130  fbits = _mm_cmpgt_epi8(fbits, zeros);
131  fbits = _mm_and_si128(fbits, sign_extract);
132  __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
133  __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
134 
135  __m256 sign_mask =
136  _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
137  return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
138  // // This is the desired function call. Though it seems to be missing in GCC.
139  // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
140  // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
141  // _mm_castsi128_ps(sign_bits0));
142 }
143 
144 static inline void
145 _mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
146 {
147  // deinterleave values
148  __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
149  __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
150  *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
151  *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
152 }
153 
154 static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
155 {
156  const __m256 sign_mask = _mm256_set1_ps(-0.0f);
157  const __m256 abs_mask =
158  _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
159 
160  __m256 llr0, llr1;
161  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
162 
163  // calculate result
164  __m256 sign =
165  _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
166  __m256 dst =
167  _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
168  return _mm256_or_ps(dst, sign);
169 }
170 
171 static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
172 {
173  // prepare sign mask for correct +-
174  __m256 sign_mask = _mm256_polar_sign_mask(fbits);
175 
176  __m256 llr0, llr1;
177  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
178 
179  // calculate result
180  llr0 = _mm256_xor_ps(llr0, sign_mask);
181  __m256 dst = _mm256_add_ps(llr0, llr1);
182  return dst;
183 }
184 
185 static inline __m256 _mm256_accumulate_square_sum_ps(
186  __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
187 {
188  aux = _mm256_mul_ps(aux, val);
189  aux = _mm256_sub_ps(aux, acc);
190  aux = _mm256_mul_ps(aux, aux);
191  aux = _mm256_mul_ps(aux, rec);
192  return _mm256_add_ps(sq_acc, aux);
193 }
194 
195 #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:57
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:60
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:70
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:145
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:185
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:19
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:154
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:51
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:75
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:92
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:171