Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold AVX2 intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-paste.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
18 #include <immintrin.h>
19 
20 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
21 {
22  const __m128i zeros = _mm_set1_epi8(0x00);
23  const __m128i sign_extract = _mm_set1_epi8(0x80);
24  const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
25  0xff,
26  0xff,
27  0x00,
28  0xff,
29  0xff,
30  0xff,
31  0x01,
32  0xff,
33  0xff,
34  0xff,
35  0x02,
36  0xff,
37  0xff,
38  0xff,
39  0x03,
40  0xff,
41  0xff,
42  0xff,
43  0x04,
44  0xff,
45  0xff,
46  0xff,
47  0x05,
48  0xff,
49  0xff,
50  0xff,
51  0x06,
52  0xff,
53  0xff,
54  0xff,
55  0x07);
56  __m256i sign_bits = _mm256_setzero_si256();
57 
58  fbits = _mm_cmpgt_epi8(fbits, zeros);
59  fbits = _mm_and_si128(fbits, sign_extract);
60  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
61  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
62  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
63 
64  return _mm256_castsi256_ps(sign_bits);
65 }
66 
67 static inline __m256
68 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
69 {
70  // prepare sign mask for correct +-
71  __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
72 
73  __m256 llr0, llr1;
74  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
75 
76  // calculate result
77  llr0 = _mm256_xor_ps(llr0, sign_mask);
78  __m256 dst = _mm256_add_ps(llr0, llr1);
79  return dst;
80 }
81 
82 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
83  const __m256 cplxValue1)
84 {
85  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
86  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
87  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
88  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
89  return _mm256_permutevar8x32_ps(complex_result, idx);
90 }
91 
92 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
93  const __m256 symbols1,
94  const __m256 points0,
95  const __m256 points1,
96  const __m256 scalar)
97 {
98  /*
99  * Calculate: |y - x|^2 * SNR_lin
100  * Consider 'symbolsX' and 'pointsX' to be complex float
101  * 'symbolsX' are 'y' and 'pointsX' are 'x'
102  */
103  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
104  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
105  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
106  return _mm256_mul_ps(norms, scalar);
107 }
108 
109 /*
110  * The function below vectorizes the inner loop of the following code:
111  *
112  * float max_values[8] = {0.f};
113  * unsigned max_indices[8] = {0};
114  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
115  * for (unsigned i = 0; i < num_points / 8; ++i) {
116  * for (unsigned j = 0; j < 8; ++j) {
117  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
118  * bool compare = abs_squared > max_values[j];
119  * max_values[j] = compare ? abs_squared : max_values[j];
120  * max_indices[j] = compare ? current_indices[j] : max_indices[j]
121  * current_indices[j] += 8; // update for next outer loop iteration
122  * ++src0;
123  * }
124  * }
125  */
126 static inline void vector_32fc_index_max_variant0(__m256 in0,
127  __m256 in1,
128  __m256* max_values,
129  __m256i* max_indices,
130  __m256i* current_indices,
131  __m256i indices_increment)
132 {
133  in0 = _mm256_mul_ps(in0, in0);
134  in1 = _mm256_mul_ps(in1, in1);
135 
136  /*
137  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
138  * hadd_ps(a, b) computes
139  * (b_7 + b_6,
140  * b_5 + b_4,
141  * ---------
142  * a_7 + b_6,
143  * a_5 + a_4,
144  * ---------
145  * b_3 + b_2,
146  * b_1 + b_0,
147  * ---------
148  * a_3 + a_2,
149  * a_1 + a_0).
150  * The result is the squared absolute value of complex numbers at index
151  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
152  * current_indices!
153  */
154  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
155 
156  /*
157  * Compare the recently computed squared absolute values with the
158  * previously determined maximum values. cmp_ps(a, b) determines
159  * a > b ? 0xFFFFFFFF for each element in the vectors =>
160  * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
161  *
162  * If either operand is NaN, 0 is returned as an “ordered” comparision is
163  * used => the blend operation will select the value from *max_values.
164  */
165  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
166 
167  /* Select maximum by blending. This is the only line which differs from variant1 */
168  *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
169 
170  /*
171  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
172  * each element in the vectors =>
173  * max_indices = compare_mask ? current_indices : max_indices
174  *
175  * Note: The casting of data types is required to make the compiler happy
176  * and does not change values.
177  */
178  *max_indices =
179  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
180  _mm256_castsi256_ps(*current_indices),
181  compare_mask));
182 
183  /* compute indices of complex numbers which will be loaded in the next iteration */
184  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
185 }
186 
187 /* See _variant0 for details */
188 static inline void vector_32fc_index_max_variant1(__m256 in0,
189  __m256 in1,
190  __m256* max_values,
191  __m256i* max_indices,
192  __m256i* current_indices,
193  __m256i indices_increment)
194 {
195  in0 = _mm256_mul_ps(in0, in0);
196  in1 = _mm256_mul_ps(in1, in1);
197 
198  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
199  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
200 
201  /*
202  * This is the only line which differs from variant0. Using maxps instead of
203  * blendvps is faster on Intel CPUs (on the ones tested with).
204  *
205  * Note: The order of arguments matters if a NaN is encountered in which
206  * case the value of the second argument is selected. This is consistent
207  * with the “ordered” comparision and the blend operation: The comparision
208  * returns false if a NaN is encountered and the blend operation
209  * consequently selects the value from max_indices.
210  */
211  *max_values = _mm256_max_ps(abs_squared, *max_values);
212 
213  *max_indices =
214  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
215  _mm256_castsi256_ps(*current_indices),
216  compare_mask));
217 
218  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
219 }
220 
221 /*
222  * The function below vectorizes the inner loop of the following code:
223  *
224  * float min_values[8] = {FLT_MAX};
225  * unsigned min_indices[8] = {0};
226  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
227  * for (unsigned i = 0; i < num_points / 8; ++i) {
228  * for (unsigned j = 0; j < 8; ++j) {
229  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
230  * bool compare = abs_squared < min_values[j];
231  * min_values[j] = compare ? abs_squared : min_values[j];
232  * min_indices[j] = compare ? current_indices[j] : min_indices[j]
233  * current_indices[j] += 8; // update for next outer loop iteration
234  * ++src0;
235  * }
236  * }
237  */
238 static inline void vector_32fc_index_min_variant0(__m256 in0,
239  __m256 in1,
240  __m256* min_values,
241  __m256i* min_indices,
242  __m256i* current_indices,
243  __m256i indices_increment)
244 {
245  in0 = _mm256_mul_ps(in0, in0);
246  in1 = _mm256_mul_ps(in1, in1);
247 
248  /*
249  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
250  * hadd_ps(a, b) computes
251  * (b_7 + b_6,
252  * b_5 + b_4,
253  * ---------
254  * a_7 + b_6,
255  * a_5 + a_4,
256  * ---------
257  * b_3 + b_2,
258  * b_1 + b_0,
259  * ---------
260  * a_3 + a_2,
261  * a_1 + a_0).
262  * The result is the squared absolute value of complex numbers at index
263  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
264  * current_indices!
265  */
266  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
267 
268  /*
269  * Compare the recently computed squared absolute values with the
270  * previously determined minimum values. cmp_ps(a, b) determines
271  * a < b ? 0xFFFFFFFF for each element in the vectors =>
272  * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
273  *
274  * If either operand is NaN, 0 is returned as an “ordered” comparision is
275  * used => the blend operation will select the value from *min_values.
276  */
277  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
278 
279  /* Select minimum by blending. This is the only line which differs from variant1 */
280  *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
281 
282  /*
283  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
284  * each element in the vectors =>
285  * min_indices = compare_mask ? current_indices : min_indices
286  *
287  * Note: The casting of data types is required to make the compiler happy
288  * and does not change values.
289  */
290  *min_indices =
291  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
292  _mm256_castsi256_ps(*current_indices),
293  compare_mask));
294 
295  /* compute indices of complex numbers which will be loaded in the next iteration */
296  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
297 }
298 
299 /* See _variant0 for details */
300 static inline void vector_32fc_index_min_variant1(__m256 in0,
301  __m256 in1,
302  __m256* min_values,
303  __m256i* min_indices,
304  __m256i* current_indices,
305  __m256i indices_increment)
306 {
307  in0 = _mm256_mul_ps(in0, in0);
308  in1 = _mm256_mul_ps(in1, in1);
309 
310  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
311  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
312 
313  /*
314  * This is the only line which differs from variant0. Using maxps instead of
315  * blendvps is faster on Intel CPUs (on the ones tested with).
316  *
317  * Note: The order of arguments matters if a NaN is encountered in which
318  * case the value of the second argument is selected. This is consistent
319  * with the “ordered” comparision and the blend operation: The comparision
320  * returns false if a NaN is encountered and the blend operation
321  * consequently selects the value from min_indices.
322  */
323  *min_values = _mm256_min_ps(abs_squared, *min_values);
324 
325  *min_indices =
326  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
327  _mm256_castsi256_ps(*current_indices),
328  compare_mask));
329 
330  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
331 }
332 
333 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
int64x2_t __m128i
Definition: sse2neon.h:244
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:92
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:20
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:188
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:82
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:238
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:68
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:126
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:300
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:145