Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
62 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
63 #include <math.h>
65 
66 static inline float llr_odd(const float la, const float lb)
67 {
68  const float ala = fabsf(la);
69  const float alb = fabsf(lb);
70  return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
71 }
72 
73 static inline void llr_odd_stages(
74  float* llrs, int min_stage, const int depth, const int frame_size, const int row)
75 {
76  int loop_stage = depth - 1;
77  float* dst_llr_ptr;
78  float* src_llr_ptr;
79  int stage_size = 0x01 << loop_stage;
80 
81  int el;
82  while (min_stage <= loop_stage) {
83  dst_llr_ptr = llrs + loop_stage * frame_size + row;
84  src_llr_ptr = dst_llr_ptr + frame_size;
85  for (el = 0; el < stage_size; el++) {
86  *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
87  src_llr_ptr += 2;
88  }
89 
90  --loop_stage;
91  stage_size >>= 1;
92  }
93 }
94 
95 static inline float llr_even(const float la, const float lb, const unsigned char f)
96 {
97  switch (f) {
98  case 0:
99  return lb + la;
100  default:
101  return lb - la;
102  }
103 }
104 
105 static inline void
106 even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
107 {
108  u++;
109  int i;
110  for (i = 1; i < u_num; i += 2) {
111  *u_even++ = *u;
112  u += 2;
113  }
114 }
115 
116 static inline void
117 odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
118 {
119  int i;
120  for (i = 1; i < u_num; i += 2) {
121  *u_xor++ = *u ^ *(u + 1);
122  u += 2;
123  }
124 }
125 
126 static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
127 {
128  int max_stage_depth = 0;
129  int half_stage_size = 0x01;
130  int stage_size = half_stage_size << 1;
131  while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
132  if (!(row % stage_size < half_stage_size)) {
133  break;
134  }
135  half_stage_size <<= 1;
136  stage_size <<= 1;
137  max_stage_depth++;
138  }
139  return max_stage_depth;
140 }
141 
142 #ifdef LV_HAVE_GENERIC
143 
144 static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
145  unsigned char* u,
146  const int frame_exp,
147  const int stage,
148  const int u_num,
149  const int row)
150 {
151  const int frame_size = 0x01 << frame_exp;
152  const int next_stage = stage + 1;
153 
154  const int half_stage_size = 0x01 << stage;
155  const int stage_size = half_stage_size << 1;
156 
157  const bool is_upper_stage_half = row % stage_size < half_stage_size;
158 
159  // // this is a natural bit order impl
160  float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
161  float* call_row_llr = llrs + row;
162 
163  const int section = row - (row % stage_size);
164  const int jump_size = ((row % half_stage_size) << 1) % stage_size;
165 
166  const int next_upper_row = section + jump_size;
167  const int next_lower_row = next_upper_row + 1;
168 
169  const float* upper_right_llr_ptr = next_llrs + next_upper_row;
170  const float* lower_right_llr_ptr = next_llrs + next_lower_row;
171 
172  if (!is_upper_stage_half) {
173  const int u_pos = u_num >> stage;
174  const unsigned char f = u[u_pos - 1];
175  *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
176  return;
177  }
178 
179  if (frame_exp > next_stage) {
180  unsigned char* u_half = u + frame_size;
181  odd_xor_even_values(u_half, u, u_num);
183  next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
184 
185  even_u_values(u_half, u, u_num);
187  next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
188  }
189 
190  *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
191 }
192 
193 #endif /* LV_HAVE_GENERIC */
194 
195 
196 #ifdef LV_HAVE_AVX
197 #include <immintrin.h>
199 
200 static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
201  unsigned char* u,
202  const int frame_exp,
203  const int stage,
204  const int u_num,
205  const int row)
206 {
207  const int frame_size = 0x01 << frame_exp;
208  if (row % 2) { // for odd rows just do the only necessary calculation and return.
209  const float* next_llrs = llrs + frame_size + row;
210  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
211  return;
212  }
213 
214  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
215  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
216  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
217  return;
218  }
219 
220  int loop_stage = max_stage_depth;
221  int stage_size = 0x01 << loop_stage;
222 
223  float* src_llr_ptr;
224  float* dst_llr_ptr;
225 
226  __m256 src0, src1, dst;
227 
228  if (row) { // not necessary for ZERO row. == first bit to be decoded.
229  // first do bit combination for all stages
230  // effectively encode some decoded bits again.
231  unsigned char* u_target = u + frame_size;
232  unsigned char* u_temp = u + 2 * frame_size;
233  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
234 
235  if (stage_size > 15) {
236  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
237  } else {
238  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
239  }
240 
241  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
242  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
243 
244  __m128i fbits;
245 
246  int p;
247  for (p = 0; p < stage_size; p += 8) {
248  fbits = _mm_loadu_si128((__m128i*)u_target);
249  u_target += 8;
250 
251  src0 = _mm256_loadu_ps(src_llr_ptr);
252  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
253  src_llr_ptr += 16;
254 
255  dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
256 
257  _mm256_storeu_ps(dst_llr_ptr, dst);
258  dst_llr_ptr += 8;
259  }
260 
261  --loop_stage;
262  stage_size >>= 1;
263  }
264 
265  const int min_stage = stage > 2 ? stage : 2;
266 
267  _mm256_zeroall(); // Important to clear cache!
268 
269  int el;
270  while (min_stage < loop_stage) {
271  dst_llr_ptr = llrs + loop_stage * frame_size + row;
272  src_llr_ptr = dst_llr_ptr + frame_size;
273  for (el = 0; el < stage_size; el += 8) {
274  src0 = _mm256_loadu_ps(src_llr_ptr);
275  src_llr_ptr += 8;
276  src1 = _mm256_loadu_ps(src_llr_ptr);
277  src_llr_ptr += 8;
278 
279  dst = _mm256_polar_minsum_llrs(src0, src1);
280 
281  _mm256_storeu_ps(dst_llr_ptr, dst);
282  dst_llr_ptr += 8;
283  }
284 
285  --loop_stage;
286  stage_size >>= 1;
287  }
288 
289  // for stages < 3 vectors are too small!.
290  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
291 }
292 
293 #endif /* LV_HAVE_AVX */
294 
295 #ifdef LV_HAVE_AVX2
296 #include <immintrin.h>
298 
299 static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
300  unsigned char* u,
301  const int frame_exp,
302  const int stage,
303  const int u_num,
304  const int row)
305 {
306  const int frame_size = 0x01 << frame_exp;
307  if (row % 2) { // for odd rows just do the only necessary calculation and return.
308  const float* next_llrs = llrs + frame_size + row;
309  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
310  return;
311  }
312 
313  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
314  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
315  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
316  return;
317  }
318 
319  int loop_stage = max_stage_depth;
320  int stage_size = 0x01 << loop_stage;
321 
322  float* src_llr_ptr;
323  float* dst_llr_ptr;
324 
325  __m256 src0, src1, dst;
326 
327  if (row) { // not necessary for ZERO row. == first bit to be decoded.
328  // first do bit combination for all stages
329  // effectively encode some decoded bits again.
330  unsigned char* u_target = u + frame_size;
331  unsigned char* u_temp = u + 2 * frame_size;
332  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
333 
334  if (stage_size > 15) {
335  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
336  } else {
337  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
338  }
339 
340  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
341  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
342 
343  __m128i fbits;
344 
345  int p;
346  for (p = 0; p < stage_size; p += 8) {
347  fbits = _mm_loadu_si128((__m128i*)u_target);
348  u_target += 8;
349 
350  src0 = _mm256_loadu_ps(src_llr_ptr);
351  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
352  src_llr_ptr += 16;
353 
354  dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
355 
356  _mm256_storeu_ps(dst_llr_ptr, dst);
357  dst_llr_ptr += 8;
358  }
359 
360  --loop_stage;
361  stage_size >>= 1;
362  }
363 
364  const int min_stage = stage > 2 ? stage : 2;
365 
366  _mm256_zeroall(); // Important to clear cache!
367 
368  int el;
369  while (min_stage < loop_stage) {
370  dst_llr_ptr = llrs + loop_stage * frame_size + row;
371  src_llr_ptr = dst_llr_ptr + frame_size;
372  for (el = 0; el < stage_size; el += 8) {
373  src0 = _mm256_loadu_ps(src_llr_ptr);
374  src_llr_ptr += 8;
375  src1 = _mm256_loadu_ps(src_llr_ptr);
376  src_llr_ptr += 8;
377 
378  dst = _mm256_polar_minsum_llrs(src0, src1);
379 
380  _mm256_storeu_ps(dst_llr_ptr, dst);
381  dst_llr_ptr += 8;
382  }
383 
384  --loop_stage;
385  stage_size >>= 1;
386  }
387 
388  // for stages < 3 vectors are too small!.
389  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
390 }
391 
392 #endif /* LV_HAVE_AVX2 */
393 
394 #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
int64x2_t __m128i
Definition: sse2neon.h:244
static float llr_even(const float la, const float lb, const unsigned char f)
Definition: volk_32f_8u_polarbutterfly_32f.h:95
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:73
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:117
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:144
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:200
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:106
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:126
static float llr_odd(const float la, const float lb)
Definition: volk_32f_8u_polarbutterfly_32f.h:66
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:52
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:76
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:68
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:154
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:171
for i
Definition: volk_config_fixed.tmpl.h:13