Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8u_conv_k7_r2puppet_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
11 #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
12 
13 #include <string.h>
14 #include <volk/volk.h>
16 
17 typedef union {
18  // decision_t is a BIT vector
19  unsigned char* t;
20  unsigned int* w;
21 } p_decision_t;
22 
23 static inline int parity(int x, unsigned char* Partab)
24 {
25  x ^= (x >> 16);
26  x ^= (x >> 8);
27  return Partab[x];
28 }
29 
30 static inline int chainback_viterbi(unsigned char* data,
31  unsigned int nbits,
32  unsigned int endstate,
33  unsigned int tailsize,
34  unsigned char* decisions)
35 {
36  unsigned char* d;
37  int d_ADDSHIFT = 0;
38  int d_numstates = (1 << 6);
39  int d_decision_t_size = d_numstates / 8;
40  unsigned int d_k = 7;
41  int d_framebits = nbits;
42  /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
43  d = decisions;
44  /* Make room beyond the end of the encoder register so we can
45  * accumulate a full byte of decoded data
46  */
47 
48  endstate = (endstate % d_numstates) << d_ADDSHIFT;
49 
50  /* The store into data[] only needs to be done every 8 bits.
51  * But this avoids a conditional branch, and the writes will
52  * combine in the cache anyway
53  */
54 
55  d += tailsize * d_decision_t_size; /* Look past tail */
56  int retval;
57  int dif = tailsize - (d_k - 1);
58  // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
59  p_decision_t dec;
60  while (nbits-- > d_framebits - (d_k - 1)) {
61  int k;
62  dec.t = &d[nbits * d_decision_t_size];
63  k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
64 
65  endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
66  // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
67  // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
68  data[((nbits + dif) % d_framebits)] = k;
69 
70  retval = endstate;
71  }
72  nbits += 1;
73 
74  while (nbits-- != 0) {
75  int k;
76 
77  dec.t = &d[nbits * d_decision_t_size];
78 
79  k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
80 
81  endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
82  data[((nbits + dif) % d_framebits)] = k;
83  }
84  // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
85  // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
86 
87 
88  return retval >> d_ADDSHIFT;
89 }
90 
91 
92 #if LV_HAVE_SSE3
93 
94 #include <emmintrin.h>
95 #include <mmintrin.h>
96 #include <pmmintrin.h>
97 #include <stdio.h>
98 #include <xmmintrin.h>
99 
100 static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
101  unsigned char* dec,
102  unsigned int framebits)
103 {
104 
105 
106  static int once = 1;
107  int d_numstates = (1 << 6);
108  int rate = 2;
109  static unsigned char* D;
110  static unsigned char* Y;
111  static unsigned char* X;
112  static unsigned int excess = 6;
113  static unsigned char* Branchtab;
114  static unsigned char Partab[256];
115 
116  int d_polys[2] = { 79, 109 };
117 
118 
119  if (once) {
120 
121  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
122  Y = X + d_numstates;
123  Branchtab =
124  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
125  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
127  int state, i;
128  int cnt, ti;
129 
130  /* Initialize parity lookup table */
131  for (i = 0; i < 256; i++) {
132  cnt = 0;
133  ti = i;
134  while (ti) {
135  if (ti & 1)
136  cnt++;
137  ti >>= 1;
138  }
139  Partab[i] = cnt & 1;
140  }
141  /* Initialize the branch table */
142  for (state = 0; state < d_numstates / 2; state++) {
143  for (i = 0; i < rate; i++) {
144  Branchtab[i * d_numstates / 2 + state] =
145  parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
146  }
147  }
148 
149  once = 0;
150  }
151 
152  // unbias the old_metrics
153  memset(X, 31, d_numstates);
154 
155  // initialize decisions
156  memset(D, 0, (d_numstates / 8) * (framebits + 6));
157 
159  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
160 
161  unsigned int min = X[0];
162  int i = 0, state = 0;
163  for (i = 0; i < (d_numstates); ++i) {
164  if (X[i] < min) {
165  min = X[i];
166  state = i;
167  }
168  }
169 
170  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
171 
172  return;
173 }
174 
175 #endif /*LV_HAVE_SSE3*/
176 
177 
178 #if LV_HAVE_NEON
179 
180 #include "volk/sse2neon.h"
181 
182 static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms,
183  unsigned char* dec,
184  unsigned int framebits)
185 {
186 
187 
188  static int once = 1;
189  int d_numstates = (1 << 6);
190  int rate = 2;
191  static unsigned char* D;
192  static unsigned char* Y;
193  static unsigned char* X;
194  static unsigned int excess = 6;
195  static unsigned char* Branchtab;
196  static unsigned char Partab[256];
197 
198  int d_polys[2] = { 79, 109 };
199 
200 
201  if (once) {
202 
203  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
204  Y = X + d_numstates;
205  Branchtab =
206  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
207  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
209  int state, i;
210  int cnt, ti;
211 
212  /* Initialize parity lookup table */
213  for (i = 0; i < 256; i++) {
214  cnt = 0;
215  ti = i;
216  while (ti) {
217  if (ti & 1)
218  cnt++;
219  ti >>= 1;
220  }
221  Partab[i] = cnt & 1;
222  }
223  /* Initialize the branch table */
224  for (state = 0; state < d_numstates / 2; state++) {
225  for (i = 0; i < rate; i++) {
226  Branchtab[i * d_numstates / 2 + state] =
227  parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
228  }
229  }
230 
231  once = 0;
232  }
233 
234  // unbias the old_metrics
235  memset(X, 31, d_numstates);
236 
237  // initialize decisions
238  memset(D, 0, (d_numstates / 8) * (framebits + 6));
239 
241  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
242 
243  unsigned int min = X[0];
244  int i = 0, state = 0;
245  for (i = 0; i < (d_numstates); ++i) {
246  if (X[i] < min) {
247  min = X[i];
248  state = i;
249  }
250  }
251 
252  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
253 
254  return;
255 }
256 
257 #endif /*LV_HAVE_NEON*/
258 
259 
260 //#if LV_HAVE_AVX2
261 //
262 //#include <immintrin.h>
263 //#include <stdio.h>
264 //
265 // static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
266 // unsigned char* dec,
267 // unsigned int framebits)
268 //{
269 //
270 //
271 // static int once = 1;
272 // int d_numstates = (1 << 6);
273 // int rate = 2;
274 // static unsigned char* D;
275 // static unsigned char* Y;
276 // static unsigned char* X;
277 // static unsigned int excess = 6;
278 // static unsigned char* Branchtab;
279 // static unsigned char Partab[256];
280 //
281 // int d_polys[2] = { 79, 109 };
282 //
283 //
284 // if (once) {
285 //
286 // X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
287 // Y = X + d_numstates;
288 // Branchtab =
289 // (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
290 // D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
291 // volk_get_alignment());
292 // int state, i;
293 // int cnt, ti;
294 //
295 // /* Initialize parity lookup table */
296 // for (i = 0; i < 256; i++) {
297 // cnt = 0;
298 // ti = i;
299 // while (ti) {
300 // if (ti & 1)
301 // cnt++;
302 // ti >>= 1;
303 // }
304 // Partab[i] = cnt & 1;
305 // }
306 // /* Initialize the branch table */
307 // for (state = 0; state < d_numstates / 2; state++) {
308 // for (i = 0; i < rate; i++) {
309 // Branchtab[i * d_numstates / 2 + state] =
310 // parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
311 // }
312 // }
313 //
314 // once = 0;
315 // }
316 //
317 // // unbias the old_metrics
318 // memset(X, 31, d_numstates);
319 //
320 // // initialize decisions
321 // memset(D, 0, (d_numstates / 8) * (framebits + 6));
322 //
323 // volk_8u_x4_conv_k7_r2_8u_avx2(
324 // Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
325 //
326 // unsigned int min = X[0];
327 // int i = 0, state = 0;
328 // for (i = 0; i < (d_numstates); ++i) {
329 // if (X[i] < min) {
330 // min = X[i];
331 // state = i;
332 // }
333 // }
334 //
335 // chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
336 //
337 // return;
338 //}
339 //
340 //#endif /*LV_HAVE_AVX2*/
341 
342 
343 #if LV_HAVE_GENERIC
344 
345 
346 static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
347  unsigned char* dec,
348  unsigned int framebits)
349 {
350 
351 
352  static int once = 1;
353  int d_numstates = (1 << 6);
354  int rate = 2;
355  static unsigned char* Y;
356  static unsigned char* X;
357  static unsigned char* D;
358  static unsigned int excess = 6;
359  static unsigned char* Branchtab;
360  static unsigned char Partab[256];
361 
362  int d_polys[2] = { 79, 109 };
363 
364 
365  if (once) {
366 
367  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
368  Y = X + d_numstates;
369  Branchtab =
370  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
371  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
373 
374  int state, i;
375  int cnt, ti;
376 
377  /* Initialize parity lookup table */
378  for (i = 0; i < 256; i++) {
379  cnt = 0;
380  ti = i;
381  while (ti) {
382  if (ti & 1)
383  cnt++;
384  ti >>= 1;
385  }
386  Partab[i] = cnt & 1;
387  }
388  /* Initialize the branch table */
389  for (state = 0; state < d_numstates / 2; state++) {
390  for (i = 0; i < rate; i++) {
391  Branchtab[i * d_numstates / 2 + state] =
392  parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
393  }
394  }
395 
396  once = 0;
397  }
398 
399  // unbias the old_metrics
400  memset(X, 31, d_numstates);
401 
402  // initialize decisions
403  memset(D, 0, (d_numstates / 8) * (framebits + 6));
404 
406  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
407 
408  unsigned int min = X[0];
409  int i = 0, state = 0;
410  for (i = 0; i < (d_numstates); ++i) {
411  if (X[i] < min) {
412  min = X[i];
413  state = i;
414  }
415  }
416 
417  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
418 
419  return;
420 }
421 
422 #endif /* LV_HAVE_GENERIC */
423 
424 #endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/
data
Definition: plot_best_vs_generic.py:23
Definition: volk_8u_conv_k7_r2puppet_8u.h:17
unsigned int * w
Definition: volk_8u_conv_k7_r2puppet_8u.h:20
unsigned char * t
Definition: volk_8u_conv_k7_r2puppet_8u.h:19
size_t volk_get_alignment(void)
Get the machine alignment in bytes.
Definition: volk.tmpl.c:90
static void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char *syms, unsigned char *dec, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:182
static void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char *syms, unsigned char *dec, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:346
static void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char *syms, unsigned char *dec, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:100
static int chainback_viterbi(unsigned char *data, unsigned int nbits, unsigned int endstate, unsigned int tailsize, unsigned char *decisions)
Definition: volk_8u_conv_k7_r2puppet_8u.h:30
static int parity(int x, unsigned char *Partab)
Definition: volk_8u_conv_k7_r2puppet_8u.h:23
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:330
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:626
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:920
for i
Definition: volk_config_fixed.tmpl.h:13
__VOLK_DECL_BEGIN VOLK_API void * volk_malloc(size_t size, size_t alignment)
Allocate size bytes of data aligned to alignment.
Definition: volk_malloc.c:38