Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32u_reverse_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
30 #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
31 struct dword_split {
32  int b00 : 1;
33  int b01 : 1;
34  int b02 : 1;
35  int b03 : 1;
36  int b04 : 1;
37  int b05 : 1;
38  int b06 : 1;
39  int b07 : 1;
40  int b08 : 1;
41  int b09 : 1;
42  int b10 : 1;
43  int b11 : 1;
44  int b12 : 1;
45  int b13 : 1;
46  int b14 : 1;
47  int b15 : 1;
48  int b16 : 1;
49  int b17 : 1;
50  int b18 : 1;
51  int b19 : 1;
52  int b20 : 1;
53  int b21 : 1;
54  int b22 : 1;
55  int b23 : 1;
56  int b24 : 1;
57  int b25 : 1;
58  int b26 : 1;
59  int b27 : 1;
60  int b28 : 1;
61  int b29 : 1;
62  int b30 : 1;
63  int b31 : 1;
64 };
65 struct char_split {
66  uint8_t b00 : 1;
67  uint8_t b01 : 1;
68  uint8_t b02 : 1;
69  uint8_t b03 : 1;
70  uint8_t b04 : 1;
71  uint8_t b05 : 1;
72  uint8_t b06 : 1;
73  uint8_t b07 : 1;
74 };
75 
76 // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
77 // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
78 static const unsigned char BitReverseTable256[] = {
79  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
80  0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
81  0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
82  0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
83  0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
84  0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
85  0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
86  0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
87  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
88  0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
89  0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
90  0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
91  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
92  0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
93  0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
94  0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
95  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
96  0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
97  0x3F, 0xBF, 0x7F, 0xFF
98 };
99 #ifdef LV_HAVE_GENERIC
100 static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
101  const uint32_t* in,
102  unsigned int num_points)
103 {
104  const struct dword_split* in_ptr = (const struct dword_split*)in;
105  struct dword_split* out_ptr = (struct dword_split*)out;
106  unsigned int number = 0;
107  for (; number < num_points; ++number) {
108  out_ptr->b00 = in_ptr->b31;
109  out_ptr->b01 = in_ptr->b30;
110  out_ptr->b02 = in_ptr->b29;
111  out_ptr->b03 = in_ptr->b28;
112  out_ptr->b04 = in_ptr->b27;
113  out_ptr->b05 = in_ptr->b26;
114  out_ptr->b06 = in_ptr->b25;
115  out_ptr->b07 = in_ptr->b24;
116  out_ptr->b08 = in_ptr->b23;
117  out_ptr->b09 = in_ptr->b22;
118  out_ptr->b10 = in_ptr->b21;
119  out_ptr->b11 = in_ptr->b20;
120  out_ptr->b12 = in_ptr->b19;
121  out_ptr->b13 = in_ptr->b18;
122  out_ptr->b14 = in_ptr->b17;
123  out_ptr->b15 = in_ptr->b16;
124  out_ptr->b16 = in_ptr->b15;
125  out_ptr->b17 = in_ptr->b14;
126  out_ptr->b18 = in_ptr->b13;
127  out_ptr->b19 = in_ptr->b12;
128  out_ptr->b20 = in_ptr->b11;
129  out_ptr->b21 = in_ptr->b10;
130  out_ptr->b22 = in_ptr->b09;
131  out_ptr->b23 = in_ptr->b08;
132  out_ptr->b24 = in_ptr->b07;
133  out_ptr->b25 = in_ptr->b06;
134  out_ptr->b26 = in_ptr->b05;
135  out_ptr->b27 = in_ptr->b04;
136  out_ptr->b28 = in_ptr->b03;
137  out_ptr->b29 = in_ptr->b02;
138  out_ptr->b30 = in_ptr->b01;
139  out_ptr->b31 = in_ptr->b00;
140  ++in_ptr;
141  ++out_ptr;
142  }
143 }
144 #endif /* LV_HAVE_GENERIC */
145 
146 #ifdef LV_HAVE_GENERIC
147 static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
148  const uint32_t* in,
149  unsigned int num_points)
150 {
151  const uint32_t* in_ptr = in;
152  uint32_t* out_ptr = out;
153  unsigned int number = 0;
154  for (; number < num_points; ++number) {
155  const struct char_split* in8 = (const struct char_split*)in_ptr;
156  struct char_split* out8 = (struct char_split*)out_ptr;
157 
158  out8[3].b00 = in8[0].b07;
159  out8[3].b01 = in8[0].b06;
160  out8[3].b02 = in8[0].b05;
161  out8[3].b03 = in8[0].b04;
162  out8[3].b04 = in8[0].b03;
163  out8[3].b05 = in8[0].b02;
164  out8[3].b06 = in8[0].b01;
165  out8[3].b07 = in8[0].b00;
166 
167  out8[2].b00 = in8[1].b07;
168  out8[2].b01 = in8[1].b06;
169  out8[2].b02 = in8[1].b05;
170  out8[2].b03 = in8[1].b04;
171  out8[2].b04 = in8[1].b03;
172  out8[2].b05 = in8[1].b02;
173  out8[2].b06 = in8[1].b01;
174  out8[2].b07 = in8[1].b00;
175 
176  out8[1].b00 = in8[2].b07;
177  out8[1].b01 = in8[2].b06;
178  out8[1].b02 = in8[2].b05;
179  out8[1].b03 = in8[2].b04;
180  out8[1].b04 = in8[2].b03;
181  out8[1].b05 = in8[2].b02;
182  out8[1].b06 = in8[2].b01;
183  out8[1].b07 = in8[2].b00;
184 
185  out8[0].b00 = in8[3].b07;
186  out8[0].b01 = in8[3].b06;
187  out8[0].b02 = in8[3].b05;
188  out8[0].b03 = in8[3].b04;
189  out8[0].b04 = in8[3].b03;
190  out8[0].b05 = in8[3].b02;
191  out8[0].b06 = in8[3].b01;
192  out8[0].b07 = in8[3].b00;
193  ++in_ptr;
194  ++out_ptr;
195  }
196 }
197 #endif /* LV_HAVE_GENERIC */
198 
199 // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
200 // http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
201 #ifdef LV_HAVE_GENERIC
202 static inline void
203 volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
204 {
205  const uint32_t* in_ptr = in;
206  uint32_t* out_ptr = out;
207  unsigned int number = 0;
208  for (; number < num_points; ++number) {
209  *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
210  (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
211  (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
212  (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
213  ++in_ptr;
214  ++out_ptr;
215  }
216 }
217 #endif /* LV_HAVE_GENERIC */
218 
219 // Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
220 // domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
221 #ifdef LV_HAVE_GENERIC
222 static inline void
223 volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
224 {
225  const uint32_t* in_ptr = in;
226  uint32_t* out_ptr = out;
227  const uint8_t* in8;
228  uint8_t* out8;
229  unsigned int number = 0;
230  for (; number < num_points; ++number) {
231  in8 = (const uint8_t*)in_ptr;
232  out8 = (uint8_t*)out_ptr;
233  out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
234  out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
235  out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
236  out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
237  ++in_ptr;
238  ++out_ptr;
239  }
240 }
241 #endif /* LV_HAVE_GENERIC */
242 
243 #ifdef LV_HAVE_GENERIC
244 // Current gr-pager implementation
245 static inline void
246 volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
247 {
248  const uint32_t* in_ptr = in;
249  uint32_t* out_ptr = out;
250  const uint8_t* in8;
251  uint8_t* out8;
252  unsigned int number = 0;
253  for (; number < num_points; ++number) {
254  in8 = (const uint8_t*)in_ptr;
255  out8 = (uint8_t*)out_ptr;
256  out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
257  out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
258  out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
259  out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
260  ++in_ptr;
261  ++out_ptr;
262  }
263 }
264 #endif /* LV_HAVE_GENERIC */
265 
266 // After lengthy thought and quite a bit of whiteboarding:
267 #ifdef LV_HAVE_GENERIC
268 static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
269  const uint32_t* in,
270  unsigned int num_points)
271 {
272  const uint32_t* in_ptr = in;
273  uint32_t* out_ptr = out;
274  unsigned int number = 0;
275  for (; number < num_points; ++number) {
276  uint32_t tmp = *in_ptr;
277  /* permute uint16:
278  The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
279  */
280  tmp = (tmp << 16) | (tmp >> 16);
281  /* permute bytes:
282  shift up by 1 B first, then only consider even bytes, and OR with the unshifted
283  even bytes
284  */
285  tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
286  /* permute 4bit tuples:
287  Same idea, but the "consideration" mask expression becomes unwieldy
288  */
289  tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
290  ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
291  /* permute 2bit tuples:
292  Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
293  3; we need those every 4b, which coincides with a hex digit!
294  */
295  tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
296  /* permute odd/even:
297  0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
298  0x05!
299  */
300  tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
301 
302  *out_ptr = tmp;
303  ++in_ptr;
304  ++out_ptr;
305  }
306 }
307 #endif /* LV_HAVE_GENERIC */
308 #ifdef LV_HAVE_GENERIC
309 static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
310  const uint32_t* in,
311  unsigned int num_points)
312 {
313  // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
314  const uint32_t* in_ptr = in;
315  uint32_t* out_ptr = out;
316  unsigned int number = 0;
317  for (; number < num_points; ++number) {
318  uint32_t tmp = *in_ptr;
319  tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
320  tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
321  tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
322  ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
323  tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
324  tmp = (tmp << 16) | (tmp >> 16);
325 
326  *out_ptr = tmp;
327  ++in_ptr;
328  ++out_ptr;
329  }
330 }
331 #endif /* LV_HAVE_GENERIC */
332 
333 #ifdef LV_HAVE_NEONV8
334 #include <arm_neon.h>
335 
336 static inline void
337 volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
338 {
339  const uint32_t* in_ptr = in;
340  uint32_t* out_ptr = out;
341 
342  const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
343 
344  const unsigned int quarterPoints = num_points / 4;
345  unsigned int number = 0;
346  for (; number < quarterPoints; ++number) {
347  __VOLK_PREFETCH(in_ptr + 4);
348  uint32x4_t x = vld1q_u32(in_ptr);
349  uint32x4_t z =
350  vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
351  vst1q_u32(out_ptr, z);
352  in_ptr += 4;
353  out_ptr += 4;
354  }
355  number = quarterPoints * 4;
356  for (; number < num_points; ++number) {
357  *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
358  (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
359  (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
360  (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
361  ++in_ptr;
362  ++out_ptr;
363  }
364 }
365 
366 #else
367 #ifdef LV_HAVE_NEON
368 #include <arm_neon.h>
369 
370 #define DO_RBIT \
371  __VOLK_ASM("rbit %[result], %[value]" \
372  : [result] "=r"(*out_ptr) \
373  : [value] "r"(*in_ptr) \
374  :); \
375  in_ptr++; \
376  out_ptr++;
377 
378 static inline void
379 volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
380 {
381 
382  const uint32_t* in_ptr = in;
383  uint32_t* out_ptr = out;
384  const unsigned int eighthPoints = num_points / 8;
385  unsigned int number = 0;
386  for (; number < eighthPoints; ++number) {
387  __VOLK_PREFETCH(in_ptr + 8);
388  DO_RBIT;
389  DO_RBIT;
390  DO_RBIT;
391  DO_RBIT;
392  DO_RBIT;
393  DO_RBIT;
394  DO_RBIT;
395  DO_RBIT;
396  }
397  number = eighthPoints * 8;
398  for (; number < num_points; ++number) {
399  DO_RBIT;
400  }
401 }
402 #undef DO_RBIT
403 #endif /* LV_HAVE_NEON */
404 #endif /* LV_HAVE_NEONV8 */
405 
406 
407 #endif /* INCLUDED_volk_32u_reverse_32u_u_H */
Definition: volk_32u_reverse_32u.h:65
uint8_t b02
Definition: volk_32u_reverse_32u.h:68
uint8_t b01
Definition: volk_32u_reverse_32u.h:67
uint8_t b05
Definition: volk_32u_reverse_32u.h:71
uint8_t b07
Definition: volk_32u_reverse_32u.h:73
uint8_t b04
Definition: volk_32u_reverse_32u.h:70
uint8_t b00
Definition: volk_32u_reverse_32u.h:66
uint8_t b06
Definition: volk_32u_reverse_32u.h:72
uint8_t b03
Definition: volk_32u_reverse_32u.h:69
Definition: volk_32u_reverse_32u.h:31
int b10
Definition: volk_32u_reverse_32u.h:42
int b02
Definition: volk_32u_reverse_32u.h:34
int b07
Definition: volk_32u_reverse_32u.h:39
int b16
Definition: volk_32u_reverse_32u.h:48
int b24
Definition: volk_32u_reverse_32u.h:56
int b06
Definition: volk_32u_reverse_32u.h:38
int b09
Definition: volk_32u_reverse_32u.h:41
int b28
Definition: volk_32u_reverse_32u.h:60
int b03
Definition: volk_32u_reverse_32u.h:35
int b11
Definition: volk_32u_reverse_32u.h:43
int b31
Definition: volk_32u_reverse_32u.h:63
int b23
Definition: volk_32u_reverse_32u.h:55
int b29
Definition: volk_32u_reverse_32u.h:61
int b25
Definition: volk_32u_reverse_32u.h:57
int b14
Definition: volk_32u_reverse_32u.h:46
int b15
Definition: volk_32u_reverse_32u.h:47
int b08
Definition: volk_32u_reverse_32u.h:40
int b21
Definition: volk_32u_reverse_32u.h:53
int b27
Definition: volk_32u_reverse_32u.h:59
int b19
Definition: volk_32u_reverse_32u.h:51
int b22
Definition: volk_32u_reverse_32u.h:54
int b30
Definition: volk_32u_reverse_32u.h:62
int b04
Definition: volk_32u_reverse_32u.h:36
int b18
Definition: volk_32u_reverse_32u.h:50
int b17
Definition: volk_32u_reverse_32u.h:49
int b12
Definition: volk_32u_reverse_32u.h:44
int b05
Definition: volk_32u_reverse_32u.h:37
int b00
Definition: volk_32u_reverse_32u.h:32
int b01
Definition: volk_32u_reverse_32u.h:33
int b26
Definition: volk_32u_reverse_32u.h:58
int b13
Definition: volk_32u_reverse_32u.h:45
int b20
Definition: volk_32u_reverse_32u.h:52
static void volk_32u_reverse_32u_1972magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:246
static void volk_32u_reverse_32u_dword_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:100
static void volk_32u_reverse_32u_2001magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:223
static void volk_32u_reverse_32u_lut(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:203
#define DO_RBIT
Definition: volk_32u_reverse_32u.h:370
static const unsigned char BitReverseTable256[]
Definition: volk_32u_reverse_32u.h:78
static void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:309
static void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:268
static void volk_32u_reverse_32u_arm(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:379
static void volk_32u_reverse_32u_byte_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:147
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71