Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12  */
13 
14 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16 #include <string.h>
17 
18 static inline unsigned int log2_of_power_of_2(unsigned int val)
19 {
20  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21  static const unsigned int b[] = {
22  0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23  };
24 
25  unsigned int res = (val & b[0]) != 0;
26  res |= ((val & b[4]) != 0) << 4;
27  res |= ((val & b[3]) != 0) << 3;
28  res |= ((val & b[2]) != 0) << 2;
29  res |= ((val & b[1]) != 0) << 1;
30  return res;
31 }
32 
33 static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34  const unsigned char* temp_ptr,
35  const unsigned int num_branches,
36  const unsigned int frame_half)
37 {
38  unsigned int branch, bit;
39  for (branch = 0; branch < num_branches; ++branch) {
40  for (bit = 0; bit < frame_half; ++bit) {
41  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42  *(frame_ptr + frame_half) = *(temp_ptr + 1);
43  ++frame_ptr;
44  temp_ptr += 2;
45  }
46  frame_ptr += frame_half;
47  }
48 }
49 
50 #ifdef LV_HAVE_GENERIC
51 
52 static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
53  unsigned char* temp,
54  unsigned int frame_size)
55 {
56  unsigned int stage = log2_of_power_of_2(frame_size);
57  unsigned int frame_half = frame_size >> 1;
58  unsigned int num_branches = 1;
59 
60  while (stage) {
61  // encode stage
62  encodepolar_single_stage(frame, temp, num_branches, frame_half);
63  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
64 
65  // update all the parameters.
66  num_branches = num_branches << 1;
67  frame_half = frame_half >> 1;
68  --stage;
69  }
70 }
71 #endif /* LV_HAVE_GENERIC */
72 
73 #ifdef LV_HAVE_SSSE3
74 #include <tmmintrin.h>
75 
76 static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
77  unsigned char* temp,
78  unsigned int frame_size)
79 {
80  const unsigned int po2 = log2_of_power_of_2(frame_size);
81 
82  unsigned int stage = po2;
83  unsigned char* frame_ptr = frame;
84  unsigned char* temp_ptr = temp;
85 
86  unsigned int frame_half = frame_size >> 1;
87  unsigned int num_branches = 1;
88  unsigned int branch;
89  unsigned int bit;
90 
91  // prepare constants
92  const __m128i mask_stage1 = _mm_set_epi8(0x0,
93  0xFF,
94  0x0,
95  0xFF,
96  0x0,
97  0xFF,
98  0x0,
99  0xFF,
100  0x0,
101  0xFF,
102  0x0,
103  0xFF,
104  0x0,
105  0xFF,
106  0x0,
107  0xFF);
108 
109  // get some SIMD registers to play with.
110  __m128i r_frame0, r_temp0, shifted;
111 
112  {
113  __m128i r_frame1, r_temp1;
114  const __m128i shuffle_separate =
115  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
116 
117  while (stage > 4) {
118  frame_ptr = frame;
119  temp_ptr = temp;
120 
121  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
122  for (branch = 0; branch < num_branches; ++branch) {
123  for (bit = 0; bit < frame_half; bit += 16) {
124  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
125  temp_ptr += 16;
126  r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
127  temp_ptr += 16;
128 
129  shifted = _mm_srli_si128(r_temp0, 1);
130  shifted = _mm_and_si128(shifted, mask_stage1);
131  r_temp0 = _mm_xor_si128(shifted, r_temp0);
132  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
133 
134  shifted = _mm_srli_si128(r_temp1, 1);
135  shifted = _mm_and_si128(shifted, mask_stage1);
136  r_temp1 = _mm_xor_si128(shifted, r_temp1);
137  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
138 
139  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
140  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
141 
142  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
143  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
144  frame_ptr += 16;
145  }
146 
147  frame_ptr += frame_half;
148  }
149  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
150 
151  num_branches = num_branches << 1;
152  frame_half = frame_half >> 1;
153  stage--;
154  }
155  }
156 
157  // This last part requires at least 16-bit frames.
158  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
159 
160  // reset pointers to correct positions.
161  frame_ptr = frame;
162  temp_ptr = temp;
163 
164  // prefetch first chunk
165  __VOLK_PREFETCH(temp_ptr);
166 
167  const __m128i shuffle_stage4 =
168  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
169  const __m128i mask_stage4 = _mm_set_epi8(0x0,
170  0x0,
171  0x0,
172  0x0,
173  0x0,
174  0x0,
175  0x0,
176  0x0,
177  0xFF,
178  0xFF,
179  0xFF,
180  0xFF,
181  0xFF,
182  0xFF,
183  0xFF,
184  0xFF);
185  const __m128i mask_stage3 = _mm_set_epi8(0x0,
186  0x0,
187  0x0,
188  0x0,
189  0xFF,
190  0xFF,
191  0xFF,
192  0xFF,
193  0x0,
194  0x0,
195  0x0,
196  0x0,
197  0xFF,
198  0xFF,
199  0xFF,
200  0xFF);
201  const __m128i mask_stage2 = _mm_set_epi8(0x0,
202  0x0,
203  0xFF,
204  0xFF,
205  0x0,
206  0x0,
207  0xFF,
208  0xFF,
209  0x0,
210  0x0,
211  0xFF,
212  0xFF,
213  0x0,
214  0x0,
215  0xFF,
216  0xFF);
217 
218  for (branch = 0; branch < num_branches; ++branch) {
219  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
220 
221  // prefetch next chunk
222  temp_ptr += 16;
223  __VOLK_PREFETCH(temp_ptr);
224 
225  // shuffle once for bit-reversal.
226  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
227 
228  shifted = _mm_srli_si128(r_temp0, 8);
229  shifted = _mm_and_si128(shifted, mask_stage4);
230  r_frame0 = _mm_xor_si128(shifted, r_temp0);
231 
232  shifted = _mm_srli_si128(r_frame0, 4);
233  shifted = _mm_and_si128(shifted, mask_stage3);
234  r_frame0 = _mm_xor_si128(shifted, r_frame0);
235 
236  shifted = _mm_srli_si128(r_frame0, 2);
237  shifted = _mm_and_si128(shifted, mask_stage2);
238  r_frame0 = _mm_xor_si128(shifted, r_frame0);
239 
240  shifted = _mm_srli_si128(r_frame0, 1);
241  shifted = _mm_and_si128(shifted, mask_stage1);
242  r_frame0 = _mm_xor_si128(shifted, r_frame0);
243 
244  // store result of chunk.
245  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
246  frame_ptr += 16;
247  }
248 }
249 
250 #endif /* LV_HAVE_SSSE3 */
251 
252 #ifdef LV_HAVE_AVX2
253 #include <immintrin.h>
254 
255 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
256  unsigned char* temp,
257  unsigned int frame_size)
258 {
259  const unsigned int po2 = log2_of_power_of_2(frame_size);
260 
261  unsigned int stage = po2;
262  unsigned char* frame_ptr = frame;
263  unsigned char* temp_ptr = temp;
264 
265  unsigned int frame_half = frame_size >> 1;
266  unsigned int num_branches = 1;
267  unsigned int branch;
268  unsigned int bit;
269 
270  // prepare constants
271  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
272  0xFF,
273  0x0,
274  0xFF,
275  0x0,
276  0xFF,
277  0x0,
278  0xFF,
279  0x0,
280  0xFF,
281  0x0,
282  0xFF,
283  0x0,
284  0xFF,
285  0x0,
286  0xFF,
287  0x0,
288  0xFF,
289  0x0,
290  0xFF,
291  0x0,
292  0xFF,
293  0x0,
294  0xFF,
295  0x0,
296  0xFF,
297  0x0,
298  0xFF,
299  0x0,
300  0xFF,
301  0x0,
302  0xFF);
303 
304  const __m128i mask_stage0 = _mm_set_epi8(0x0,
305  0xFF,
306  0x0,
307  0xFF,
308  0x0,
309  0xFF,
310  0x0,
311  0xFF,
312  0x0,
313  0xFF,
314  0x0,
315  0xFF,
316  0x0,
317  0xFF,
318  0x0,
319  0xFF);
320  // get some SIMD registers to play with.
321  __m256i r_frame0, r_temp0, shifted;
322  __m128i r_temp2, r_frame2, shifted2;
323  {
324  __m256i r_frame1, r_temp1;
325  __m128i r_frame3, r_temp3;
326  const __m256i shuffle_separate = _mm256_setr_epi8(0,
327  2,
328  4,
329  6,
330  8,
331  10,
332  12,
333  14,
334  1,
335  3,
336  5,
337  7,
338  9,
339  11,
340  13,
341  15,
342  0,
343  2,
344  4,
345  6,
346  8,
347  10,
348  12,
349  14,
350  1,
351  3,
352  5,
353  7,
354  9,
355  11,
356  13,
357  15);
358  const __m128i shuffle_separate128 =
359  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
360 
361  while (stage > 4) {
362  frame_ptr = frame;
363  temp_ptr = temp;
364 
365  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
366  for (branch = 0; branch < num_branches; ++branch) {
367  for (bit = 0; bit < frame_half; bit += 32) {
368  if ((frame_half - bit) <
369  32) // if only 16 bits remaining in frame, not 32
370  {
371  r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
372  temp_ptr += 16;
373  r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
374  temp_ptr += 16;
375 
376  shifted2 = _mm_srli_si128(r_temp2, 1);
377  shifted2 = _mm_and_si128(shifted2, mask_stage0);
378  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
379  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
380 
381  shifted2 = _mm_srli_si128(r_temp3, 1);
382  shifted2 = _mm_and_si128(shifted2, mask_stage0);
383  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
384  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
385 
386  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
387  _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
388 
389  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
390  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
391  frame_ptr += 16;
392  break;
393  }
394  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
395  temp_ptr += 32;
396  r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
397  temp_ptr += 32;
398 
399  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
400  shifted = _mm256_and_si256(shifted, mask_stage1);
401  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
402  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
403 
404  shifted = _mm256_srli_si256(r_temp1, 1);
405  shifted = _mm256_and_si256(shifted, mask_stage1);
406  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
407  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
408 
409  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
410  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
411  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
412  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
413 
414  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
415 
416  _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
417  frame_ptr += 32;
418  }
419 
420  frame_ptr += frame_half;
421  }
422  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
423 
424  num_branches = num_branches << 1;
425  frame_half = frame_half >> 1;
426  stage--;
427  }
428  }
429 
430  // This last part requires at least 32-bit frames.
431  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
432 
433  // reset pointers to correct positions.
434  frame_ptr = frame;
435  temp_ptr = temp;
436 
437  // prefetch first chunk
438  __VOLK_PREFETCH(temp_ptr);
439 
440  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
441  8,
442  4,
443  12,
444  2,
445  10,
446  6,
447  14,
448  1,
449  9,
450  5,
451  13,
452  3,
453  11,
454  7,
455  15,
456  0,
457  8,
458  4,
459  12,
460  2,
461  10,
462  6,
463  14,
464  1,
465  9,
466  5,
467  13,
468  3,
469  11,
470  7,
471  15);
472  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
473  0x0,
474  0x0,
475  0x0,
476  0x0,
477  0x0,
478  0x0,
479  0x0,
480  0xFF,
481  0xFF,
482  0xFF,
483  0xFF,
484  0xFF,
485  0xFF,
486  0xFF,
487  0xFF,
488  0x0,
489  0x0,
490  0x0,
491  0x0,
492  0x0,
493  0x0,
494  0x0,
495  0x0,
496  0xFF,
497  0xFF,
498  0xFF,
499  0xFF,
500  0xFF,
501  0xFF,
502  0xFF,
503  0xFF);
504  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
505  0x0,
506  0x0,
507  0x0,
508  0xFF,
509  0xFF,
510  0xFF,
511  0xFF,
512  0x0,
513  0x0,
514  0x0,
515  0x0,
516  0xFF,
517  0xFF,
518  0xFF,
519  0xFF,
520  0x0,
521  0x0,
522  0x0,
523  0x0,
524  0xFF,
525  0xFF,
526  0xFF,
527  0xFF,
528  0x0,
529  0x0,
530  0x0,
531  0x0,
532  0xFF,
533  0xFF,
534  0xFF,
535  0xFF);
536  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
537  0x0,
538  0xFF,
539  0xFF,
540  0x0,
541  0x0,
542  0xFF,
543  0xFF,
544  0x0,
545  0x0,
546  0xFF,
547  0xFF,
548  0x0,
549  0x0,
550  0xFF,
551  0xFF,
552  0x0,
553  0x0,
554  0xFF,
555  0xFF,
556  0x0,
557  0x0,
558  0xFF,
559  0xFF,
560  0x0,
561  0x0,
562  0xFF,
563  0xFF,
564  0x0,
565  0x0,
566  0xFF,
567  0xFF);
568 
569  for (branch = 0; branch < num_branches / 2; ++branch) {
570  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
571 
572  // prefetch next chunk
573  temp_ptr += 32;
574  __VOLK_PREFETCH(temp_ptr);
575 
576  // shuffle once for bit-reversal.
577  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
578 
579  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
580  shifted = _mm256_and_si256(shifted, mask_stage4);
581  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
582 
583 
584  shifted = _mm256_srli_si256(r_frame0, 4);
585  shifted = _mm256_and_si256(shifted, mask_stage3);
586  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
587 
588  shifted = _mm256_srli_si256(r_frame0, 2);
589  shifted = _mm256_and_si256(shifted, mask_stage2);
590  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
591 
592  shifted = _mm256_srli_si256(r_frame0, 1);
593  shifted = _mm256_and_si256(shifted, mask_stage1);
594  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
595 
596  // store result of chunk.
597  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
598  frame_ptr += 32;
599  }
600 }
601 #endif /* LV_HAVE_AVX2 */
602 
603 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
604 
605 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
606 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
607 
608 #ifdef LV_HAVE_SSSE3
609 #include <tmmintrin.h>
610 
611 static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
612  unsigned char* temp,
613  unsigned int frame_size)
614 {
615  const unsigned int po2 = log2_of_power_of_2(frame_size);
616 
617  unsigned int stage = po2;
618  unsigned char* frame_ptr = frame;
619  unsigned char* temp_ptr = temp;
620 
621  unsigned int frame_half = frame_size >> 1;
622  unsigned int num_branches = 1;
623  unsigned int branch;
624  unsigned int bit;
625 
626  // prepare constants
627  const __m128i mask_stage1 = _mm_set_epi8(0x0,
628  0xFF,
629  0x0,
630  0xFF,
631  0x0,
632  0xFF,
633  0x0,
634  0xFF,
635  0x0,
636  0xFF,
637  0x0,
638  0xFF,
639  0x0,
640  0xFF,
641  0x0,
642  0xFF);
643 
644  // get some SIMD registers to play with.
645  __m128i r_frame0, r_temp0, shifted;
646 
647  {
648  __m128i r_frame1, r_temp1;
649  const __m128i shuffle_separate =
650  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
651 
652  while (stage > 4) {
653  frame_ptr = frame;
654  temp_ptr = temp;
655 
656  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
657  for (branch = 0; branch < num_branches; ++branch) {
658  for (bit = 0; bit < frame_half; bit += 16) {
659  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
660  temp_ptr += 16;
661  r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
662  temp_ptr += 16;
663 
664  shifted = _mm_srli_si128(r_temp0, 1);
665  shifted = _mm_and_si128(shifted, mask_stage1);
666  r_temp0 = _mm_xor_si128(shifted, r_temp0);
667  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
668 
669  shifted = _mm_srli_si128(r_temp1, 1);
670  shifted = _mm_and_si128(shifted, mask_stage1);
671  r_temp1 = _mm_xor_si128(shifted, r_temp1);
672  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
673 
674  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
675  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
676 
677  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
678  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
679  frame_ptr += 16;
680  }
681 
682  frame_ptr += frame_half;
683  }
684  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
685 
686  num_branches = num_branches << 1;
687  frame_half = frame_half >> 1;
688  stage--;
689  }
690  }
691 
692  // This last part requires at least 16-bit frames.
693  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
694 
695  // reset pointers to correct positions.
696  frame_ptr = frame;
697  temp_ptr = temp;
698 
699  // prefetch first chunk
700  __VOLK_PREFETCH(temp_ptr);
701 
702  const __m128i shuffle_stage4 =
703  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
704  const __m128i mask_stage4 = _mm_set_epi8(0x0,
705  0x0,
706  0x0,
707  0x0,
708  0x0,
709  0x0,
710  0x0,
711  0x0,
712  0xFF,
713  0xFF,
714  0xFF,
715  0xFF,
716  0xFF,
717  0xFF,
718  0xFF,
719  0xFF);
720  const __m128i mask_stage3 = _mm_set_epi8(0x0,
721  0x0,
722  0x0,
723  0x0,
724  0xFF,
725  0xFF,
726  0xFF,
727  0xFF,
728  0x0,
729  0x0,
730  0x0,
731  0x0,
732  0xFF,
733  0xFF,
734  0xFF,
735  0xFF);
736  const __m128i mask_stage2 = _mm_set_epi8(0x0,
737  0x0,
738  0xFF,
739  0xFF,
740  0x0,
741  0x0,
742  0xFF,
743  0xFF,
744  0x0,
745  0x0,
746  0xFF,
747  0xFF,
748  0x0,
749  0x0,
750  0xFF,
751  0xFF);
752 
753  for (branch = 0; branch < num_branches; ++branch) {
754  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
755 
756  // prefetch next chunk
757  temp_ptr += 16;
758  __VOLK_PREFETCH(temp_ptr);
759 
760  // shuffle once for bit-reversal.
761  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
762 
763  shifted = _mm_srli_si128(r_temp0, 8);
764  shifted = _mm_and_si128(shifted, mask_stage4);
765  r_frame0 = _mm_xor_si128(shifted, r_temp0);
766 
767  shifted = _mm_srli_si128(r_frame0, 4);
768  shifted = _mm_and_si128(shifted, mask_stage3);
769  r_frame0 = _mm_xor_si128(shifted, r_frame0);
770 
771  shifted = _mm_srli_si128(r_frame0, 2);
772  shifted = _mm_and_si128(shifted, mask_stage2);
773  r_frame0 = _mm_xor_si128(shifted, r_frame0);
774 
775  shifted = _mm_srli_si128(r_frame0, 1);
776  shifted = _mm_and_si128(shifted, mask_stage1);
777  r_frame0 = _mm_xor_si128(shifted, r_frame0);
778 
779  // store result of chunk.
780  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
781  frame_ptr += 16;
782  }
783 }
784 #endif /* LV_HAVE_SSSE3 */
785 
786 #ifdef LV_HAVE_AVX2
787 #include <immintrin.h>
788 
789 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
790  unsigned char* temp,
791  unsigned int frame_size)
792 {
793  const unsigned int po2 = log2_of_power_of_2(frame_size);
794 
795  unsigned int stage = po2;
796  unsigned char* frame_ptr = frame;
797  unsigned char* temp_ptr = temp;
798 
799  unsigned int frame_half = frame_size >> 1;
800  unsigned int num_branches = 1;
801  unsigned int branch;
802  unsigned int bit;
803 
804  // prepare constants
805  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
806  0xFF,
807  0x0,
808  0xFF,
809  0x0,
810  0xFF,
811  0x0,
812  0xFF,
813  0x0,
814  0xFF,
815  0x0,
816  0xFF,
817  0x0,
818  0xFF,
819  0x0,
820  0xFF,
821  0x0,
822  0xFF,
823  0x0,
824  0xFF,
825  0x0,
826  0xFF,
827  0x0,
828  0xFF,
829  0x0,
830  0xFF,
831  0x0,
832  0xFF,
833  0x0,
834  0xFF,
835  0x0,
836  0xFF);
837 
838  const __m128i mask_stage0 = _mm_set_epi8(0x0,
839  0xFF,
840  0x0,
841  0xFF,
842  0x0,
843  0xFF,
844  0x0,
845  0xFF,
846  0x0,
847  0xFF,
848  0x0,
849  0xFF,
850  0x0,
851  0xFF,
852  0x0,
853  0xFF);
854  // get some SIMD registers to play with.
855  __m256i r_frame0, r_temp0, shifted;
856  __m128i r_temp2, r_frame2, shifted2;
857  {
858  __m256i r_frame1, r_temp1;
859  __m128i r_frame3, r_temp3;
860  const __m256i shuffle_separate = _mm256_setr_epi8(0,
861  2,
862  4,
863  6,
864  8,
865  10,
866  12,
867  14,
868  1,
869  3,
870  5,
871  7,
872  9,
873  11,
874  13,
875  15,
876  0,
877  2,
878  4,
879  6,
880  8,
881  10,
882  12,
883  14,
884  1,
885  3,
886  5,
887  7,
888  9,
889  11,
890  13,
891  15);
892  const __m128i shuffle_separate128 =
893  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
894 
895  while (stage > 4) {
896  frame_ptr = frame;
897  temp_ptr = temp;
898 
899  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
900  for (branch = 0; branch < num_branches; ++branch) {
901  for (bit = 0; bit < frame_half; bit += 32) {
902  if ((frame_half - bit) <
903  32) // if only 16 bits remaining in frame, not 32
904  {
905  r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
906  temp_ptr += 16;
907  r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
908  temp_ptr += 16;
909 
910  shifted2 = _mm_srli_si128(r_temp2, 1);
911  shifted2 = _mm_and_si128(shifted2, mask_stage0);
912  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
913  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
914 
915  shifted2 = _mm_srli_si128(r_temp3, 1);
916  shifted2 = _mm_and_si128(shifted2, mask_stage0);
917  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
918  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
919 
920  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
921  _mm_store_si128((__m128i*)frame_ptr, r_frame2);
922 
923  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
924  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
925  frame_ptr += 16;
926  break;
927  }
928  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
929  temp_ptr += 32;
930  r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
931  temp_ptr += 32;
932 
933  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
934  shifted = _mm256_and_si256(shifted, mask_stage1);
935  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
936  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
937 
938  shifted = _mm256_srli_si256(r_temp1, 1);
939  shifted = _mm256_and_si256(shifted, mask_stage1);
940  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
941  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
942 
943  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
944  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
945  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
946  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
947 
948  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
949 
950  _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
951  frame_ptr += 32;
952  }
953 
954  frame_ptr += frame_half;
955  }
956  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
957 
958  num_branches = num_branches << 1;
959  frame_half = frame_half >> 1;
960  stage--;
961  }
962  }
963 
964  // This last part requires at least 32-bit frames.
965  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
966 
967  // reset pointers to correct positions.
968  frame_ptr = frame;
969  temp_ptr = temp;
970 
971  // prefetch first chunk.
972  __VOLK_PREFETCH(temp_ptr);
973 
974  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
975  8,
976  4,
977  12,
978  2,
979  10,
980  6,
981  14,
982  1,
983  9,
984  5,
985  13,
986  3,
987  11,
988  7,
989  15,
990  0,
991  8,
992  4,
993  12,
994  2,
995  10,
996  6,
997  14,
998  1,
999  9,
1000  5,
1001  13,
1002  3,
1003  11,
1004  7,
1005  15);
1006  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1007  0x0,
1008  0x0,
1009  0x0,
1010  0x0,
1011  0x0,
1012  0x0,
1013  0x0,
1014  0xFF,
1015  0xFF,
1016  0xFF,
1017  0xFF,
1018  0xFF,
1019  0xFF,
1020  0xFF,
1021  0xFF,
1022  0x0,
1023  0x0,
1024  0x0,
1025  0x0,
1026  0x0,
1027  0x0,
1028  0x0,
1029  0x0,
1030  0xFF,
1031  0xFF,
1032  0xFF,
1033  0xFF,
1034  0xFF,
1035  0xFF,
1036  0xFF,
1037  0xFF);
1038  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1039  0x0,
1040  0x0,
1041  0x0,
1042  0xFF,
1043  0xFF,
1044  0xFF,
1045  0xFF,
1046  0x0,
1047  0x0,
1048  0x0,
1049  0x0,
1050  0xFF,
1051  0xFF,
1052  0xFF,
1053  0xFF,
1054  0x0,
1055  0x0,
1056  0x0,
1057  0x0,
1058  0xFF,
1059  0xFF,
1060  0xFF,
1061  0xFF,
1062  0x0,
1063  0x0,
1064  0x0,
1065  0x0,
1066  0xFF,
1067  0xFF,
1068  0xFF,
1069  0xFF);
1070  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1071  0x0,
1072  0xFF,
1073  0xFF,
1074  0x0,
1075  0x0,
1076  0xFF,
1077  0xFF,
1078  0x0,
1079  0x0,
1080  0xFF,
1081  0xFF,
1082  0x0,
1083  0x0,
1084  0xFF,
1085  0xFF,
1086  0x0,
1087  0x0,
1088  0xFF,
1089  0xFF,
1090  0x0,
1091  0x0,
1092  0xFF,
1093  0xFF,
1094  0x0,
1095  0x0,
1096  0xFF,
1097  0xFF,
1098  0x0,
1099  0x0,
1100  0xFF,
1101  0xFF);
1102 
1103  for (branch = 0; branch < num_branches / 2; ++branch) {
1104  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1105 
1106  // prefetch next chunk
1107  temp_ptr += 32;
1108  __VOLK_PREFETCH(temp_ptr);
1109 
1110  // shuffle once for bit-reversal.
1111  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1112 
1113  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1114  shifted = _mm256_and_si256(shifted, mask_stage4);
1115  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1116 
1117  shifted = _mm256_srli_si256(r_frame0, 4);
1118  shifted = _mm256_and_si256(shifted, mask_stage3);
1119  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1120 
1121  shifted = _mm256_srli_si256(r_frame0, 2);
1122  shifted = _mm256_and_si256(shifted, mask_stage2);
1123  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1124 
1125  shifted = _mm256_srli_si256(r_frame0, 1);
1126  shifted = _mm256_and_si256(shifted, mask_stage1);
1127  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1128 
1129  // store result of chunk.
1130  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1131  frame_ptr += 32;
1132  }
1133 }
1134 #endif /* LV_HAVE_AVX2 */
1135 
1136 
1137 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
val
Definition: volk_arch_defs.py:57
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6281
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:611
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71