14 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (
val & b[0]) != 0;
26 res |= ((
val & b[4]) != 0) << 4;
27 res |= ((
val & b[3]) != 0) << 3;
28 res |= ((
val & b[2]) != 0) << 2;
29 res |= ((
val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
50 #ifdef LV_HAVE_GENERIC
54 unsigned int frame_size)
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
63 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
74 #include <tmmintrin.h>
78 unsigned int frame_size)
82 unsigned int stage = po2;
83 unsigned char* frame_ptr = frame;
84 unsigned char* temp_ptr = temp;
86 unsigned int frame_half = frame_size >> 1;
87 unsigned int num_branches = 1;
110 __m128i r_frame0, r_temp0, shifted;
114 const __m128i shuffle_separate =
115 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
122 for (branch = 0; branch < num_branches; ++branch) {
123 for (bit = 0; bit < frame_half; bit += 16) {
147 frame_ptr += frame_half;
149 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
151 num_branches = num_branches << 1;
152 frame_half = frame_half >> 1;
168 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
218 for (branch = 0; branch < num_branches; ++branch) {
253 #include <immintrin.h>
255 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
257 unsigned int frame_size)
261 unsigned int stage = po2;
262 unsigned char* frame_ptr = frame;
263 unsigned char* temp_ptr = temp;
265 unsigned int frame_half = frame_size >> 1;
266 unsigned int num_branches = 1;
271 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
321 __m256i r_frame0, r_temp0, shifted;
322 __m128i r_temp2, r_frame2, shifted2;
324 __m256i r_frame1, r_temp1;
326 const __m256i shuffle_separate = _mm256_setr_epi8(0,
358 const __m128i shuffle_separate128 =
359 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
366 for (branch = 0; branch < num_branches; ++branch) {
367 for (bit = 0; bit < frame_half; bit += 32) {
368 if ((frame_half - bit) <
394 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
396 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
399 shifted = _mm256_srli_si256(r_temp0, 1);
400 shifted = _mm256_and_si256(shifted, mask_stage1);
401 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
402 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
404 shifted = _mm256_srli_si256(r_temp1, 1);
405 shifted = _mm256_and_si256(shifted, mask_stage1);
406 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
407 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
409 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
410 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
411 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
412 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
414 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
416 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
420 frame_ptr += frame_half;
422 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
424 num_branches = num_branches << 1;
425 frame_half = frame_half >> 1;
440 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
472 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
504 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
536 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
569 for (branch = 0; branch < num_branches / 2; ++branch) {
570 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
577 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
579 shifted = _mm256_srli_si256(r_temp0, 8);
580 shifted = _mm256_and_si256(shifted, mask_stage4);
581 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
584 shifted = _mm256_srli_si256(r_frame0, 4);
585 shifted = _mm256_and_si256(shifted, mask_stage3);
586 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
588 shifted = _mm256_srli_si256(r_frame0, 2);
589 shifted = _mm256_and_si256(shifted, mask_stage2);
590 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
592 shifted = _mm256_srli_si256(r_frame0, 1);
593 shifted = _mm256_and_si256(shifted, mask_stage1);
594 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
605 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
606 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
609 #include <tmmintrin.h>
613 unsigned int frame_size)
617 unsigned int stage = po2;
618 unsigned char* frame_ptr = frame;
619 unsigned char* temp_ptr = temp;
621 unsigned int frame_half = frame_size >> 1;
622 unsigned int num_branches = 1;
645 __m128i r_frame0, r_temp0, shifted;
649 const __m128i shuffle_separate =
650 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
657 for (branch = 0; branch < num_branches; ++branch) {
658 for (bit = 0; bit < frame_half; bit += 16) {
682 frame_ptr += frame_half;
684 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
686 num_branches = num_branches << 1;
687 frame_half = frame_half >> 1;
703 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
753 for (branch = 0; branch < num_branches; ++branch) {
787 #include <immintrin.h>
789 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
791 unsigned int frame_size)
795 unsigned int stage = po2;
796 unsigned char* frame_ptr = frame;
797 unsigned char* temp_ptr = temp;
799 unsigned int frame_half = frame_size >> 1;
800 unsigned int num_branches = 1;
805 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
855 __m256i r_frame0, r_temp0, shifted;
856 __m128i r_temp2, r_frame2, shifted2;
858 __m256i r_frame1, r_temp1;
860 const __m256i shuffle_separate = _mm256_setr_epi8(0,
892 const __m128i shuffle_separate128 =
893 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
900 for (branch = 0; branch < num_branches; ++branch) {
901 for (bit = 0; bit < frame_half; bit += 32) {
902 if ((frame_half - bit) <
928 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
930 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
933 shifted = _mm256_srli_si256(r_temp0, 1);
934 shifted = _mm256_and_si256(shifted, mask_stage1);
935 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
936 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
938 shifted = _mm256_srli_si256(r_temp1, 1);
939 shifted = _mm256_and_si256(shifted, mask_stage1);
940 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
941 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
943 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
944 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
945 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
946 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
948 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
950 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
954 frame_ptr += frame_half;
956 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
958 num_branches = num_branches << 1;
959 frame_half = frame_half >> 1;
974 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1006 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1038 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1070 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1103 for (branch = 0; branch < num_branches / 2; ++branch) {
1104 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1111 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1113 shifted = _mm256_srli_si256(r_temp0, 8);
1114 shifted = _mm256_and_si256(shifted, mask_stage4);
1115 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1117 shifted = _mm256_srli_si256(r_frame0, 4);
1118 shifted = _mm256_and_si256(shifted, mask_stage3);
1119 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1121 shifted = _mm256_srli_si256(r_frame0, 2);
1122 shifted = _mm256_and_si256(shifted, mask_stage2);
1123 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1125 shifted = _mm256_srli_si256(r_frame0, 1);
1126 shifted = _mm256_and_si256(shifted, mask_stage1);
1127 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1130 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
val
Definition: volk_arch_defs.py:57
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6281
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:611
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71