Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47 
48 typedef union {
49  unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50  unsigned int w[64 /*NUMSTATES*/ / 32];
51  unsigned short s[64 /*NUMSTATES*/ / 16];
52  unsigned char c[64 /*NUMSTATES*/ / 8];
53 #ifdef _MSC_VER
54 } decision_t;
55 #else
56 } decision_t __attribute__((aligned(16)));
57 #endif
58 
59 
60 static inline void renormalize(unsigned char* X, unsigned char threshold)
61 {
62  int NUMSTATES = 64;
63  int i;
64 
65  unsigned char min = X[0];
66  // if(min > threshold) {
67  for (i = 0; i < NUMSTATES; i++)
68  if (min > X[i])
69  min = X[i];
70  for (i = 0; i < NUMSTATES; i++)
71  X[i] -= min;
72  //}
73 }
74 
75 
76 // helper BFLY for GENERIC version
77 static inline void BFLY(int i,
78  int s,
79  unsigned char* syms,
80  unsigned char* Y,
81  unsigned char* X,
82  decision_t* d,
83  unsigned char* Branchtab)
84 {
85  int j, decision0, decision1;
86  unsigned char metric, m0, m1, m2, m3;
87 
88  int NUMSTATES = 64;
89  int RATE = 2;
90  int METRICSHIFT = 2;
91  int PRECISIONSHIFT = 2;
92 
93  metric = 0;
94  for (j = 0; j < RATE; j++)
95  metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
96  metric = metric >> PRECISIONSHIFT;
97 
98  unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
99 
100  m0 = X[i] + metric;
101  m1 = X[i + NUMSTATES / 2] + (max - metric);
102  m2 = X[i] + (max - metric);
103  m3 = X[i + NUMSTATES / 2] + metric;
104 
105  decision0 = (signed int)(m0 - m1) > 0;
106  decision1 = (signed int)(m2 - m3) > 0;
107 
108  Y[2 * i] = decision0 ? m1 : m0;
109  Y[2 * i + 1] = decision1 ? m3 : m2;
110 
111  d->w[i / (sizeof(unsigned int) * 8 / 2) +
112  s * (sizeof(decision_t) / sizeof(unsigned int))] |=
113  (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
114 }
115 
116 
117 //#if LV_HAVE_AVX2
118 //
119 //#include <immintrin.h>
120 //#include <stdio.h>
121 //
122 // static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
123 // unsigned char* X,
124 // unsigned char* syms,
125 // unsigned char* dec,
126 // unsigned int framebits,
127 // unsigned int excess,
128 // unsigned char* Branchtab)
129 //{
130 // unsigned int i9;
131 // for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
132 // unsigned char a75, a81;
133 // int a73, a92;
134 // int s20, s21;
135 // unsigned char *a80, *b6;
136 // int *a110, *a91, *a93;
137 // __m256i *a112, *a71, *a72, *a77, *a83, *a95;
138 // __m256i a86, a87;
139 // __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
140 // m26,
141 // s18, s19, s22, s23, s24, s25, t13, t14, t15;
142 // a71 = ((__m256i*)X);
143 // s18 = *(a71);
144 // a72 = (a71 + 1);
145 // s19 = *(a72);
146 // s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
147 // s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
148 // s18 = s22;
149 // a73 = (4 * i9);
150 // b6 = (syms + a73);
151 // a75 = *(b6);
152 // a76 = _mm256_set1_epi8(a75);
153 // a77 = ((__m256i*)Branchtab);
154 // a78 = *(a77);
155 // a79 = _mm256_xor_si256(a76, a78);
156 // a80 = (b6 + 1);
157 // a81 = *(a80);
158 // a82 = _mm256_set1_epi8(a81);
159 // a83 = (a77 + 1);
160 // a84 = *(a83);
161 // a85 = _mm256_xor_si256(a82, a84);
162 // t13 = _mm256_avg_epu8(a79, a85);
163 // a86 = ((__m256i)t13);
164 // a87 = _mm256_srli_epi16(a86, 2);
165 // a88 = ((__m256i)a87);
166 // t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
167 // t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
168 // m23 = _mm256_adds_epu8(s18, t14);
169 // m24 = _mm256_adds_epu8(s19, t15);
170 // m25 = _mm256_adds_epu8(s18, t15);
171 // m26 = _mm256_adds_epu8(s19, t14);
172 // a89 = _mm256_min_epu8(m24, m23);
173 // d9 = _mm256_cmpeq_epi8(a89, m24);
174 // a90 = _mm256_min_epu8(m26, m25);
175 // d10 = _mm256_cmpeq_epi8(a90, m26);
176 // s22 = _mm256_unpacklo_epi8(d9, d10);
177 // s23 = _mm256_unpackhi_epi8(d9, d10);
178 // s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
179 // a91 = ((int*)dec);
180 // a92 = (4 * i9);
181 // a93 = (a91 + a92);
182 // *(a93) = s20;
183 // s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
184 // a110 = (a93 + 1);
185 // *(a110) = s21;
186 // s22 = _mm256_unpacklo_epi8(a89, a90);
187 // s23 = _mm256_unpackhi_epi8(a89, a90);
188 // a95 = ((__m256i*)Y);
189 // s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
190 // *(a95) = s24;
191 // s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
192 // a112 = (a95 + 1);
193 // *(a112) = s23;
194 // if ((((unsigned char*)Y)[0] > 210)) {
195 // __m256i m5, m6;
196 // m5 = ((__m256i*)Y)[0];
197 // m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
198 // __m256i m7;
199 // m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
200 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
201 // ((__m256i)m7)));
202 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
203 // ((__m256i)m7)));
204 // m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
205 // ((__m256i)m7)));
206 // m7 = _mm256_unpacklo_epi8(m7, m7);
207 // m7 = _mm256_shufflelo_epi16(m7, 0);
208 // m6 = _mm256_unpacklo_epi64(m7, m7);
209 // m6 = _mm256_permute2x128_si256(
210 // m6, m6, 0); // copy lower half of m6 to upper half, since above ops
211 // // operate on 128 bit lanes
212 // ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
213 // ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
214 // }
215 // unsigned char a188, a194;
216 // int a205;
217 // int s48, s54;
218 // unsigned char *a187, *a193;
219 // int *a204, *a206, *a223, *b16;
220 // __m256i *a184, *a185, *a190, *a196, *a208, *a225;
221 // __m256i a199, a200;
222 // __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
223 // m40,
224 // m41, m42, s46, s47, s50, s51, t25, t26, t27;
225 // a184 = ((__m256i*)Y);
226 // s46 = *(a184);
227 // a185 = (a184 + 1);
228 // s47 = *(a185);
229 // s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
230 // s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
231 // s46 = s50;
232 // a187 = (b6 + 2);
233 // a188 = *(a187);
234 // a189 = _mm256_set1_epi8(a188);
235 // a190 = ((__m256i*)Branchtab);
236 // a191 = *(a190);
237 // a192 = _mm256_xor_si256(a189, a191);
238 // a193 = (b6 + 3);
239 // a194 = *(a193);
240 // a195 = _mm256_set1_epi8(a194);
241 // a196 = (a190 + 1);
242 // a197 = *(a196);
243 // a198 = _mm256_xor_si256(a195, a197);
244 // t25 = _mm256_avg_epu8(a192, a198);
245 // a199 = ((__m256i)t25);
246 // a200 = _mm256_srli_epi16(a199, 2);
247 // a201 = ((__m256i)a200);
248 // t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
249 // t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
250 // m39 = _mm256_adds_epu8(s46, t26);
251 // m40 = _mm256_adds_epu8(s47, t27);
252 // m41 = _mm256_adds_epu8(s46, t27);
253 // m42 = _mm256_adds_epu8(s47, t26);
254 // a202 = _mm256_min_epu8(m40, m39);
255 // d17 = _mm256_cmpeq_epi8(a202, m40);
256 // a203 = _mm256_min_epu8(m42, m41);
257 // d18 = _mm256_cmpeq_epi8(a203, m42);
258 // s24 = _mm256_unpacklo_epi8(d17, d18);
259 // s25 = _mm256_unpackhi_epi8(d17, d18);
260 // s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
261 // a204 = ((int*)dec);
262 // a205 = (4 * i9);
263 // b16 = (a204 + a205);
264 // a206 = (b16 + 2);
265 // *(a206) = s48;
266 // s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
267 // a223 = (b16 + 3);
268 // *(a223) = s54;
269 // s50 = _mm256_unpacklo_epi8(a202, a203);
270 // s51 = _mm256_unpackhi_epi8(a202, a203);
271 // s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
272 // s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
273 // a208 = ((__m256i*)X);
274 // *(a208) = s25;
275 // a225 = (a208 + 1);
276 // *(a225) = s51;
277 //
278 // if ((((unsigned char*)X)[0] > 210)) {
279 // __m256i m12, m13;
280 // m12 = ((__m256i*)X)[0];
281 // m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
282 // __m256i m14;
283 // m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
284 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
285 // ((__m256i)m14)));
286 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
287 // ((__m256i)m14)));
288 // m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
289 // ((__m256i)m14)));
290 // m14 = _mm256_unpacklo_epi8(m14, m14);
291 // m14 = _mm256_shufflelo_epi16(m14, 0);
292 // m13 = _mm256_unpacklo_epi64(m14, m14);
293 // m13 = _mm256_permute2x128_si256(m13, m13, 0);
294 // ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
295 // ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
296 // }
297 // }
298 //
299 // renormalize(X, 210);
300 //
301 // unsigned int j;
302 // for (j = 0; j < (framebits + excess) % 2; ++j) {
303 // int i;
304 // for (i = 0; i < 64 / 2; i++) {
305 // BFLY(i,
306 // (((framebits + excess) >> 1) << 1) + j,
307 // syms,
308 // Y,
309 // X,
310 // (decision_t*)dec,
311 // Branchtab);
312 // }
313 //
314 // renormalize(Y, 210);
315 // }
316 // /*skip*/
317 //}
318 //
319 //#endif /*LV_HAVE_AVX2*/
320 
321 
322 #if LV_HAVE_SSE3
323 
324 #include <emmintrin.h>
325 #include <mmintrin.h>
326 #include <pmmintrin.h>
327 #include <stdio.h>
328 #include <xmmintrin.h>
329 
330 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
331  unsigned char* X,
332  unsigned char* syms,
333  unsigned char* dec,
334  unsigned int framebits,
335  unsigned int excess,
336  unsigned char* Branchtab)
337 {
338  unsigned int i9;
339  for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
340  unsigned char a75, a81;
341  int a73, a92;
342  short int s20, s21, s26, s27;
343  unsigned char *a74, *a80, *b6;
344  short int *a110, *a111, *a91, *a93, *a94;
345  __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
346  __m128i a105, a106, a86, a87;
347  __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
348  a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
349  s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
350  a71 = ((__m128i*)X);
351  s18 = *(a71);
352  a72 = (a71 + 2);
353  s19 = *(a72);
354  a73 = (4 * i9);
355  a74 = (syms + a73);
356  a75 = *(a74);
357  a76 = _mm_set1_epi8(a75);
358  a77 = ((__m128i*)Branchtab);
359  a78 = *(a77);
360  a79 = _mm_xor_si128(a76, a78);
361  b6 = (a73 + syms);
362  a80 = (b6 + 1);
363  a81 = *(a80);
364  a82 = _mm_set1_epi8(a81);
365  a83 = (a77 + 2);
366  a84 = *(a83);
367  a85 = _mm_xor_si128(a82, a84);
368  t13 = _mm_avg_epu8(a79, a85);
369  a86 = ((__m128i)t13);
370  a87 = _mm_srli_epi16(a86, 2);
371  a88 = ((__m128i)a87);
372  t14 = _mm_and_si128(
373  a88,
374  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
375  t15 = _mm_subs_epu8(
376  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
377  t14);
378  m23 = _mm_adds_epu8(s18, t14);
379  m24 = _mm_adds_epu8(s19, t15);
380  m25 = _mm_adds_epu8(s18, t15);
381  m26 = _mm_adds_epu8(s19, t14);
382  a89 = _mm_min_epu8(m24, m23);
383  d9 = _mm_cmpeq_epi8(a89, m24);
384  a90 = _mm_min_epu8(m26, m25);
385  d10 = _mm_cmpeq_epi8(a90, m26);
386  s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
387  a91 = ((short int*)dec);
388  a92 = (8 * i9);
389  a93 = (a91 + a92);
390  *(a93) = s20;
391  s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
392  a94 = (a93 + 1);
393  *(a94) = s21;
394  s22 = _mm_unpacklo_epi8(a89, a90);
395  s23 = _mm_unpackhi_epi8(a89, a90);
396  a95 = ((__m128i*)Y);
397  *(a95) = s22;
398  a96 = (a95 + 1);
399  *(a96) = s23;
400  a97 = (a71 + 1);
401  s24 = *(a97);
402  a98 = (a71 + 3);
403  s25 = *(a98);
404  a99 = (a77 + 1);
405  a100 = *(a99);
406  a101 = _mm_xor_si128(a76, a100);
407  a102 = (a77 + 3);
408  a103 = *(a102);
409  a104 = _mm_xor_si128(a82, a103);
410  t16 = _mm_avg_epu8(a101, a104);
411  a105 = ((__m128i)t16);
412  a106 = _mm_srli_epi16(a105, 2);
413  a107 = ((__m128i)a106);
414  t17 = _mm_and_si128(
415  a107,
416  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
417  t18 = _mm_subs_epu8(
418  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
419  t17);
420  m27 = _mm_adds_epu8(s24, t17);
421  m28 = _mm_adds_epu8(s25, t18);
422  m29 = _mm_adds_epu8(s24, t18);
423  m30 = _mm_adds_epu8(s25, t17);
424  a108 = _mm_min_epu8(m28, m27);
425  d11 = _mm_cmpeq_epi8(a108, m28);
426  a109 = _mm_min_epu8(m30, m29);
427  d12 = _mm_cmpeq_epi8(a109, m30);
428  s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
429  a110 = (a93 + 2);
430  *(a110) = s26;
431  s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
432  a111 = (a93 + 3);
433  *(a111) = s27;
434  s28 = _mm_unpacklo_epi8(a108, a109);
435  s29 = _mm_unpackhi_epi8(a108, a109);
436  a112 = (a95 + 2);
437  *(a112) = s28;
438  a113 = (a95 + 3);
439  *(a113) = s29;
440  if ((((unsigned char*)Y)[0] > 210)) {
441  __m128i m5, m6;
442  m5 = ((__m128i*)Y)[0];
443  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
444  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
445  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
446  __m128i m7;
447  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
448  m7 =
449  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
450  m7 =
451  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
452  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
453  m7 = _mm_unpacklo_epi8(m7, m7);
454  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
455  m6 = _mm_unpacklo_epi64(m7, m7);
456  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
457  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
458  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
459  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
460  }
461  unsigned char a188, a194;
462  int a186, a205;
463  short int s48, s49, s54, s55;
464  unsigned char *a187, *a193, *b15;
465  short int *a204, *a206, *a207, *a223, *a224, *b16;
466  __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
467  *a225, *a226;
468  __m128i a199, a200, a218, a219;
469  __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
470  a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
471  m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
472  a184 = ((__m128i*)Y);
473  s46 = *(a184);
474  a185 = (a184 + 2);
475  s47 = *(a185);
476  a186 = (4 * i9);
477  b15 = (a186 + syms);
478  a187 = (b15 + 2);
479  a188 = *(a187);
480  a189 = _mm_set1_epi8(a188);
481  a190 = ((__m128i*)Branchtab);
482  a191 = *(a190);
483  a192 = _mm_xor_si128(a189, a191);
484  a193 = (b15 + 3);
485  a194 = *(a193);
486  a195 = _mm_set1_epi8(a194);
487  a196 = (a190 + 2);
488  a197 = *(a196);
489  a198 = _mm_xor_si128(a195, a197);
490  t25 = _mm_avg_epu8(a192, a198);
491  a199 = ((__m128i)t25);
492  a200 = _mm_srli_epi16(a199, 2);
493  a201 = ((__m128i)a200);
494  t26 = _mm_and_si128(
495  a201,
496  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
497  t27 = _mm_subs_epu8(
498  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
499  t26);
500  m39 = _mm_adds_epu8(s46, t26);
501  m40 = _mm_adds_epu8(s47, t27);
502  m41 = _mm_adds_epu8(s46, t27);
503  m42 = _mm_adds_epu8(s47, t26);
504  a202 = _mm_min_epu8(m40, m39);
505  d17 = _mm_cmpeq_epi8(a202, m40);
506  a203 = _mm_min_epu8(m42, m41);
507  d18 = _mm_cmpeq_epi8(a203, m42);
508  s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
509  a204 = ((short int*)dec);
510  a205 = (8 * i9);
511  b16 = (a204 + a205);
512  a206 = (b16 + 4);
513  *(a206) = s48;
514  s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
515  a207 = (b16 + 5);
516  *(a207) = s49;
517  s50 = _mm_unpacklo_epi8(a202, a203);
518  s51 = _mm_unpackhi_epi8(a202, a203);
519  a208 = ((__m128i*)X);
520  *(a208) = s50;
521  a209 = (a208 + 1);
522  *(a209) = s51;
523  a210 = (a184 + 1);
524  s52 = *(a210);
525  a211 = (a184 + 3);
526  s53 = *(a211);
527  a212 = (a190 + 1);
528  a213 = *(a212);
529  a214 = _mm_xor_si128(a189, a213);
530  a215 = (a190 + 3);
531  a216 = *(a215);
532  a217 = _mm_xor_si128(a195, a216);
533  t28 = _mm_avg_epu8(a214, a217);
534  a218 = ((__m128i)t28);
535  a219 = _mm_srli_epi16(a218, 2);
536  a220 = ((__m128i)a219);
537  t29 = _mm_and_si128(
538  a220,
539  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
540  t30 = _mm_subs_epu8(
541  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
542  t29);
543  m43 = _mm_adds_epu8(s52, t29);
544  m44 = _mm_adds_epu8(s53, t30);
545  m45 = _mm_adds_epu8(s52, t30);
546  m46 = _mm_adds_epu8(s53, t29);
547  a221 = _mm_min_epu8(m44, m43);
548  d19 = _mm_cmpeq_epi8(a221, m44);
549  a222 = _mm_min_epu8(m46, m45);
550  d20 = _mm_cmpeq_epi8(a222, m46);
551  s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
552  a223 = (b16 + 6);
553  *(a223) = s54;
554  s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
555  a224 = (b16 + 7);
556  *(a224) = s55;
557  s56 = _mm_unpacklo_epi8(a221, a222);
558  s57 = _mm_unpackhi_epi8(a221, a222);
559  a225 = (a208 + 2);
560  *(a225) = s56;
561  a226 = (a208 + 3);
562  *(a226) = s57;
563  if ((((unsigned char*)X)[0] > 210)) {
564  __m128i m12, m13;
565  m12 = ((__m128i*)X)[0];
566  m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
567  m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
568  m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
569  __m128i m14;
570  m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
571  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
572  ((__m128i)m14)));
573  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
574  ((__m128i)m14)));
575  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
576  ((__m128i)m14)));
577  m14 = _mm_unpacklo_epi8(m14, m14);
578  m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
579  m13 = _mm_unpacklo_epi64(m14, m14);
580  ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
581  ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
582  ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
583  ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
584  }
585  }
586 
587  renormalize(X, 210);
588 
589  /*int ch;
590  for(ch = 0; ch < 64; ch++) {
591  printf("%d,", X[ch]);
592  }
593  printf("\n");*/
594 
595  unsigned int j;
596  for (j = 0; j < (framebits + excess) % 2; ++j) {
597  int i;
598  for (i = 0; i < 64 / 2; i++) {
599  BFLY(i,
600  (((framebits + excess) >> 1) << 1) + j,
601  syms,
602  Y,
603  X,
604  (decision_t*)dec,
605  Branchtab);
606  }
607 
608 
609  renormalize(Y, 210);
610 
611  /*printf("\n");
612  for(ch = 0; ch < 64; ch++) {
613  printf("%d,", Y[ch]);
614  }
615  printf("\n");*/
616  }
617  /*skip*/
618 }
619 
620 #endif /*LV_HAVE_SSE3*/
621 
622 #if LV_HAVE_NEON
623 
624 #include "volk/sse2neon.h"
625 
626 static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
627  unsigned char* X,
628  unsigned char* syms,
629  unsigned char* dec,
630  unsigned int framebits,
631  unsigned int excess,
632  unsigned char* Branchtab)
633 {
634  unsigned int i9;
635  for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
636  unsigned char a75, a81;
637  int a73, a92;
638  short int s20, s21, s26, s27;
639  unsigned char *a74, *a80, *b6;
640  short int *a110, *a111, *a91, *a93, *a94;
641  __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
642  __m128i a105, a106, a86, a87;
643  __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
644  a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
645  s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
646  a71 = ((__m128i*)X);
647  s18 = *(a71);
648  a72 = (a71 + 2);
649  s19 = *(a72);
650  a73 = (4 * i9);
651  a74 = (syms + a73);
652  a75 = *(a74);
653  a76 = _mm_set1_epi8(a75);
654  a77 = ((__m128i*)Branchtab);
655  a78 = *(a77);
656  a79 = _mm_xor_si128(a76, a78);
657  b6 = (a73 + syms);
658  a80 = (b6 + 1);
659  a81 = *(a80);
660  a82 = _mm_set1_epi8(a81);
661  a83 = (a77 + 2);
662  a84 = *(a83);
663  a85 = _mm_xor_si128(a82, a84);
664  t13 = _mm_avg_epu8(a79, a85);
665  a86 = ((__m128i)t13);
666  a87 = _mm_srli_epi16(a86, 2);
667  a88 = ((__m128i)a87);
668  t14 = _mm_and_si128(
669  a88,
670  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
671  t15 = _mm_subs_epu8(
672  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
673  t14);
674  m23 = _mm_adds_epu8(s18, t14);
675  m24 = _mm_adds_epu8(s19, t15);
676  m25 = _mm_adds_epu8(s18, t15);
677  m26 = _mm_adds_epu8(s19, t14);
678  a89 = _mm_min_epu8(m24, m23);
679  d9 = _mm_cmpeq_epi8(a89, m24);
680  a90 = _mm_min_epu8(m26, m25);
681  d10 = _mm_cmpeq_epi8(a90, m26);
682  s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
683  a91 = ((short int*)dec);
684  a92 = (8 * i9);
685  a93 = (a91 + a92);
686  *(a93) = s20;
687  s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
688  a94 = (a93 + 1);
689  *(a94) = s21;
690  s22 = _mm_unpacklo_epi8(a89, a90);
691  s23 = _mm_unpackhi_epi8(a89, a90);
692  a95 = ((__m128i*)Y);
693  *(a95) = s22;
694  a96 = (a95 + 1);
695  *(a96) = s23;
696  a97 = (a71 + 1);
697  s24 = *(a97);
698  a98 = (a71 + 3);
699  s25 = *(a98);
700  a99 = (a77 + 1);
701  a100 = *(a99);
702  a101 = _mm_xor_si128(a76, a100);
703  a102 = (a77 + 3);
704  a103 = *(a102);
705  a104 = _mm_xor_si128(a82, a103);
706  t16 = _mm_avg_epu8(a101, a104);
707  a105 = ((__m128i)t16);
708  a106 = _mm_srli_epi16(a105, 2);
709  a107 = ((__m128i)a106);
710  t17 = _mm_and_si128(
711  a107,
712  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
713  t18 = _mm_subs_epu8(
714  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
715  t17);
716  m27 = _mm_adds_epu8(s24, t17);
717  m28 = _mm_adds_epu8(s25, t18);
718  m29 = _mm_adds_epu8(s24, t18);
719  m30 = _mm_adds_epu8(s25, t17);
720  a108 = _mm_min_epu8(m28, m27);
721  d11 = _mm_cmpeq_epi8(a108, m28);
722  a109 = _mm_min_epu8(m30, m29);
723  d12 = _mm_cmpeq_epi8(a109, m30);
724  s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
725  a110 = (a93 + 2);
726  *(a110) = s26;
727  s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
728  a111 = (a93 + 3);
729  *(a111) = s27;
730  s28 = _mm_unpacklo_epi8(a108, a109);
731  s29 = _mm_unpackhi_epi8(a108, a109);
732  a112 = (a95 + 2);
733  *(a112) = s28;
734  a113 = (a95 + 3);
735  *(a113) = s29;
736  if ((((unsigned char*)Y)[0] > 210)) {
737  __m128i m5, m6;
738  m5 = ((__m128i*)Y)[0];
739  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
740  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
741  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
742  __m128i m7;
743  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
744  m7 =
745  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
746  m7 =
747  ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
748  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
749  m7 = _mm_unpacklo_epi8(m7, m7);
750  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
751  m6 = _mm_unpacklo_epi64(m7, m7);
752  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
753  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
754  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
755  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
756  }
757  unsigned char a188, a194;
758  int a186, a205;
759  short int s48, s49, s54, s55;
760  unsigned char *a187, *a193, *b15;
761  short int *a204, *a206, *a207, *a223, *a224, *b16;
762  __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
763  *a225, *a226;
764  __m128i a199, a200, a218, a219;
765  __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
766  a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
767  m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
768  a184 = ((__m128i*)Y);
769  s46 = *(a184);
770  a185 = (a184 + 2);
771  s47 = *(a185);
772  a186 = (4 * i9);
773  b15 = (a186 + syms);
774  a187 = (b15 + 2);
775  a188 = *(a187);
776  a189 = _mm_set1_epi8(a188);
777  a190 = ((__m128i*)Branchtab);
778  a191 = *(a190);
779  a192 = _mm_xor_si128(a189, a191);
780  a193 = (b15 + 3);
781  a194 = *(a193);
782  a195 = _mm_set1_epi8(a194);
783  a196 = (a190 + 2);
784  a197 = *(a196);
785  a198 = _mm_xor_si128(a195, a197);
786  t25 = _mm_avg_epu8(a192, a198);
787  a199 = ((__m128i)t25);
788  a200 = _mm_srli_epi16(a199, 2);
789  a201 = ((__m128i)a200);
790  t26 = _mm_and_si128(
791  a201,
792  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
793  t27 = _mm_subs_epu8(
794  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
795  t26);
796  m39 = _mm_adds_epu8(s46, t26);
797  m40 = _mm_adds_epu8(s47, t27);
798  m41 = _mm_adds_epu8(s46, t27);
799  m42 = _mm_adds_epu8(s47, t26);
800  a202 = _mm_min_epu8(m40, m39);
801  d17 = _mm_cmpeq_epi8(a202, m40);
802  a203 = _mm_min_epu8(m42, m41);
803  d18 = _mm_cmpeq_epi8(a203, m42);
804  s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
805  a204 = ((short int*)dec);
806  a205 = (8 * i9);
807  b16 = (a204 + a205);
808  a206 = (b16 + 4);
809  *(a206) = s48;
810  s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
811  a207 = (b16 + 5);
812  *(a207) = s49;
813  s50 = _mm_unpacklo_epi8(a202, a203);
814  s51 = _mm_unpackhi_epi8(a202, a203);
815  a208 = ((__m128i*)X);
816  *(a208) = s50;
817  a209 = (a208 + 1);
818  *(a209) = s51;
819  a210 = (a184 + 1);
820  s52 = *(a210);
821  a211 = (a184 + 3);
822  s53 = *(a211);
823  a212 = (a190 + 1);
824  a213 = *(a212);
825  a214 = _mm_xor_si128(a189, a213);
826  a215 = (a190 + 3);
827  a216 = *(a215);
828  a217 = _mm_xor_si128(a195, a216);
829  t28 = _mm_avg_epu8(a214, a217);
830  a218 = ((__m128i)t28);
831  a219 = _mm_srli_epi16(a218, 2);
832  a220 = ((__m128i)a219);
833  t29 = _mm_and_si128(
834  a220,
835  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
836  t30 = _mm_subs_epu8(
837  _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
838  t29);
839  m43 = _mm_adds_epu8(s52, t29);
840  m44 = _mm_adds_epu8(s53, t30);
841  m45 = _mm_adds_epu8(s52, t30);
842  m46 = _mm_adds_epu8(s53, t29);
843  a221 = _mm_min_epu8(m44, m43);
844  d19 = _mm_cmpeq_epi8(a221, m44);
845  a222 = _mm_min_epu8(m46, m45);
846  d20 = _mm_cmpeq_epi8(a222, m46);
847  s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
848  a223 = (b16 + 6);
849  *(a223) = s54;
850  s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
851  a224 = (b16 + 7);
852  *(a224) = s55;
853  s56 = _mm_unpacklo_epi8(a221, a222);
854  s57 = _mm_unpackhi_epi8(a221, a222);
855  a225 = (a208 + 2);
856  *(a225) = s56;
857  a226 = (a208 + 3);
858  *(a226) = s57;
859  if ((((unsigned char*)X)[0] > 210)) {
860  __m128i m12, m13;
861  m12 = ((__m128i*)X)[0];
862  m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
863  m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
864  m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
865  __m128i m14;
866  m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
867  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
868  ((__m128i)m14)));
869  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
870  ((__m128i)m14)));
871  m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
872  ((__m128i)m14)));
873  m14 = _mm_unpacklo_epi8(m14, m14);
874  m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
875  m13 = _mm_unpacklo_epi64(m14, m14);
876  ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
877  ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
878  ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
879  ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
880  }
881  }
882 
883  renormalize(X, 210);
884 
885  /*int ch;
886  for(ch = 0; ch < 64; ch++) {
887  printf("%d,", X[ch]);
888  }
889  printf("\n");*/
890 
891  unsigned int j;
892  for (j = 0; j < (framebits + excess) % 2; ++j) {
893  int i;
894  for (i = 0; i < 64 / 2; i++) {
895  BFLY(i,
896  (((framebits + excess) >> 1) << 1) + j,
897  syms,
898  Y,
899  X,
900  (decision_t*)dec,
901  Branchtab);
902  }
903 
904 
905  renormalize(Y, 210);
906 
907  /*printf("\n");
908  for(ch = 0; ch < 64; ch++) {
909  printf("%d,", Y[ch]);
910  }
911  printf("\n");*/
912  }
913  /*skip*/
914 }
915 
916 #endif /*LV_HAVE_NEON*/
917 
918 #if LV_HAVE_GENERIC
919 
920 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
921  unsigned char* X,
922  unsigned char* syms,
923  unsigned char* dec,
924  unsigned int framebits,
925  unsigned int excess,
926  unsigned char* Branchtab)
927 {
928  int nbits = framebits + excess;
929  int NUMSTATES = 64;
930  int RENORMALIZE_THRESHOLD = 210;
931 
932  int s, i;
933  for (s = 0; s < nbits; s++) {
934  void* tmp;
935  for (i = 0; i < NUMSTATES / 2; i++) {
936  BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
937  }
938 
939  renormalize(Y, RENORMALIZE_THRESHOLD);
940 
942  tmp = (void*)X;
943  X = Y;
944  Y = (unsigned char*)tmp;
945  }
946 }
947 
948 #endif /* LV_HAVE_GENERIC */
949 
950 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6405
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3101
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4776
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
#define _mm_srli_epi64(a, imm)
Definition: sse2neon.h:5863
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6300
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3284
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4696
#define _mm_srli_epi16(a, imm)
Definition: sse2neon.h:5812
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:6206
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3187
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
Definition: volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:77
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:330
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:626
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:920
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
for i
Definition: volk_config_fixed.tmpl.h:13