Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_64u_byteswap_u_H
54 #define INCLUDED_volk_64u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_SSE2
60 #include <emmintrin.h>
61 
62 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63 {
64  uint32_t* inputPtr = (uint32_t*)intsToSwap;
65  __m128i input, byte1, byte2, byte3, byte4, output;
66  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68  uint64_t number = 0;
69  const unsigned int halfPoints = num_points / 2;
70  for (; number < halfPoints; number++) {
71  // Load the 32t values, increment inputPtr later since we're doing it in-place.
72  input = _mm_loadu_si128((__m128i*)inputPtr);
73 
74  // Do the four shifts
75  byte1 = _mm_slli_epi32(input, 24);
76  byte2 = _mm_slli_epi32(input, 8);
77  byte3 = _mm_srli_epi32(input, 8);
78  byte4 = _mm_srli_epi32(input, 24);
79  // Or bytes together
80  output = _mm_or_si128(byte1, byte4);
81  byte2 = _mm_and_si128(byte2, byte2mask);
82  output = _mm_or_si128(output, byte2);
83  byte3 = _mm_and_si128(byte3, byte3mask);
84  output = _mm_or_si128(output, byte3);
85 
86  // Reorder the two words
87  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Store the results
90  _mm_storeu_si128((__m128i*)inputPtr, output);
91  inputPtr += 4;
92  }
93 
94  // Byteswap any remaining points:
95  number = halfPoints * 2;
96  for (; number < num_points; number++) {
97  uint32_t output1 = *inputPtr;
98  uint32_t output2 = inputPtr[1];
99 
100  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102 
103  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105 
106  *inputPtr++ = output2;
107  *inputPtr++ = output1;
108  }
109 }
110 #endif /* LV_HAVE_SSE2 */
111 
112 
113 #ifdef LV_HAVE_GENERIC
114 
115 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
116  unsigned int num_points)
117 {
118  uint32_t* inputPtr = (uint32_t*)intsToSwap;
119  unsigned int point;
120  for (point = 0; point < num_points; point++) {
121  uint32_t output1 = *inputPtr;
122  uint32_t output2 = inputPtr[1];
123 
124  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
126 
127  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
129 
130  *inputPtr++ = output2;
131  *inputPtr++ = output1;
132  }
133 }
134 #endif /* LV_HAVE_GENERIC */
135 
136 #if LV_HAVE_AVX2
137 #include <immintrin.h>
138 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
139 {
140  unsigned int number = 0;
141 
142  const unsigned int nPerSet = 4;
143  const uint64_t nSets = num_points / nPerSet;
144 
145  uint32_t* inputPtr = (uint32_t*)intsToSwap;
146 
147  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
148  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
149  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
150 
151  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
152 
153  for (; number < nSets; number++) {
154 
155  // Load the 32t values, increment inputPtr later since we're doing it in-place.
156  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
157  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
158 
159  // Store the results
160  _mm256_store_si256((__m256i*)inputPtr, output);
161 
162  /* inputPtr is 32bit so increment twice */
163  inputPtr += 2 * nPerSet;
164  }
165 
166  // Byteswap any remaining points:
167  for (number = nSets * nPerSet; number < num_points; ++number) {
168  uint32_t output1 = *inputPtr;
169  uint32_t output2 = inputPtr[1];
170  uint32_t out1 =
171  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
172  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
173 
174  uint32_t out2 =
175  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
176  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
177  *inputPtr++ = out2;
178  *inputPtr++ = out1;
179  }
180 }
181 
182 #endif /* LV_HAVE_AVX2 */
183 
184 
185 #if LV_HAVE_SSSE3
186 #include <tmmintrin.h>
187 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191 
192  const unsigned int nPerSet = 2;
193  const uint64_t nSets = num_points / nPerSet;
194 
195  uint32_t* inputPtr = (uint32_t*)intsToSwap;
196 
197  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
198 
199  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
200 
201  for (; number < nSets; number++) {
202 
203  // Load the 32t values, increment inputPtr later since we're doing it in-place.
204  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
205  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
206 
207  // Store the results
208  _mm_store_si128((__m128i*)inputPtr, output);
209 
210  /* inputPtr is 32bit so increment twice */
211  inputPtr += 2 * nPerSet;
212  }
213 
214  // Byteswap any remaining points:
215  for (number = nSets * nPerSet; number < num_points; ++number) {
216  uint32_t output1 = *inputPtr;
217  uint32_t output2 = inputPtr[1];
218  uint32_t out1 =
219  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
220  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
221 
222  uint32_t out2 =
223  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
224  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
225  *inputPtr++ = out2;
226  *inputPtr++ = out1;
227  }
228 }
229 #endif /* LV_HAVE_SSSE3 */
230 
231 
232 #ifdef LV_HAVE_NEONV8
233 #include <arm_neon.h>
234 
235 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
236 {
237  uint32_t* inputPtr = (uint32_t*)intsToSwap;
238  const unsigned int n4points = num_points / 4;
239  uint8x16x2_t input;
240  uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
241 
242  unsigned int number = 0;
243  for (number = 0; number < n4points; ++number) {
244  __VOLK_PREFETCH(inputPtr + 8);
245  input = vld2q_u8((uint8_t*)inputPtr);
246  input.val[0] = vqtbl1q_u8(input.val[0], idx);
247  input.val[1] = vqtbl1q_u8(input.val[1], idx);
248  vst2q_u8((uint8_t*)inputPtr, input);
249 
250  inputPtr += 8;
251  }
252 
253  for (number = n4points * 4; number < num_points; ++number) {
254  uint32_t output1 = *inputPtr;
255  uint32_t output2 = inputPtr[1];
256 
257  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
258  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
259  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
260  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
261 
262  *inputPtr++ = output2;
263  *inputPtr++ = output1;
264  }
265 }
266 #else
267 #ifdef LV_HAVE_NEON
268 #include <arm_neon.h>
269 
270 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
271 {
272  uint32_t* inputPtr = (uint32_t*)intsToSwap;
273  unsigned int number = 0;
274  unsigned int n8points = num_points / 4;
275 
276  uint8x8x4_t input_table;
277  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
278  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
279 
280  /* these magic numbers are used as byte-indices in the LUT.
281  they are pre-computed to save time. A simple C program
282  can calculate them; for example for lookup01:
283  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
284  for(ii=0; ii < 8; ++ii) {
285  index += ((uint64_t)(*(chars+ii))) << (ii*8);
286  }
287  */
288  int_lookup01 = vcreate_u8(2269495096316185);
289  int_lookup23 = vcreate_u8(146949840772469531);
290  int_lookup45 = vcreate_u8(291630186448622877);
291  int_lookup67 = vcreate_u8(436310532124776223);
292 
293  for (number = 0; number < n8points; ++number) {
294  input_table = vld4_u8((uint8_t*)inputPtr);
295  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
296  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
297  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
298  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
299  vst1_u8((uint8_t*)inputPtr, swapped_int01);
300  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
301  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
302  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
303 
304  inputPtr += 4;
305  }
306 
307  for (number = n8points * 4; number < num_points; ++number) {
308  uint32_t output1 = *inputPtr;
309  uint32_t output2 = inputPtr[1];
310 
311  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
312  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
313  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
314  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
315 
316  *inputPtr++ = output2;
317  *inputPtr++ = output1;
318  }
319 }
320 #endif /* LV_HAVE_NEON */
321 #endif
322 
323 #endif /* INCLUDED_volk_64u_byteswap_u_H */
324 #ifndef INCLUDED_volk_64u_byteswap_a_H
325 #define INCLUDED_volk_64u_byteswap_a_H
326 
327 #include <inttypes.h>
328 #include <stdio.h>
329 
330 
331 #ifdef LV_HAVE_SSE2
332 #include <emmintrin.h>
333 
334 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
335 {
336  uint32_t* inputPtr = (uint32_t*)intsToSwap;
337  __m128i input, byte1, byte2, byte3, byte4, output;
338  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
339  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
340  uint64_t number = 0;
341  const unsigned int halfPoints = num_points / 2;
342  for (; number < halfPoints; number++) {
343  // Load the 32t values, increment inputPtr later since we're doing it in-place.
344  input = _mm_load_si128((__m128i*)inputPtr);
345 
346  // Do the four shifts
347  byte1 = _mm_slli_epi32(input, 24);
348  byte2 = _mm_slli_epi32(input, 8);
349  byte3 = _mm_srli_epi32(input, 8);
350  byte4 = _mm_srli_epi32(input, 24);
351  // Or bytes together
352  output = _mm_or_si128(byte1, byte4);
353  byte2 = _mm_and_si128(byte2, byte2mask);
354  output = _mm_or_si128(output, byte2);
355  byte3 = _mm_and_si128(byte3, byte3mask);
356  output = _mm_or_si128(output, byte3);
357 
358  // Reorder the two words
359  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
360 
361  // Store the results
362  _mm_store_si128((__m128i*)inputPtr, output);
363  inputPtr += 4;
364  }
365 
366  // Byteswap any remaining points:
367  number = halfPoints * 2;
368  for (; number < num_points; number++) {
369  uint32_t output1 = *inputPtr;
370  uint32_t output2 = inputPtr[1];
371 
372  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
373  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
374 
375  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
376  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
377 
378  *inputPtr++ = output2;
379  *inputPtr++ = output1;
380  }
381 }
382 #endif /* LV_HAVE_SSE2 */
383 
384 #if LV_HAVE_AVX2
385 #include <immintrin.h>
386 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
387 {
388  unsigned int number = 0;
389 
390  const unsigned int nPerSet = 4;
391  const uint64_t nSets = num_points / nPerSet;
392 
393  uint32_t* inputPtr = (uint32_t*)intsToSwap;
394 
395  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
396  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
397  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
398 
399  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
400 
401  for (; number < nSets; number++) {
402  // Load the 32t values, increment inputPtr later since we're doing it in-place.
403  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
404  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
405 
406  // Store the results
407  _mm256_storeu_si256((__m256i*)inputPtr, output);
408 
409  /* inputPtr is 32bit so increment twice */
410  inputPtr += 2 * nPerSet;
411  }
412 
413  // Byteswap any remaining points:
414  for (number = nSets * nPerSet; number < num_points; ++number) {
415  uint32_t output1 = *inputPtr;
416  uint32_t output2 = inputPtr[1];
417  uint32_t out1 =
418  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
419  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
420 
421  uint32_t out2 =
422  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
423  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
424  *inputPtr++ = out2;
425  *inputPtr++ = out1;
426  }
427 }
428 
429 #endif /* LV_HAVE_AVX2 */
430 
431 
432 #if LV_HAVE_SSSE3
433 #include <tmmintrin.h>
434 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
435  unsigned int num_points)
436 {
437  unsigned int number = 0;
438 
439  const unsigned int nPerSet = 2;
440  const uint64_t nSets = num_points / nPerSet;
441 
442  uint32_t* inputPtr = (uint32_t*)intsToSwap;
443 
444  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
445 
446  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
447 
448  for (; number < nSets; number++) {
449  // Load the 32t values, increment inputPtr later since we're doing it in-place.
450  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
451  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
452 
453  // Store the results
454  _mm_storeu_si128((__m128i*)inputPtr, output);
455 
456  /* inputPtr is 32bit so increment twice */
457  inputPtr += 2 * nPerSet;
458  }
459 
460  // Byteswap any remaining points:
461  for (number = nSets * nPerSet; number < num_points; ++number) {
462  uint32_t output1 = *inputPtr;
463  uint32_t output2 = inputPtr[1];
464  uint32_t out1 =
465  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
466  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
467 
468  uint32_t out2 =
469  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
470  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
471  *inputPtr++ = out2;
472  *inputPtr++ = out1;
473  }
474 }
475 #endif /* LV_HAVE_SSSE3 */
476 
477 #ifdef LV_HAVE_GENERIC
478 
479 static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
480  unsigned int num_points)
481 {
482  uint32_t* inputPtr = (uint32_t*)intsToSwap;
483  unsigned int point;
484  for (point = 0; point < num_points; point++) {
485  uint32_t output1 = *inputPtr;
486  uint32_t output2 = inputPtr[1];
487 
488  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
489  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
490 
491  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
492  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
493 
494  *inputPtr++ = output2;
495  *inputPtr++ = output1;
496  }
497 }
498 #endif /* LV_HAVE_GENERIC */
499 
500 
501 #endif /* INCLUDED_volk_64u_byteswap_a_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:479
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:187
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:334
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:434
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:62
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:115
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:270
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71