53 #ifndef INCLUDED_volk_64u_byteswap_u_H
54 #define INCLUDED_volk_64u_byteswap_u_H
60 #include <emmintrin.h>
64 uint32_t* inputPtr = (uint32_t*)intsToSwap;
65 __m128i input, byte1, byte2, byte3, byte4, output;
69 const unsigned int halfPoints = num_points / 2;
70 for (; number < halfPoints; number++) {
95 number = halfPoints * 2;
96 for (; number < num_points; number++) {
97 uint32_t output1 = *inputPtr;
98 uint32_t output2 = inputPtr[1];
100 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
103 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
106 *inputPtr++ = output2;
107 *inputPtr++ = output1;
113 #ifdef LV_HAVE_GENERIC
116 unsigned int num_points)
118 uint32_t* inputPtr = (uint32_t*)intsToSwap;
120 for (point = 0; point < num_points; point++) {
121 uint32_t output1 = *inputPtr;
122 uint32_t output2 = inputPtr[1];
124 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
127 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
130 *inputPtr++ = output2;
131 *inputPtr++ = output1;
137 #include <immintrin.h>
138 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap,
unsigned int num_points)
140 unsigned int number = 0;
142 const unsigned int nPerSet = 4;
143 const uint64_t nSets = num_points / nPerSet;
145 uint32_t* inputPtr = (uint32_t*)intsToSwap;
147 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
148 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
149 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
151 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
153 for (; number < nSets; number++) {
156 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
157 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
160 _mm256_store_si256((__m256i*)inputPtr, output);
163 inputPtr += 2 * nPerSet;
167 for (number = nSets * nPerSet; number < num_points; ++number) {
168 uint32_t output1 = *inputPtr;
169 uint32_t output2 = inputPtr[1];
171 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
172 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
175 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
176 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
186 #include <tmmintrin.h>
188 unsigned int num_points)
190 unsigned int number = 0;
192 const unsigned int nPerSet = 2;
193 const uint64_t nSets = num_points / nPerSet;
195 uint32_t* inputPtr = (uint32_t*)intsToSwap;
197 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
201 for (; number < nSets; number++) {
211 inputPtr += 2 * nPerSet;
215 for (number = nSets * nPerSet; number < num_points; ++number) {
216 uint32_t output1 = *inputPtr;
217 uint32_t output2 = inputPtr[1];
219 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
220 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
223 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
224 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
232 #ifdef LV_HAVE_NEONV8
233 #include <arm_neon.h>
235 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap,
unsigned int num_points)
237 uint32_t* inputPtr = (uint32_t*)intsToSwap;
238 const unsigned int n4points = num_points / 4;
240 uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
242 unsigned int number = 0;
243 for (number = 0; number < n4points; ++number) {
245 input = vld2q_u8((uint8_t*)inputPtr);
246 input.val[0] = vqtbl1q_u8(input.val[0], idx);
247 input.val[1] = vqtbl1q_u8(input.val[1], idx);
248 vst2q_u8((uint8_t*)inputPtr, input);
253 for (number = n4points * 4; number < num_points; ++number) {
254 uint32_t output1 = *inputPtr;
255 uint32_t output2 = inputPtr[1];
257 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
258 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
259 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
260 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
262 *inputPtr++ = output2;
263 *inputPtr++ = output1;
268 #include <arm_neon.h>
272 uint32_t* inputPtr = (uint32_t*)intsToSwap;
273 unsigned int number = 0;
274 unsigned int n8points = num_points / 4;
276 uint8x8x4_t input_table;
277 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
278 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
288 int_lookup01 = vcreate_u8(2269495096316185);
289 int_lookup23 = vcreate_u8(146949840772469531);
290 int_lookup45 = vcreate_u8(291630186448622877);
291 int_lookup67 = vcreate_u8(436310532124776223);
293 for (number = 0; number < n8points; ++number) {
294 input_table = vld4_u8((uint8_t*)inputPtr);
295 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
296 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
297 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
298 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
299 vst1_u8((uint8_t*)inputPtr, swapped_int01);
300 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
301 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
302 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
307 for (number = n8points * 4; number < num_points; ++number) {
308 uint32_t output1 = *inputPtr;
309 uint32_t output2 = inputPtr[1];
311 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
312 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
313 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
314 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
316 *inputPtr++ = output2;
317 *inputPtr++ = output1;
324 #ifndef INCLUDED_volk_64u_byteswap_a_H
325 #define INCLUDED_volk_64u_byteswap_a_H
327 #include <inttypes.h>
332 #include <emmintrin.h>
336 uint32_t* inputPtr = (uint32_t*)intsToSwap;
337 __m128i input, byte1, byte2, byte3, byte4, output;
341 const unsigned int halfPoints = num_points / 2;
342 for (; number < halfPoints; number++) {
367 number = halfPoints * 2;
368 for (; number < num_points; number++) {
369 uint32_t output1 = *inputPtr;
370 uint32_t output2 = inputPtr[1];
372 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
373 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
375 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
376 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
378 *inputPtr++ = output2;
379 *inputPtr++ = output1;
385 #include <immintrin.h>
386 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap,
unsigned int num_points)
388 unsigned int number = 0;
390 const unsigned int nPerSet = 4;
391 const uint64_t nSets = num_points / nPerSet;
393 uint32_t* inputPtr = (uint32_t*)intsToSwap;
395 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
396 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
397 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
399 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
401 for (; number < nSets; number++) {
403 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
404 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
407 _mm256_storeu_si256((__m256i*)inputPtr, output);
410 inputPtr += 2 * nPerSet;
414 for (number = nSets * nPerSet; number < num_points; ++number) {
415 uint32_t output1 = *inputPtr;
416 uint32_t output2 = inputPtr[1];
418 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
419 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
422 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
423 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
433 #include <tmmintrin.h>
435 unsigned int num_points)
437 unsigned int number = 0;
439 const unsigned int nPerSet = 2;
440 const uint64_t nSets = num_points / nPerSet;
442 uint32_t* inputPtr = (uint32_t*)intsToSwap;
444 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
448 for (; number < nSets; number++) {
457 inputPtr += 2 * nPerSet;
461 for (number = nSets * nPerSet; number < num_points; ++number) {
462 uint32_t output1 = *inputPtr;
463 uint32_t output2 = inputPtr[1];
465 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
466 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
469 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
470 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
477 #ifdef LV_HAVE_GENERIC
480 unsigned int num_points)
482 uint32_t* inputPtr = (uint32_t*)intsToSwap;
484 for (point = 0; point < num_points; point++) {
485 uint32_t output1 = *inputPtr;
486 uint32_t output2 = inputPtr[1];
488 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
489 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
491 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
492 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
494 *inputPtr++ = output2;
495 *inputPtr++ = output1;
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:479
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:187
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:334
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:434
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:62
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:115
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:270
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71