Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_32u_byteswap_u_H
54 #define INCLUDED_volk_32u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #if LV_HAVE_AVX2
60 #include <immintrin.h>
61 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
62 {
63 
64  unsigned int number;
65 
66  const unsigned int nPerSet = 8;
67  const uint64_t nSets = num_points / nPerSet;
68 
69  uint32_t* inputPtr = intsToSwap;
70 
71  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
72  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
73  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
74 
75  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
76 
77  for (number = 0; number < nSets; number++) {
78 
79  // Load the 32t values, increment inputPtr later since we're doing it in-place.
80  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
81  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
82 
83  // Store the results
84  _mm256_storeu_si256((__m256i*)inputPtr, output);
85  inputPtr += nPerSet;
86  }
87 
88  // Byteswap any remaining points:
89  for (number = nSets * nPerSet; number < num_points; number++) {
90  uint32_t outputVal = *inputPtr;
91  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
92  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
93  *inputPtr = outputVal;
94  inputPtr++;
95  }
96 }
97 #endif /* LV_HAVE_AVX2 */
98 
99 
100 #ifdef LV_HAVE_SSE2
101 #include <emmintrin.h>
102 
103 static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
104 {
105  unsigned int number = 0;
106 
107  uint32_t* inputPtr = intsToSwap;
108  __m128i input, byte1, byte2, byte3, byte4, output;
109  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
110  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
111 
112  const uint64_t quarterPoints = num_points / 4;
113  for (; number < quarterPoints; number++) {
114  // Load the 32t values, increment inputPtr later since we're doing it in-place.
115  input = _mm_loadu_si128((__m128i*)inputPtr);
116  // Do the four shifts
117  byte1 = _mm_slli_epi32(input, 24);
118  byte2 = _mm_slli_epi32(input, 8);
119  byte3 = _mm_srli_epi32(input, 8);
120  byte4 = _mm_srli_epi32(input, 24);
121  // Or bytes together
122  output = _mm_or_si128(byte1, byte4);
123  byte2 = _mm_and_si128(byte2, byte2mask);
124  output = _mm_or_si128(output, byte2);
125  byte3 = _mm_and_si128(byte3, byte3mask);
126  output = _mm_or_si128(output, byte3);
127  // Store the results
128  _mm_storeu_si128((__m128i*)inputPtr, output);
129  inputPtr += 4;
130  }
131 
132  // Byteswap any remaining points:
133  number = quarterPoints * 4;
134  for (; number < num_points; number++) {
135  uint32_t outputVal = *inputPtr;
136  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
137  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
138  *inputPtr = outputVal;
139  inputPtr++;
140  }
141 }
142 #endif /* LV_HAVE_SSE2 */
143 
144 
145 #ifdef LV_HAVE_NEON
146 #include <arm_neon.h>
147 
148 static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
149 {
150  uint32_t* inputPtr = intsToSwap;
151  unsigned int number = 0;
152  unsigned int n8points = num_points / 8;
153 
154  uint8x8x4_t input_table;
155  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
156  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
157 
158  /* these magic numbers are used as byte-indices in the LUT.
159  they are pre-computed to save time. A simple C program
160  can calculate them; for example for lookup01:
161  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
162  for(ii=0; ii < 8; ++ii) {
163  index += ((uint64_t)(*(chars+ii))) << (ii*8);
164  }
165  */
166  int_lookup01 = vcreate_u8(74609667900706840);
167  int_lookup23 = vcreate_u8(219290013576860186);
168  int_lookup45 = vcreate_u8(363970359253013532);
169  int_lookup67 = vcreate_u8(508650704929166878);
170 
171  for (number = 0; number < n8points; ++number) {
172  input_table = vld4_u8((uint8_t*)inputPtr);
173  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
174  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
175  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
176  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
177  vst1_u8((uint8_t*)inputPtr, swapped_int01);
178  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
179  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
180  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
181 
182  inputPtr += 8;
183  }
184 
185  for (number = n8points * 8; number < num_points; ++number) {
186  uint32_t output = *inputPtr;
187  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
188  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
189 
190  *inputPtr = output;
191  inputPtr++;
192  }
193 }
194 #endif /* LV_HAVE_NEON */
195 
196 #ifdef LV_HAVE_NEONV8
197 #include <arm_neon.h>
198 
199 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
200 {
201  uint32_t* inputPtr = (uint32_t*)intsToSwap;
202  const unsigned int n8points = num_points / 8;
203  uint8x16_t input;
204  uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
205 
206  unsigned int number = 0;
207  for (number = 0; number < n8points; ++number) {
208  __VOLK_PREFETCH(inputPtr + 8);
209  input = vld1q_u8((uint8_t*)inputPtr);
210  input = vqtbl1q_u8(input, idx);
211  vst1q_u8((uint8_t*)inputPtr, input);
212  inputPtr += 4;
213 
214  input = vld1q_u8((uint8_t*)inputPtr);
215  input = vqtbl1q_u8(input, idx);
216  vst1q_u8((uint8_t*)inputPtr, input);
217  inputPtr += 4;
218  }
219 
220  for (number = n8points * 8; number < num_points; ++number) {
221  uint32_t output = *inputPtr;
222 
223  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
224  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
225 
226  *inputPtr++ = output;
227  }
228 }
229 #endif /* LV_HAVE_NEONV8 */
230 
231 
232 #ifdef LV_HAVE_GENERIC
233 
234 static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
235  unsigned int num_points)
236 {
237  uint32_t* inputPtr = intsToSwap;
238 
239  unsigned int point;
240  for (point = 0; point < num_points; point++) {
241  uint32_t output = *inputPtr;
242  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
243  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
244 
245  *inputPtr = output;
246  inputPtr++;
247  }
248 }
249 #endif /* LV_HAVE_GENERIC */
250 
251 
252 #endif /* INCLUDED_volk_32u_byteswap_u_H */
253 #ifndef INCLUDED_volk_32u_byteswap_a_H
254 #define INCLUDED_volk_32u_byteswap_a_H
255 
256 #include <inttypes.h>
257 #include <stdio.h>
258 
259 
260 #if LV_HAVE_AVX2
261 #include <immintrin.h>
262 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
263 {
264 
265  unsigned int number;
266 
267  const unsigned int nPerSet = 8;
268  const uint64_t nSets = num_points / nPerSet;
269 
270  uint32_t* inputPtr = intsToSwap;
271 
272  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
273  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
274  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
275 
276  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
277 
278  for (number = 0; number < nSets; number++) {
279 
280  // Load the 32t values, increment inputPtr later since we're doing it in-place.
281  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
282  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
283 
284  // Store the results
285  _mm256_store_si256((__m256i*)inputPtr, output);
286  inputPtr += nPerSet;
287  }
288 
289  // Byteswap any remaining points:
290  for (number = nSets * nPerSet; number < num_points; number++) {
291  uint32_t outputVal = *inputPtr;
292  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
293  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
294  *inputPtr = outputVal;
295  inputPtr++;
296  }
297 }
298 #endif /* LV_HAVE_AVX2 */
299 
300 
301 #ifdef LV_HAVE_SSE2
302 #include <emmintrin.h>
303 
304 
305 static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
306 {
307  unsigned int number = 0;
308 
309  uint32_t* inputPtr = intsToSwap;
310  __m128i input, byte1, byte2, byte3, byte4, output;
311  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
312  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
313 
314  const uint64_t quarterPoints = num_points / 4;
315  for (; number < quarterPoints; number++) {
316  // Load the 32t values, increment inputPtr later since we're doing it in-place.
317  input = _mm_load_si128((__m128i*)inputPtr);
318  // Do the four shifts
319  byte1 = _mm_slli_epi32(input, 24);
320  byte2 = _mm_slli_epi32(input, 8);
321  byte3 = _mm_srli_epi32(input, 8);
322  byte4 = _mm_srli_epi32(input, 24);
323  // Or bytes together
324  output = _mm_or_si128(byte1, byte4);
325  byte2 = _mm_and_si128(byte2, byte2mask);
326  output = _mm_or_si128(output, byte2);
327  byte3 = _mm_and_si128(byte3, byte3mask);
328  output = _mm_or_si128(output, byte3);
329  // Store the results
330  _mm_store_si128((__m128i*)inputPtr, output);
331  inputPtr += 4;
332  }
333 
334  // Byteswap any remaining points:
335  number = quarterPoints * 4;
336  for (; number < num_points; number++) {
337  uint32_t outputVal = *inputPtr;
338  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
339  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
340  *inputPtr = outputVal;
341  inputPtr++;
342  }
343 }
344 #endif /* LV_HAVE_SSE2 */
345 
346 
347 #ifdef LV_HAVE_GENERIC
348 
349 static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
350  unsigned int num_points)
351 {
352  uint32_t* inputPtr = intsToSwap;
353 
354  unsigned int point;
355  for (point = 0; point < num_points; point++) {
356  uint32_t output = *inputPtr;
357  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
358  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
359 
360  *inputPtr = output;
361  inputPtr++;
362  }
363 }
364 #endif /* LV_HAVE_GENERIC */
365 
366 
367 #endif /* INCLUDED_volk_32u_byteswap_a_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:148
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:234
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:349
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:103
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:305
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71