53 #ifndef INCLUDED_volk_32u_byteswap_u_H
54 #define INCLUDED_volk_32u_byteswap_u_H
60 #include <immintrin.h>
61 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap,
unsigned int num_points)
66 const unsigned int nPerSet = 8;
67 const uint64_t nSets = num_points / nPerSet;
69 uint32_t* inputPtr = intsToSwap;
71 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
72 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
73 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
75 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
77 for (number = 0; number < nSets; number++) {
80 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
81 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
84 _mm256_storeu_si256((__m256i*)inputPtr, output);
89 for (number = nSets * nPerSet; number < num_points; number++) {
90 uint32_t outputVal = *inputPtr;
91 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
92 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
93 *inputPtr = outputVal;
101 #include <emmintrin.h>
105 unsigned int number = 0;
107 uint32_t* inputPtr = intsToSwap;
108 __m128i input, byte1, byte2, byte3, byte4, output;
112 const uint64_t quarterPoints = num_points / 4;
113 for (; number < quarterPoints; number++) {
133 number = quarterPoints * 4;
134 for (; number < num_points; number++) {
135 uint32_t outputVal = *inputPtr;
136 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
137 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
138 *inputPtr = outputVal;
146 #include <arm_neon.h>
150 uint32_t* inputPtr = intsToSwap;
151 unsigned int number = 0;
152 unsigned int n8points = num_points / 8;
154 uint8x8x4_t input_table;
155 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
156 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
166 int_lookup01 = vcreate_u8(74609667900706840);
167 int_lookup23 = vcreate_u8(219290013576860186);
168 int_lookup45 = vcreate_u8(363970359253013532);
169 int_lookup67 = vcreate_u8(508650704929166878);
171 for (number = 0; number < n8points; ++number) {
172 input_table = vld4_u8((uint8_t*)inputPtr);
173 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
174 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
175 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
176 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
177 vst1_u8((uint8_t*)inputPtr, swapped_int01);
178 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
179 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
180 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
185 for (number = n8points * 8; number < num_points; ++number) {
186 uint32_t output = *inputPtr;
187 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
188 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
196 #ifdef LV_HAVE_NEONV8
197 #include <arm_neon.h>
199 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap,
unsigned int num_points)
201 uint32_t* inputPtr = (uint32_t*)intsToSwap;
202 const unsigned int n8points = num_points / 8;
204 uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
206 unsigned int number = 0;
207 for (number = 0; number < n8points; ++number) {
209 input = vld1q_u8((uint8_t*)inputPtr);
210 input = vqtbl1q_u8(input, idx);
211 vst1q_u8((uint8_t*)inputPtr, input);
214 input = vld1q_u8((uint8_t*)inputPtr);
215 input = vqtbl1q_u8(input, idx);
216 vst1q_u8((uint8_t*)inputPtr, input);
220 for (number = n8points * 8; number < num_points; ++number) {
221 uint32_t output = *inputPtr;
223 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
224 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
226 *inputPtr++ = output;
232 #ifdef LV_HAVE_GENERIC
235 unsigned int num_points)
237 uint32_t* inputPtr = intsToSwap;
240 for (point = 0; point < num_points; point++) {
241 uint32_t output = *inputPtr;
242 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
243 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
253 #ifndef INCLUDED_volk_32u_byteswap_a_H
254 #define INCLUDED_volk_32u_byteswap_a_H
256 #include <inttypes.h>
261 #include <immintrin.h>
262 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap,
unsigned int num_points)
267 const unsigned int nPerSet = 8;
268 const uint64_t nSets = num_points / nPerSet;
270 uint32_t* inputPtr = intsToSwap;
272 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
273 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
274 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
276 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
278 for (number = 0; number < nSets; number++) {
281 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
282 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
285 _mm256_store_si256((__m256i*)inputPtr, output);
290 for (number = nSets * nPerSet; number < num_points; number++) {
291 uint32_t outputVal = *inputPtr;
292 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
293 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
294 *inputPtr = outputVal;
302 #include <emmintrin.h>
307 unsigned int number = 0;
309 uint32_t* inputPtr = intsToSwap;
310 __m128i input, byte1, byte2, byte3, byte4, output;
314 const uint64_t quarterPoints = num_points / 4;
315 for (; number < quarterPoints; number++) {
335 number = quarterPoints * 4;
336 for (; number < num_points; number++) {
337 uint32_t outputVal = *inputPtr;
338 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
339 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
340 *inputPtr = outputVal;
347 #ifdef LV_HAVE_GENERIC
350 unsigned int num_points)
352 uint32_t* inputPtr = intsToSwap;
355 for (point = 0; point < num_points; point++) {
356 uint32_t output = *inputPtr;
357 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
358 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:148
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:234
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:349
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:103
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:305
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71