Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_16u_byteswap_u_H
41 #define INCLUDED_volk_16u_byteswap_u_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_GENERIC
47 
48 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
49  unsigned int num_points)
50 {
51  uint16_t* inputPtr = intsToSwap;
52  for (unsigned int point = 0; point < num_points; point++) {
53  uint16_t output = *inputPtr;
54  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
55  *inputPtr = output;
56  inputPtr++;
57  }
58 }
59 #endif /* LV_HAVE_GENERIC */
60 
61 
62 #if LV_HAVE_AVX2
63 #include <immintrin.h>
64 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
65 {
66  unsigned int number;
67 
68  const unsigned int nPerSet = 16;
69  const uint64_t nSets = num_points / nPerSet;
70 
71  uint16_t* inputPtr = (uint16_t*)intsToSwap;
72 
73  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
74  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
75  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
76 
77  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
78 
79  for (number = 0; number < nSets; number++) {
80  // Load the 32t values, increment inputPtr later since we're doing it in-place.
81  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
82  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
83 
84  // Store the results
85  _mm256_store_si256((__m256i*)inputPtr, output);
86  inputPtr += nPerSet;
87  }
88 
89  // Byteswap any remaining points:
90  for (number = nPerSet * nSets; number < num_points; number++) {
91  uint16_t outputVal = *inputPtr;
92  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
93  *inputPtr = outputVal;
94  inputPtr++;
95  }
96 }
97 #endif /* LV_HAVE_AVX2 */
98 
99 
100 #if LV_HAVE_AVX2
101 #include <immintrin.h>
102 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
103 {
104  unsigned int number;
105 
106  const unsigned int nPerSet = 16;
107  const uint64_t nSets = num_points / nPerSet;
108 
109  uint16_t* inputPtr = (uint16_t*)intsToSwap;
110 
111  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
112  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
113  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
114 
115  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
116 
117  for (number = 0; number < nSets; number++) {
118  // Load the 32t values, increment inputPtr later since we're doing it in-place.
119  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
120  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
121 
122  // Store the results
123  _mm256_storeu_si256((__m256i*)inputPtr, output);
124  inputPtr += nPerSet;
125  }
126 
127  // Byteswap any remaining points:
128  for (number = nPerSet * nSets; number < num_points; number++) {
129  uint16_t outputVal = *inputPtr;
130  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
131  *inputPtr = outputVal;
132  inputPtr++;
133  }
134 }
135 #endif /* LV_HAVE_AVX2 */
136 
137 
138 #ifdef LV_HAVE_SSE2
139 #include <emmintrin.h>
140 
141 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
142 {
143  unsigned int number = 0;
144  uint16_t* inputPtr = intsToSwap;
145  __m128i input, left, right, output;
146 
147  const unsigned int eighthPoints = num_points / 8;
148  for (; number < eighthPoints; number++) {
149  // Load the 16t values, increment inputPtr later since we're doing it in-place.
150  input = _mm_loadu_si128((__m128i*)inputPtr);
151  // Do the two shifts
152  left = _mm_slli_epi16(input, 8);
153  right = _mm_srli_epi16(input, 8);
154  // Or the left and right halves together
155  output = _mm_or_si128(left, right);
156  // Store the results
157  _mm_storeu_si128((__m128i*)inputPtr, output);
158  inputPtr += 8;
159  }
160 
161  // Byteswap any remaining points:
162  number = eighthPoints * 8;
163  for (; number < num_points; number++) {
164  uint16_t outputVal = *inputPtr;
165  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
166  *inputPtr = outputVal;
167  inputPtr++;
168  }
169 }
170 #endif /* LV_HAVE_SSE2 */
171 
172 
173 #endif /* INCLUDED_volk_16u_byteswap_u_H */
174 #ifndef INCLUDED_volk_16u_byteswap_a_H
175 #define INCLUDED_volk_16u_byteswap_a_H
176 
177 #include <inttypes.h>
178 #include <stdio.h>
179 
180 #ifdef LV_HAVE_SSE2
181 #include <emmintrin.h>
182 
183 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
184 {
185  uint16_t* inputPtr = intsToSwap;
186  __m128i input, left, right, output;
187 
188  const unsigned int eighthPoints = num_points / 8;
189  for (unsigned int number = 0; number < eighthPoints; number++) {
190  // Load the 16t values, increment inputPtr later since we're doing it in-place.
191  input = _mm_load_si128((__m128i*)inputPtr);
192  // Do the two shifts
193  left = _mm_slli_epi16(input, 8);
194  right = _mm_srli_epi16(input, 8);
195  // Or the left and right halves together
196  output = _mm_or_si128(left, right);
197  // Store the results
198  _mm_store_si128((__m128i*)inputPtr, output);
199  inputPtr += 8;
200  }
201 
202  // Byteswap any remaining points:
203  volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
204 }
205 #endif /* LV_HAVE_SSE2 */
206 
207 #ifdef LV_HAVE_NEON
208 #include <arm_neon.h>
209 
210 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
211 {
212  unsigned int number;
213  unsigned int eighth_points = num_points / 8;
214  uint16x8_t input, output;
215  uint16_t* inputPtr = intsToSwap;
216 
217  for (number = 0; number < eighth_points; number++) {
218  input = vld1q_u16(inputPtr);
219  output = vsriq_n_u16(output, input, 8);
220  output = vsliq_n_u16(output, input, 8);
221  vst1q_u16(inputPtr, output);
222  inputPtr += 8;
223  }
224 
225  volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
226 }
227 #endif /* LV_HAVE_NEON */
228 
229 #ifdef LV_HAVE_NEON
230 #include <arm_neon.h>
231 
232 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
233  unsigned int num_points)
234 {
235  uint16_t* inputPtr = intsToSwap;
236  unsigned int number = 0;
237  unsigned int n16points = num_points / 16;
238 
239  uint8x8x4_t input_table;
240  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
241  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
242 
243  /* these magic numbers are used as byte-indices in the LUT.
244  they are pre-computed to save time. A simple C program
245  can calculate them; for example for lookup01:
246  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
247  for(ii=0; ii < 8; ++ii) {
248  index += ((uint64_t)(*(chars+ii))) << (ii*8);
249  }
250  */
251  int_lookup01 = vcreate_u8(1232017111498883080);
252  int_lookup23 = vcreate_u8(1376697457175036426);
253  int_lookup45 = vcreate_u8(1521377802851189772);
254  int_lookup67 = vcreate_u8(1666058148527343118);
255 
256  for (number = 0; number < n16points; ++number) {
257  input_table = vld4_u8((uint8_t*)inputPtr);
258  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
259  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
260  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
261  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
262  vst1_u8((uint8_t*)inputPtr, swapped_int01);
263  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
264  vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
265  vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
266 
267  inputPtr += 16;
268  }
269 
270  volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
271 }
272 #endif /* LV_HAVE_NEON */
273 
274 #ifdef LV_HAVE_GENERIC
275 
276 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
277  unsigned int num_points)
278 {
279  uint16_t* inputPtr = intsToSwap;
280  for (unsigned int point = 0; point < num_points; point++) {
281  uint16_t output = *inputPtr;
282  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
283  *inputPtr = output;
284  inputPtr++;
285  }
286 }
287 #endif /* LV_HAVE_GENERIC */
288 
289 #ifdef LV_HAVE_ORC
290 
291 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
292 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
293 {
294  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
295 }
296 #endif /* LV_HAVE_ORC */
297 
298 
299 #endif /* INCLUDED_volk_16u_byteswap_a_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
#define _mm_srli_epi16(a, imm)
Definition: sse2neon.h:5812
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:141
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:276
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:210
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:183
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:48
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:232