Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_16i_convert_8i_u_H
41 #define INCLUDED_volk_16i_convert_8i_u_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50  const int16_t* inputVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const unsigned int thirtysecondPoints = num_points / 32;
55 
56  int8_t* outputVectorPtr = outputVector;
57  int16_t* inputPtr = (int16_t*)inputVector;
58  __m256i inputVal1;
59  __m256i inputVal2;
60  __m256i ret;
61 
62  for (; number < thirtysecondPoints; number++) {
63 
64  // Load the 16 values
65  inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
66  inputPtr += 16;
67  inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
68  inputPtr += 16;
69 
70  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
72 
73  ret = _mm256_packs_epi16(inputVal1, inputVal2);
74  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
75 
76  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
77 
78  outputVectorPtr += 32;
79  }
80 
81  number = thirtysecondPoints * 32;
82  for (; number < num_points; number++) {
83  outputVector[number] = (int8_t)(inputVector[number] >> 8);
84  }
85 }
86 #endif /* LV_HAVE_AVX2 */
87 
88 
89 #ifdef LV_HAVE_SSE2
90 #include <emmintrin.h>
91 
92 static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
93  const int16_t* inputVector,
94  unsigned int num_points)
95 {
96  unsigned int number = 0;
97  const unsigned int sixteenthPoints = num_points / 16;
98 
99  int8_t* outputVectorPtr = outputVector;
100  int16_t* inputPtr = (int16_t*)inputVector;
101  __m128i inputVal1;
102  __m128i inputVal2;
103  __m128i ret;
104 
105  for (; number < sixteenthPoints; number++) {
106 
107  // Load the 16 values
108  inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
109  inputPtr += 8;
110  inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
111  inputPtr += 8;
112 
113  inputVal1 = _mm_srai_epi16(inputVal1, 8);
114  inputVal2 = _mm_srai_epi16(inputVal2, 8);
115 
116  ret = _mm_packs_epi16(inputVal1, inputVal2);
117 
118  _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
119 
120  outputVectorPtr += 16;
121  }
122 
123  number = sixteenthPoints * 16;
124  for (; number < num_points; number++) {
125  outputVector[number] = (int8_t)(inputVector[number] >> 8);
126  }
127 }
128 #endif /* LV_HAVE_SSE2 */
129 
130 
131 #ifdef LV_HAVE_GENERIC
132 
133 static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
134  const int16_t* inputVector,
135  unsigned int num_points)
136 {
137  int8_t* outputVectorPtr = outputVector;
138  const int16_t* inputVectorPtr = inputVector;
139  unsigned int number = 0;
140 
141  for (number = 0; number < num_points; number++) {
142  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
143  }
144 }
145 #endif /* LV_HAVE_GENERIC */
146 
147 
148 #endif /* INCLUDED_volk_16i_convert_8i_u_H */
149 #ifndef INCLUDED_volk_16i_convert_8i_a_H
150 #define INCLUDED_volk_16i_convert_8i_a_H
151 
152 #include <inttypes.h>
153 #include <stdio.h>
154 
155 #ifdef LV_HAVE_AVX2
156 #include <immintrin.h>
157 
158 static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
159  const int16_t* inputVector,
160  unsigned int num_points)
161 {
162  unsigned int number = 0;
163  const unsigned int thirtysecondPoints = num_points / 32;
164 
165  int8_t* outputVectorPtr = outputVector;
166  int16_t* inputPtr = (int16_t*)inputVector;
167  __m256i inputVal1;
168  __m256i inputVal2;
169  __m256i ret;
170 
171  for (; number < thirtysecondPoints; number++) {
172 
173  // Load the 16 values
174  inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
175  inputPtr += 16;
176  inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
177  inputPtr += 16;
178 
179  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
180  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
181 
182  ret = _mm256_packs_epi16(inputVal1, inputVal2);
183  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
184 
185  _mm256_store_si256((__m256i*)outputVectorPtr, ret);
186 
187  outputVectorPtr += 32;
188  }
189 
190  number = thirtysecondPoints * 32;
191  for (; number < num_points; number++) {
192  outputVector[number] = (int8_t)(inputVector[number] >> 8);
193  }
194 }
195 #endif /* LV_HAVE_AVX2 */
196 
197 
198 #ifdef LV_HAVE_SSE2
199 #include <emmintrin.h>
200 
201 static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
202  const int16_t* inputVector,
203  unsigned int num_points)
204 {
205  unsigned int number = 0;
206  const unsigned int sixteenthPoints = num_points / 16;
207 
208  int8_t* outputVectorPtr = outputVector;
209  int16_t* inputPtr = (int16_t*)inputVector;
210  __m128i inputVal1;
211  __m128i inputVal2;
212  __m128i ret;
213 
214  for (; number < sixteenthPoints; number++) {
215 
216  // Load the 16 values
217  inputVal1 = _mm_load_si128((__m128i*)inputPtr);
218  inputPtr += 8;
219  inputVal2 = _mm_load_si128((__m128i*)inputPtr);
220  inputPtr += 8;
221 
222  inputVal1 = _mm_srai_epi16(inputVal1, 8);
223  inputVal2 = _mm_srai_epi16(inputVal2, 8);
224 
225  ret = _mm_packs_epi16(inputVal1, inputVal2);
226 
227  _mm_store_si128((__m128i*)outputVectorPtr, ret);
228 
229  outputVectorPtr += 16;
230  }
231 
232  number = sixteenthPoints * 16;
233  for (; number < num_points; number++) {
234  outputVector[number] = (int8_t)(inputVector[number] >> 8);
235  }
236 }
237 #endif /* LV_HAVE_SSE2 */
238 
239 
240 #ifdef LV_HAVE_NEON
241 #include <arm_neon.h>
242 
243 static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
244  const int16_t* inputVector,
245  unsigned int num_points)
246 {
247  int8_t* outputVectorPtr = outputVector;
248  const int16_t* inputVectorPtr = inputVector;
249  unsigned int number = 0;
250  unsigned int sixteenth_points = num_points / 16;
251 
252  int16x8_t inputVal0;
253  int16x8_t inputVal1;
254  int8x8_t outputVal0;
255  int8x8_t outputVal1;
256  int8x16_t outputVal;
257 
258  for (number = 0; number < sixteenth_points; number++) {
259  // load two input vectors
260  inputVal0 = vld1q_s16(inputVectorPtr);
261  inputVal1 = vld1q_s16(inputVectorPtr + 8);
262  // shift right
263  outputVal0 = vshrn_n_s16(inputVal0, 8);
264  outputVal1 = vshrn_n_s16(inputVal1, 8);
265  // squash two vectors and write output
266  outputVal = vcombine_s8(outputVal0, outputVal1);
267  vst1q_s8(outputVectorPtr, outputVal);
268  inputVectorPtr += 16;
269  outputVectorPtr += 16;
270  }
271 
272  for (number = sixteenth_points * 16; number < num_points; number++) {
273  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
274  }
275 }
276 #endif /* LV_HAVE_NEON */
277 
278 
279 #ifdef LV_HAVE_GENERIC
280 
281 static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
282  const int16_t* inputVector,
283  unsigned int num_points)
284 {
285  int8_t* outputVectorPtr = outputVector;
286  const int16_t* inputVectorPtr = inputVector;
287  unsigned int number = 0;
288 
289  for (number = 0; number < num_points; number++) {
290  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
291  }
292 }
293 #endif /* LV_HAVE_GENERIC */
294 
295 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_convert_8i_a_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:201
static void volk_16i_convert_8i_u_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:92
static void volk_16i_convert_8i_a_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:281
static void volk_16i_convert_8i_neon(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:243
static void volk_16i_convert_8i_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:133