Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_8i_convert_16i_u_H
41 #define INCLUDED_volk_8i_convert_16i_u_H
42 
43 #include <inttypes.h>
44 #include <stdio.h>
45 
46 #ifdef LV_HAVE_AVX2
47 #include <immintrin.h>
48 
49 static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
50  const int8_t* inputVector,
51  unsigned int num_points)
52 {
53  unsigned int number = 0;
54  const unsigned int sixteenthPoints = num_points / 16;
55 
56  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
57  __m256i* outputVectorPtr = (__m256i*)outputVector;
58  __m128i inputVal;
59  __m256i ret;
60 
61  for (; number < sixteenthPoints; number++) {
62  inputVal = _mm_loadu_si128(inputVectorPtr);
63  ret = _mm256_cvtepi8_epi16(inputVal);
64  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
65  _mm256_storeu_si256(outputVectorPtr, ret);
66 
67  outputVectorPtr++;
68  inputVectorPtr++;
69  }
70 
71  number = sixteenthPoints * 16;
72  for (; number < num_points; number++) {
73  outputVector[number] = (int16_t)(inputVector[number]) * 256;
74  }
75 }
76 #endif /* LV_HAVE_AVX2 */
77 
78 
79 #ifdef LV_HAVE_SSE4_1
80 #include <smmintrin.h>
81 
82 static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
83  const int8_t* inputVector,
84  unsigned int num_points)
85 {
86  unsigned int number = 0;
87  const unsigned int sixteenthPoints = num_points / 16;
88 
89  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
90  __m128i* outputVectorPtr = (__m128i*)outputVector;
91  __m128i inputVal;
92  __m128i ret;
93 
94  for (; number < sixteenthPoints; number++) {
95  inputVal = _mm_loadu_si128(inputVectorPtr);
96  ret = _mm_cvtepi8_epi16(inputVal);
97  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
98  _mm_storeu_si128(outputVectorPtr, ret);
99 
100  outputVectorPtr++;
101 
102  inputVal = _mm_srli_si128(inputVal, 8);
103  ret = _mm_cvtepi8_epi16(inputVal);
104  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
105  _mm_storeu_si128(outputVectorPtr, ret);
106 
107  outputVectorPtr++;
108 
109  inputVectorPtr++;
110  }
111 
112  number = sixteenthPoints * 16;
113  for (; number < num_points; number++) {
114  outputVector[number] = (int16_t)(inputVector[number]) * 256;
115  }
116 }
117 #endif /* LV_HAVE_SSE4_1 */
118 
119 
120 #ifdef LV_HAVE_GENERIC
121 
122 static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
123  const int8_t* inputVector,
124  unsigned int num_points)
125 {
126  int16_t* outputVectorPtr = outputVector;
127  const int8_t* inputVectorPtr = inputVector;
128  unsigned int number = 0;
129 
130  for (number = 0; number < num_points; number++) {
131  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
132  }
133 }
134 #endif /* LV_HAVE_GENERIC */
135 
136 
137 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
138 
139 
140 #ifndef INCLUDED_volk_8i_convert_16i_a_H
141 #define INCLUDED_volk_8i_convert_16i_a_H
142 
143 #include <inttypes.h>
144 #include <stdio.h>
145 
146 #ifdef LV_HAVE_AVX2
147 #include <immintrin.h>
148 
149 static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
150  const int8_t* inputVector,
151  unsigned int num_points)
152 {
153  unsigned int number = 0;
154  const unsigned int sixteenthPoints = num_points / 16;
155 
156  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
157  __m256i* outputVectorPtr = (__m256i*)outputVector;
158  __m128i inputVal;
159  __m256i ret;
160 
161  for (; number < sixteenthPoints; number++) {
162  inputVal = _mm_load_si128(inputVectorPtr);
163  ret = _mm256_cvtepi8_epi16(inputVal);
164  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
165  _mm256_store_si256(outputVectorPtr, ret);
166 
167  outputVectorPtr++;
168  inputVectorPtr++;
169  }
170 
171  number = sixteenthPoints * 16;
172  for (; number < num_points; number++) {
173  outputVector[number] = (int16_t)(inputVector[number]) * 256;
174  }
175 }
176 #endif /* LV_HAVE_AVX2 */
177 
178 
179 #ifdef LV_HAVE_SSE4_1
180 #include <smmintrin.h>
181 
182 static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
183  const int8_t* inputVector,
184  unsigned int num_points)
185 {
186  unsigned int number = 0;
187  const unsigned int sixteenthPoints = num_points / 16;
188 
189  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
190  __m128i* outputVectorPtr = (__m128i*)outputVector;
191  __m128i inputVal;
192  __m128i ret;
193 
194  for (; number < sixteenthPoints; number++) {
195  inputVal = _mm_load_si128(inputVectorPtr);
196  ret = _mm_cvtepi8_epi16(inputVal);
197  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
198  _mm_store_si128(outputVectorPtr, ret);
199 
200  outputVectorPtr++;
201 
202  inputVal = _mm_srli_si128(inputVal, 8);
203  ret = _mm_cvtepi8_epi16(inputVal);
204  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
205  _mm_store_si128(outputVectorPtr, ret);
206 
207  outputVectorPtr++;
208 
209  inputVectorPtr++;
210  }
211 
212  number = sixteenthPoints * 16;
213  for (; number < num_points; number++) {
214  outputVector[number] = (int16_t)(inputVector[number]) * 256;
215  }
216 }
217 #endif /* LV_HAVE_SSE4_1 */
218 
219 
220 #ifdef LV_HAVE_GENERIC
221 
222 static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
223  const int8_t* inputVector,
224  unsigned int num_points)
225 {
226  int16_t* outputVectorPtr = outputVector;
227  const int8_t* inputVectorPtr = inputVector;
228  unsigned int number = 0;
229 
230  for (number = 0; number < num_points; number++) {
231  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
232  }
233 }
234 #endif /* LV_HAVE_GENERIC */
235 
236 
237 #ifdef LV_HAVE_NEON
238 #include <arm_neon.h>
239 
240 static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
241  const int8_t* inputVector,
242  unsigned int num_points)
243 {
244  int16_t* outputVectorPtr = outputVector;
245  const int8_t* inputVectorPtr = inputVector;
246  unsigned int number;
247  const unsigned int eighth_points = num_points / 8;
248 
249  int8x8_t input_vec;
250  int16x8_t converted_vec;
251 
252  // NEON doesn't have a concept of 8 bit registers, so we are really
253  // dealing with the low half of 16-bit registers. Since this requires
254  // a move instruction we likely do better with ASM here.
255  for (number = 0; number < eighth_points; ++number) {
256  input_vec = vld1_s8(inputVectorPtr);
257  converted_vec = vmovl_s8(input_vec);
258  // converted_vec = vmulq_s16(converted_vec, scale_factor);
259  converted_vec = vshlq_n_s16(converted_vec, 8);
260  vst1q_s16(outputVectorPtr, converted_vec);
261 
262  inputVectorPtr += 8;
263  outputVectorPtr += 8;
264  }
265 
266  for (number = eighth_points * 8; number < num_points; number++) {
267  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
268  }
269 }
270 #endif /* LV_HAVE_NEON */
271 
272 
273 #ifdef LV_HAVE_ORC
274 extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
275  const int8_t* inputVector,
276  unsigned int num_points);
277 
278 static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
279  const int8_t* inputVector,
280  unsigned int num_points)
281 {
282  volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
283 }
284 #endif /* LV_HAVE_ORC */
285 
286 
287 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8i_convert_16i_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:122
static void volk_8i_convert_16i_a_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:222
static void volk_8i_convert_16i_neon(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:240