40 #ifndef INCLUDED_volk_16i_convert_8i_u_H
41 #define INCLUDED_volk_16i_convert_8i_u_H
47 #include <immintrin.h>
49 static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50 const int16_t* inputVector,
51 unsigned int num_points)
53 unsigned int number = 0;
54 const unsigned int thirtysecondPoints = num_points / 32;
56 int8_t* outputVectorPtr = outputVector;
57 int16_t* inputPtr = (int16_t*)inputVector;
62 for (; number < thirtysecondPoints; number++) {
65 inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
67 inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
70 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
73 ret = _mm256_packs_epi16(inputVal1, inputVal2);
74 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
76 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
78 outputVectorPtr += 32;
81 number = thirtysecondPoints * 32;
82 for (; number < num_points; number++) {
83 outputVector[number] = (int8_t)(inputVector[number] >> 8);
90 #include <emmintrin.h>
93 const int16_t* inputVector,
94 unsigned int num_points)
96 unsigned int number = 0;
97 const unsigned int sixteenthPoints = num_points / 16;
99 int8_t* outputVectorPtr = outputVector;
100 int16_t* inputPtr = (int16_t*)inputVector;
105 for (; number < sixteenthPoints; number++) {
120 outputVectorPtr += 16;
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 outputVector[number] = (int8_t)(inputVector[number] >> 8);
131 #ifdef LV_HAVE_GENERIC
134 const int16_t* inputVector,
135 unsigned int num_points)
137 int8_t* outputVectorPtr = outputVector;
138 const int16_t* inputVectorPtr = inputVector;
139 unsigned int number = 0;
141 for (number = 0; number < num_points; number++) {
142 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
149 #ifndef INCLUDED_volk_16i_convert_8i_a_H
150 #define INCLUDED_volk_16i_convert_8i_a_H
152 #include <inttypes.h>
156 #include <immintrin.h>
158 static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
159 const int16_t* inputVector,
160 unsigned int num_points)
162 unsigned int number = 0;
163 const unsigned int thirtysecondPoints = num_points / 32;
165 int8_t* outputVectorPtr = outputVector;
166 int16_t* inputPtr = (int16_t*)inputVector;
171 for (; number < thirtysecondPoints; number++) {
174 inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
176 inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
179 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
180 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
182 ret = _mm256_packs_epi16(inputVal1, inputVal2);
183 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
185 _mm256_store_si256((__m256i*)outputVectorPtr, ret);
187 outputVectorPtr += 32;
190 number = thirtysecondPoints * 32;
191 for (; number < num_points; number++) {
192 outputVector[number] = (int8_t)(inputVector[number] >> 8);
199 #include <emmintrin.h>
202 const int16_t* inputVector,
203 unsigned int num_points)
205 unsigned int number = 0;
206 const unsigned int sixteenthPoints = num_points / 16;
208 int8_t* outputVectorPtr = outputVector;
209 int16_t* inputPtr = (int16_t*)inputVector;
214 for (; number < sixteenthPoints; number++) {
229 outputVectorPtr += 16;
232 number = sixteenthPoints * 16;
233 for (; number < num_points; number++) {
234 outputVector[number] = (int8_t)(inputVector[number] >> 8);
241 #include <arm_neon.h>
244 const int16_t* inputVector,
245 unsigned int num_points)
247 int8_t* outputVectorPtr = outputVector;
248 const int16_t* inputVectorPtr = inputVector;
249 unsigned int number = 0;
250 unsigned int sixteenth_points = num_points / 16;
258 for (number = 0; number < sixteenth_points; number++) {
260 inputVal0 = vld1q_s16(inputVectorPtr);
261 inputVal1 = vld1q_s16(inputVectorPtr + 8);
263 outputVal0 = vshrn_n_s16(inputVal0, 8);
264 outputVal1 = vshrn_n_s16(inputVal1, 8);
266 outputVal = vcombine_s8(outputVal0, outputVal1);
267 vst1q_s8(outputVectorPtr, outputVal);
268 inputVectorPtr += 16;
269 outputVectorPtr += 16;
272 for (number = sixteenth_points * 16; number < num_points; number++) {
273 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
279 #ifdef LV_HAVE_GENERIC
282 const int16_t* inputVector,
283 unsigned int num_points)
285 int8_t* outputVectorPtr = outputVector;
286 const int16_t* inputVectorPtr = inputVector;
287 unsigned int number = 0;
289 for (number = 0; number < num_points; number++) {
290 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_convert_8i_a_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:201
static void volk_16i_convert_8i_u_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:92
static void volk_16i_convert_8i_a_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:281
static void volk_16i_convert_8i_neon(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:243
static void volk_16i_convert_8i_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:133