34 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
40 #ifdef LV_HAVE_GENERIC
45 unsigned int num_points)
48 for (n = 0; n < num_points; n++) {
49 result[n] = in_a[n] * in_b[n];
57 #include <emmintrin.h>
62 unsigned int num_points)
64 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
78 for (number = 0; number < sse_iters; number++) {
108 for (number = sse_iters * 4; number < num_points; ++number) {
109 *_out++ = (*_in_a++) * (*_in_b++);
116 #include <emmintrin.h>
121 unsigned int num_points)
123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
137 for (number = 0; number < sse_iters; number++) {
167 for (number = sse_iters * 4; number < num_points; ++number) {
168 *_out++ = (*_in_a++) * (*_in_b++);
175 #include <immintrin.h>
177 static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int avx2_points = num_points / 8;
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
191 const __m256i mask_imag = _mm256_set_epi8(0xFF,
223 const __m256i mask_real = _mm256_set_epi8(0,
256 for (; number < avx2_points; number++) {
257 a = _mm256_loadu_si256(
259 b = _mm256_loadu_si256(
261 c = _mm256_mullo_epi16(a, b);
263 c_sr = _mm256_srli_si256(c, 2);
265 real = _mm256_subs_epi16(c, c_sr);
266 real = _mm256_and_si256(
269 b_sl = _mm256_slli_si256(b, 2);
270 a_sl = _mm256_slli_si256(a, 2);
272 imag1 = _mm256_mullo_epi16(a, b_sl);
273 imag2 = _mm256_mullo_epi16(b, a_sl);
275 imag = _mm256_adds_epi16(imag1, imag2);
276 imag = _mm256_and_si256(imag, mask_imag);
278 result = _mm256_or_si256(real, imag);
280 _mm256_storeu_si256((__m256i*)_out, result);
287 number = avx2_points * 8;
288 for (; number < num_points; number++) {
289 *_out++ = (*_in_a++) * (*_in_b++);
296 #include <immintrin.h>
298 static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
301 unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int avx2_points = num_points / 8;
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
312 const __m256i mask_imag = _mm256_set_epi8(0xFF,
344 const __m256i mask_real = _mm256_set_epi8(0,
377 for (; number < avx2_points; number++) {
378 a = _mm256_load_si256(
380 b = _mm256_load_si256(
382 c = _mm256_mullo_epi16(a, b);
384 c_sr = _mm256_srli_si256(c, 2);
386 real = _mm256_subs_epi16(c, c_sr);
387 real = _mm256_and_si256(
390 b_sl = _mm256_slli_si256(b, 2);
391 a_sl = _mm256_slli_si256(a, 2);
393 imag1 = _mm256_mullo_epi16(a, b_sl);
394 imag2 = _mm256_mullo_epi16(b, a_sl);
396 imag = _mm256_adds_epi16(imag1, imag2);
397 imag = _mm256_and_si256(imag, mask_imag);
399 result = _mm256_or_si256(real, imag);
401 _mm256_store_si256((__m256i*)_out, result);
408 number = avx2_points * 8;
409 for (; number < num_points; number++) {
410 *_out++ = (*_in_a++) * (*_in_b++);
417 #include <arm_neon.h>
422 unsigned int num_points)
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr);
433 b_val = vld2_s16((int16_t*)b_ptr);
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:59
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:118
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:42
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:419
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
short complex lv_16sc_t
Definition: volk_complex.h:71