35 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
36 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
43 #ifdef LV_HAVE_GENERIC
48 unsigned int num_points)
50 result[0] =
lv_cmake((int16_t)0, (int16_t)0);
52 for (n = 0; n < num_points; n++) {
63 #include <emmintrin.h>
68 unsigned int num_points)
72 const unsigned int sse_iters = num_points / 4;
80 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
88 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
90 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
92 for (number = 0; number < sse_iters; number++) {
128 for (number = 0; number < 4; ++number) {
135 for (number = 0; number < (num_points % 4); ++number) {
148 #include <emmintrin.h>
153 unsigned int num_points)
157 const unsigned int sse_iters = num_points / 4;
165 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
166 realcacc, imagcacc, result;
173 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
175 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
177 for (number = 0; number < sse_iters; number++) {
213 for (number = 0; number < 4; ++number) {
220 for (number = 0; number < (num_points % 4); ++number) {
232 #include <immintrin.h>
234 static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(
lv_16sc_t* out,
237 unsigned int num_points)
241 const unsigned int avx_iters = num_points / 8;
249 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
250 realcacc, imagcacc, result;
253 realcacc = _mm256_setzero_si256();
254 imagcacc = _mm256_setzero_si256();
256 mask_imag = _mm256_set_epi8(0xFF,
288 mask_real = _mm256_set_epi8(0,
321 for (number = 0; number < avx_iters; number++) {
322 a = _mm256_loadu_si256((__m256i*)_in_a);
324 b = _mm256_loadu_si256((__m256i*)_in_b);
326 c = _mm256_mullo_epi16(a, b);
328 c_sr = _mm256_srli_si256(c, 2);
330 real = _mm256_subs_epi16(c, c_sr);
332 b_sl = _mm256_slli_si256(b, 2);
333 a_sl = _mm256_slli_si256(a, 2);
335 imag1 = _mm256_mullo_epi16(a, b_sl);
336 imag2 = _mm256_mullo_epi16(b, a_sl);
338 imag = _mm256_adds_epi16(imag1, imag2);
340 realcacc = _mm256_adds_epi16(realcacc, real);
341 imagcacc = _mm256_adds_epi16(imagcacc, imag);
347 realcacc = _mm256_and_si256(realcacc, mask_real);
348 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
350 result = _mm256_or_si256(realcacc, imagcacc);
352 _mm256_storeu_si256((__m256i*)dotProductVector,
355 for (number = 0; number < 8; ++number) {
362 for (number = 0; number < (num_points % 8); ++number) {
374 #include <immintrin.h>
376 static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(
lv_16sc_t* out,
379 unsigned int num_points)
383 const unsigned int avx_iters = num_points / 8;
391 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
392 realcacc, imagcacc, result;
395 realcacc = _mm256_setzero_si256();
396 imagcacc = _mm256_setzero_si256();
398 mask_imag = _mm256_set_epi8(0xFF,
430 mask_real = _mm256_set_epi8(0,
463 for (number = 0; number < avx_iters; number++) {
464 a = _mm256_load_si256((__m256i*)_in_a);
466 b = _mm256_load_si256((__m256i*)_in_b);
468 c = _mm256_mullo_epi16(a, b);
470 c_sr = _mm256_srli_si256(c, 2);
472 real = _mm256_subs_epi16(c, c_sr);
474 b_sl = _mm256_slli_si256(b, 2);
475 a_sl = _mm256_slli_si256(a, 2);
477 imag1 = _mm256_mullo_epi16(a, b_sl);
478 imag2 = _mm256_mullo_epi16(b, a_sl);
480 imag = _mm256_adds_epi16(imag1, imag2);
482 realcacc = _mm256_adds_epi16(realcacc, real);
483 imagcacc = _mm256_adds_epi16(imagcacc, imag);
489 realcacc = _mm256_and_si256(realcacc, mask_real);
490 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
492 result = _mm256_or_si256(realcacc, imagcacc);
494 _mm256_store_si256((__m256i*)dotProductVector,
497 for (number = 0; number < 8; ++number) {
504 for (number = 0; number < (num_points % 8); ++number) {
516 #include <arm_neon.h>
521 unsigned int num_points)
523 unsigned int quarter_points = num_points / 4;
528 *out =
lv_cmake((int16_t)0, (int16_t)0);
530 if (quarter_points > 0) {
533 int16x4x2_t a_val, b_val, c_val, accumulator;
534 int16x4x2_t tmp_real, tmp_imag;
536 accumulator.val[0] = vdup_n_s16(0);
537 accumulator.val[1] = vdup_n_s16(0);
540 for (number = 0; number < quarter_points; ++number) {
541 a_val = vld2_s16((int16_t*)a_ptr);
542 b_val = vld2_s16((int16_t*)b_ptr);
548 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
550 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
554 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
556 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
558 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
559 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
561 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
562 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
568 vst2_s16((int16_t*)accum_result, accumulator);
569 for (number = 0; number < 4; ++number) {
579 for (number = quarter_points * 4; number < num_points; ++number) {
580 *out += (*a_ptr++) * (*b_ptr++);
588 #include <arm_neon.h>
593 unsigned int num_points)
595 unsigned int quarter_points = num_points / 4;
602 int16x4x2_t a_val, b_val, accumulator;
605 accumulator.val[0] = vdup_n_s16(0);
606 accumulator.val[1] = vdup_n_s16(0);
608 for (number = 0; number < quarter_points; ++number) {
609 a_val = vld2_s16((int16_t*)a_ptr);
610 b_val = vld2_s16((int16_t*)b_ptr);
614 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
615 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
618 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
619 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
621 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
622 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
628 vst2_s16((int16_t*)accum_result, accumulator);
629 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
632 for (number = quarter_points * 4; number < num_points; ++number) {
633 *out += (*a_ptr++) * (*b_ptr++);
641 #include <arm_neon.h>
646 unsigned int num_points)
648 unsigned int quarter_points = num_points / 4;
655 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
658 accumulator1.val[0] = vdup_n_s16(0);
659 accumulator1.val[1] = vdup_n_s16(0);
660 accumulator2.val[0] = vdup_n_s16(0);
661 accumulator2.val[1] = vdup_n_s16(0);
663 for (number = 0; number < quarter_points; ++number) {
664 a_val = vld2_s16((int16_t*)a_ptr);
665 b_val = vld2_s16((int16_t*)b_ptr);
670 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
671 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
672 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
673 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
679 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
680 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
682 vst2_s16((int16_t*)accum_result, accumulator1);
683 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
686 for (number = quarter_points * 4; number < num_points; ++number) {
687 *out += (*a_ptr++) * (*b_ptr++);
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:16
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:643
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:45
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:65
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:590
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:150
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:518
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
#define lv_creal(x)
Definition: volk_complex.h:96
short complex lv_16sc_t
Definition: volk_complex.h:71