47 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
48 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
54 #include <emmintrin.h>
55 #include <xmmintrin.h>
66 unsigned int num_points)
68 const unsigned int num_bytes = num_points * 2;
70 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
71 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
86 int bound = (num_bytes >> 4);
87 int leftovers = (num_bytes >> 1) & 7;
89 for (;
i < bound; ++
i) {
157 for (
i = bound * 8;
i < (bound * 8) + leftovers; ++
i) {
158 target0[
i] = src0[
i] + src1[
i];
159 target1[
i] = src0[
i] + src2[
i];
160 target2[
i] = src0[
i] + src3[
i];
161 target3[
i] = src0[
i] + src4[
i];
167 #include <arm_neon.h>
178 unsigned int num_points)
180 const unsigned int eighth_points = num_points / 8;
181 unsigned int number = 0;
183 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
184 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
185 for (number = 0; number < eighth_points; ++number) {
186 src0_vec = vld1q_s16(src0);
187 src1_vec = vld1q_s16(src1);
188 src2_vec = vld1q_s16(src2);
189 src3_vec = vld1q_s16(src3);
190 src4_vec = vld1q_s16(src4);
192 target0_vec = vaddq_s16(src0_vec, src1_vec);
193 target1_vec = vaddq_s16(src0_vec, src2_vec);
194 target2_vec = vaddq_s16(src0_vec, src3_vec);
195 target3_vec = vaddq_s16(src0_vec, src4_vec);
197 vst1q_s16(target0, target0_vec);
198 vst1q_s16(target1, target1_vec);
199 vst1q_s16(target2, target2_vec);
200 vst1q_s16(target3, target3_vec);
212 for (number = eighth_points * 8; number < num_points; ++number) {
213 *target0++ = *src0 + *src1++;
214 *target1++ = *src0 + *src2++;
215 *target2++ = *src0 + *src3++;
216 *target3++ = *src0++ + *src4++;
222 #ifdef LV_HAVE_GENERIC
233 unsigned int num_points)
235 const unsigned int num_bytes = num_points * 2;
239 int bound = num_bytes >> 1;
241 for (
i = 0;
i < bound; ++
i) {
242 target0[
i] = src0[
i] + src1[
i];
243 target1[
i] = src0[
i] + src2[
i];
244 target2[
i] = src0[
i] + src3[
i];
245 target3[
i] = src0[
i] + src4[
i];
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:57
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:169
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:224
for i
Definition: volk_config_fixed.tmpl.h:13