Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
47 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
48 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
49 
50 #include <inttypes.h>
51 #include <stdio.h>
52 
53 #ifdef LV_HAVE_SSE2
54 #include <emmintrin.h>
55 #include <xmmintrin.h>
56 
57 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
58  short* target1,
59  short* target2,
60  short* target3,
61  short* src0,
62  short* src1,
63  short* src2,
64  short* src3,
65  short* src4,
66  unsigned int num_points)
67 {
68  const unsigned int num_bytes = num_points * 2;
69 
70  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
71  __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
72  *p_src3, *p_src4;
73  p_target0 = (__m128i*)target0;
74  p_target1 = (__m128i*)target1;
75  p_target2 = (__m128i*)target2;
76  p_target3 = (__m128i*)target3;
77 
78  p_src0 = (__m128i*)src0;
79  p_src1 = (__m128i*)src1;
80  p_src2 = (__m128i*)src2;
81  p_src3 = (__m128i*)src3;
82  p_src4 = (__m128i*)src4;
83 
84  int i = 0;
85 
86  int bound = (num_bytes >> 4);
87  int leftovers = (num_bytes >> 1) & 7;
88 
89  for (; i < bound; ++i) {
90  xmm0 = _mm_load_si128(p_src0);
91  xmm1 = _mm_load_si128(p_src1);
92  xmm2 = _mm_load_si128(p_src2);
93  xmm3 = _mm_load_si128(p_src3);
94  xmm4 = _mm_load_si128(p_src4);
95 
96  p_src0 += 1;
97  p_src1 += 1;
98 
99  xmm1 = _mm_add_epi16(xmm0, xmm1);
100  xmm2 = _mm_add_epi16(xmm0, xmm2);
101  xmm3 = _mm_add_epi16(xmm0, xmm3);
102  xmm4 = _mm_add_epi16(xmm0, xmm4);
103 
104 
105  p_src2 += 1;
106  p_src3 += 1;
107  p_src4 += 1;
108 
109  _mm_store_si128(p_target0, xmm1);
110  _mm_store_si128(p_target1, xmm2);
111  _mm_store_si128(p_target2, xmm3);
112  _mm_store_si128(p_target3, xmm4);
113 
114  p_target0 += 1;
115  p_target1 += 1;
116  p_target2 += 1;
117  p_target3 += 1;
118  }
119  /*__VOLK_ASM __VOLK_VOLATILE
120  (
121  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
122  "cmp $0, %[bound]\n\t"
123  "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
124  "movaps (%[src0]), %%xmm1\n\t"
125  "movaps (%[src1]), %%xmm2\n\t"
126  "movaps (%[src2]), %%xmm3\n\t"
127  "movaps (%[src3]), %%xmm4\n\t"
128  "movaps (%[src4]), %%xmm5\n\t"
129  "add $16, %[src0]\n\t"
130  "add $16, %[src1]\n\t"
131  "add $16, %[src2]\n\t"
132  "add $16, %[src3]\n\t"
133  "add $16, %[src4]\n\t"
134  "paddw %%xmm1, %%xmm2\n\t"
135  "paddw %%xmm1, %%xmm3\n\t"
136  "paddw %%xmm1, %%xmm4\n\t"
137  "paddw %%xmm1, %%xmm5\n\t"
138  "add $-1, %[bound]\n\t"
139  "movaps %%xmm2, (%[target0])\n\t"
140  "movaps %%xmm3, (%[target1])\n\t"
141  "movaps %%xmm4, (%[target2])\n\t"
142  "movaps %%xmm5, (%[target3])\n\t"
143  "add $16, %[target0]\n\t"
144  "add $16, %[target1]\n\t"
145  "add $16, %[target2]\n\t"
146  "add $16, %[target3]\n\t"
147  "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
148  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
149  :
150  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
151  [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
152  [target2]"r"(target2), [target3]"r"(target3)
153  :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
154  );
155  */
156 
157  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
158  target0[i] = src0[i] + src1[i];
159  target1[i] = src0[i] + src2[i];
160  target2[i] = src0[i] + src3[i];
161  target3[i] = src0[i] + src4[i];
162  }
163 }
164 #endif /*LV_HAVE_SSE2*/
165 
166 #ifdef LV_HAVE_NEON
167 #include <arm_neon.h>
168 
169 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
170  short* target1,
171  short* target2,
172  short* target3,
173  short* src0,
174  short* src1,
175  short* src2,
176  short* src3,
177  short* src4,
178  unsigned int num_points)
179 {
180  const unsigned int eighth_points = num_points / 8;
181  unsigned int number = 0;
182 
183  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
184  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
185  for (number = 0; number < eighth_points; ++number) {
186  src0_vec = vld1q_s16(src0);
187  src1_vec = vld1q_s16(src1);
188  src2_vec = vld1q_s16(src2);
189  src3_vec = vld1q_s16(src3);
190  src4_vec = vld1q_s16(src4);
191 
192  target0_vec = vaddq_s16(src0_vec, src1_vec);
193  target1_vec = vaddq_s16(src0_vec, src2_vec);
194  target2_vec = vaddq_s16(src0_vec, src3_vec);
195  target3_vec = vaddq_s16(src0_vec, src4_vec);
196 
197  vst1q_s16(target0, target0_vec);
198  vst1q_s16(target1, target1_vec);
199  vst1q_s16(target2, target2_vec);
200  vst1q_s16(target3, target3_vec);
201  src0 += 8;
202  src1 += 8;
203  src2 += 8;
204  src3 += 8;
205  src4 += 8;
206  target0 += 8;
207  target1 += 8;
208  target2 += 8;
209  target3 += 8;
210  }
211 
212  for (number = eighth_points * 8; number < num_points; ++number) {
213  *target0++ = *src0 + *src1++;
214  *target1++ = *src0 + *src2++;
215  *target2++ = *src0 + *src3++;
216  *target3++ = *src0++ + *src4++;
217  }
218 }
219 
220 #endif /* LV_HAVE_NEON */
221 
222 #ifdef LV_HAVE_GENERIC
223 
224 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
225  short* target1,
226  short* target2,
227  short* target3,
228  short* src0,
229  short* src1,
230  short* src2,
231  short* src3,
232  short* src4,
233  unsigned int num_points)
234 {
235  const unsigned int num_bytes = num_points * 2;
236 
237  int i = 0;
238 
239  int bound = num_bytes >> 1;
240 
241  for (i = 0; i < bound; ++i) {
242  target0[i] = src0[i] + src1[i];
243  target1[i] = src0[i] + src2[i];
244  target2[i] = src0[i] + src3[i];
245  target3[i] = src0[i] + src4[i];
246  }
247 }
248 
249 #endif /* LV_HAVE_GENERIC */
250 
251 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:57
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:169
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:224
for i
Definition: volk_config_fixed.tmpl.h:13