Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
43 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
44 
45 #include <inttypes.h>
46 #include <stdio.h>
47 
48 #ifdef LV_HAVE_SSE2
49 
50 #include <emmintrin.h>
51 
52 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
53  short* src0,
54  short* src1,
55  short* src2,
56  short* src3,
57  unsigned int num_points)
58 {
59  const unsigned int num_bytes = num_points * 2;
60 
61  int i = 0;
62 
63  int bound = (num_bytes >> 4);
64  int bound_copy = bound;
65  int leftovers = (num_bytes >> 1) & 7;
66 
67  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
68  p_target = (__m128i*)target;
69  p_src0 = (__m128i*)src0;
70  p_src1 = (__m128i*)src1;
71  p_src2 = (__m128i*)src2;
72  p_src3 = (__m128i*)src3;
73 
74  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
75 
76  while (bound_copy > 0) {
77  xmm1 = _mm_load_si128(p_src0);
78  xmm2 = _mm_load_si128(p_src1);
79  xmm3 = _mm_load_si128(p_src2);
80  xmm4 = _mm_load_si128(p_src3);
81 
82  xmm5 = _mm_setzero_si128();
83  xmm6 = _mm_setzero_si128();
84  xmm7 = xmm1;
85  xmm8 = xmm3;
86 
87  xmm1 = _mm_sub_epi16(xmm2, xmm1);
88 
89  xmm3 = _mm_sub_epi16(xmm4, xmm3);
90 
91  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
92  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
93 
94  xmm2 = _mm_and_si128(xmm5, xmm2);
95  xmm4 = _mm_and_si128(xmm6, xmm4);
96  xmm5 = _mm_andnot_si128(xmm5, xmm7);
97  xmm6 = _mm_andnot_si128(xmm6, xmm8);
98 
99  xmm5 = _mm_add_epi16(xmm2, xmm5);
100  xmm6 = _mm_add_epi16(xmm4, xmm6);
101 
102  xmm1 = _mm_xor_si128(xmm1, xmm1);
103  xmm2 = xmm5;
104  xmm5 = _mm_sub_epi16(xmm6, xmm5);
105  p_src0 += 1;
106  bound_copy -= 1;
107 
108  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
109  p_src1 += 1;
110 
111  xmm6 = _mm_and_si128(xmm1, xmm6);
112 
113  xmm1 = _mm_andnot_si128(xmm1, xmm2);
114  p_src2 += 1;
115 
116  xmm1 = _mm_add_epi16(xmm6, xmm1);
117  p_src3 += 1;
118 
119  _mm_store_si128(p_target, xmm1);
120  p_target += 1;
121  }
122 
123 
124  /*__VOLK_ASM __VOLK_VOLATILE
125  (
126  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
127  "cmp $0, %[bound]\n\t"
128  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
129 
130  "movaps (%[src0]), %%xmm1\n\t"
131  "movaps (%[src1]), %%xmm2\n\t"
132  "movaps (%[src2]), %%xmm3\n\t"
133  "movaps (%[src3]), %%xmm4\n\t"
134 
135  "pxor %%xmm5, %%xmm5\n\t"
136  "pxor %%xmm6, %%xmm6\n\t"
137  "movaps %%xmm1, %%xmm7\n\t"
138  "movaps %%xmm3, %%xmm8\n\t"
139  "psubw %%xmm2, %%xmm1\n\t"
140  "psubw %%xmm4, %%xmm3\n\t"
141 
142  "pcmpgtw %%xmm1, %%xmm5\n\t"
143  "pcmpgtw %%xmm3, %%xmm6\n\t"
144 
145  "pand %%xmm5, %%xmm2\n\t"
146  "pand %%xmm6, %%xmm4\n\t"
147  "pandn %%xmm7, %%xmm5\n\t"
148  "pandn %%xmm8, %%xmm6\n\t"
149 
150  "paddw %%xmm2, %%xmm5\n\t"
151  "paddw %%xmm4, %%xmm6\n\t"
152 
153  "pxor %%xmm1, %%xmm1\n\t"
154  "movaps %%xmm5, %%xmm2\n\t"
155 
156  "psubw %%xmm6, %%xmm5\n\t"
157  "add $16, %[src0]\n\t"
158  "add $-1, %[bound]\n\t"
159 
160  "pcmpgtw %%xmm5, %%xmm1\n\t"
161  "add $16, %[src1]\n\t"
162 
163  "pand %%xmm1, %%xmm6\n\t"
164 
165  "pandn %%xmm2, %%xmm1\n\t"
166  "add $16, %[src2]\n\t"
167 
168  "paddw %%xmm6, %%xmm1\n\t"
169  "add $16, %[src3]\n\t"
170 
171  "movaps %%xmm1, (%[target])\n\t"
172  "addw $16, %[target]\n\t"
173  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
174 
175  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
176  :
177  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
178  [src3]"r"(src3), [target]"r"(target)
179  :
180  );
181  */
182 
183  short temp0 = 0;
184  short temp1 = 0;
185  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
186  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
187  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
188  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
189  }
190  return;
191 }
192 
193 #endif /*LV_HAVE_SSE2*/
194 
195 #ifdef LV_HAVE_NEON
196 
197 #include <arm_neon.h>
198 
199 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
200  short* src0,
201  short* src1,
202  short* src2,
203  short* src3,
204  unsigned int num_points)
205 {
206  const unsigned int eighth_points = num_points / 8;
207  unsigned i;
208 
209  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
210  int16x8_t diff12, diff34;
211  int16x8_t comp0, comp1, comp2, comp3;
212  int16x8_t result1_vec, result2_vec;
213  int16x8_t zeros;
214  zeros = vdupq_n_s16(0);
215  for (i = 0; i < eighth_points; ++i) {
216  src0_vec = vld1q_s16(src0);
217  src1_vec = vld1q_s16(src1);
218  src2_vec = vld1q_s16(src2);
219  src3_vec = vld1q_s16(src3);
220  diff12 = vsubq_s16(src0_vec, src1_vec);
221  diff34 = vsubq_s16(src2_vec, src3_vec);
222  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
223  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
224  comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
225  comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
226  comp0 = vandq_s16(src0_vec, comp0);
227  comp1 = vandq_s16(src1_vec, comp1);
228  comp2 = vandq_s16(src2_vec, comp2);
229  comp3 = vandq_s16(src3_vec, comp3);
230 
231  result1_vec = vaddq_s16(comp0, comp1);
232  result2_vec = vaddq_s16(comp2, comp3);
233 
234  diff12 = vsubq_s16(result1_vec, result2_vec);
235  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237  comp0 = vandq_s16(result1_vec, comp0);
238  comp1 = vandq_s16(result2_vec, comp1);
239  result1_vec = vaddq_s16(comp0, comp1);
240  vst1q_s16(target, result1_vec);
241  src0 += 8;
242  src1 += 8;
243  src2 += 8;
244  src3 += 8;
245  target += 8;
246  }
247 
248  short temp0 = 0;
249  short temp1 = 0;
250  for (i = eighth_points * 8; i < num_points; ++i) {
251  temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
252  temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
253  *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
254  src0++;
255  src1++;
256  src2++;
257  src3++;
258  }
259 }
260 #endif /* LV_HAVE_NEON */
261 
262 
263 #ifdef LV_HAVE_GENERIC
264 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
265  short* src0,
266  short* src1,
267  short* src2,
268  short* src3,
269  unsigned int num_points)
270 {
271  const unsigned int num_bytes = num_points * 2;
272 
273  int i = 0;
274 
275  int bound = num_bytes >> 1;
276 
277  short temp0 = 0;
278  short temp1 = 0;
279  for (i = 0; i < bound; ++i) {
280  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
281  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
282  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
283  }
284 }
285 
286 #endif /*LV_HAVE_GENERIC*/
287 
288 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3156
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6072
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:264
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:199
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:52
for i
Definition: volk_config_fixed.tmpl.h:13