42 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
43 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
50 #include <emmintrin.h>
57 unsigned int num_points)
59 const unsigned int num_bytes = num_points * 2;
63 int bound = (num_bytes >> 4);
64 int bound_copy = bound;
65 int leftovers = (num_bytes >> 1) & 7;
67 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
74 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
76 while (bound_copy > 0) {
185 for (
i = bound * 8;
i < (bound * 8) + leftovers; ++
i) {
186 temp0 = ((short)(src0[
i] - src1[
i]) > 0) ? src0[
i] : src1[
i];
187 temp1 = ((short)(src2[
i] - src3[
i]) > 0) ? src2[
i] : src3[
i];
188 target[
i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
197 #include <arm_neon.h>
204 unsigned int num_points)
206 const unsigned int eighth_points = num_points / 8;
209 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
210 int16x8_t diff12, diff34;
211 int16x8_t comp0, comp1, comp2, comp3;
212 int16x8_t result1_vec, result2_vec;
214 zeros = vdupq_n_s16(0);
215 for (
i = 0;
i < eighth_points; ++
i) {
216 src0_vec = vld1q_s16(src0);
217 src1_vec = vld1q_s16(src1);
218 src2_vec = vld1q_s16(src2);
219 src3_vec = vld1q_s16(src3);
220 diff12 = vsubq_s16(src0_vec, src1_vec);
221 diff34 = vsubq_s16(src2_vec, src3_vec);
222 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
223 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
224 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
225 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
226 comp0 = vandq_s16(src0_vec, comp0);
227 comp1 = vandq_s16(src1_vec, comp1);
228 comp2 = vandq_s16(src2_vec, comp2);
229 comp3 = vandq_s16(src3_vec, comp3);
231 result1_vec = vaddq_s16(comp0, comp1);
232 result2_vec = vaddq_s16(comp2, comp3);
234 diff12 = vsubq_s16(result1_vec, result2_vec);
235 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
236 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
237 comp0 = vandq_s16(result1_vec, comp0);
238 comp1 = vandq_s16(result2_vec, comp1);
239 result1_vec = vaddq_s16(comp0, comp1);
240 vst1q_s16(target, result1_vec);
250 for (
i = eighth_points * 8;
i < num_points; ++
i) {
251 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
252 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
253 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
263 #ifdef LV_HAVE_GENERIC
269 unsigned int num_points)
271 const unsigned int num_bytes = num_points * 2;
275 int bound = num_bytes >> 1;
279 for (
i = 0;
i < bound; ++
i) {
280 temp0 = ((short)(src0[
i] - src1[
i]) > 0) ? src0[
i] : src1[
i];
281 temp1 = ((short)(src2[
i] - src3[
i]) > 0) ? src2[
i] : src3[
i];
282 target[
i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3156
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6072
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:264
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:199
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:52
for i
Definition: volk_config_fixed.tmpl.h:13