Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
41 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
42 
43 #include <volk/volk_common.h>
44 
45 #include <inttypes.h>
46 #include <stdio.h>
47 
48 
49 #ifdef LV_HAVE_SSSE3
50 
51 #include <emmintrin.h>
52 #include <tmmintrin.h>
53 #include <xmmintrin.h>
54 
55 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
56  int16_t* src0,
57  unsigned int num_points)
58 {
59  const unsigned int num_bytes = num_points * 2;
60 
61  static const uint8_t shufmask0[16] = {
62  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
63  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
64  };
65  static const uint8_t shufmask1[16] = {
66  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
68  };
69  static const uint8_t andmask0[16] = {
70  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
71  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
72  };
73  static const uint8_t andmask1[16] = {
74  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
76  };
77 
78  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
79  __m128i xmm5, xmm6, xmm7, xmm8;
80 
81  xmm4 = _mm_load_si128((__m128i*)shufmask0);
82  xmm5 = _mm_load_si128((__m128i*)shufmask1);
83  xmm6 = _mm_load_si128((__m128i*)andmask0);
84  xmm7 = _mm_load_si128((__m128i*)andmask1);
85 
86  __m128i *p_target, *p_src0;
87 
88  p_target = (__m128i*)target;
89  p_src0 = (__m128i*)src0;
90 
91  int bound = num_bytes >> 5;
92  int intermediate = (num_bytes >> 4) & 1;
93  int leftovers = (num_bytes >> 1) & 7;
94 
95  int i = 0;
96 
97  for (i = 0; i < bound; ++i) {
98  xmm0 = _mm_load_si128(p_src0);
99  xmm1 = _mm_load_si128(&p_src0[1]);
100 
101  xmm2 = _mm_xor_si128(xmm2, xmm2);
102  p_src0 += 2;
103 
104  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
105 
106  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
107 
108  xmm8 = _mm_and_si128(xmm2, xmm6);
109  xmm3 = _mm_and_si128(xmm2, xmm7);
110 
111 
112  xmm8 = _mm_add_epi8(xmm8, xmm4);
113  xmm3 = _mm_add_epi8(xmm3, xmm5);
114 
115  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
116  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
117 
118 
119  xmm3 = _mm_add_epi16(xmm0, xmm1);
120 
121 
122  _mm_store_si128(p_target, xmm3);
123 
124  p_target += 1;
125  }
126 
127  if (intermediate) {
128  xmm0 = _mm_load_si128(p_src0);
129 
130  xmm2 = _mm_xor_si128(xmm2, xmm2);
131  p_src0 += 1;
132 
133  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
134  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
135 
136  xmm8 = _mm_and_si128(xmm2, xmm6);
137 
138  xmm3 = _mm_add_epi8(xmm8, xmm4);
139 
140  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
141 
142  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
143 
144  p_target = (__m128i*)((int8_t*)p_target + 8);
145  }
146 
147  for (i = (bound << 4) + (intermediate << 3);
148  i < (bound << 4) + (intermediate << 3) + leftovers;
149  i += 2) {
150  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
151  }
152 }
153 
154 #endif /*LV_HAVE_SSSE3*/
155 
156 #ifdef LV_HAVE_NEON
157 
158 #include <arm_neon.h>
159 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
160  int16_t* src0,
161  unsigned int num_points)
162 {
163  const unsigned int eighth_points = num_points / 16;
164  unsigned number;
165  int16x8x2_t input_vec;
166  int16x8_t diff, max_vec, zeros;
167  uint16x8_t comp1, comp2;
168  zeros = vdupq_n_s16(0);
169  for (number = 0; number < eighth_points; ++number) {
170  input_vec = vld2q_s16(src0);
171  //__VOLK_PREFETCH(src0+16);
172  diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
173  comp1 = vcgeq_s16(diff, zeros);
174  comp2 = vcltq_s16(diff, zeros);
175 
176  input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
177  input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
178 
179  max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
180  vst1q_s16(target, max_vec);
181  src0 += 16;
182  target += 8;
183  }
184  for (number = 0; number < num_points % 16; number += 2) {
185  target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
186  ? src0[number]
187  : src0[number + 1];
188  }
189 }
190 #endif /* LV_HAVE_NEON */
191 
192 #ifdef LV_HAVE_NEONV7
193 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
194  int16_t* src0,
195  unsigned int num_points);
196 #endif /* LV_HAVE_NEONV7 */
197 
198 #ifdef LV_HAVE_GENERIC
199 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
200  int16_t* src0,
201  unsigned int num_points)
202 {
203  const unsigned int num_bytes = num_points * 2;
204 
205  int i = 0;
206 
207  int bound = num_bytes >> 1;
208 
209  for (i = 0; i < bound; i += 2) {
210  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
211  }
212 }
213 
214 #endif /*LV_HAVE_GENERIC*/
215 
216 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3002
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5976
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6864
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:159
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:55
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:199
#define bit128_p(x)
Definition: volk_common.h:151
for i
Definition: volk_config_fixed.tmpl.h:13