40 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H 
   41 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H 
   51 #include <emmintrin.h> 
   52 #include <tmmintrin.h> 
   53 #include <xmmintrin.h> 
   57                                                             unsigned int num_points)
 
   59     const unsigned int num_bytes = num_points * 2;
 
   61     static const uint8_t shufmask0[16] = {
 
   62         0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
 
   63         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 
   65     static const uint8_t shufmask1[16] = {
 
   66         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 
   67         0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
 
   69     static const uint8_t andmask0[16] = {
 
   70         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
 
   71         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 
   73     static const uint8_t andmask1[16] = {
 
   74         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 
   75         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
 
   78     __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
 
   91     int bound = num_bytes >> 5;
 
   92     int intermediate = (num_bytes >> 4) & 1;
 
   93     int leftovers = (num_bytes >> 1) & 7;
 
   97     for (
i = 0; 
i < bound; ++
i) {
 
  144         p_target = (
__m128i*)((int8_t*)p_target + 8);
 
  147     for (
i = (bound << 4) + (intermediate << 3);
 
  148          i < (bound << 4) + (intermediate << 3) + leftovers;
 
  150         target[
i >> 1] = ((int16_t)(src0[
i] - src0[
i + 1]) > 0) ? src0[
i] : src0[
i + 1];
 
  158 #include <arm_neon.h> 
  161                                                          unsigned int num_points)
 
  163     const unsigned int eighth_points = num_points / 16;
 
  165     int16x8x2_t input_vec;
 
  166     int16x8_t diff, max_vec, zeros;
 
  167     uint16x8_t comp1, comp2;
 
  168     zeros = vdupq_n_s16(0);
 
  169     for (number = 0; number < eighth_points; ++number) {
 
  170         input_vec = vld2q_s16(src0);
 
  172         diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
 
  173         comp1 = vcgeq_s16(diff, zeros);
 
  174         comp2 = vcltq_s16(diff, zeros);
 
  176         input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
 
  177         input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
 
  179         max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
 
  180         vst1q_s16(target, max_vec);
 
  184     for (number = 0; number < num_points % 16; number += 2) {
 
  185         target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
 
  192 #ifdef LV_HAVE_NEONV7 
  193 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
 
  195                                                        unsigned int num_points);
 
  198 #ifdef LV_HAVE_GENERIC 
  201                                                             unsigned int num_points)
 
  203     const unsigned int num_bytes = num_points * 2;
 
  207     int bound = num_bytes >> 1;
 
  209     for (
i = 0; 
i < bound; 
i += 2) {
 
  210         target[
i >> 1] = ((int16_t)(src0[
i] - src0[
i + 1]) > 0) ? src0[
i] : src0[
i + 1];
 
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
 
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
 
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
 
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3002
 
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
 
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5976
 
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
 
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
 
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6864
 
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
 
int64x2_t __m128i
Definition: sse2neon.h:244
 
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:159
 
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:55
 
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:199
 
#define bit128_p(x)
Definition: volk_common.h:151
 
for i
Definition: volk_config_fixed.tmpl.h:13