Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_permute_and_scalar_add.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
46 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
47 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
48 
49 #include <inttypes.h>
50 #include <stdio.h>
51 
52 #ifdef LV_HAVE_SSE2
53 
54 #include <emmintrin.h>
55 #include <xmmintrin.h>
56 
57 static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
58  short* src0,
59  short* permute_indexes,
60  short* cntl0,
61  short* cntl1,
62  short* cntl2,
63  short* cntl3,
64  short* scalars,
65  unsigned int num_points)
66 {
67 
68  const unsigned int num_bytes = num_points * 2;
69 
70  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
71 
72  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
73 
74  short* p_permute_indexes = permute_indexes;
75 
76  p_target = (__m128i*)target;
77  p_cntl0 = (__m128i*)cntl0;
78  p_cntl1 = (__m128i*)cntl1;
79  p_cntl2 = (__m128i*)cntl2;
80  p_cntl3 = (__m128i*)cntl3;
81  p_scalars = (__m128i*)scalars;
82 
83  int i = 0;
84 
85  int bound = (num_bytes >> 4);
86  int leftovers = (num_bytes >> 1) & 7;
87 
88  xmm0 = _mm_load_si128(p_scalars);
89 
90  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
91  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
92  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
93  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
94 
95  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
96  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
97  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
98  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
99 
100 
101  for (; i < bound; ++i) {
102  xmm0 = _mm_setzero_si128();
103  xmm5 = _mm_setzero_si128();
104  xmm6 = _mm_setzero_si128();
105  xmm7 = _mm_setzero_si128();
106 
107  xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
108  xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
109  xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
110  xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
111  xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
112  xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
113  xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
114  xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
115 
116  xmm0 = _mm_add_epi16(xmm0, xmm5);
117  xmm6 = _mm_add_epi16(xmm6, xmm7);
118 
119  p_permute_indexes += 8;
120 
121  xmm0 = _mm_add_epi16(xmm0, xmm6);
122 
123  xmm5 = _mm_load_si128(p_cntl0);
124  xmm6 = _mm_load_si128(p_cntl1);
125  xmm7 = _mm_load_si128(p_cntl2);
126 
127  xmm5 = _mm_and_si128(xmm5, xmm1);
128  xmm6 = _mm_and_si128(xmm6, xmm2);
129  xmm7 = _mm_and_si128(xmm7, xmm3);
130 
131  xmm0 = _mm_add_epi16(xmm0, xmm5);
132 
133  xmm5 = _mm_load_si128(p_cntl3);
134 
135  xmm6 = _mm_add_epi16(xmm6, xmm7);
136 
137  p_cntl0 += 1;
138 
139  xmm5 = _mm_and_si128(xmm5, xmm4);
140 
141  xmm0 = _mm_add_epi16(xmm0, xmm6);
142 
143  p_cntl1 += 1;
144  p_cntl2 += 1;
145 
146  xmm0 = _mm_add_epi16(xmm0, xmm5);
147 
148  p_cntl3 += 1;
149 
150  _mm_store_si128(p_target, xmm0);
151 
152  p_target += 1;
153  }
154 
155  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
156  target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
157  (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
158  (cntl3[i] & scalars[3]);
159  }
160 }
161 #endif /*LV_HAVE_SSE*/
162 
163 
164 #ifdef LV_HAVE_GENERIC
165 static inline void volk_16i_permute_and_scalar_add_generic(short* target,
166  short* src0,
167  short* permute_indexes,
168  short* cntl0,
169  short* cntl1,
170  short* cntl2,
171  short* cntl3,
172  short* scalars,
173  unsigned int num_points)
174 {
175  const unsigned int num_bytes = num_points * 2;
176 
177  int i = 0;
178 
179  int bound = num_bytes >> 1;
180 
181  for (i = 0; i < bound; ++i) {
182  target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
183  (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
184  (cntl3[i] & scalars[3]);
185  }
186 }
187 
188 #endif /*LV_HAVE_GENERIC*/
189 
190 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
#define _mm_insert_epi16(a, b, imm)
Definition: sse2neon.h:4418
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_16i_permute_and_scalar_add_a_sse2(short *target, short *src0, short *permute_indexes, short *cntl0, short *cntl1, short *cntl2, short *cntl3, short *scalars, unsigned int num_points)
Definition: volk_16i_permute_and_scalar_add.h:57
static void volk_16i_permute_and_scalar_add_generic(short *target, short *src0, short *permute_indexes, short *cntl0, short *cntl1, short *cntl2, short *cntl3, short *scalars, unsigned int num_points)
Definition: volk_16i_permute_and_scalar_add.h:165
for i
Definition: volk_config_fixed.tmpl.h:13