Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_16i_branch_4_state_8.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
43 #ifndef INCLUDED_volk_16i_branch_4_state_8_a_H
44 #define INCLUDED_volk_16i_branch_4_state_8_a_H
45 
46 #include <inttypes.h>
47 #include <stdio.h>
48 
49 #ifdef LV_HAVE_SSSE3
50 
51 #include <emmintrin.h>
52 #include <tmmintrin.h>
53 #include <xmmintrin.h>
54 
55 static inline void volk_16i_branch_4_state_8_a_ssse3(short* target,
56  short* src0,
57  char** permuters,
58  short* cntl2,
59  short* cntl3,
60  short* scalars)
61 {
62  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
63  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
64 
65  p_target = (__m128i*)target;
66  p_src0 = (__m128i*)src0;
67  p_cntl2 = (__m128i*)cntl2;
68  p_cntl3 = (__m128i*)cntl3;
69  p_scalars = (__m128i*)scalars;
70 
71  xmm0 = _mm_load_si128(p_scalars);
72 
73  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
74  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
75  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
76  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
77 
78  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
79  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
80  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
81  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
82 
83  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
84  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
85  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
86  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
87 
88  xmm5 = _mm_load_si128(p_src0);
89  xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
90  xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
91  xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
92  xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
93 
94  xmm5 = _mm_add_epi16(xmm1, xmm2);
95 
96  xmm6 = _mm_add_epi16(xmm2, xmm6);
97  xmm8 = _mm_add_epi16(xmm1, xmm8);
98 
99  xmm7 = _mm_load_si128(p_cntl2);
100  xmm9 = _mm_load_si128(p_cntl3);
101 
102  xmm0 = _mm_add_epi16(xmm5, xmm0);
103 
104  xmm7 = _mm_and_si128(xmm7, xmm3);
105  xmm9 = _mm_and_si128(xmm9, xmm4);
106 
107  xmm5 = _mm_load_si128(&p_cntl2[1]);
108  xmm11 = _mm_load_si128(&p_cntl3[1]);
109 
110  xmm7 = _mm_add_epi16(xmm7, xmm9);
111 
112  xmm5 = _mm_and_si128(xmm5, xmm3);
113  xmm11 = _mm_and_si128(xmm11, xmm4);
114 
115  xmm0 = _mm_add_epi16(xmm0, xmm7);
116 
117 
118  xmm7 = _mm_load_si128(&p_cntl2[2]);
119  xmm9 = _mm_load_si128(&p_cntl3[2]);
120 
121  xmm5 = _mm_add_epi16(xmm5, xmm11);
122 
123  xmm7 = _mm_and_si128(xmm7, xmm3);
124  xmm9 = _mm_and_si128(xmm9, xmm4);
125 
126  xmm6 = _mm_add_epi16(xmm6, xmm5);
127 
128 
129  xmm5 = _mm_load_si128(&p_cntl2[3]);
130  xmm11 = _mm_load_si128(&p_cntl3[3]);
131 
132  xmm7 = _mm_add_epi16(xmm7, xmm9);
133 
134  xmm5 = _mm_and_si128(xmm5, xmm3);
135  xmm11 = _mm_and_si128(xmm11, xmm4);
136 
137  xmm8 = _mm_add_epi16(xmm8, xmm7);
138 
139  xmm5 = _mm_add_epi16(xmm5, xmm11);
140 
141  _mm_store_si128(p_target, xmm0);
142  _mm_store_si128(&p_target[1], xmm6);
143 
144  xmm10 = _mm_add_epi16(xmm5, xmm10);
145 
146  _mm_store_si128(&p_target[2], xmm8);
147 
148  _mm_store_si128(&p_target[3], xmm10);
149 }
150 
151 
152 #endif /*LV_HAVE_SSEs*/
153 
154 #ifdef LV_HAVE_GENERIC
155 static inline void volk_16i_branch_4_state_8_generic(short* target,
156  short* src0,
157  char** permuters,
158  short* cntl2,
159  short* cntl3,
160  short* scalars)
161 {
162  int i = 0;
163 
164  int bound = 4;
165 
166  for (; i < bound; ++i) {
167  target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) +
168  (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) +
169  (cntl3[i * 8] & scalars[3]);
170  target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] +
171  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
172  (cntl2[i * 8 + 1] & scalars[2]) +
173  (cntl3[i * 8 + 1] & scalars[3]);
174  target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] +
175  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
176  (cntl2[i * 8 + 2] & scalars[2]) +
177  (cntl3[i * 8 + 2] & scalars[3]);
178  target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] +
179  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
180  (cntl2[i * 8 + 3] & scalars[2]) +
181  (cntl3[i * 8 + 3] & scalars[3]);
182  target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] +
183  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
184  (cntl2[i * 8 + 4] & scalars[2]) +
185  (cntl3[i * 8 + 4] & scalars[3]);
186  target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] +
187  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
188  (cntl2[i * 8 + 5] & scalars[2]) +
189  (cntl3[i * 8 + 5] & scalars[3]);
190  target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] +
191  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
192  (cntl2[i * 8 + 6] & scalars[2]) +
193  (cntl3[i * 8 + 6] & scalars[3]);
194  target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] +
195  ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
196  (cntl2[i * 8 + 7] & scalars[2]) +
197  (cntl3[i * 8 + 7] & scalars[3]);
198  }
199 }
200 
201 #endif /*LV_HAVE_GENERIC*/
202 
203 
204 #endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
int64x2_t __m128i
Definition: sse2neon.h:244
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
static void volk_16i_branch_4_state_8_a_ssse3(short *target, short *src0, char **permuters, short *cntl2, short *cntl3, short *scalars)
Definition: volk_16i_branch_4_state_8.h:55
static void volk_16i_branch_4_state_8_generic(short *target, short *src0, char **permuters, short *cntl2, short *cntl3, short *scalars)
Definition: volk_16i_branch_4_state_8.h:155
for i
Definition: volk_config_fixed.tmpl.h:13