Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
sse2neon.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 Free Software Foundation, Inc.
3  *
4  * This file is part of VOLK
5  *
6  * SPDX-License-Identifier: MIT
7  *
8  * This file is from :
9  * https://github.com/DLTcollab/sse2neon
10  */
11 
12 // Turn off Clang formatting, as
13 // this would make diffs a lot more
14 // tricky.
15 // clang-format off
16 #ifndef SSE2NEON_H
17 #define SSE2NEON_H
18 
19 // This header file provides a simple API translation layer
20 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
21 //
22 // This header file does not yet translate all of the SSE intrinsics.
23 //
24 // Contributors to this work are:
25 // John W. Ratcliff <jratcliffscarab@gmail.com>
26 // Brandon Rowlett <browlett@nvidia.com>
27 // Ken Fast <kfast@gdeb.com>
28 // Eric van Beurden <evanbeurden@nvidia.com>
29 // Alexander Potylitsin <apotylitsin@nvidia.com>
30 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
31 // Jim Huang <jserv@biilabs.io>
32 // Mark Cheng <marktwtn@biilabs.io>
33 // Malcolm James MacLeod <malcolm@gulden.com>
34 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
35 // Sebastian Pop <spop@amazon.com>
36 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
37 // Danila Kutenin <danilak@google.com>
38 // François Turban (JishinMaster) <francois.turban@gmail.com>
39 // Pei-Hsuan Hung <afcidk@gmail.com>
40 // Yang-Hao Yuan <yanghau@biilabs.io>
41 // Syoyo Fujita <syoyo@lighttransport.com>
42 // Brecht Van Lommel <brecht@blender.org>
43 
44 /*
45  * sse2neon is freely redistributable under the MIT License.
46  *
47  * Permission is hereby granted, free of charge, to any person obtaining a copy
48  * of this software and associated documentation files (the "Software"), to deal
49  * in the Software without restriction, including without limitation the rights
50  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
51  * copies of the Software, and to permit persons to whom the Software is
52  * furnished to do so, subject to the following conditions:
53  *
54  * The above copyright notice and this permission notice shall be included in
55  * all copies or substantial portions of the Software.
56  *
57  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
62  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
63  * SOFTWARE.
64  */
65 
66 /* Tunable configurations */
67 
68 /* Enable precise implementation of math operations
69  * This would slow down the computation a bit, but gives consistent result with
70  * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
71  */
72 /* _mm_min|max_ps|ss|pd|sd */
73 #ifndef SSE2NEON_PRECISE_MINMAX
74 #define SSE2NEON_PRECISE_MINMAX (0)
75 #endif
76 /* _mm_rcp_ps and _mm_div_ps */
77 #ifndef SSE2NEON_PRECISE_DIV
78 #define SSE2NEON_PRECISE_DIV (0)
79 #endif
80 /* _mm_sqrt_ps and _mm_rsqrt_ps */
81 #ifndef SSE2NEON_PRECISE_SQRT
82 #define SSE2NEON_PRECISE_SQRT (0)
83 #endif
84 /* _mm_dp_pd */
85 #ifndef SSE2NEON_PRECISE_DP
86 #define SSE2NEON_PRECISE_DP (0)
87 #endif
88 
89 /* compiler specific definitions */
90 #if defined(__GNUC__) || defined(__clang__)
91 #pragma push_macro("FORCE_INLINE")
92 #pragma push_macro("ALIGN_STRUCT")
93 #define FORCE_INLINE static inline __attribute__((always_inline))
94 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
95 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
96 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
97 #else /* non-GNU / non-clang compilers */
98 #warning "Macro name collisions may happen with unsupported compiler."
99 #ifndef FORCE_INLINE
100 #define FORCE_INLINE static inline
101 #endif
102 #ifndef ALIGN_STRUCT
103 #define ALIGN_STRUCT(x) __declspec(align(x))
104 #endif
105 #define _sse2neon_likely(x) (x)
106 #define _sse2neon_unlikely(x) (x)
107 #endif
108 
109 /* C language does not allow initializing a variable with a function call. */
110 #ifdef __cplusplus
111 #define _sse2neon_const static const
112 #else
113 #define _sse2neon_const const
114 #endif
115 
116 #include <stdint.h>
117 #include <stdlib.h>
118 
119 /* Architecture-specific build options */
120 /* FIXME: #pragma GCC push_options is only available on GCC */
121 #if defined(__GNUC__)
122 #if defined(__arm__) && __ARM_ARCH == 7
123 /* According to ARM C Language Extensions Architecture specification,
124  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
125  * architecture supported.
126  */
127 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
128 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
129 #endif
130 #if !defined(__clang__)
131 #pragma GCC push_options
132 #pragma GCC target("fpu=neon")
133 #endif
134 #elif defined(__aarch64__)
135 #if !defined(__clang__)
136 #pragma GCC push_options
137 #pragma GCC target("+simd")
138 #endif
139 #elif __ARM_ARCH == 8
140 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
141 #error \
142  "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
143 #endif
144 #if !defined(__clang__)
145 #pragma GCC push_options
146 #endif
147 #else
148 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
149 #endif
150 #endif
151 
152 #include <arm_neon.h>
153 #if !defined(__aarch64__) && (__ARM_ARCH == 8)
154 #if defined __has_include && __has_include(<arm_acle.h>)
155 #include <arm_acle.h>
156 #endif
157 #endif
158 
159 /* Rounding functions require either Aarch64 instructions or libm failback */
160 #if !defined(__aarch64__)
161 #include <math.h>
162 #endif
163 
164 /* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
165  * or even not accessible in user mode.
166  * To write or access to these registers in user mode,
167  * we have to perform syscall instead.
168  */
169 #if !defined(__aarch64__)
170 #include <sys/time.h>
171 #endif
172 
173 /* "__has_builtin" can be used to query support for built-in functions
174  * provided by gcc/clang and other compilers that support it.
175  */
176 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
177 /* Compatibility with gcc <= 9 */
178 #if defined(__GNUC__) && (__GNUC__ <= 9)
179 #define __has_builtin(x) HAS##x
180 #define HAS__builtin_popcount 1
181 #define HAS__builtin_popcountll 1
182 #else
183 #define __has_builtin(x) 0
184 #endif
185 #endif
186 
195 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
196  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
197 
198 /* Rounding mode macros. */
199 #define _MM_FROUND_TO_NEAREST_INT 0x00
200 #define _MM_FROUND_TO_NEG_INF 0x01
201 #define _MM_FROUND_TO_POS_INF 0x02
202 #define _MM_FROUND_TO_ZERO 0x03
203 #define _MM_FROUND_CUR_DIRECTION 0x04
204 #define _MM_FROUND_NO_EXC 0x08
205 #define _MM_FROUND_RAISE_EXC 0x00
206 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
207 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
208 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
209 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
210 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
211 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
212 #define _MM_ROUND_NEAREST 0x0000
213 #define _MM_ROUND_DOWN 0x2000
214 #define _MM_ROUND_UP 0x4000
215 #define _MM_ROUND_TOWARD_ZERO 0x6000
216 /* Flush zero mode macros. */
217 #define _MM_FLUSH_ZERO_MASK 0x8000
218 #define _MM_FLUSH_ZERO_ON 0x8000
219 #define _MM_FLUSH_ZERO_OFF 0x0000
220 /* Denormals are zeros mode macros. */
221 #define _MM_DENORMALS_ZERO_MASK 0x0040
222 #define _MM_DENORMALS_ZERO_ON 0x0040
223 #define _MM_DENORMALS_ZERO_OFF 0x0000
224 
225 /* indicate immediate constant argument in a given range */
226 #define __constrange(a, b) const
227 
228 /* A few intrinsics accept traditional data types like ints or floats, but
229  * most operate on data types that are specific to SSE.
230  * If a vector type ends in d, it contains doubles, and if it does not have
231  * a suffix, it contains floats. An integer vector type can contain any type
232  * of integer, from chars to shorts to unsigned long longs.
233  */
234 typedef int64x1_t __m64;
235 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
236 // On ARM 32-bit architecture, the float64x2_t is not supported.
237 // The data type __m128d should be represented in a different way for related
238 // intrinsic conversion.
239 #if defined(__aarch64__)
240 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
241 #else
242 typedef float32x4_t __m128d;
243 #endif
244 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
245 
246 // __int64 is defined in the Intrinsics Guide which maps to different datatype
247 // in different data model
248 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
249 #if (defined(__x86_64__) || defined(__i386__))
250 #define __int64 long long
251 #else
252 #define __int64 int64_t
253 #endif
254 #endif
255 
256 /* type-safe casting between types */
257 
258 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
259 #define vreinterpretq_m128_f32(x) (x)
260 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
261 
262 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
263 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
264 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
265 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
266 
267 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
268 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
269 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
270 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
271 
272 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
273 #define vreinterpretq_f32_m128(x) (x)
274 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
275 
276 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
277 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
278 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
279 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
280 
281 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
282 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
283 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
284 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
285 
286 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
287 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
288 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
289 #define vreinterpretq_m128i_s64(x) (x)
290 
291 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
292 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
293 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
294 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
295 
296 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
297 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
298 
299 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
300 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
301 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
302 #define vreinterpretq_s64_m128i(x) (x)
303 
304 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
305 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
306 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
307 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
308 
309 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
310 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
311 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
312 #define vreinterpret_m64_s64(x) (x)
313 
314 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
315 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
316 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
317 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
318 
319 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
320 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
321 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
322 
323 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
324 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
325 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
326 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
327 
328 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
329 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
330 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
331 #define vreinterpret_s64_m64(x) (x)
332 
333 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
334 
335 #if defined(__aarch64__)
336 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
337 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
338 
339 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
340 
341 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
342 #define vreinterpretq_m128d_f64(x) (x)
343 
344 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
345 
346 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
347 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
348 
349 #define vreinterpretq_f64_m128d(x) (x)
350 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
351 #else
352 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
353 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
354 
355 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
356 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
357 
358 #define vreinterpretq_m128d_f32(x) (x)
359 
360 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
361 
362 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
363 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
364 
365 #define vreinterpretq_f32_m128d(x) (x)
366 #endif
367 
368 // A struct is defined in this header file called 'SIMDVec' which can be used
369 // by applications which attempt to access the contents of an __m128 struct
370 // directly. It is important to note that accessing the __m128 struct directly
371 // is bad coding practice by Microsoft: @see:
372 // https://docs.microsoft.com/en-us/cpp/cpp/m128
373 //
374 // However, some legacy source code may try to access the contents of an __m128
375 // struct directly so the developer can use the SIMDVec as an alias for it. Any
376 // casting must be done manually by the developer, as you cannot cast or
377 // otherwise alias the base NEON data type for intrinsic operations.
378 //
379 // union intended to allow direct access to an __m128 variable using the names
380 // that the MSVC compiler provides. This union should really only be used when
381 // trying to access the members of the vector as integer values. GCC/clang
382 // allow native access to the float members through a simple array access
383 // operator (in C since 4.6, in C++ since 4.8).
384 //
385 // Ideally direct accesses to SIMD vectors should not be used since it can cause
386 // a performance hit. If it really is needed however, the original __m128
387 // variable can be aliased with a pointer to this union and used to access
388 // individual components. The use of this union should be hidden behind a macro
389 // that is used throughout the codebase to access the members instead of always
390 // declaring this type of variable.
391 typedef union ALIGN_STRUCT(16) SIMDVec {
392  float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
393  int8_t m128_i8[16]; // as signed 8-bit integers.
394  int16_t m128_i16[8]; // as signed 16-bit integers.
395  int32_t m128_i32[4]; // as signed 32-bit integers.
396  int64_t m128_i64[2]; // as signed 64-bit integers.
397  uint8_t m128_u8[16]; // as unsigned 8-bit integers.
398  uint16_t m128_u16[8]; // as unsigned 16-bit integers.
399  uint32_t m128_u32[4]; // as unsigned 32-bit integers.
400  uint64_t m128_u64[2]; // as unsigned 64-bit integers.
402 
403 // casting using SIMDVec
404 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
405 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
406 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
407 
408 /* SSE macros */
409 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
410 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
411 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
412 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
413 
414 // Function declaration
415 // SSE
416 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
421 // SSE2
428 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
429 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
430 FORCE_INLINE __m128d _mm_set_pd(double, double);
433 // SSE4.1
440 // SSE4.2
441 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
442 
443 /* Backwards compatibility for compilers with lack of specific type support */
444 
445 // Older gcc does not define vld1q_u8_x4 type
446 #if defined(__GNUC__) && !defined(__clang__) && \
447  ((__GNUC__ <= 12 && defined(__arm__)) || \
448  (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
449  (__GNUC__ <= 9 && defined(__aarch64__)))
450 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
451 {
452  uint8x16x4_t ret;
453  ret.val[0] = vld1q_u8(p + 0);
454  ret.val[1] = vld1q_u8(p + 16);
455  ret.val[2] = vld1q_u8(p + 32);
456  ret.val[3] = vld1q_u8(p + 48);
457  return ret;
458 }
459 #else
460 // Wraps vld1q_u8_x4
461 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
462 {
463  return vld1q_u8_x4(p);
464 }
465 #endif
466 
467 /* Function Naming Conventions
468  * The naming convention of SSE intrinsics is straightforward. A generic SSE
469  * intrinsic function is given as follows:
470  * _mm_<name>_<data_type>
471  *
472  * The parts of this format are given as follows:
473  * 1. <name> describes the operation performed by the intrinsic
474  * 2. <data_type> identifies the data type of the function's primary arguments
475  *
476  * This last part, <data_type>, is a little complicated. It identifies the
477  * content of the input values, and can be set to any of the following values:
478  * + ps - vectors contain floats (ps stands for packed single-precision)
479  * + pd - vectors cantain doubles (pd stands for packed double-precision)
480  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
481  * signed integers
482  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
483  * unsigned integers
484  * + si128 - unspecified 128-bit vector or 256-bit vector
485  * + m128/m128i/m128d - identifies input vector types when they are different
486  * than the type of the returned vector
487  *
488  * For example, _mm_setzero_ps. The _mm implies that the function returns
489  * a 128-bit vector. The _ps at the end implies that the argument vectors
490  * contain floats.
491  *
492  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
493  * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
494  * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
495  * // Set packed 8-bit integers
496  * // 128 bits, 16 chars, per 8 bits
497  * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
498  * 4, 5, 12, 13, 6, 7, 14, 15);
499  * // Shuffle packed 8-bit integers
500  * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
501  *
502  * Data (Number, Binary, Byte Index):
503  +------+------+-------------+------+------+-------------+
504  | 1 | 2 | 3 | 4 | Number
505  +------+------+------+------+------+------+------+------+
506  | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
507  +------+------+------+------+------+------+------+------+
508  | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
509  +------+------+------+------+------+------+------+------+
510 
511  +------+------+------+------+------+------+------+------+
512  | 5 | 6 | 7 | 8 | Number
513  +------+------+------+------+------+------+------+------+
514  | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
515  +------+------+------+------+------+------+------+------+
516  | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
517  +------+------+------+------+------+------+------+------+
518  * Index (Byte Index):
519  +------+------+------+------+------+------+------+------+
520  | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
521  +------+------+------+------+------+------+------+------+
522 
523  +------+------+------+------+------+------+------+------+
524  | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
525  +------+------+------+------+------+------+------+------+
526  * Result:
527  +------+------+------+------+------+------+------+------+
528  | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
529  +------+------+------+------+------+------+------+------+
530  | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
531  +------+------+------+------+------+------+------+------+
532  | 256 | 2 | 5 | 6 | Number
533  +------+------+------+------+------+------+------+------+
534 
535  +------+------+------+------+------+------+------+------+
536  | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
537  +------+------+------+------+------+------+------+------+
538  | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
539  +------+------+------+------+------+------+------+------+
540  | 3 | 7 | 4 | 8 | Number
541  +------+------+------+------+------+------+-------------+
542  */
543 
544 /* Constants for use with _mm_prefetch. */
545 enum _mm_hint {
546  _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
547  _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
548  _MM_HINT_T1 = 2, /* load data to L2 cache only */
549  _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
550  _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
551  _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
552  _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
553  _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
554 };
555 
556 // The bit field mapping to the FPCR(floating-point control register)
557 typedef struct {
558  uint16_t res0;
559  uint8_t res1 : 6;
560  uint8_t bit22 : 1;
561  uint8_t bit23 : 1;
562  uint8_t bit24 : 1;
563  uint8_t res2 : 7;
564 #if defined(__aarch64__)
565  uint32_t res3;
566 #endif
567 } fpcr_bitfield;
568 
569 // Takes the upper 64 bits of a and places it in the low end of the result
570 // Takes the lower 64 bits of b and places it into the high end of the result.
572 {
573  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
574  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
575  return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
576 }
577 
578 // takes the lower two 32-bit values from a and swaps them and places in high
579 // end of result takes the higher two 32 bit values from b and swaps them and
580 // places in low end of result.
582 {
583  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
584  float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
585  return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
586 }
587 
589 {
590  float32x2_t a21 = vget_high_f32(
591  vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
592  float32x2_t b03 = vget_low_f32(
593  vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
594  return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
595 }
596 
598 {
599  float32x2_t a03 = vget_low_f32(
600  vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
601  float32x2_t b21 = vget_high_f32(
602  vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
603  return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
604 }
605 
607 {
608  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
609  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
610  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
611 }
612 
614 {
615  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
616  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
617  return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
618 }
619 
621 {
622  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
623  float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
624  return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
625 }
626 
627 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
628 // high
630 {
631  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
632  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
633  return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
634 }
635 
637 {
638  float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
639  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
640  return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
641 }
642 
644 {
645  float32x2_t a22 =
646  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
647  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
648  return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
649 }
650 
652 {
653  float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
654  float32x2_t b22 =
655  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
656  return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
657 }
658 
660 {
661  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
662  float32x2_t a22 =
663  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
664  float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
665  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
666  return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
667 }
668 
670 {
671  float32x2_t a33 =
672  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
673  float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
674  return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
675 }
676 
678 {
679  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
680  float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
681  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
682  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
683  return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
684 }
685 
687 {
688  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
689  float32_t b2 = vgetq_lane_f32(b, 2);
690  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
691  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
692  return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
693 }
694 
696 {
697  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
698  float32_t b2 = vgetq_lane_f32(b, 2);
699  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
700  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
701  return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
702 }
703 
704 // Kahan summation for accurate summation of floating-point numbers.
705 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
706 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
707 {
708  y -= *c;
709  float t = *sum + y;
710  *c = (t - *sum) - y;
711  *sum = t;
712 }
713 
714 #if defined(__ARM_FEATURE_CRYPTO) && \
715  (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
716 // Wraps vmull_p64
717 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
718 {
719  poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
720  poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
721  return vreinterpretq_u64_p128(vmull_p64(a, b));
722 }
723 #else // ARMv7 polyfill
724 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
725 //
726 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
727 // 64-bit->128-bit polynomial multiply.
728 //
729 // It needs some work and is somewhat slow, but it is still faster than all
730 // known scalar methods.
731 //
732 // Algorithm adapted to C from
733 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
734 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
735 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
736 // (https://hal.inria.fr/hal-01506572)
737 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
738 {
739  poly8x8_t a = vreinterpret_p8_u64(_a);
740  poly8x8_t b = vreinterpret_p8_u64(_b);
741 
742  // Masks
743  uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
744  vcreate_u8(0x00000000ffffffff));
745  uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
746  vcreate_u8(0x0000000000000000));
747 
748  // Do the multiplies, rotating with vext to get all combinations
749  uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
750  uint8x16_t e =
751  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
752  uint8x16_t f =
753  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
754  uint8x16_t g =
755  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
756  uint8x16_t h =
757  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
758  uint8x16_t i =
759  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
760  uint8x16_t j =
761  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
762  uint8x16_t k =
763  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
764 
765  // Add cross products
766  uint8x16_t l = veorq_u8(e, f); // L = E + F
767  uint8x16_t m = veorq_u8(g, h); // M = G + H
768  uint8x16_t n = veorq_u8(i, j); // N = I + J
769 
770  // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
771  // instructions.
772 #if defined(__aarch64__)
773  uint8x16_t lm_p0 = vreinterpretq_u8_u64(
774  vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
775  uint8x16_t lm_p1 = vreinterpretq_u8_u64(
776  vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
777  uint8x16_t nk_p0 = vreinterpretq_u8_u64(
778  vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
779  uint8x16_t nk_p1 = vreinterpretq_u8_u64(
780  vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
781 #else
782  uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
783  uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
784  uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
785  uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
786 #endif
787  // t0 = (L) (P0 + P1) << 8
788  // t1 = (M) (P2 + P3) << 16
789  uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
790  uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
791  uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
792 
793  // t2 = (N) (P4 + P5) << 24
794  // t3 = (K) (P6 + P7) << 32
795  uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
796  uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
797  uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
798 
799  // De-interleave
800 #if defined(__aarch64__)
801  uint8x16_t t0 = vreinterpretq_u8_u64(
802  vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
803  uint8x16_t t1 = vreinterpretq_u8_u64(
804  vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
805  uint8x16_t t2 = vreinterpretq_u8_u64(
806  vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
807  uint8x16_t t3 = vreinterpretq_u8_u64(
808  vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
809 #else
810  uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
811  uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
812  uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
813  uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
814 #endif
815  // Shift the cross products
816  uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
817  uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
818  uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
819  uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
820 
821  // Accumulate the products
822  uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
823  uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
824  uint8x16_t mix = veorq_u8(d, cross1);
825  uint8x16_t r = veorq_u8(mix, cross2);
826  return vreinterpretq_u64_u8(r);
827 }
828 #endif // ARMv7 polyfill
829 
830 // C equivalent:
831 // __m128i _mm_shuffle_epi32_default(__m128i a,
832 // __constrange(0, 255) int imm) {
833 // __m128i ret;
834 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
835 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
836 // return ret;
837 // }
838 #define _mm_shuffle_epi32_default(a, imm) \
839  __extension__({ \
840  int32x4_t ret; \
841  ret = vmovq_n_s32( \
842  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
843  ret = vsetq_lane_s32( \
844  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
845  ret, 1); \
846  ret = vsetq_lane_s32( \
847  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
848  ret, 2); \
849  ret = vsetq_lane_s32( \
850  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
851  ret, 3); \
852  vreinterpretq_m128i_s32(ret); \
853  })
854 
855 // Takes the upper 64 bits of a and places it in the low end of the result
856 // Takes the lower 64 bits of a and places it into the high end of the result.
858 {
859  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
860  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
861  return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
862 }
863 
864 // takes the lower two 32-bit values from a and swaps them and places in low end
865 // of result takes the higher two 32 bit values from a and swaps them and places
866 // in high end of result.
868 {
869  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
870  int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
871  return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
872 }
873 
874 // rotates the least significant 32 bits into the most significant 32 bits, and
875 // shifts the rest down
877 {
879  vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
880 }
881 
882 // rotates the most significant 32 bits into the least significant 32 bits, and
883 // shifts the rest up
885 {
887  vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
888 }
889 
890 // gets the lower 64 bits of a, and places it in the upper 64 bits
891 // gets the lower 64 bits of a and places it in the lower 64 bits
893 {
894  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
895  return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
896 }
897 
898 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
899 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
901 {
902  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
903  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
904  return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
905 }
906 
907 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
908 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
909 // places it in the lower 64 bits
911 {
912  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
913  return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
914 }
915 
917 {
918  int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
919  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
920  return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
921 }
922 
924 {
925  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
926  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
927  return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
928 }
929 
931 {
932  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
933  int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
934  return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
935 }
936 
937 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
938 // int imm)
939 #if defined(__aarch64__)
940 #define _mm_shuffle_epi32_splat(a, imm) \
941  __extension__({ \
942  vreinterpretq_m128i_s32( \
943  vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
944  })
945 #else
946 #define _mm_shuffle_epi32_splat(a, imm) \
947  __extension__({ \
948  vreinterpretq_m128i_s32( \
949  vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
950  })
951 #endif
952 
953 // NEON does not support a general purpose permute intrinsic
954 // Selects four specific single-precision, floating-point values from a and b,
955 // based on the mask i.
956 //
957 // C equivalent:
958 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
959 // __constrange(0, 255) int imm) {
960 // __m128 ret;
961 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
962 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
963 // return ret;
964 // }
965 //
966 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
967 #define _mm_shuffle_ps_default(a, b, imm) \
968  __extension__({ \
969  float32x4_t ret; \
970  ret = vmovq_n_f32( \
971  vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
972  ret = vsetq_lane_f32( \
973  vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
974  ret, 1); \
975  ret = vsetq_lane_f32( \
976  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
977  ret, 2); \
978  ret = vsetq_lane_f32( \
979  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
980  ret, 3); \
981  vreinterpretq_m128_f32(ret); \
982  })
983 
984 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
985 // by imm.
986 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
987 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
988 // __constrange(0,255) int
989 // imm)
990 #define _mm_shufflelo_epi16_function(a, imm) \
991  __extension__({ \
992  int16x8_t ret = vreinterpretq_s16_m128i(a); \
993  int16x4_t lowBits = vget_low_s16(ret); \
994  ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
995  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
996  1); \
997  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
998  2); \
999  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1000  3); \
1001  vreinterpretq_m128i_s16(ret); \
1002  })
1003 
1004 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1005 // by imm.
1006 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1007 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1008 // __constrange(0,255) int
1009 // imm)
1010 #define _mm_shufflehi_epi16_function(a, imm) \
1011  __extension__({ \
1012  int16x8_t ret = vreinterpretq_s16_m128i(a); \
1013  int16x4_t highBits = vget_high_s16(ret); \
1014  ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1015  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1016  5); \
1017  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1018  6); \
1019  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1020  7); \
1021  vreinterpretq_m128i_s16(ret); \
1022  })
1023 
1024 /* MMX */
1025 
1026 //_mm_empty is a no-op on arm
1028 
1029 /* SSE */
1030 
1031 // Adds the four single-precision, floating-point values of a and b.
1032 //
1033 // r0 := a0 + b0
1034 // r1 := a1 + b1
1035 // r2 := a2 + b2
1036 // r3 := a3 + b3
1037 //
1038 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1040 {
1041  return vreinterpretq_m128_f32(
1043 }
1044 
1045 // adds the scalar single-precision floating point values of a and b.
1046 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1048 {
1049  float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1050  float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1051  // the upper values in the result must be the remnants of <a>.
1052  return vreinterpretq_m128_f32(vaddq_f32(a, value));
1053 }
1054 
1055 // Computes the bitwise AND of the four single-precision, floating-point values
1056 // of a and b.
1057 //
1058 // r0 := a0 & b0
1059 // r1 := a1 & b1
1060 // r2 := a2 & b2
1061 // r3 := a3 & b3
1062 //
1063 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1065 {
1066  return vreinterpretq_m128_s32(
1068 }
1069 
1070 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1071 // values of a and b.
1072 //
1073 // r0 := ~a0 & b0
1074 // r1 := ~a1 & b1
1075 // r2 := ~a2 & b2
1076 // r3 := ~a3 & b3
1077 //
1078 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1080 {
1081  return vreinterpretq_m128_s32(
1082  vbicq_s32(vreinterpretq_s32_m128(b),
1083  vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1084 }
1085 
1086 // Average packed unsigned 16-bit integers in a and b, and store the results in
1087 // dst.
1088 //
1089 // FOR j := 0 to 3
1090 // i := j*16
1091 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1092 // ENDFOR
1093 //
1094 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1096 {
1097  return vreinterpret_m64_u16(
1098  vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1099 }
1100 
1101 // Average packed unsigned 8-bit integers in a and b, and store the results in
1102 // dst.
1103 //
1104 // FOR j := 0 to 7
1105 // i := j*8
1106 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1107 // ENDFOR
1108 //
1109 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1111 {
1112  return vreinterpret_m64_u8(
1113  vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1114 }
1115 
1116 // Compares for equality.
1117 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1119 {
1120  return vreinterpretq_m128_u32(
1122 }
1123 
1124 // Compares for equality.
1125 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1127 {
1128  return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1129 }
1130 
1131 // Compares for greater than or equal.
1132 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1134 {
1135  return vreinterpretq_m128_u32(
1137 }
1138 
1139 // Compares for greater than or equal.
1140 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1142 {
1143  return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1144 }
1145 
1146 // Compares for greater than.
1147 //
1148 // r0 := (a0 > b0) ? 0xffffffff : 0x0
1149 // r1 := (a1 > b1) ? 0xffffffff : 0x0
1150 // r2 := (a2 > b2) ? 0xffffffff : 0x0
1151 // r3 := (a3 > b3) ? 0xffffffff : 0x0
1152 //
1153 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1155 {
1156  return vreinterpretq_m128_u32(
1158 }
1159 
1160 // Compares for greater than.
1161 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1163 {
1164  return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1165 }
1166 
1167 // Compares for less than or equal.
1168 //
1169 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
1170 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
1171 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
1172 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
1173 //
1174 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1176 {
1177  return vreinterpretq_m128_u32(
1179 }
1180 
1181 // Compares for less than or equal.
1182 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1184 {
1185  return _mm_move_ss(a, _mm_cmple_ps(a, b));
1186 }
1187 
1188 // Compares for less than
1189 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1191 {
1192  return vreinterpretq_m128_u32(
1194 }
1195 
1196 // Compares for less than
1197 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1199 {
1200  return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1201 }
1202 
1203 // Compares for inequality.
1204 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1206 {
1207  return vreinterpretq_m128_u32(vmvnq_u32(
1208  vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1209 }
1210 
1211 // Compares for inequality.
1212 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1214 {
1215  return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1216 }
1217 
1218 // Compares for not greater than or equal.
1219 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1221 {
1222  return vreinterpretq_m128_u32(vmvnq_u32(
1223  vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1224 }
1225 
1226 // Compares for not greater than or equal.
1227 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1229 {
1230  return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1231 }
1232 
1233 // Compares for not greater than.
1234 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1236 {
1237  return vreinterpretq_m128_u32(vmvnq_u32(
1238  vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1239 }
1240 
1241 // Compares for not greater than.
1242 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1244 {
1245  return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1246 }
1247 
1248 // Compares for not less than or equal.
1249 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1251 {
1252  return vreinterpretq_m128_u32(vmvnq_u32(
1253  vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1254 }
1255 
1256 // Compares for not less than or equal.
1257 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1259 {
1260  return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1261 }
1262 
1263 // Compares for not less than.
1264 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1266 {
1267  return vreinterpretq_m128_u32(vmvnq_u32(
1268  vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1269 }
1270 
1271 // Compares for not less than.
1272 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1274 {
1275  return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1276 }
1277 
1278 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1279 // Ordered compare between each value returns true for "orderable" and false for
1280 // "not orderable" (NaN).
1281 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1282 // also:
1283 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1284 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1286 {
1287  // Note: NEON does not have ordered compare builtin
1288  // Need to compare a eq a and b eq b to check for NaN
1289  // Do AND of results to get final
1290  uint32x4_t ceqaa =
1292  uint32x4_t ceqbb =
1294  return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1295 }
1296 
1297 // Compares for ordered.
1298 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1300 {
1301  return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1302 }
1303 
1304 // Compares for unordered.
1305 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1307 {
1308  uint32x4_t f32a =
1310  uint32x4_t f32b =
1312  return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1313 }
1314 
1315 // Compares for unordered.
1316 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1318 {
1319  return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1320 }
1321 
1322 // Compares the lower single-precision floating point scalar values of a and b
1323 // using an equality operation. :
1324 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1326 {
1327  uint32x4_t a_eq_b =
1329  return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1330 }
1331 
1332 // Compares the lower single-precision floating point scalar values of a and b
1333 // using a greater than or equal operation. :
1334 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1336 {
1337  uint32x4_t a_ge_b =
1339  return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1340 }
1341 
1342 // Compares the lower single-precision floating point scalar values of a and b
1343 // using a greater than operation. :
1344 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1346 {
1347  uint32x4_t a_gt_b =
1349  return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1350 }
1351 
1352 // Compares the lower single-precision floating point scalar values of a and b
1353 // using a less than or equal operation. :
1354 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1356 {
1357  uint32x4_t a_le_b =
1359  return vgetq_lane_u32(a_le_b, 0) & 0x1;
1360 }
1361 
1362 // Compares the lower single-precision floating point scalar values of a and b
1363 // using a less than operation. :
1364 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1365 // note!! The documentation on MSDN is incorrect! If either of the values is a
1366 // NAN the docs say you will get a one, but in fact, it will return a zero!!
1368 {
1369  uint32x4_t a_lt_b =
1371  return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1372 }
1373 
1374 // Compares the lower single-precision floating point scalar values of a and b
1375 // using an inequality operation. :
1376 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1378 {
1379  return !_mm_comieq_ss(a, b);
1380 }
1381 
1382 // Convert packed signed 32-bit integers in b to packed single-precision
1383 // (32-bit) floating-point elements, store the results in the lower 2 elements
1384 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1385 // dst.
1386 //
1387 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1388 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1389 // dst[95:64] := a[95:64]
1390 // dst[127:96] := a[127:96]
1391 //
1392 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1394 {
1395  return vreinterpretq_m128_f32(
1396  vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1397  vget_high_f32(vreinterpretq_f32_m128(a))));
1398 }
1399 
1400 // Convert packed single-precision (32-bit) floating-point elements in a to
1401 // packed 32-bit integers, and store the results in dst.
1402 //
1403 // FOR j := 0 to 1
1404 // i := 32*j
1405 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1406 // ENDFOR
1407 //
1408 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1410 {
1411 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1412  return vreinterpret_m64_s32(
1413  vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1414 #else
1415  return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1417 #endif
1418 }
1419 
1420 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1421 // floating-point element, store the result in the lower element of dst, and
1422 // copy the upper 3 packed elements from a to the upper elements of dst.
1423 //
1424 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1425 // dst[127:32] := a[127:32]
1426 //
1427 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1429 {
1430  return vreinterpretq_m128_f32(
1431  vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1432 }
1433 
1434 // Convert the lower single-precision (32-bit) floating-point element in a to a
1435 // 32-bit integer, and store the result in dst.
1436 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1438 {
1439 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1440  return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1441  0);
1442 #else
1443  float32_t data = vgetq_lane_f32(
1445  return (int32_t) data;
1446 #endif
1447 }
1448 
1449 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1450 // floating-point elements, and store the results in dst.
1451 //
1452 // FOR j := 0 to 3
1453 // i := j*16
1454 // m := j*32
1455 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1456 // ENDFOR
1457 //
1458 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1460 {
1461  return vreinterpretq_m128_f32(
1462  vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1463 }
1464 
1465 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1466 // floating-point elements, store the results in the lower 2 elements of dst,
1467 // and copy the upper 2 packed elements from a to the upper elements of dst.
1468 //
1469 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1470 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1471 // dst[95:64] := a[95:64]
1472 // dst[127:96] := a[127:96]
1473 //
1474 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1476 {
1477  return vreinterpretq_m128_f32(
1478  vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1479  vget_high_f32(vreinterpretq_f32_m128(a))));
1480 }
1481 
1482 // Convert packed signed 32-bit integers in a to packed single-precision
1483 // (32-bit) floating-point elements, store the results in the lower 2 elements
1484 // of dst, then convert the packed signed 32-bit integers in b to
1485 // single-precision (32-bit) floating-point element, and store the results in
1486 // the upper 2 elements of dst.
1487 //
1488 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1489 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1490 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1491 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1492 //
1493 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1495 {
1496  return vreinterpretq_m128_f32(vcvtq_f32_s32(
1497  vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1498 }
1499 
1500 // Convert the lower packed 8-bit integers in a to packed single-precision
1501 // (32-bit) floating-point elements, and store the results in dst.
1502 //
1503 // FOR j := 0 to 3
1504 // i := j*8
1505 // m := j*32
1506 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1507 // ENDFOR
1508 //
1509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1511 {
1512  return vreinterpretq_m128_f32(vcvtq_f32_s32(
1513  vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1514 }
1515 
1516 // Convert packed single-precision (32-bit) floating-point elements in a to
1517 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1518 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1519 // 0x7FFFFFFF.
1520 //
1521 // FOR j := 0 to 3
1522 // i := 16*j
1523 // k := 32*j
1524 // IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1525 // dst[i+15:i] := 0x7FFF
1526 // ELSE
1527 // dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1528 // FI
1529 // ENDFOR
1530 //
1531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1533 {
1534  const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
1535  const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
1536  const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1537  const __m128i maxMask = _mm_castps_si128(
1538  _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1539  const __m128i betweenMask = _mm_castps_si128(
1540  _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1541  const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1542  _mm_setzero_si128());
1543  __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1544  __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1545  __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1546  __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1547  return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1548 }
1549 
1550 // Convert packed single-precision (32-bit) floating-point elements in a to
1551 // packed 32-bit integers, and store the results in dst.
1552 //
1553 // FOR j := 0 to 1
1554 // i := 32*j
1555 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1556 // ENDFOR
1557 //
1558 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1559 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1560 
1561 // Convert packed single-precision (32-bit) floating-point elements in a to
1562 // packed 8-bit integers, and store the results in lower 4 elements of dst.
1563 // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1564 // between 0x7F and 0x7FFFFFFF.
1565 //
1566 // FOR j := 0 to 3
1567 // i := 8*j
1568 // k := 32*j
1569 // IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1570 // dst[i+7:i] := 0x7F
1571 // ELSE
1572 // dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1573 // FI
1574 // ENDFOR
1575 //
1576 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1578 {
1579  const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
1580  const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
1581  const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1582  const __m128i maxMask = _mm_castps_si128(
1583  _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1584  const __m128i betweenMask = _mm_castps_si128(
1585  _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1586  const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1587  _mm_setzero_si128());
1588  __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1589  __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1590  __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1591  __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1592  int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1593  int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1594  static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1595  int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1596 
1597  return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1598 }
1599 
1600 // Convert packed unsigned 16-bit integers in a to packed single-precision
1601 // (32-bit) floating-point elements, and store the results in dst.
1602 //
1603 // FOR j := 0 to 3
1604 // i := j*16
1605 // m := j*32
1606 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1607 // ENDFOR
1608 //
1609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1611 {
1612  return vreinterpretq_m128_f32(
1613  vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1614 }
1615 
1616 // Convert the lower packed unsigned 8-bit integers in a to packed
1617 // single-precision (32-bit) floating-point elements, and store the results in
1618 // dst.
1619 //
1620 // FOR j := 0 to 3
1621 // i := j*8
1622 // m := j*32
1623 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1624 // ENDFOR
1625 //
1626 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1628 {
1629  return vreinterpretq_m128_f32(vcvtq_f32_u32(
1630  vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1631 }
1632 
1633 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1634 // floating-point element, store the result in the lower element of dst, and
1635 // copy the upper 3 packed elements from a to the upper elements of dst.
1636 //
1637 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1638 // dst[127:32] := a[127:32]
1639 //
1640 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1641 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1642 
1643 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1644 // floating-point element, store the result in the lower element of dst, and
1645 // copy the upper 3 packed elements from a to the upper elements of dst.
1646 //
1647 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1648 // dst[127:32] := a[127:32]
1649 //
1650 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1652 {
1653  return vreinterpretq_m128_f32(
1654  vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1655 }
1656 
1657 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1658 //
1659 // dst[31:0] := a[31:0]
1660 //
1661 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1663 {
1664  return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1665 }
1666 
1667 // Convert the lower single-precision (32-bit) floating-point element in a to a
1668 // 32-bit integer, and store the result in dst.
1669 //
1670 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1671 //
1672 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1673 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1674 
1675 // Convert the lower single-precision (32-bit) floating-point element in a to a
1676 // 64-bit integer, and store the result in dst.
1677 //
1678 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1679 //
1680 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1682 {
1683 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1684  return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1685 #else
1686  float32_t data = vgetq_lane_f32(
1688  return (int64_t) data;
1689 #endif
1690 }
1691 
1692 // Convert packed single-precision (32-bit) floating-point elements in a to
1693 // packed 32-bit integers with truncation, and store the results in dst.
1694 //
1695 // FOR j := 0 to 1
1696 // i := 32*j
1697 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1698 // ENDFOR
1699 //
1700 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1702 {
1703  return vreinterpret_m64_s32(
1704  vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1705 }
1706 
1707 // Convert the lower single-precision (32-bit) floating-point element in a to a
1708 // 32-bit integer with truncation, and store the result in dst.
1709 //
1710 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1711 //
1712 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1714 {
1715  return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1716 }
1717 
1718 // Convert packed single-precision (32-bit) floating-point elements in a to
1719 // packed 32-bit integers with truncation, and store the results in dst.
1720 //
1721 // FOR j := 0 to 1
1722 // i := 32*j
1723 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1724 // ENDFOR
1725 //
1726 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1727 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1728 
1729 // Convert the lower single-precision (32-bit) floating-point element in a to a
1730 // 32-bit integer with truncation, and store the result in dst.
1731 //
1732 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1733 //
1734 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1735 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1736 
1737 // Convert the lower single-precision (32-bit) floating-point element in a to a
1738 // 64-bit integer with truncation, and store the result in dst.
1739 //
1740 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1741 //
1742 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1744 {
1745  return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1746 }
1747 
1748 // Divides the four single-precision, floating-point values of a and b.
1749 //
1750 // r0 := a0 / b0
1751 // r1 := a1 / b1
1752 // r2 := a2 / b2
1753 // r3 := a3 / b3
1754 //
1755 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1757 {
1758 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1759  return vreinterpretq_m128_f32(
1761 #else
1762  float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1763  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1764 #if SSE2NEON_PRECISE_DIV
1765  // Additional Netwon-Raphson iteration for accuracy
1766  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1767 #endif
1768  return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1769 #endif
1770 }
1771 
1772 // Divides the scalar single-precision floating point value of a by b.
1773 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1775 {
1776  float32_t value =
1777  vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1778  return vreinterpretq_m128_f32(
1779  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1780 }
1781 
1782 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1783 // the lower element of dst.
1784 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1785 #define _mm_extract_pi16(a, imm) \
1786  (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1787 
1788 // Free aligned memory that was allocated with _mm_malloc.
1789 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
1790 FORCE_INLINE void _mm_free(void *addr)
1791 {
1792  free(addr);
1793 }
1794 
1795 // Macro: Get the flush zero bits from the MXCSR control and status register.
1796 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1797 // _MM_FLUSH_ZERO_OFF
1798 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
1800 {
1801  union {
1802  fpcr_bitfield field;
1803 #if defined(__aarch64__)
1804  uint64_t value;
1805 #else
1806  uint32_t value;
1807 #endif
1808  } r;
1809 
1810 #if defined(__aarch64__)
1811  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1812 #else
1813  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1814 #endif
1815 
1816  return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1817 }
1818 
1819 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1820 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1821 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1822 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1824 {
1825  union {
1826  fpcr_bitfield field;
1827 #if defined(__aarch64__)
1828  uint64_t value;
1829 #else
1830  uint32_t value;
1831 #endif
1832  } r;
1833 
1834 #if defined(__aarch64__)
1835  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
1836 #else
1837  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1838 #endif
1839 
1840  if (r.field.bit22) {
1841  return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1842  } else {
1843  return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1844  }
1845 }
1846 
1847 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1848 // specified by imm8.
1849 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1850 #define _mm_insert_pi16(a, b, imm) \
1851  __extension__({ \
1852  vreinterpret_m64_s16( \
1853  vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1854  })
1855 
1856 // Loads four single-precision, floating-point values.
1857 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1859 {
1860  return vreinterpretq_m128_f32(vld1q_f32(p));
1861 }
1862 
1863 // Load a single-precision (32-bit) floating-point element from memory into all
1864 // elements of dst.
1865 //
1866 // dst[31:0] := MEM[mem_addr+31:mem_addr]
1867 // dst[63:32] := MEM[mem_addr+31:mem_addr]
1868 // dst[95:64] := MEM[mem_addr+31:mem_addr]
1869 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1870 //
1871 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1872 #define _mm_load_ps1 _mm_load1_ps
1873 
1874 // Loads an single - precision, floating - point value into the low word and
1875 // clears the upper three words.
1876 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1878 {
1879  return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1880 }
1881 
1882 // Loads a single single-precision, floating-point value, copying it into all
1883 // four words
1884 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1886 {
1887  return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1888 }
1889 
1890 // Sets the upper two single-precision, floating-point values with 64
1891 // bits of data loaded from the address p; the lower two values are passed
1892 // through from a.
1893 //
1894 // r0 := a0
1895 // r1 := a1
1896 // r2 := *p0
1897 // r3 := *p1
1898 //
1899 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1901 {
1902  return vreinterpretq_m128_f32(
1903  vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1904 }
1905 
1906 // Sets the lower two single-precision, floating-point values with 64
1907 // bits of data loaded from the address p; the upper two values are passed
1908 // through from a.
1909 //
1910 // Return Value
1911 // r0 := *p0
1912 // r1 := *p1
1913 // r2 := a2
1914 // r3 := a3
1915 //
1916 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1918 {
1919  return vreinterpretq_m128_f32(
1920  vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1921 }
1922 
1923 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1924 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1925 // general-protection exception may be generated.
1926 //
1927 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1928 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1929 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1930 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1931 //
1932 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1934 {
1935  float32x4_t v = vrev64q_f32(vld1q_f32(p));
1936  return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1937 }
1938 
1939 // Loads four single-precision, floating-point values.
1940 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1942 {
1943  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1944  // equivalent for neon
1945  return vreinterpretq_m128_f32(vld1q_f32(p));
1946 }
1947 
1948 // Load unaligned 16-bit integer from memory into the first element of dst.
1949 //
1950 // dst[15:0] := MEM[mem_addr+15:mem_addr]
1951 // dst[MAX:16] := 0
1952 //
1953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1955 {
1956  return vreinterpretq_m128i_s16(
1957  vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1958 }
1959 
1960 // Load unaligned 64-bit integer from memory into the first element of dst.
1961 //
1962 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1963 // dst[MAX:64] := 0
1964 //
1965 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1967 {
1968  return vreinterpretq_m128i_s64(
1969  vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1970 }
1971 
1972 // Allocate aligned blocks of memory.
1973 // https://software.intel.com/en-us/
1974 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
1975 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1976 {
1977  void *ptr;
1978  if (align == 1)
1979  return malloc(size);
1980  if (align == 2 || (sizeof(void *) == 8 && align == 4))
1981  align = sizeof(void *);
1982  if (!posix_memalign(&ptr, align, size))
1983  return ptr;
1984  return NULL;
1985 }
1986 
1987 // Conditionally store 8-bit integer elements from a into memory using mask
1988 // (elements are not stored when the highest bit is not set in the corresponding
1989 // element) and a non-temporal memory hint.
1990 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1991 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1992 {
1993  int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1994  __m128 b = _mm_load_ps((const float *) mem_addr);
1995  int8x8_t masked =
1996  vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1997  vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1998  vst1_s8((int8_t *) mem_addr, masked);
1999 }
2000 
2001 // Conditionally store 8-bit integer elements from a into memory using mask
2002 // (elements are not stored when the highest bit is not set in the corresponding
2003 // element) and a non-temporal memory hint.
2004 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
2005 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2006 
2007 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2008 // values in dst.
2009 //
2010 // FOR j := 0 to 3
2011 // i := j*16
2012 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
2013 // ENDFOR
2014 //
2015 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
2017 {
2018  return vreinterpret_m64_s16(
2019  vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2020 }
2021 
2022 // Computes the maximums of the four single-precision, floating-point values of
2023 // a and b.
2024 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2026 {
2027 #if SSE2NEON_PRECISE_MINMAX
2028  float32x4_t _a = vreinterpretq_f32_m128(a);
2029  float32x4_t _b = vreinterpretq_f32_m128(b);
2030  return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
2031 #else
2032  return vreinterpretq_m128_f32(
2034 #endif
2035 }
2036 
2037 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2038 // values in dst.
2039 //
2040 // FOR j := 0 to 7
2041 // i := j*8
2042 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
2043 // ENDFOR
2044 //
2045 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
2047 {
2048  return vreinterpret_m64_u8(
2049  vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2050 }
2051 
2052 // Computes the maximum of the two lower scalar single-precision floating point
2053 // values of a and b.
2054 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2056 {
2057  float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2058  return vreinterpretq_m128_f32(
2059  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2060 }
2061 
2062 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2063 // values in dst.
2064 //
2065 // FOR j := 0 to 3
2066 // i := j*16
2067 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2068 // ENDFOR
2069 //
2070 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2072 {
2073  return vreinterpret_m64_s16(
2074  vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2075 }
2076 
2077 // Computes the minima of the four single-precision, floating-point values of a
2078 // and b.
2079 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2081 {
2082 #if SSE2NEON_PRECISE_MINMAX
2083  float32x4_t _a = vreinterpretq_f32_m128(a);
2084  float32x4_t _b = vreinterpretq_f32_m128(b);
2085  return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2086 #else
2087  return vreinterpretq_m128_f32(
2089 #endif
2090 }
2091 
2092 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2093 // values in dst.
2094 //
2095 // FOR j := 0 to 7
2096 // i := j*8
2097 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2098 // ENDFOR
2099 //
2100 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2102 {
2103  return vreinterpret_m64_u8(
2104  vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2105 }
2106 
2107 // Computes the minimum of the two lower scalar single-precision floating point
2108 // values of a and b.
2109 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2111 {
2112  float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2113  return vreinterpretq_m128_f32(
2114  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2115 }
2116 
2117 // Sets the low word to the single-precision, floating-point value of b
2118 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2120 {
2121  return vreinterpretq_m128_f32(
2122  vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2123  vreinterpretq_f32_m128(a), 0));
2124 }
2125 
2126 // Moves the upper two values of B into the lower two values of A.
2127 //
2128 // r3 := a3
2129 // r2 := a2
2130 // r1 := b3
2131 // r0 := b2
2133 {
2134  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2135  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2136  return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2137 }
2138 
2139 // Moves the lower two values of B into the upper two values of A.
2140 //
2141 // r3 := b1
2142 // r2 := b0
2143 // r1 := a1
2144 // r0 := a0
2146 {
2147  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2148  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2149  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2150 }
2151 
2152 // Create mask from the most significant bit of each 8-bit element in a, and
2153 // store the result in dst.
2154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2156 {
2157  uint8x8_t input = vreinterpret_u8_m64(a);
2158 #if defined(__aarch64__)
2159  static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2160  uint8x8_t tmp = vshr_n_u8(input, 7);
2161  return vaddv_u8(vshl_u8(tmp, shift));
2162 #else
2163  // Refer the implementation of `_mm_movemask_epi8`
2164  uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2165  uint32x2_t paired16 =
2166  vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2167  uint8x8_t paired32 =
2168  vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2169  return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2170 #endif
2171 }
2172 
2173 // NEON does not provide this method
2174 // Creates a 4-bit mask from the most significant bits of the four
2175 // single-precision, floating-point values.
2176 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2178 {
2179  uint32x4_t input = vreinterpretq_u32_m128(a);
2180 #if defined(__aarch64__)
2181  static const int32x4_t shift = {0, 1, 2, 3};
2182  uint32x4_t tmp = vshrq_n_u32(input, 31);
2183  return vaddvq_u32(vshlq_u32(tmp, shift));
2184 #else
2185  // Uses the exact same method as _mm_movemask_epi8, see that for details.
2186  // Shift out everything but the sign bits with a 32-bit unsigned shift
2187  // right.
2188  uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2189  // Merge the two pairs together with a 64-bit unsigned shift right + add.
2190  uint8x16_t paired =
2191  vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2192  // Extract the result.
2193  return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2194 #endif
2195 }
2196 
2197 // Multiplies the four single-precision, floating-point values of a and b.
2198 //
2199 // r0 := a0 * b0
2200 // r1 := a1 * b1
2201 // r2 := a2 * b2
2202 // r3 := a3 * b3
2203 //
2204 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2206 {
2207  return vreinterpretq_m128_f32(
2209 }
2210 
2211 // Multiply the lower single-precision (32-bit) floating-point element in a and
2212 // b, store the result in the lower element of dst, and copy the upper 3 packed
2213 // elements from a to the upper elements of dst.
2214 //
2215 // dst[31:0] := a[31:0] * b[31:0]
2216 // dst[127:32] := a[127:32]
2217 //
2218 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2220 {
2221  return _mm_move_ss(a, _mm_mul_ps(a, b));
2222 }
2223 
2224 // Multiply the packed unsigned 16-bit integers in a and b, producing
2225 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2226 // integers in dst.
2227 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2229 {
2230  return vreinterpret_m64_u16(vshrn_n_u32(
2231  vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2232 }
2233 
2234 // Computes the bitwise OR of the four single-precision, floating-point values
2235 // of a and b.
2236 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2238 {
2239  return vreinterpretq_m128_s32(
2241 }
2242 
2243 // Average packed unsigned 8-bit integers in a and b, and store the results in
2244 // dst.
2245 //
2246 // FOR j := 0 to 7
2247 // i := j*8
2248 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2249 // ENDFOR
2250 //
2251 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2252 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2253 
2254 // Average packed unsigned 16-bit integers in a and b, and store the results in
2255 // dst.
2256 //
2257 // FOR j := 0 to 3
2258 // i := j*16
2259 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2260 // ENDFOR
2261 //
2262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2263 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2264 
2265 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2266 // the lower element of dst.
2267 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2268 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2269 
2270 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2271 // specified by imm8.
2272 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2273 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2274 
2275 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2276 // values in dst.
2277 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2278 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2279 
2280 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2281 // values in dst.
2282 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2283 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2284 
2285 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2286 // values in dst.
2287 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2288 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2289 
2290 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2291 // values in dst.
2292 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2293 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2294 
2295 // Create mask from the most significant bit of each 8-bit element in a, and
2296 // store the result in dst.
2297 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2298 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2299 
2300 // Multiply the packed unsigned 16-bit integers in a and b, producing
2301 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2302 // integers in dst.
2303 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2304 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2305 
2306 // Loads one cache line of data from address p to a location closer to the
2307 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2308 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2309 {
2310  (void) i;
2311  __builtin_prefetch(p);
2312 }
2313 
2314 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2315 // b, then horizontally sum each consecutive 8 differences to produce four
2316 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2317 // 16 bits of dst.
2318 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2319 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2320 
2321 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2322 // in dst.
2323 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2324 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2325 
2326 // Compute the approximate reciprocal of packed single-precision (32-bit)
2327 // floating-point elements in a, and store the results in dst. The maximum
2328 // relative error for this approximation is less than 1.5*2^-12.
2329 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2331 {
2332  float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2333  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2334 #if SSE2NEON_PRECISE_DIV
2335  // Additional Netwon-Raphson iteration for accuracy
2336  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2337 #endif
2338  return vreinterpretq_m128_f32(recip);
2339 }
2340 
2341 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2342 // floating-point element in a, store the result in the lower element of dst,
2343 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2344 // maximum relative error for this approximation is less than 1.5*2^-12.
2345 //
2346 // dst[31:0] := (1.0 / a[31:0])
2347 // dst[127:32] := a[127:32]
2348 //
2349 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2351 {
2352  return _mm_move_ss(a, _mm_rcp_ps(a));
2353 }
2354 
2355 // Computes the approximations of the reciprocal square roots of the four
2356 // single-precision floating point values of in.
2357 // The current precision is 1% error.
2358 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2360 {
2361  float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2362 #if SSE2NEON_PRECISE_SQRT
2363  // Additional Netwon-Raphson iteration for accuracy
2364  out = vmulq_f32(
2365  out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2366  out = vmulq_f32(
2367  out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2368 #endif
2369  return vreinterpretq_m128_f32(out);
2370 }
2371 
2372 // Compute the approximate reciprocal square root of the lower single-precision
2373 // (32-bit) floating-point element in a, store the result in the lower element
2374 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2375 // dst.
2376 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2378 {
2379  return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2380 }
2381 
2382 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2383 // b, then horizontally sum each consecutive 8 differences to produce four
2384 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2385 // 16 bits of dst.
2386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2388 {
2389  uint64x1_t t = vpaddl_u32(vpaddl_u16(
2390  vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2391  return vreinterpret_m64_u16(
2392  vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2393 }
2394 
2395 // Macro: Set the flush zero bits of the MXCSR control and status register to
2396 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
2397 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
2400 {
2401  // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2402  // regardless of the value of the FZ bit.
2403  union {
2404  fpcr_bitfield field;
2405 #if defined(__aarch64__)
2406  uint64_t value;
2407 #else
2408  uint32_t value;
2409 #endif
2410  } r;
2411 
2412 #if defined(__aarch64__)
2413  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2414 #else
2415  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2416 #endif
2417 
2418  r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2419 
2420 #if defined(__aarch64__)
2421  __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2422 #else
2423  __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2424 #endif
2425 }
2426 
2427 // Sets the four single-precision, floating-point values to the four inputs.
2428 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2429 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2430 {
2431  float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2432  return vreinterpretq_m128_f32(vld1q_f32(data));
2433 }
2434 
2435 // Sets the four single-precision, floating-point values to w.
2436 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2438 {
2439  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2440 }
2441 
2442 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2443 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2444 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2445 // _MM_ROUND_TOWARD_ZERO
2446 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2448 {
2449  union {
2450  fpcr_bitfield field;
2451 #if defined(__aarch64__)
2452  uint64_t value;
2453 #else
2454  uint32_t value;
2455 #endif
2456  } r;
2457 
2458 #if defined(__aarch64__)
2459  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
2460 #else
2461  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2462 #endif
2463 
2464  switch (rounding) {
2465  case _MM_ROUND_TOWARD_ZERO:
2466  r.field.bit22 = 1;
2467  r.field.bit23 = 1;
2468  break;
2469  case _MM_ROUND_DOWN:
2470  r.field.bit22 = 0;
2471  r.field.bit23 = 1;
2472  break;
2473  case _MM_ROUND_UP:
2474  r.field.bit22 = 1;
2475  r.field.bit23 = 0;
2476  break;
2477  default: //_MM_ROUND_NEAREST
2478  r.field.bit22 = 0;
2479  r.field.bit23 = 0;
2480  }
2481 
2482 #if defined(__aarch64__)
2483  __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
2484 #else
2485  __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2486 #endif
2487 }
2488 
2489 // Copy single-precision (32-bit) floating-point element a to the lower element
2490 // of dst, and zero the upper 3 elements.
2491 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2493 {
2494  float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2495  return vreinterpretq_m128_f32(vld1q_f32(data));
2496 }
2497 
2498 // Sets the four single-precision, floating-point values to w.
2499 //
2500 // r0 := r1 := r2 := r3 := w
2501 //
2502 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2504 {
2505  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2506 }
2507 
2508 // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2509 FORCE_INLINE void _mm_setcsr(unsigned int a)
2510 {
2512 }
2513 
2514 // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2515 FORCE_INLINE unsigned int _mm_getcsr()
2516 {
2517  return _MM_GET_ROUNDING_MODE();
2518 }
2519 
2520 // Sets the four single-precision, floating-point values to the four inputs in
2521 // reverse order.
2522 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2523 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2524 {
2525  float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2526  return vreinterpretq_m128_f32(vld1q_f32(data));
2527 }
2528 
2529 // Clears the four single-precision, floating-point values.
2530 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2532 {
2533  return vreinterpretq_m128_f32(vdupq_n_f32(0));
2534 }
2535 
2536 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2537 // in dst.
2538 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2539 #if __has_builtin(__builtin_shufflevector)
2540 #define _mm_shuffle_pi16(a, imm) \
2541  __extension__({ \
2542  vreinterpret_m64_s16(__builtin_shufflevector( \
2543  vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2544  ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2545  })
2546 #else
2547 #define _mm_shuffle_pi16(a, imm) \
2548  __extension__({ \
2549  int16x4_t ret; \
2550  ret = \
2551  vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2552  ret = vset_lane_s16( \
2553  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2554  1); \
2555  ret = vset_lane_s16( \
2556  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2557  2); \
2558  ret = vset_lane_s16( \
2559  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2560  3); \
2561  vreinterpret_m64_s16(ret); \
2562  })
2563 #endif
2564 
2565 // Guarantees that every preceding store is globally visible before any
2566 // subsequent store.
2567 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2569 {
2570  __sync_synchronize();
2571 }
2572 
2573 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2574 // int imm)
2575 #if __has_builtin(__builtin_shufflevector)
2576 #define _mm_shuffle_ps(a, b, imm) \
2577  __extension__({ \
2578  float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2579  float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2580  float32x4_t _shuf = __builtin_shufflevector( \
2581  _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2582  (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2583  vreinterpretq_m128_f32(_shuf); \
2584  })
2585 #else // generic
2586 #define _mm_shuffle_ps(a, b, imm) \
2587  __extension__({ \
2588  __m128 ret; \
2589  switch (imm) { \
2590  case _MM_SHUFFLE(1, 0, 3, 2): \
2591  ret = _mm_shuffle_ps_1032((a), (b)); \
2592  break; \
2593  case _MM_SHUFFLE(2, 3, 0, 1): \
2594  ret = _mm_shuffle_ps_2301((a), (b)); \
2595  break; \
2596  case _MM_SHUFFLE(0, 3, 2, 1): \
2597  ret = _mm_shuffle_ps_0321((a), (b)); \
2598  break; \
2599  case _MM_SHUFFLE(2, 1, 0, 3): \
2600  ret = _mm_shuffle_ps_2103((a), (b)); \
2601  break; \
2602  case _MM_SHUFFLE(1, 0, 1, 0): \
2603  ret = _mm_movelh_ps((a), (b)); \
2604  break; \
2605  case _MM_SHUFFLE(1, 0, 0, 1): \
2606  ret = _mm_shuffle_ps_1001((a), (b)); \
2607  break; \
2608  case _MM_SHUFFLE(0, 1, 0, 1): \
2609  ret = _mm_shuffle_ps_0101((a), (b)); \
2610  break; \
2611  case _MM_SHUFFLE(3, 2, 1, 0): \
2612  ret = _mm_shuffle_ps_3210((a), (b)); \
2613  break; \
2614  case _MM_SHUFFLE(0, 0, 1, 1): \
2615  ret = _mm_shuffle_ps_0011((a), (b)); \
2616  break; \
2617  case _MM_SHUFFLE(0, 0, 2, 2): \
2618  ret = _mm_shuffle_ps_0022((a), (b)); \
2619  break; \
2620  case _MM_SHUFFLE(2, 2, 0, 0): \
2621  ret = _mm_shuffle_ps_2200((a), (b)); \
2622  break; \
2623  case _MM_SHUFFLE(3, 2, 0, 2): \
2624  ret = _mm_shuffle_ps_3202((a), (b)); \
2625  break; \
2626  case _MM_SHUFFLE(3, 2, 3, 2): \
2627  ret = _mm_movehl_ps((b), (a)); \
2628  break; \
2629  case _MM_SHUFFLE(1, 1, 3, 3): \
2630  ret = _mm_shuffle_ps_1133((a), (b)); \
2631  break; \
2632  case _MM_SHUFFLE(2, 0, 1, 0): \
2633  ret = _mm_shuffle_ps_2010((a), (b)); \
2634  break; \
2635  case _MM_SHUFFLE(2, 0, 0, 1): \
2636  ret = _mm_shuffle_ps_2001((a), (b)); \
2637  break; \
2638  case _MM_SHUFFLE(2, 0, 3, 2): \
2639  ret = _mm_shuffle_ps_2032((a), (b)); \
2640  break; \
2641  default: \
2642  ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2643  break; \
2644  } \
2645  ret; \
2646  })
2647 #endif
2648 
2649 // Computes the approximations of square roots of the four single-precision,
2650 // floating-point values of a. First computes reciprocal square roots and then
2651 // reciprocals of the four values.
2652 //
2653 // r0 := sqrt(a0)
2654 // r1 := sqrt(a1)
2655 // r2 := sqrt(a2)
2656 // r3 := sqrt(a3)
2657 //
2658 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2660 {
2661 #if SSE2NEON_PRECISE_SQRT
2662  float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2663 
2664  // Test for vrsqrteq_f32(0) -> positive infinity case.
2665  // Change to zero, so that s * 1/sqrt(s) result is zero too.
2666  const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2667  const uint32x4_t div_by_zero =
2668  vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2669  recip = vreinterpretq_f32_u32(
2670  vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2671 
2672  // Additional Netwon-Raphson iteration for accuracy
2673  recip = vmulq_f32(
2674  vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2675  recip);
2676  recip = vmulq_f32(
2677  vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2678  recip);
2679 
2680  // sqrt(s) = s * 1/sqrt(s)
2681  return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2682 #elif defined(__aarch64__)
2683  return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2684 #else
2685  float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2686  float32x4_t sq = vrecpeq_f32(recipsq);
2687  return vreinterpretq_m128_f32(sq);
2688 #endif
2689 }
2690 
2691 // Computes the approximation of the square root of the scalar single-precision
2692 // floating point value of in.
2693 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2695 {
2696  float32_t value =
2697  vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2698  return vreinterpretq_m128_f32(
2699  vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2700 }
2701 
2702 // Stores four single-precision, floating-point values.
2703 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2705 {
2706  vst1q_f32(p, vreinterpretq_f32_m128(a));
2707 }
2708 
2709 // Store the lower single-precision (32-bit) floating-point element from a into
2710 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2711 // boundary or a general-protection exception may be generated.
2712 //
2713 // MEM[mem_addr+31:mem_addr] := a[31:0]
2714 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2715 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2716 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2717 //
2718 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2720 {
2721  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2722  vst1q_f32(p, vdupq_n_f32(a0));
2723 }
2724 
2725 // Stores the lower single - precision, floating - point value.
2726 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2728 {
2729  vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2730 }
2731 
2732 // Store the lower single-precision (32-bit) floating-point element from a into
2733 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2734 // boundary or a general-protection exception may be generated.
2735 //
2736 // MEM[mem_addr+31:mem_addr] := a[31:0]
2737 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2738 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2739 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2740 //
2741 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2742 #define _mm_store1_ps _mm_store_ps1
2743 
2744 // Stores the upper two single-precision, floating-point values of a to the
2745 // address p.
2746 //
2747 // *p0 := a2
2748 // *p1 := a3
2749 //
2750 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2752 {
2753  *p = vreinterpret_m64_f32(vget_high_f32(a));
2754 }
2755 
2756 // Stores the lower two single-precision floating point values of a to the
2757 // address p.
2758 //
2759 // *p0 := a0
2760 // *p1 := a1
2761 //
2762 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2764 {
2765  *p = vreinterpret_m64_f32(vget_low_f32(a));
2766 }
2767 
2768 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2769 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2770 // general-protection exception may be generated.
2771 //
2772 // MEM[mem_addr+31:mem_addr] := a[127:96]
2773 // MEM[mem_addr+63:mem_addr+32] := a[95:64]
2774 // MEM[mem_addr+95:mem_addr+64] := a[63:32]
2775 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2776 //
2777 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2779 {
2780  float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2781  float32x4_t rev = vextq_f32(tmp, tmp, 2);
2782  vst1q_f32(p, rev);
2783 }
2784 
2785 // Stores four single-precision, floating-point values.
2786 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2788 {
2789  vst1q_f32(p, vreinterpretq_f32_m128(a));
2790 }
2791 
2792 // Stores 16-bits of integer data a at the address p.
2793 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2795 {
2796  vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2797 }
2798 
2799 // Stores 64-bits of integer data a at the address p.
2800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2802 {
2803  vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2804 }
2805 
2806 // Store 64-bits of integer data from a into memory using a non-temporal memory
2807 // hint.
2808 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2810 {
2811  vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2812 }
2813 
2814 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2815 // point elements) from a into memory using a non-temporal memory hint.
2816 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2818 {
2819 #if __has_builtin(__builtin_nontemporal_store)
2820  __builtin_nontemporal_store(a, (float32x4_t *) p);
2821 #else
2822  vst1q_f32(p, vreinterpretq_f32_m128(a));
2823 #endif
2824 }
2825 
2826 // Subtracts the four single-precision, floating-point values of a and b.
2827 //
2828 // r0 := a0 - b0
2829 // r1 := a1 - b1
2830 // r2 := a2 - b2
2831 // r3 := a3 - b3
2832 //
2833 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2835 {
2836  return vreinterpretq_m128_f32(
2838 }
2839 
2840 // Subtract the lower single-precision (32-bit) floating-point element in b from
2841 // the lower single-precision (32-bit) floating-point element in a, store the
2842 // result in the lower element of dst, and copy the upper 3 packed elements from
2843 // a to the upper elements of dst.
2844 //
2845 // dst[31:0] := a[31:0] - b[31:0]
2846 // dst[127:32] := a[127:32]
2847 //
2848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2850 {
2851  return _mm_move_ss(a, _mm_sub_ps(a, b));
2852 }
2853 
2854 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2855 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2856 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2857 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2858 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2859  do { \
2860  float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2861  float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2862  row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2863  vget_low_f32(ROW23.val[0])); \
2864  row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2865  vget_low_f32(ROW23.val[1])); \
2866  row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2867  vget_high_f32(ROW23.val[0])); \
2868  row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2869  vget_high_f32(ROW23.val[1])); \
2870  } while (0)
2871 
2872 // according to the documentation, these intrinsics behave the same as the
2873 // non-'u' versions. We'll just alias them here.
2874 #define _mm_ucomieq_ss _mm_comieq_ss
2875 #define _mm_ucomige_ss _mm_comige_ss
2876 #define _mm_ucomigt_ss _mm_comigt_ss
2877 #define _mm_ucomile_ss _mm_comile_ss
2878 #define _mm_ucomilt_ss _mm_comilt_ss
2879 #define _mm_ucomineq_ss _mm_comineq_ss
2880 
2881 // Return vector of type __m128i with undefined elements.
2882 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2884 {
2885 #if defined(__GNUC__) || defined(__clang__)
2886 #pragma GCC diagnostic push
2887 #pragma GCC diagnostic ignored "-Wuninitialized"
2888 #endif
2889  __m128i a;
2890  return a;
2891 #if defined(__GNUC__) || defined(__clang__)
2892 #pragma GCC diagnostic pop
2893 #endif
2894 }
2895 
2896 // Return vector of type __m128 with undefined elements.
2897 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2899 {
2900 #if defined(__GNUC__) || defined(__clang__)
2901 #pragma GCC diagnostic push
2902 #pragma GCC diagnostic ignored "-Wuninitialized"
2903 #endif
2904  __m128 a;
2905  return a;
2906 #if defined(__GNUC__) || defined(__clang__)
2907 #pragma GCC diagnostic pop
2908 #endif
2909 }
2910 
2911 // Selects and interleaves the upper two single-precision, floating-point values
2912 // from a and b.
2913 //
2914 // r0 := a2
2915 // r1 := b2
2916 // r2 := a3
2917 // r3 := b3
2918 //
2919 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2921 {
2922 #if defined(__aarch64__)
2923  return vreinterpretq_m128_f32(
2924  vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2925 #else
2926  float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2927  float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2928  float32x2x2_t result = vzip_f32(a1, b1);
2929  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2930 #endif
2931 }
2932 
2933 // Selects and interleaves the lower two single-precision, floating-point values
2934 // from a and b.
2935 //
2936 // r0 := a0
2937 // r1 := b0
2938 // r2 := a1
2939 // r3 := b1
2940 //
2941 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2943 {
2944 #if defined(__aarch64__)
2945  return vreinterpretq_m128_f32(
2946  vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2947 #else
2948  float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2949  float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2950  float32x2x2_t result = vzip_f32(a1, b1);
2951  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2952 #endif
2953 }
2954 
2955 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2956 // floating-point values of a and b.
2957 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2959 {
2960  return vreinterpretq_m128_s32(
2962 }
2963 
2964 /* SSE2 */
2965 
2966 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2967 // unsigned 16-bit integers in b.
2968 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2970 {
2971  return vreinterpretq_m128i_s16(
2973 }
2974 
2975 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2976 // unsigned 32-bit integers in b.
2977 //
2978 // r0 := a0 + b0
2979 // r1 := a1 + b1
2980 // r2 := a2 + b2
2981 // r3 := a3 + b3
2982 //
2983 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2985 {
2986  return vreinterpretq_m128i_s32(
2988 }
2989 
2990 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2991 // unsigned 32-bit integers in b.
2992 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2994 {
2995  return vreinterpretq_m128i_s64(
2997 }
2998 
2999 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3000 // unsigned 8-bit integers in b.
3001 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
3003 {
3004  return vreinterpretq_m128i_s8(
3006 }
3007 
3008 // Add packed double-precision (64-bit) floating-point elements in a and b, and
3009 // store the results in dst.
3010 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
3012 {
3013 #if defined(__aarch64__)
3014  return vreinterpretq_m128d_f64(
3015  vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3016 #else
3017  double *da = (double *) &a;
3018  double *db = (double *) &b;
3019  double c[2];
3020  c[0] = da[0] + db[0];
3021  c[1] = da[1] + db[1];
3022  return vld1q_f32((float32_t *) c);
3023 #endif
3024 }
3025 
3026 // Add the lower double-precision (64-bit) floating-point element in a and b,
3027 // store the result in the lower element of dst, and copy the upper element from
3028 // a to the upper element of dst.
3029 //
3030 // dst[63:0] := a[63:0] + b[63:0]
3031 // dst[127:64] := a[127:64]
3032 //
3033 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
3035 {
3036 #if defined(__aarch64__)
3037  return _mm_move_sd(a, _mm_add_pd(a, b));
3038 #else
3039  double *da = (double *) &a;
3040  double *db = (double *) &b;
3041  double c[2];
3042  c[0] = da[0] + db[0];
3043  c[1] = da[1];
3044  return vld1q_f32((float32_t *) c);
3045 #endif
3046 }
3047 
3048 // Add 64-bit integers a and b, and store the result in dst.
3049 //
3050 // dst[63:0] := a[63:0] + b[63:0]
3051 //
3052 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
3054 {
3055  return vreinterpret_m64_s64(
3056  vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3057 }
3058 
3059 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3060 // and saturates.
3061 //
3062 // r0 := SignedSaturate(a0 + b0)
3063 // r1 := SignedSaturate(a1 + b1)
3064 // ...
3065 // r7 := SignedSaturate(a7 + b7)
3066 //
3067 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3069 {
3070  return vreinterpretq_m128i_s16(
3072 }
3073 
3074 // Add packed signed 8-bit integers in a and b using saturation, and store the
3075 // results in dst.
3076 //
3077 // FOR j := 0 to 15
3078 // i := j*8
3079 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3080 // ENDFOR
3081 //
3082 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
3084 {
3085  return vreinterpretq_m128i_s8(
3087 }
3088 
3089 // Add packed unsigned 16-bit integers in a and b using saturation, and store
3090 // the results in dst.
3091 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
3093 {
3094  return vreinterpretq_m128i_u16(
3096 }
3097 
3098 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3099 // b and saturates..
3100 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3102 {
3103  return vreinterpretq_m128i_u8(
3105 }
3106 
3107 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
3108 // elements in a and b, and store the results in dst.
3109 //
3110 // FOR j := 0 to 1
3111 // i := j*64
3112 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3113 // ENDFOR
3114 //
3115 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3117 {
3118  return vreinterpretq_m128d_s64(
3120 }
3121 
3122 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3123 // b.
3124 //
3125 // r := a & b
3126 //
3127 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3129 {
3130  return vreinterpretq_m128i_s32(
3132 }
3133 
3134 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3135 // elements in a and then AND with b, and store the results in dst.
3136 //
3137 // FOR j := 0 to 1
3138 // i := j*64
3139 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3140 // ENDFOR
3141 //
3142 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3144 {
3145  // *NOTE* argument swap
3146  return vreinterpretq_m128d_s64(
3148 }
3149 
3150 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3151 // 128-bit value in a.
3152 //
3153 // r := (~a) & b
3154 //
3155 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3157 {
3158  return vreinterpretq_m128i_s32(
3159  vbicq_s32(vreinterpretq_s32_m128i(b),
3160  vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3161 }
3162 
3163 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3164 // unsigned 16-bit integers in b and rounds.
3165 //
3166 // r0 := (a0 + b0) / 2
3167 // r1 := (a1 + b1) / 2
3168 // ...
3169 // r7 := (a7 + b7) / 2
3170 //
3171 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3173 {
3174  return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3176 }
3177 
3178 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3179 // unsigned 8-bit integers in b and rounds.
3180 //
3181 // r0 := (a0 + b0) / 2
3182 // r1 := (a1 + b1) / 2
3183 // ...
3184 // r15 := (a15 + b15) / 2
3185 //
3186 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3188 {
3189  return vreinterpretq_m128i_u8(
3190  vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3191 }
3192 
3193 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3194 // dst.
3195 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3196 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3197 
3198 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3199 // dst.
3200 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3201 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3202 
3203 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3204 // compilation and does not generate any instructions, thus it has zero latency.
3205 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3207 {
3209 }
3210 
3211 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3212 // compilation and does not generate any instructions, thus it has zero latency.
3213 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3215 {
3217 }
3218 
3219 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3220 // compilation and does not generate any instructions, thus it has zero latency.
3221 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3223 {
3225 }
3226 
3227 // Applies a type cast to reinterpret four 32-bit floating point values passed
3228 // in as a 128-bit parameter as packed 32-bit integers.
3229 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3231 {
3233 }
3234 
3235 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3236 // compilation and does not generate any instructions, thus it has zero latency.
3237 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3239 {
3240 #if defined(__aarch64__)
3241  return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3242 #else
3244 #endif
3245 }
3246 
3247 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3248 // 128-bit parameter as packed 32-bit floating point values.
3249 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3251 {
3253 }
3254 
3255 // Cache line containing p is flushed and invalidated from all caches in the
3256 // coherency domain. :
3257 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3258 FORCE_INLINE void _mm_clflush(void const *p)
3259 {
3260  (void) p;
3261  // no corollary for Neon?
3262 }
3263 
3264 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3265 // unsigned 16-bit integers in b for equality.
3266 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3268 {
3269  return vreinterpretq_m128i_u16(
3271 }
3272 
3273 // Compare packed 32-bit integers in a and b for equality, and store the results
3274 // in dst
3276 {
3277  return vreinterpretq_m128i_u32(
3279 }
3280 
3281 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3282 // unsigned 8-bit integers in b for equality.
3283 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3285 {
3286  return vreinterpretq_m128i_u8(
3288 }
3289 
3290 // Compare packed double-precision (64-bit) floating-point elements in a and b
3291 // for equality, and store the results in dst.
3292 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3294 {
3295 #if defined(__aarch64__)
3296  return vreinterpretq_m128d_u64(
3297  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3298 #else
3299  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3300  uint32x4_t cmp =
3302  uint32x4_t swapped = vrev64q_u32(cmp);
3303  return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3304 #endif
3305 }
3306 
3307 // Compare the lower double-precision (64-bit) floating-point elements in a and
3308 // b for equality, store the result in the lower element of dst, and copy the
3309 // upper element from a to the upper element of dst.
3310 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3312 {
3313  return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3314 }
3315 
3316 // Compare packed double-precision (64-bit) floating-point elements in a and b
3317 // for greater-than-or-equal, and store the results in dst.
3318 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3320 {
3321 #if defined(__aarch64__)
3322  return vreinterpretq_m128d_u64(
3323  vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3324 #else
3325  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3326  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3327  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3328  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3329  uint64_t d[2];
3330  d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3331  d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3332 
3333  return vreinterpretq_m128d_u64(vld1q_u64(d));
3334 #endif
3335 }
3336 
3337 // Compare the lower double-precision (64-bit) floating-point elements in a and
3338 // b for greater-than-or-equal, store the result in the lower element of dst,
3339 // and copy the upper element from a to the upper element of dst.
3340 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3342 {
3343 #if defined(__aarch64__)
3344  return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3345 #else
3346  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3347  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3348  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3349  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3350  uint64_t d[2];
3351  d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3352  d[1] = a1;
3353 
3354  return vreinterpretq_m128d_u64(vld1q_u64(d));
3355 #endif
3356 }
3357 
3358 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3359 // in b for greater than.
3360 //
3361 // r0 := (a0 > b0) ? 0xffff : 0x0
3362 // r1 := (a1 > b1) ? 0xffff : 0x0
3363 // ...
3364 // r7 := (a7 > b7) ? 0xffff : 0x0
3365 //
3366 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3368 {
3369  return vreinterpretq_m128i_u16(
3371 }
3372 
3373 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3374 // in b for greater than.
3375 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3377 {
3378  return vreinterpretq_m128i_u32(
3380 }
3381 
3382 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3383 // in b for greater than.
3384 //
3385 // r0 := (a0 > b0) ? 0xff : 0x0
3386 // r1 := (a1 > b1) ? 0xff : 0x0
3387 // ...
3388 // r15 := (a15 > b15) ? 0xff : 0x0
3389 //
3390 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3392 {
3393  return vreinterpretq_m128i_u8(
3395 }
3396 
3397 // Compare packed double-precision (64-bit) floating-point elements in a and b
3398 // for greater-than, and store the results in dst.
3399 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3401 {
3402 #if defined(__aarch64__)
3403  return vreinterpretq_m128d_u64(
3404  vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3405 #else
3406  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3407  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3408  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3409  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3410  uint64_t d[2];
3411  d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3412  d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3413 
3414  return vreinterpretq_m128d_u64(vld1q_u64(d));
3415 #endif
3416 }
3417 
3418 // Compare the lower double-precision (64-bit) floating-point elements in a and
3419 // b for greater-than, store the result in the lower element of dst, and copy
3420 // the upper element from a to the upper element of dst.
3421 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3423 {
3424 #if defined(__aarch64__)
3425  return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3426 #else
3427  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3428  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3429  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3430  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3431  uint64_t d[2];
3432  d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3433  d[1] = a1;
3434 
3435  return vreinterpretq_m128d_u64(vld1q_u64(d));
3436 #endif
3437 }
3438 
3439 // Compare packed double-precision (64-bit) floating-point elements in a and b
3440 // for less-than-or-equal, and store the results in dst.
3441 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3443 {
3444 #if defined(__aarch64__)
3445  return vreinterpretq_m128d_u64(
3446  vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3447 #else
3448  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3449  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3450  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3451  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3452  uint64_t d[2];
3453  d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3454  d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3455 
3456  return vreinterpretq_m128d_u64(vld1q_u64(d));
3457 #endif
3458 }
3459 
3460 // Compare the lower double-precision (64-bit) floating-point elements in a and
3461 // b for less-than-or-equal, store the result in the lower element of dst, and
3462 // copy the upper element from a to the upper element of dst.
3463 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3465 {
3466 #if defined(__aarch64__)
3467  return _mm_move_sd(a, _mm_cmple_pd(a, b));
3468 #else
3469  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3470  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3471  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3472  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3473  uint64_t d[2];
3474  d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3475  d[1] = a1;
3476 
3477  return vreinterpretq_m128d_u64(vld1q_u64(d));
3478 #endif
3479 }
3480 
3481 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3482 // in b for less than.
3483 //
3484 // r0 := (a0 < b0) ? 0xffff : 0x0
3485 // r1 := (a1 < b1) ? 0xffff : 0x0
3486 // ...
3487 // r7 := (a7 < b7) ? 0xffff : 0x0
3488 //
3489 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3491 {
3492  return vreinterpretq_m128i_u16(
3494 }
3495 
3496 
3497 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3498 // in b for less than.
3499 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3501 {
3502  return vreinterpretq_m128i_u32(
3504 }
3505 
3506 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3507 // in b for lesser than.
3508 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3510 {
3511  return vreinterpretq_m128i_u8(
3513 }
3514 
3515 // Compare packed double-precision (64-bit) floating-point elements in a and b
3516 // for less-than, and store the results in dst.
3517 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3519 {
3520 #if defined(__aarch64__)
3521  return vreinterpretq_m128d_u64(
3522  vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3523 #else
3524  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3525  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3526  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3527  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3528  uint64_t d[2];
3529  d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3530  d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3531 
3532  return vreinterpretq_m128d_u64(vld1q_u64(d));
3533 #endif
3534 }
3535 
3536 // Compare the lower double-precision (64-bit) floating-point elements in a and
3537 // b for less-than, store the result in the lower element of dst, and copy the
3538 // upper element from a to the upper element of dst.
3539 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3541 {
3542 #if defined(__aarch64__)
3543  return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3544 #else
3545  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3546  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3547  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3548  uint64_t d[2];
3549  d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3550  d[1] = a1;
3551 
3552  return vreinterpretq_m128d_u64(vld1q_u64(d));
3553 #endif
3554 }
3555 
3556 // Compare packed double-precision (64-bit) floating-point elements in a and b
3557 // for not-equal, and store the results in dst.
3558 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3560 {
3561 #if defined(__aarch64__)
3562  return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3563  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3564 #else
3565  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3566  uint32x4_t cmp =
3568  uint32x4_t swapped = vrev64q_u32(cmp);
3569  return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3570 #endif
3571 }
3572 
3573 // Compare the lower double-precision (64-bit) floating-point elements in a and
3574 // b for not-equal, store the result in the lower element of dst, and copy the
3575 // upper element from a to the upper element of dst.
3576 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3578 {
3579  return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3580 }
3581 
3582 // Compare packed double-precision (64-bit) floating-point elements in a and b
3583 // for not-greater-than-or-equal, and store the results in dst.
3584 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3586 {
3587 #if defined(__aarch64__)
3588  return vreinterpretq_m128d_u64(veorq_u64(
3589  vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3590  vdupq_n_u64(UINT64_MAX)));
3591 #else
3592  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3593  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3594  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3595  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3596  uint64_t d[2];
3597  d[0] =
3598  !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3599  d[1] =
3600  !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3601 
3602  return vreinterpretq_m128d_u64(vld1q_u64(d));
3603 #endif
3604 }
3605 
3606 // Compare the lower double-precision (64-bit) floating-point elements in a and
3607 // b for not-greater-than-or-equal, store the result in the lower element of
3608 // dst, and copy the upper element from a to the upper element of dst.
3609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3611 {
3612  return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3613 }
3614 
3615 // Compare packed double-precision (64-bit) floating-point elements in a and b
3616 // for not-greater-than, and store the results in dst.
3617 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3619 {
3620 #if defined(__aarch64__)
3621  return vreinterpretq_m128d_u64(veorq_u64(
3622  vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3623  vdupq_n_u64(UINT64_MAX)));
3624 #else
3625  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3626  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3627  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3628  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3629  uint64_t d[2];
3630  d[0] =
3631  !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3632  d[1] =
3633  !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3634 
3635  return vreinterpretq_m128d_u64(vld1q_u64(d));
3636 #endif
3637 }
3638 
3639 // Compare the lower double-precision (64-bit) floating-point elements in a and
3640 // b for not-greater-than, store the result in the lower element of dst, and
3641 // copy the upper element from a to the upper element of dst.
3642 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3644 {
3645  return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3646 }
3647 
3648 // Compare packed double-precision (64-bit) floating-point elements in a and b
3649 // for not-less-than-or-equal, and store the results in dst.
3650 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3652 {
3653 #if defined(__aarch64__)
3654  return vreinterpretq_m128d_u64(veorq_u64(
3655  vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656  vdupq_n_u64(UINT64_MAX)));
3657 #else
3658  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3659  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3660  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3661  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3662  uint64_t d[2];
3663  d[0] =
3664  !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3665  d[1] =
3666  !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3667 
3668  return vreinterpretq_m128d_u64(vld1q_u64(d));
3669 #endif
3670 }
3671 
3672 // Compare the lower double-precision (64-bit) floating-point elements in a and
3673 // b for not-less-than-or-equal, store the result in the lower element of dst,
3674 // and copy the upper element from a to the upper element of dst.
3675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3677 {
3678  return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3679 }
3680 
3681 // Compare packed double-precision (64-bit) floating-point elements in a and b
3682 // for not-less-than, and store the results in dst.
3683 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3685 {
3686 #if defined(__aarch64__)
3687  return vreinterpretq_m128d_u64(veorq_u64(
3688  vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3689  vdupq_n_u64(UINT64_MAX)));
3690 #else
3691  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3692  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3693  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3694  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3695  uint64_t d[2];
3696  d[0] =
3697  !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3698  d[1] =
3699  !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3700 
3701  return vreinterpretq_m128d_u64(vld1q_u64(d));
3702 #endif
3703 }
3704 
3705 // Compare the lower double-precision (64-bit) floating-point elements in a and
3706 // b for not-less-than, store the result in the lower element of dst, and copy
3707 // the upper element from a to the upper element of dst.
3708 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3710 {
3711  return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3712 }
3713 
3714 // Compare packed double-precision (64-bit) floating-point elements in a and b
3715 // to see if neither is NaN, and store the results in dst.
3716 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3718 {
3719 #if defined(__aarch64__)
3720  // Excluding NaNs, any two floating point numbers can be compared.
3721  uint64x2_t not_nan_a =
3722  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3723  uint64x2_t not_nan_b =
3724  vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3725  return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3726 #else
3727  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3728  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3729  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3730  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3731  uint64_t d[2];
3732  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3733  (*(double *) &b0) == (*(double *) &b0))
3734  ? ~UINT64_C(0)
3735  : UINT64_C(0);
3736  d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3737  (*(double *) &b1) == (*(double *) &b1))
3738  ? ~UINT64_C(0)
3739  : UINT64_C(0);
3740 
3741  return vreinterpretq_m128d_u64(vld1q_u64(d));
3742 #endif
3743 }
3744 
3745 // Compare the lower double-precision (64-bit) floating-point elements in a and
3746 // b to see if neither is NaN, store the result in the lower element of dst, and
3747 // copy the upper element from a to the upper element of dst.
3748 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3750 {
3751 #if defined(__aarch64__)
3752  return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3753 #else
3754  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3755  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3756  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3757  uint64_t d[2];
3758  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3759  (*(double *) &b0) == (*(double *) &b0))
3760  ? ~UINT64_C(0)
3761  : UINT64_C(0);
3762  d[1] = a1;
3763 
3764  return vreinterpretq_m128d_u64(vld1q_u64(d));
3765 #endif
3766 }
3767 
3768 // Compare packed double-precision (64-bit) floating-point elements in a and b
3769 // to see if either is NaN, and store the results in dst.
3770 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3772 {
3773 #if defined(__aarch64__)
3774  // Two NaNs are not equal in comparison operation.
3775  uint64x2_t not_nan_a =
3776  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3777  uint64x2_t not_nan_b =
3778  vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3779  return vreinterpretq_m128d_s32(
3780  vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3781 #else
3782  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3783  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3784  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3785  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3786  uint64_t d[2];
3787  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3788  (*(double *) &b0) == (*(double *) &b0))
3789  ? UINT64_C(0)
3790  : ~UINT64_C(0);
3791  d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3792  (*(double *) &b1) == (*(double *) &b1))
3793  ? UINT64_C(0)
3794  : ~UINT64_C(0);
3795 
3796  return vreinterpretq_m128d_u64(vld1q_u64(d));
3797 #endif
3798 }
3799 
3800 // Compare the lower double-precision (64-bit) floating-point elements in a and
3801 // b to see if either is NaN, store the result in the lower element of dst, and
3802 // copy the upper element from a to the upper element of dst.
3803 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3805 {
3806 #if defined(__aarch64__)
3807  return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3808 #else
3809  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3810  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3811  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3812  uint64_t d[2];
3813  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3814  (*(double *) &b0) == (*(double *) &b0))
3815  ? UINT64_C(0)
3816  : ~UINT64_C(0);
3817  d[1] = a1;
3818 
3819  return vreinterpretq_m128d_u64(vld1q_u64(d));
3820 #endif
3821 }
3822 
3823 // Compare the lower double-precision (64-bit) floating-point element in a and b
3824 // for greater-than-or-equal, and return the boolean result (0 or 1).
3825 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3827 {
3828 #if defined(__aarch64__)
3829  return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3830 #else
3831  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3832  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3833 
3834  return (*(double *) &a0 >= *(double *) &b0);
3835 #endif
3836 }
3837 
3838 // Compare the lower double-precision (64-bit) floating-point element in a and b
3839 // for greater-than, and return the boolean result (0 or 1).
3840 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3842 {
3843 #if defined(__aarch64__)
3844  return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3845 #else
3846  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3847  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3848 
3849  return (*(double *) &a0 > *(double *) &b0);
3850 #endif
3851 }
3852 
3853 // Compare the lower double-precision (64-bit) floating-point element in a and b
3854 // for less-than-or-equal, and return the boolean result (0 or 1).
3855 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3857 {
3858 #if defined(__aarch64__)
3859  return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3860 #else
3861  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3862  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3863 
3864  return (*(double *) &a0 <= *(double *) &b0);
3865 #endif
3866 }
3867 
3868 // Compare the lower double-precision (64-bit) floating-point element in a and b
3869 // for less-than, and return the boolean result (0 or 1).
3870 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3872 {
3873 #if defined(__aarch64__)
3874  return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3875 #else
3876  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3877  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3878 
3879  return (*(double *) &a0 < *(double *) &b0);
3880 #endif
3881 }
3882 
3883 // Compare the lower double-precision (64-bit) floating-point element in a and b
3884 // for equality, and return the boolean result (0 or 1).
3885 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3887 {
3888 #if defined(__aarch64__)
3889  return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3890 #else
3891  uint32x4_t a_not_nan =
3893  uint32x4_t b_not_nan =
3895  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3896  uint32x4_t a_eq_b =
3898  uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3899  vreinterpretq_u64_u32(a_eq_b));
3900  return vgetq_lane_u64(and_results, 0) & 0x1;
3901 #endif
3902 }
3903 
3904 // Compare the lower double-precision (64-bit) floating-point element in a and b
3905 // for not-equal, and return the boolean result (0 or 1).
3906 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3908 {
3909  return !_mm_comieq_sd(a, b);
3910 }
3911 
3912 // Convert packed signed 32-bit integers in a to packed double-precision
3913 // (64-bit) floating-point elements, and store the results in dst.
3914 //
3915 // FOR j := 0 to 1
3916 // i := j*32
3917 // m := j*64
3918 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3919 // ENDFOR
3920 //
3921 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3923 {
3924 #if defined(__aarch64__)
3925  return vreinterpretq_m128d_f64(
3926  vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3927 #else
3928  double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3929  double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3930  return _mm_set_pd(a1, a0);
3931 #endif
3932 }
3933 
3934 // Converts the four signed 32-bit integer values of a to single-precision,
3935 // floating-point values
3936 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3938 {
3939  return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3940 }
3941 
3942 // Convert packed double-precision (64-bit) floating-point elements in a to
3943 // packed 32-bit integers, and store the results in dst.
3944 //
3945 // FOR j := 0 to 1
3946 // i := 32*j
3947 // k := 64*j
3948 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3949 // ENDFOR
3950 //
3951 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3953 {
3955  double d0 = ((double *) &rnd)[0];
3956  double d1 = ((double *) &rnd)[1];
3957  return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3958 }
3959 
3960 // Convert packed double-precision (64-bit) floating-point elements in a to
3961 // packed 32-bit integers, and store the results in dst.
3962 //
3963 // FOR j := 0 to 1
3964 // i := 32*j
3965 // k := 64*j
3966 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3967 // ENDFOR
3968 //
3969 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3971 {
3973  double d0 = ((double *) &rnd)[0];
3974  double d1 = ((double *) &rnd)[1];
3975  int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3976  return vreinterpret_m64_s32(vld1_s32(data));
3977 }
3978 
3979 // Convert packed double-precision (64-bit) floating-point elements in a to
3980 // packed single-precision (32-bit) floating-point elements, and store the
3981 // results in dst.
3982 //
3983 // FOR j := 0 to 1
3984 // i := 32*j
3985 // k := 64*j
3986 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3987 // ENDFOR
3988 // dst[127:64] := 0
3989 //
3990 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3992 {
3993 #if defined(__aarch64__)
3994  float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3995  return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3996 #else
3997  float a0 = (float) ((double *) &a)[0];
3998  float a1 = (float) ((double *) &a)[1];
3999  return _mm_set_ps(0, 0, a1, a0);
4000 #endif
4001 }
4002 
4003 // Convert packed signed 32-bit integers in a to packed double-precision
4004 // (64-bit) floating-point elements, and store the results in dst.
4005 //
4006 // FOR j := 0 to 1
4007 // i := j*32
4008 // m := j*64
4009 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
4010 // ENDFOR
4011 //
4012 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
4014 {
4015 #if defined(__aarch64__)
4016  return vreinterpretq_m128d_f64(
4017  vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
4018 #else
4019  double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
4020  double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
4021  return _mm_set_pd(a1, a0);
4022 #endif
4023 }
4024 
4025 // Converts the four single-precision, floating-point values of a to signed
4026 // 32-bit integer values.
4027 //
4028 // r0 := (int) a0
4029 // r1 := (int) a1
4030 // r2 := (int) a2
4031 // r3 := (int) a3
4032 //
4033 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4034 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4035 // does not support! It is supported on ARMv8-A however.
4037 {
4038 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4039  switch (_MM_GET_ROUNDING_MODE()) {
4040  case _MM_ROUND_NEAREST:
4041  return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4042  case _MM_ROUND_DOWN:
4043  return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
4044  case _MM_ROUND_UP:
4045  return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4046  default: // _MM_ROUND_TOWARD_ZERO
4047  return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4048  }
4049 #else
4050  float *f = (float *) &a;
4051  switch (_MM_GET_ROUNDING_MODE()) {
4052  case _MM_ROUND_NEAREST: {
4053  uint32x4_t signmask = vdupq_n_u32(0x80000000);
4054  float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4055  vdupq_n_f32(0.5f)); /* +/- 0.5 */
4056  int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4057  vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4058  int32x4_t r_trunc = vcvtq_s32_f32(
4059  vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4060  int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4061  vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4062  int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4063  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4064  float32x4_t delta = vsubq_f32(
4066  vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4067  uint32x4_t is_delta_half =
4068  vceqq_f32(delta, half); /* delta == +/- 0.5 */
4069  return vreinterpretq_m128i_s32(
4070  vbslq_s32(is_delta_half, r_even, r_normal));
4071  }
4072  case _MM_ROUND_DOWN:
4073  return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4074  floorf(f[0]));
4075  case _MM_ROUND_UP:
4076  return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4077  ceilf(f[0]));
4078  default: // _MM_ROUND_TOWARD_ZERO
4079  return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4080  (int32_t) f[0]);
4081  }
4082 #endif
4083 }
4084 
4085 // Convert packed single-precision (32-bit) floating-point elements in a to
4086 // packed double-precision (64-bit) floating-point elements, and store the
4087 // results in dst.
4088 //
4089 // FOR j := 0 to 1
4090 // i := 64*j
4091 // k := 32*j
4092 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4093 // ENDFOR
4094 //
4095 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4097 {
4098 #if defined(__aarch64__)
4099  return vreinterpretq_m128d_f64(
4100  vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4101 #else
4102  double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4103  double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4104  return _mm_set_pd(a1, a0);
4105 #endif
4106 }
4107 
4108 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4109 //
4110 // dst[63:0] := a[63:0]
4111 //
4112 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4114 {
4115 #if defined(__aarch64__)
4116  return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4117 #else
4118  return ((double *) &a)[0];
4119 #endif
4120 }
4121 
4122 // Convert the lower double-precision (64-bit) floating-point element in a to a
4123 // 32-bit integer, and store the result in dst.
4124 //
4125 // dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4126 //
4127 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
4129 {
4130 #if defined(__aarch64__)
4131  return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4132 #else
4134  double ret = ((double *) &rnd)[0];
4135  return (int32_t) ret;
4136 #endif
4137 }
4138 
4139 // Convert the lower double-precision (64-bit) floating-point element in a to a
4140 // 64-bit integer, and store the result in dst.
4141 //
4142 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4143 //
4144 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
4146 {
4147 #if defined(__aarch64__)
4148  return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4149 #else
4151  double ret = ((double *) &rnd)[0];
4152  return (int64_t) ret;
4153 #endif
4154 }
4155 
4156 // Convert the lower double-precision (64-bit) floating-point element in a to a
4157 // 64-bit integer, and store the result in dst.
4158 //
4159 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4160 //
4161 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
4162 #define _mm_cvtsd_si64x _mm_cvtsd_si64
4163 
4164 // Convert the lower double-precision (64-bit) floating-point element in b to a
4165 // single-precision (32-bit) floating-point element, store the result in the
4166 // lower element of dst, and copy the upper 3 packed elements from a to the
4167 // upper elements of dst.
4168 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
4170 {
4171 #if defined(__aarch64__)
4172  return vreinterpretq_m128_f32(vsetq_lane_f32(
4173  vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4174  vreinterpretq_f32_m128(a), 0));
4175 #else
4176  return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4177  vreinterpretq_f32_m128(a), 0));
4178 #endif
4179 }
4180 
4181 // Copy the lower 32-bit integer in a to dst.
4182 //
4183 // dst[31:0] := a[31:0]
4184 //
4185 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4187 {
4188  return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4189 }
4190 
4191 // Copy the lower 64-bit integer in a to dst.
4192 //
4193 // dst[63:0] := a[63:0]
4194 //
4195 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4197 {
4198  return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4199 }
4200 
4201 // Copy the lower 64-bit integer in a to dst.
4202 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4203 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4204 
4205 // Convert the signed 32-bit integer b to a double-precision (64-bit)
4206 // floating-point element, store the result in the lower element of dst, and
4207 // copy the upper element from a to the upper element of dst.
4208 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4210 {
4211 #if defined(__aarch64__)
4212  return vreinterpretq_m128d_f64(
4213  vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4214 #else
4215  double bf = (double) b;
4216  return vreinterpretq_m128d_s64(
4217  vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4218 #endif
4219 }
4220 
4221 // Copy the lower 64-bit integer in a to dst.
4222 //
4223 // dst[63:0] := a[63:0]
4224 //
4225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4226 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4227 
4228 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4229 // zero extending the upper bits.
4230 //
4231 // r0 := a
4232 // r1 := 0x0
4233 // r2 := 0x0
4234 // r3 := 0x0
4235 //
4236 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4238 {
4239  return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4240 }
4241 
4242 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4243 // floating-point element, store the result in the lower element of dst, and
4244 // copy the upper element from a to the upper element of dst.
4245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4247 {
4248 #if defined(__aarch64__)
4249  return vreinterpretq_m128d_f64(
4250  vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4251 #else
4252  double bf = (double) b;
4253  return vreinterpretq_m128d_s64(
4254  vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4255 #endif
4256 }
4257 
4258 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4259 // zero extending the upper bits.
4260 //
4261 // r0 := a
4262 // r1 := 0x0
4264 {
4265  return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4266 }
4267 
4268 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4269 // element.
4270 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4271 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4272 
4273 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4274 // floating-point element, store the result in the lower element of dst, and
4275 // copy the upper element from a to the upper element of dst.
4276 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4277 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4278 
4279 // Convert the lower single-precision (32-bit) floating-point element in b to a
4280 // double-precision (64-bit) floating-point element, store the result in the
4281 // lower element of dst, and copy the upper element from a to the upper element
4282 // of dst.
4283 //
4284 // dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4285 // dst[127:64] := a[127:64]
4286 //
4287 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4289 {
4290  double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4291 #if defined(__aarch64__)
4292  return vreinterpretq_m128d_f64(
4293  vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4294 #else
4295  return vreinterpretq_m128d_s64(
4296  vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4297 #endif
4298 }
4299 
4300 // Convert packed double-precision (64-bit) floating-point elements in a to
4301 // packed 32-bit integers with truncation, and store the results in dst.
4302 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4304 {
4305  double a0 = ((double *) &a)[0];
4306  double a1 = ((double *) &a)[1];
4307  return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4308 }
4309 
4310 // Convert packed double-precision (64-bit) floating-point elements in a to
4311 // packed 32-bit integers with truncation, and store the results in dst.
4312 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4314 {
4315  double a0 = ((double *) &a)[0];
4316  double a1 = ((double *) &a)[1];
4317  int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4318  return vreinterpret_m64_s32(vld1_s32(data));
4319 }
4320 
4321 // Converts the four single-precision, floating-point values of a to signed
4322 // 32-bit integer values using truncate.
4323 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4325 {
4326  return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4327 }
4328 
4329 // Convert the lower double-precision (64-bit) floating-point element in a to a
4330 // 32-bit integer with truncation, and store the result in dst.
4331 //
4332 // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4333 //
4334 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4336 {
4337  double ret = *((double *) &a);
4338  return (int32_t) ret;
4339 }
4340 
4341 // Convert the lower double-precision (64-bit) floating-point element in a to a
4342 // 64-bit integer with truncation, and store the result in dst.
4343 //
4344 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4345 //
4346 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4348 {
4349 #if defined(__aarch64__)
4350  return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4351 #else
4352  double ret = *((double *) &a);
4353  return (int64_t) ret;
4354 #endif
4355 }
4356 
4357 // Convert the lower double-precision (64-bit) floating-point element in a to a
4358 // 64-bit integer with truncation, and store the result in dst.
4359 //
4360 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4361 //
4362 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4363 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4364 
4365 // Divide packed double-precision (64-bit) floating-point elements in a by
4366 // packed elements in b, and store the results in dst.
4367 //
4368 // FOR j := 0 to 1
4369 // i := 64*j
4370 // dst[i+63:i] := a[i+63:i] / b[i+63:i]
4371 // ENDFOR
4372 //
4373 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4375 {
4376 #if defined(__aarch64__)
4377  return vreinterpretq_m128d_f64(
4378  vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4379 #else
4380  double *da = (double *) &a;
4381  double *db = (double *) &b;
4382  double c[2];
4383  c[0] = da[0] / db[0];
4384  c[1] = da[1] / db[1];
4385  return vld1q_f32((float32_t *) c);
4386 #endif
4387 }
4388 
4389 // Divide the lower double-precision (64-bit) floating-point element in a by the
4390 // lower double-precision (64-bit) floating-point element in b, store the result
4391 // in the lower element of dst, and copy the upper element from a to the upper
4392 // element of dst.
4393 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4395 {
4396 #if defined(__aarch64__)
4397  float64x2_t tmp =
4398  vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4399  return vreinterpretq_m128d_f64(
4400  vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4401 #else
4402  return _mm_move_sd(a, _mm_div_pd(a, b));
4403 #endif
4404 }
4405 
4406 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4407 // extends.
4408 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4409 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4410 #define _mm_extract_epi16(a, imm) \
4411  vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4412 
4413 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4414 // of a.
4415 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4416 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4417 // __constrange(0,8) int imm)
4418 #define _mm_insert_epi16(a, b, imm) \
4419  __extension__({ \
4420  vreinterpretq_m128i_s16( \
4421  vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4422  })
4423 
4424 // Loads two double-precision from 16-byte aligned memory, floating-point
4425 // values.
4426 //
4427 // dst[127:0] := MEM[mem_addr+127:mem_addr]
4428 //
4429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4431 {
4432 #if defined(__aarch64__)
4433  return vreinterpretq_m128d_f64(vld1q_f64(p));
4434 #else
4435  const float *fp = (const float *) p;
4436  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4437  return vreinterpretq_m128d_f32(vld1q_f32(data));
4438 #endif
4439 }
4440 
4441 // Load a double-precision (64-bit) floating-point element from memory into both
4442 // elements of dst.
4443 //
4444 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4445 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4446 //
4447 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4448 #define _mm_load_pd1 _mm_load1_pd
4449 
4450 // Load a double-precision (64-bit) floating-point element from memory into the
4451 // lower of dst, and zero the upper element. mem_addr does not need to be
4452 // aligned on any particular boundary.
4453 //
4454 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4455 // dst[127:64] := 0
4456 //
4457 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4459 {
4460 #if defined(__aarch64__)
4461  return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4462 #else
4463  const float *fp = (const float *) p;
4464  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4465  return vreinterpretq_m128d_f32(vld1q_f32(data));
4466 #endif
4467 }
4468 
4469 // Loads 128-bit value. :
4470 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4472 {
4473  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4474 }
4475 
4476 // Load a double-precision (64-bit) floating-point element from memory into both
4477 // elements of dst.
4478 //
4479 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4480 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4481 //
4482 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4484 {
4485 #if defined(__aarch64__)
4486  return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4487 #else
4488  return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4489 #endif
4490 }
4491 
4492 // Load a double-precision (64-bit) floating-point element from memory into the
4493 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4494 // not need to be aligned on any particular boundary.
4495 //
4496 // dst[63:0] := a[63:0]
4497 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4498 //
4499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4501 {
4502 #if defined(__aarch64__)
4503  return vreinterpretq_m128d_f64(
4504  vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4505 #else
4506  return vreinterpretq_m128d_f32(vcombine_f32(
4507  vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4508 #endif
4509 }
4510 
4511 // Load 64-bit integer from memory into the first element of dst.
4512 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4514 {
4515  /* Load the lower 64 bits of the value pointed to by p into the
4516  * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4517  */
4518  return vreinterpretq_m128i_s32(
4519  vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4520 }
4521 
4522 // Load a double-precision (64-bit) floating-point element from memory into the
4523 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4524 // not need to be aligned on any particular boundary.
4525 //
4526 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4527 // dst[127:64] := a[127:64]
4528 //
4529 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4531 {
4532 #if defined(__aarch64__)
4533  return vreinterpretq_m128d_f64(
4534  vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4535 #else
4536  return vreinterpretq_m128d_f32(
4537  vcombine_f32(vld1_f32((const float *) p),
4538  vget_high_f32(vreinterpretq_f32_m128d(a))));
4539 #endif
4540 }
4541 
4542 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4543 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4544 // general-protection exception may be generated.
4545 //
4546 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4547 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4548 //
4549 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4551 {
4552 #if defined(__aarch64__)
4553  float64x2_t v = vld1q_f64(p);
4554  return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4555 #else
4556  int64x2_t v = vld1q_s64((const int64_t *) p);
4557  return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4558 #endif
4559 }
4560 
4561 // Loads two double-precision from unaligned memory, floating-point values.
4562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4564 {
4565  return _mm_load_pd(p);
4566 }
4567 
4568 // Loads 128-bit value. :
4569 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4571 {
4572  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4573 }
4574 
4575 // Load unaligned 32-bit integer from memory into the first element of dst.
4576 //
4577 // dst[31:0] := MEM[mem_addr+31:mem_addr]
4578 // dst[MAX:32] := 0
4579 //
4580 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4582 {
4583  return vreinterpretq_m128i_s32(
4584  vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4585 }
4586 
4587 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4588 // integers from b.
4589 //
4590 // r0 := (a0 * b0) + (a1 * b1)
4591 // r1 := (a2 * b2) + (a3 * b3)
4592 // r2 := (a4 * b4) + (a5 * b5)
4593 // r3 := (a6 * b6) + (a7 * b7)
4594 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4596 {
4597  int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4598  vget_low_s16(vreinterpretq_s16_m128i(b)));
4599  int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4600  vget_high_s16(vreinterpretq_s16_m128i(b)));
4601 
4602  int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4603  int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4604 
4605  return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4606 }
4607 
4608 // Conditionally store 8-bit integer elements from a into memory using mask
4609 // (elements are not stored when the highest bit is not set in the corresponding
4610 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4611 // on any particular boundary.
4612 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4613 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4614 {
4615  int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4616  __m128 b = _mm_load_ps((const float *) mem_addr);
4617  int8x16_t masked =
4618  vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4620  vst1q_s8((int8_t *) mem_addr, masked);
4621 }
4622 
4623 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4624 // signed 16-bit integers from b.
4625 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4627 {
4628  return vreinterpretq_m128i_s16(
4630 }
4631 
4632 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4633 // 16 unsigned 8-bit integers from b.
4634 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4636 {
4637  return vreinterpretq_m128i_u8(
4639 }
4640 
4641 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4642 // and store packed maximum values in dst.
4643 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4645 {
4646 #if defined(__aarch64__)
4647 #if SSE2NEON_PRECISE_MINMAX
4648  float64x2_t _a = vreinterpretq_f64_m128d(a);
4649  float64x2_t _b = vreinterpretq_f64_m128d(b);
4650  return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4651 #else
4652  return vreinterpretq_m128d_f64(
4653  vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4654 #endif
4655 #else
4656  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4657  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4658  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4659  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4660  uint64_t d[2];
4661  d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4662  d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4663 
4664  return vreinterpretq_m128d_u64(vld1q_u64(d));
4665 #endif
4666 }
4667 
4668 // Compare the lower double-precision (64-bit) floating-point elements in a and
4669 // b, store the maximum value in the lower element of dst, and copy the upper
4670 // element from a to the upper element of dst.
4671 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4673 {
4674 #if defined(__aarch64__)
4675  return _mm_move_sd(a, _mm_max_pd(a, b));
4676 #else
4677  double *da = (double *) &a;
4678  double *db = (double *) &b;
4679  double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4680  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4681 #endif
4682 }
4683 
4684 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4685 // signed 16-bit integers from b.
4686 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4688 {
4689  return vreinterpretq_m128i_s16(
4691 }
4692 
4693 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4694 // 16 unsigned 8-bit integers from b.
4695 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4697 {
4698  return vreinterpretq_m128i_u8(
4700 }
4701 
4702 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4703 // and store packed minimum values in dst.
4704 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4706 {
4707 #if defined(__aarch64__)
4708 #if SSE2NEON_PRECISE_MINMAX
4709  float64x2_t _a = vreinterpretq_f64_m128d(a);
4710  float64x2_t _b = vreinterpretq_f64_m128d(b);
4711  return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4712 #else
4713  return vreinterpretq_m128d_f64(
4714  vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4715 #endif
4716 #else
4717  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4718  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4719  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4720  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4721  uint64_t d[2];
4722  d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4723  d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4724  return vreinterpretq_m128d_u64(vld1q_u64(d));
4725 #endif
4726 }
4727 
4728 // Compare the lower double-precision (64-bit) floating-point elements in a and
4729 // b, store the minimum value in the lower element of dst, and copy the upper
4730 // element from a to the upper element of dst.
4731 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4733 {
4734 #if defined(__aarch64__)
4735  return _mm_move_sd(a, _mm_min_pd(a, b));
4736 #else
4737  double *da = (double *) &a;
4738  double *db = (double *) &b;
4739  double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4740  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4741 #endif
4742 }
4743 
4744 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4745 // upper element.
4746 //
4747 // dst[63:0] := a[63:0]
4748 // dst[127:64] := 0
4749 //
4750 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4752 {
4753  return vreinterpretq_m128i_s64(
4754  vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4755 }
4756 
4757 // Move the lower double-precision (64-bit) floating-point element from b to the
4758 // lower element of dst, and copy the upper element from a to the upper element
4759 // of dst.
4760 //
4761 // dst[63:0] := b[63:0]
4762 // dst[127:64] := a[127:64]
4763 //
4764 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4766 {
4767  return vreinterpretq_m128d_f32(
4768  vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4769  vget_high_f32(vreinterpretq_f32_m128d(a))));
4770 }
4771 
4772 // NEON does not provide a version of this function.
4773 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4774 // unsigned 8-bit integers in a and zero extends the upper bits.
4775 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4777 {
4778  // Use increasingly wide shifts+adds to collect the sign bits
4779  // together.
4780  // Since the widening shifts would be rather confusing to follow in little
4781  // endian, everything will be illustrated in big endian order instead. This
4782  // has a different result - the bits would actually be reversed on a big
4783  // endian machine.
4784 
4785  // Starting input (only half the elements are shown):
4786  // 89 ff 1d c0 00 10 99 33
4787  uint8x16_t input = vreinterpretq_u8_m128i(a);
4788 
4789  // Shift out everything but the sign bits with an unsigned shift right.
4790  //
4791  // Bytes of the vector::
4792  // 89 ff 1d c0 00 10 99 33
4793  // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4794  // | | | | | | | |
4795  // 01 01 00 01 00 00 01 00
4796  //
4797  // Bits of first important lane(s):
4798  // 10001001 (89)
4799  // \______
4800  // |
4801  // 00000001 (01)
4802  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4803 
4804  // Merge the even lanes together with a 16-bit unsigned shift right + add.
4805  // 'xx' represents garbage data which will be ignored in the final result.
4806  // In the important bytes, the add functions like a binary OR.
4807  //
4808  // 01 01 00 01 00 00 01 00
4809  // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4810  // \| \| \| \|
4811  // xx 03 xx 01 xx 00 xx 02
4812  //
4813  // 00000001 00000001 (01 01)
4814  // \_______ |
4815  // \|
4816  // xxxxxxxx xxxxxx11 (xx 03)
4817  uint32x4_t paired16 =
4818  vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4819 
4820  // Repeat with a wider 32-bit shift + add.
4821  // xx 03 xx 01 xx 00 xx 02
4822  // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4823  // 14))
4824  // \| \|
4825  // xx xx xx 0d xx xx xx 02
4826  //
4827  // 00000011 00000001 (03 01)
4828  // \\_____ ||
4829  // '----.\||
4830  // xxxxxxxx xxxx1101 (xx 0d)
4831  uint64x2_t paired32 =
4832  vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4833 
4834  // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4835  // lanes. xx xx xx 0d xx xx xx 02
4836  // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4837  // 28))
4838  // \|
4839  // xx xx xx xx xx xx xx d2
4840  //
4841  // 00001101 00000010 (0d 02)
4842  // \ \___ | |
4843  // '---. \| |
4844  // xxxxxxxx 11010010 (xx d2)
4845  uint8x16_t paired64 =
4846  vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4847 
4848  // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4849  // xx xx xx xx xx xx xx d2
4850  // || return paired64[0]
4851  // d2
4852  // Note: Little endian would return the correct value 4b (01001011) instead.
4853  return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4854 }
4855 
4856 // Set each bit of mask dst based on the most significant bit of the
4857 // corresponding packed double-precision (64-bit) floating-point element in a.
4858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4860 {
4861  uint64x2_t input = vreinterpretq_u64_m128d(a);
4862  uint64x2_t high_bits = vshrq_n_u64(input, 63);
4863  return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4864 }
4865 
4866 // Copy the lower 64-bit integer in a to dst.
4867 //
4868 // dst[63:0] := a[63:0]
4869 //
4870 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4872 {
4873  return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4874 }
4875 
4876 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4877 // element.
4878 //
4879 // dst[63:0] := a[63:0]
4880 // dst[127:64] := 0
4881 //
4882 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4884 {
4885  return vreinterpretq_m128i_s64(
4886  vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4887 }
4888 
4889 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4890 // a and b, and store the unsigned 64-bit results in dst.
4891 //
4892 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4893 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4895 {
4896  // vmull_u32 upcasts instead of masking, so we downcast.
4897  uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4898  uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4899  return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4900 }
4901 
4902 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4903 // and store the results in dst.
4904 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4906 {
4907 #if defined(__aarch64__)
4908  return vreinterpretq_m128d_f64(
4909  vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4910 #else
4911  double *da = (double *) &a;
4912  double *db = (double *) &b;
4913  double c[2];
4914  c[0] = da[0] * db[0];
4915  c[1] = da[1] * db[1];
4916  return vld1q_f32((float32_t *) c);
4917 #endif
4918 }
4919 
4920 // Multiply the lower double-precision (64-bit) floating-point element in a and
4921 // b, store the result in the lower element of dst, and copy the upper element
4922 // from a to the upper element of dst.
4923 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4925 {
4926  return _mm_move_sd(a, _mm_mul_pd(a, b));
4927 }
4928 
4929 // Multiply the low unsigned 32-bit integers from a and b, and store the
4930 // unsigned 64-bit result in dst.
4931 //
4932 // dst[63:0] := a[31:0] * b[31:0]
4933 //
4934 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4936 {
4937  return vreinterpret_m64_u64(vget_low_u64(
4938  vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4939 }
4940 
4941 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4942 // integers from b.
4943 //
4944 // r0 := (a0 * b0)[31:16]
4945 // r1 := (a1 * b1)[31:16]
4946 // ...
4947 // r7 := (a7 * b7)[31:16]
4948 //
4949 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4951 {
4952  /* FIXME: issue with large values because of result saturation */
4953  // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4954  // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4955  // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4956  int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4957  int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4958  int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4959  int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4960  int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4961  int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4962  uint16x8x2_t r =
4963  vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4964  return vreinterpretq_m128i_u16(r.val[1]);
4965 }
4966 
4967 // Multiply the packed unsigned 16-bit integers in a and b, producing
4968 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4969 // integers in dst.
4970 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4972 {
4973  uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4974  uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4975  uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4976 #if defined(__aarch64__)
4977  uint32x4_t ab7654 =
4978  vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4979  uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4980  vreinterpretq_u16_u32(ab7654));
4981  return vreinterpretq_m128i_u16(r);
4982 #else
4983  uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4984  uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4985  uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4986  uint16x8x2_t r =
4987  vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4988  return vreinterpretq_m128i_u16(r.val[1]);
4989 #endif
4990 }
4991 
4992 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4993 // unsigned 16-bit integers from b.
4994 //
4995 // r0 := (a0 * b0)[15:0]
4996 // r1 := (a1 * b1)[15:0]
4997 // ...
4998 // r7 := (a7 * b7)[15:0]
4999 //
5000 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
5002 {
5003  return vreinterpretq_m128i_s16(
5005 }
5006 
5007 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
5008 // elements in a and b, and store the results in dst.
5009 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
5011 {
5012  return vreinterpretq_m128d_s64(
5014 }
5015 
5016 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
5017 //
5018 // r := a | b
5019 //
5020 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
5022 {
5023  return vreinterpretq_m128i_s32(
5025 }
5026 
5027 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5028 // saturates.
5029 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
5031 {
5032  return vreinterpretq_m128i_s8(
5033  vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5034  vqmovn_s16(vreinterpretq_s16_m128i(b))));
5035 }
5036 
5037 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5038 // and saturates.
5039 //
5040 // r0 := SignedSaturate(a0)
5041 // r1 := SignedSaturate(a1)
5042 // r2 := SignedSaturate(a2)
5043 // r3 := SignedSaturate(a3)
5044 // r4 := SignedSaturate(b0)
5045 // r5 := SignedSaturate(b1)
5046 // r6 := SignedSaturate(b2)
5047 // r7 := SignedSaturate(b3)
5048 //
5049 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
5051 {
5052  return vreinterpretq_m128i_s16(
5053  vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5054  vqmovn_s32(vreinterpretq_s32_m128i(b))));
5055 }
5056 
5057 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5058 // integers and saturates.
5059 //
5060 // r0 := UnsignedSaturate(a0)
5061 // r1 := UnsignedSaturate(a1)
5062 // ...
5063 // r7 := UnsignedSaturate(a7)
5064 // r8 := UnsignedSaturate(b0)
5065 // r9 := UnsignedSaturate(b1)
5066 // ...
5067 // r15 := UnsignedSaturate(b7)
5068 //
5069 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5071 {
5072  return vreinterpretq_m128i_u8(
5073  vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5074  vqmovun_s16(vreinterpretq_s16_m128i(b))));
5075 }
5076 
5077 // Pause the processor. This is typically used in spin-wait loops and depending
5078 // on the x86 processor typical values are in the 40-100 cycle range. The
5079 // 'yield' instruction isn't a good fit because it's effectively a nop on most
5080 // Arm cores. Experience with several databases has shown has shown an 'isb' is
5081 // a reasonable approximation.
5083 {
5084  __asm__ __volatile__("isb\n");
5085 }
5086 
5087 // Compute the absolute differences of packed unsigned 8-bit integers in a and
5088 // b, then horizontally sum each consecutive 8 differences to produce two
5089 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5090 // 16 bits of 64-bit elements in dst.
5091 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
5093 {
5094  uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5095  return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5096 }
5097 
5098 // Sets the 8 signed 16-bit integer values.
5099 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5101  short i6,
5102  short i5,
5103  short i4,
5104  short i3,
5105  short i2,
5106  short i1,
5107  short i0)
5108 {
5109  int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5110  return vreinterpretq_m128i_s16(vld1q_s16(data));
5111 }
5112 
5113 // Sets the 4 signed 32-bit integer values.
5114 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5115 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5116 {
5117  int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5118  return vreinterpretq_m128i_s32(vld1q_s32(data));
5119 }
5120 
5121 // Returns the __m128i structure with its two 64-bit integer values
5122 // initialized to the values of the two 64-bit integers passed in.
5123 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5125 {
5126  return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5127 }
5128 
5129 // Returns the __m128i structure with its two 64-bit integer values
5130 // initialized to the values of the two 64-bit integers passed in.
5131 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5132 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5133 {
5134  return vreinterpretq_m128i_s64(
5135  vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5136 }
5137 
5138 // Sets the 16 signed 8-bit integer values.
5139 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5141  signed char b14,
5142  signed char b13,
5143  signed char b12,
5144  signed char b11,
5145  signed char b10,
5146  signed char b9,
5147  signed char b8,
5148  signed char b7,
5149  signed char b6,
5150  signed char b5,
5151  signed char b4,
5152  signed char b3,
5153  signed char b2,
5154  signed char b1,
5155  signed char b0)
5156 {
5157  int8_t ALIGN_STRUCT(16)
5158  data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5159  (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5160  (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5161  (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5162  return (__m128i) vld1q_s8(data);
5163 }
5164 
5165 // Set packed double-precision (64-bit) floating-point elements in dst with the
5166 // supplied values.
5167 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
5168 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5169 {
5170  double ALIGN_STRUCT(16) data[2] = {e0, e1};
5171 #if defined(__aarch64__)
5172  return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5173 #else
5174  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5175 #endif
5176 }
5177 
5178 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5179 // dst.
5180 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
5181 #define _mm_set_pd1 _mm_set1_pd
5182 
5183 // Copy double-precision (64-bit) floating-point element a to the lower element
5184 // of dst, and zero the upper element.
5185 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
5187 {
5188  return _mm_set_pd(0, a);
5189 }
5190 
5191 // Sets the 8 signed 16-bit integer values to w.
5192 //
5193 // r0 := w
5194 // r1 := w
5195 // ...
5196 // r7 := w
5197 //
5198 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5200 {
5201  return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5202 }
5203 
5204 // Sets the 4 signed 32-bit integer values to i.
5205 //
5206 // r0 := i
5207 // r1 := i
5208 // r2 := i
5209 // r3 := I
5210 //
5211 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5213 {
5214  return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5215 }
5216 
5217 // Sets the 2 signed 64-bit integer values to i.
5218 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5220 {
5221  return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5222 }
5223 
5224 // Sets the 2 signed 64-bit integer values to i.
5225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
5227 {
5228  return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5229 }
5230 
5231 // Sets the 16 signed 8-bit integer values to b.
5232 //
5233 // r0 := b
5234 // r1 := b
5235 // ...
5236 // r15 := b
5237 //
5238 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5240 {
5241  return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5242 }
5243 
5244 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5245 // dst.
5246 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
5248 {
5249 #if defined(__aarch64__)
5250  return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5251 #else
5252  return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5253 #endif
5254 }
5255 
5256 // Sets the 8 signed 16-bit integer values in reverse order.
5257 //
5258 // Return Value
5259 // r0 := w0
5260 // r1 := w1
5261 // ...
5262 // r7 := w7
5264  short w1,
5265  short w2,
5266  short w3,
5267  short w4,
5268  short w5,
5269  short w6,
5270  short w7)
5271 {
5272  int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5273  return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5274 }
5275 
5276 // Sets the 4 signed 32-bit integer values in reverse order
5277 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5278 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5279 {
5280  int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5281  return vreinterpretq_m128i_s32(vld1q_s32(data));
5282 }
5283 
5284 // Set packed 64-bit integers in dst with the supplied values in reverse order.
5285 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
5287 {
5288  return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5289 }
5290 
5291 // Sets the 16 signed 8-bit integer values in reverse order.
5292 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5294  signed char b1,
5295  signed char b2,
5296  signed char b3,
5297  signed char b4,
5298  signed char b5,
5299  signed char b6,
5300  signed char b7,
5301  signed char b8,
5302  signed char b9,
5303  signed char b10,
5304  signed char b11,
5305  signed char b12,
5306  signed char b13,
5307  signed char b14,
5308  signed char b15)
5309 {
5310  int8_t ALIGN_STRUCT(16)
5311  data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5312  (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5313  (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5314  (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5315  return (__m128i) vld1q_s8(data);
5316 }
5317 
5318 // Set packed double-precision (64-bit) floating-point elements in dst with the
5319 // supplied values in reverse order.
5320 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
5321 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5322 {
5323  return _mm_set_pd(e0, e1);
5324 }
5325 
5326 // Return vector of type __m128d with all elements set to zero.
5327 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
5329 {
5330 #if defined(__aarch64__)
5331  return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5332 #else
5333  return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5334 #endif
5335 }
5336 
5337 // Sets the 128-bit value to zero
5338 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5340 {
5341  return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5342 }
5343 
5344 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5345 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5346 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5347 // __constrange(0,255) int imm)
5348 #if __has_builtin(__builtin_shufflevector)
5349 #define _mm_shuffle_epi32(a, imm) \
5350  __extension__({ \
5351  int32x4_t _input = vreinterpretq_s32_m128i(a); \
5352  int32x4_t _shuf = __builtin_shufflevector( \
5353  _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5354  ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5355  vreinterpretq_m128i_s32(_shuf); \
5356  })
5357 #else // generic
5358 #define _mm_shuffle_epi32(a, imm) \
5359  __extension__({ \
5360  __m128i ret; \
5361  switch (imm) { \
5362  case _MM_SHUFFLE(1, 0, 3, 2): \
5363  ret = _mm_shuffle_epi_1032((a)); \
5364  break; \
5365  case _MM_SHUFFLE(2, 3, 0, 1): \
5366  ret = _mm_shuffle_epi_2301((a)); \
5367  break; \
5368  case _MM_SHUFFLE(0, 3, 2, 1): \
5369  ret = _mm_shuffle_epi_0321((a)); \
5370  break; \
5371  case _MM_SHUFFLE(2, 1, 0, 3): \
5372  ret = _mm_shuffle_epi_2103((a)); \
5373  break; \
5374  case _MM_SHUFFLE(1, 0, 1, 0): \
5375  ret = _mm_shuffle_epi_1010((a)); \
5376  break; \
5377  case _MM_SHUFFLE(1, 0, 0, 1): \
5378  ret = _mm_shuffle_epi_1001((a)); \
5379  break; \
5380  case _MM_SHUFFLE(0, 1, 0, 1): \
5381  ret = _mm_shuffle_epi_0101((a)); \
5382  break; \
5383  case _MM_SHUFFLE(2, 2, 1, 1): \
5384  ret = _mm_shuffle_epi_2211((a)); \
5385  break; \
5386  case _MM_SHUFFLE(0, 1, 2, 2): \
5387  ret = _mm_shuffle_epi_0122((a)); \
5388  break; \
5389  case _MM_SHUFFLE(3, 3, 3, 2): \
5390  ret = _mm_shuffle_epi_3332((a)); \
5391  break; \
5392  case _MM_SHUFFLE(0, 0, 0, 0): \
5393  ret = _mm_shuffle_epi32_splat((a), 0); \
5394  break; \
5395  case _MM_SHUFFLE(1, 1, 1, 1): \
5396  ret = _mm_shuffle_epi32_splat((a), 1); \
5397  break; \
5398  case _MM_SHUFFLE(2, 2, 2, 2): \
5399  ret = _mm_shuffle_epi32_splat((a), 2); \
5400  break; \
5401  case _MM_SHUFFLE(3, 3, 3, 3): \
5402  ret = _mm_shuffle_epi32_splat((a), 3); \
5403  break; \
5404  default: \
5405  ret = _mm_shuffle_epi32_default((a), (imm)); \
5406  break; \
5407  } \
5408  ret; \
5409  })
5410 #endif
5411 
5412 // Shuffle double-precision (64-bit) floating-point elements using the control
5413 // in imm8, and store the results in dst.
5414 //
5415 // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5416 // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5417 //
5418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5419 #if __has_builtin(__builtin_shufflevector)
5420 #define _mm_shuffle_pd(a, b, imm8) \
5421  vreinterpretq_m128d_s64(__builtin_shufflevector( \
5422  vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5423  ((imm8 & 0x2) >> 1) + 2))
5424 #else
5425 #define _mm_shuffle_pd(a, b, imm8) \
5426  _mm_castsi128_pd(_mm_set_epi64x( \
5427  vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5428  vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5429 #endif
5430 
5431 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5432 // __constrange(0,255) int imm)
5433 #if __has_builtin(__builtin_shufflevector)
5434 #define _mm_shufflehi_epi16(a, imm) \
5435  __extension__({ \
5436  int16x8_t _input = vreinterpretq_s16_m128i(a); \
5437  int16x8_t _shuf = __builtin_shufflevector( \
5438  _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5439  (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5440  (((imm) >> 6) & 0x3) + 4); \
5441  vreinterpretq_m128i_s16(_shuf); \
5442  })
5443 #else // generic
5444 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5445 #endif
5446 
5447 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5448 // __constrange(0,255) int imm)
5449 #if __has_builtin(__builtin_shufflevector)
5450 #define _mm_shufflelo_epi16(a, imm) \
5451  __extension__({ \
5452  int16x8_t _input = vreinterpretq_s16_m128i(a); \
5453  int16x8_t _shuf = __builtin_shufflevector( \
5454  _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5455  (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5456  vreinterpretq_m128i_s16(_shuf); \
5457  })
5458 #else // generic
5459 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5460 #endif
5461 
5462 // Shift packed 16-bit integers in a left by count while shifting in zeros, and
5463 // store the results in dst.
5464 //
5465 // FOR j := 0 to 7
5466 // i := j*16
5467 // IF count[63:0] > 15
5468 // dst[i+15:i] := 0
5469 // ELSE
5470 // dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5471 // FI
5472 // ENDFOR
5473 //
5474 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
5476 {
5477  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5478  if (_sse2neon_unlikely(c & ~15))
5479  return _mm_setzero_si128();
5480 
5481  int16x8_t vc = vdupq_n_s16((int16_t) c);
5482  return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5483 }
5484 
5485 // Shift packed 32-bit integers in a left by count while shifting in zeros, and
5486 // store the results in dst.
5487 //
5488 // FOR j := 0 to 3
5489 // i := j*32
5490 // IF count[63:0] > 31
5491 // dst[i+31:i] := 0
5492 // ELSE
5493 // dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5494 // FI
5495 // ENDFOR
5496 //
5497 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
5499 {
5500  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5501  if (_sse2neon_unlikely(c & ~31))
5502  return _mm_setzero_si128();
5503 
5504  int32x4_t vc = vdupq_n_s32((int32_t) c);
5505  return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5506 }
5507 
5508 // Shift packed 64-bit integers in a left by count while shifting in zeros, and
5509 // store the results in dst.
5510 //
5511 // FOR j := 0 to 1
5512 // i := j*64
5513 // IF count[63:0] > 63
5514 // dst[i+63:i] := 0
5515 // ELSE
5516 // dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5517 // FI
5518 // ENDFOR
5519 //
5520 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
5522 {
5523  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5524  if (_sse2neon_unlikely(c & ~63))
5525  return _mm_setzero_si128();
5526 
5527  int64x2_t vc = vdupq_n_s64((int64_t) c);
5528  return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5529 }
5530 
5531 // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5532 // store the results in dst.
5533 //
5534 // FOR j := 0 to 7
5535 // i := j*16
5536 // IF imm8[7:0] > 15
5537 // dst[i+15:i] := 0
5538 // ELSE
5539 // dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5540 // FI
5541 // ENDFOR
5542 //
5543 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
5545 {
5546  if (_sse2neon_unlikely(imm & ~15))
5547  return _mm_setzero_si128();
5548  return vreinterpretq_m128i_s16(
5549  vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5550 }
5551 
5552 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5553 // store the results in dst.
5554 //
5555 // FOR j := 0 to 3
5556 // i := j*32
5557 // IF imm8[7:0] > 31
5558 // dst[i+31:i] := 0
5559 // ELSE
5560 // dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5561 // FI
5562 // ENDFOR
5563 //
5564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
5566 {
5567  if (_sse2neon_unlikely(imm & ~31))
5568  return _mm_setzero_si128();
5569  return vreinterpretq_m128i_s32(
5570  vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5571 }
5572 
5573 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5574 // store the results in dst.
5575 //
5576 // FOR j := 0 to 1
5577 // i := j*64
5578 // IF imm8[7:0] > 63
5579 // dst[i+63:i] := 0
5580 // ELSE
5581 // dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5582 // FI
5583 // ENDFOR
5584 //
5585 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
5587 {
5588  if (_sse2neon_unlikely(imm & ~63))
5589  return _mm_setzero_si128();
5590  return vreinterpretq_m128i_s64(
5591  vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5592 }
5593 
5594 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
5595 // dst.
5596 //
5597 // tmp := imm8[7:0]
5598 // IF tmp > 15
5599 // tmp := 16
5600 // FI
5601 // dst[127:0] := a[127:0] << (tmp*8)
5602 //
5603 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
5605 {
5606  if (_sse2neon_unlikely(imm & ~15))
5607  return _mm_setzero_si128();
5608  uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
5609  return vreinterpretq_m128i_u8(
5610  vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
5611 }
5612 
5613 // Compute the square root of packed double-precision (64-bit) floating-point
5614 // elements in a, and store the results in dst.
5615 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
5617 {
5618 #if defined(__aarch64__)
5619  return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5620 #else
5621  double a0 = sqrt(((double *) &a)[0]);
5622  double a1 = sqrt(((double *) &a)[1]);
5623  return _mm_set_pd(a1, a0);
5624 #endif
5625 }
5626 
5627 // Compute the square root of the lower double-precision (64-bit) floating-point
5628 // element in b, store the result in the lower element of dst, and copy the
5629 // upper element from a to the upper element of dst.
5630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
5632 {
5633 #if defined(__aarch64__)
5634  return _mm_move_sd(a, _mm_sqrt_pd(b));
5635 #else
5636  return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5637 #endif
5638 }
5639 
5640 // Shift packed 16-bit integers in a right by count while shifting in sign bits,
5641 // and store the results in dst.
5642 //
5643 // FOR j := 0 to 7
5644 // i := j*16
5645 // IF count[63:0] > 15
5646 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5647 // ELSE
5648 // dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5649 // FI
5650 // ENDFOR
5651 //
5652 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
5654 {
5655  int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5656  if (_sse2neon_unlikely(c & ~15))
5657  return _mm_cmplt_epi16(a, _mm_setzero_si128());
5658  return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5659 }
5660 
5661 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
5662 // and store the results in dst.
5663 //
5664 // FOR j := 0 to 3
5665 // i := j*32
5666 // IF count[63:0] > 31
5667 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5668 // ELSE
5669 // dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5670 // FI
5671 // ENDFOR
5672 //
5673 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
5675 {
5676  int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5677  if (_sse2neon_unlikely(c & ~31))
5678  return _mm_cmplt_epi32(a, _mm_setzero_si128());
5679  return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5680 }
5681 
5682 // Shift packed 16-bit integers in a right by imm8 while shifting in sign
5683 // bits, and store the results in dst.
5684 //
5685 // FOR j := 0 to 7
5686 // i := j*16
5687 // IF imm8[7:0] > 15
5688 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5689 // ELSE
5690 // dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5691 // FI
5692 // ENDFOR
5693 //
5694 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
5696 {
5697  const int count = (imm & ~15) ? 15 : imm;
5698  return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5699 }
5700 
5701 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5702 // and store the results in dst.
5703 //
5704 // FOR j := 0 to 3
5705 // i := j*32
5706 // IF imm8[7:0] > 31
5707 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5708 // ELSE
5709 // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5710 // FI
5711 // ENDFOR
5712 //
5713 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5714 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5715 #define _mm_srai_epi32(a, imm) \
5716  __extension__({ \
5717  __m128i ret; \
5718  if (_sse2neon_unlikely((imm) == 0)) { \
5719  ret = a; \
5720  } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5721  ret = vreinterpretq_m128i_s32( \
5722  vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5723  } else { \
5724  ret = vreinterpretq_m128i_s32( \
5725  vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5726  } \
5727  ret; \
5728  })
5729 
5730 // Shift packed 16-bit integers in a right by count while shifting in zeros, and
5731 // store the results in dst.
5732 //
5733 // FOR j := 0 to 7
5734 // i := j*16
5735 // IF count[63:0] > 15
5736 // dst[i+15:i] := 0
5737 // ELSE
5738 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5739 // FI
5740 // ENDFOR
5741 //
5742 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
5744 {
5745  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5746  if (_sse2neon_unlikely(c & ~15))
5747  return _mm_setzero_si128();
5748 
5749  int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5750  return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5751 }
5752 
5753 // Shift packed 32-bit integers in a right by count while shifting in zeros, and
5754 // store the results in dst.
5755 //
5756 // FOR j := 0 to 3
5757 // i := j*32
5758 // IF count[63:0] > 31
5759 // dst[i+31:i] := 0
5760 // ELSE
5761 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5762 // FI
5763 // ENDFOR
5764 //
5765 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
5767 {
5768  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5769  if (_sse2neon_unlikely(c & ~31))
5770  return _mm_setzero_si128();
5771 
5772  int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5773  return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5774 }
5775 
5776 // Shift packed 64-bit integers in a right by count while shifting in zeros, and
5777 // store the results in dst.
5778 //
5779 // FOR j := 0 to 1
5780 // i := j*64
5781 // IF count[63:0] > 63
5782 // dst[i+63:i] := 0
5783 // ELSE
5784 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5785 // FI
5786 // ENDFOR
5787 //
5788 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
5790 {
5791  uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5792  if (_sse2neon_unlikely(c & ~63))
5793  return _mm_setzero_si128();
5794 
5795  int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5796  return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5797 }
5798 
5799 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5800 // store the results in dst.
5801 //
5802 // FOR j := 0 to 7
5803 // i := j*16
5804 // IF imm8[7:0] > 15
5805 // dst[i+15:i] := 0
5806 // ELSE
5807 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5808 // FI
5809 // ENDFOR
5810 //
5811 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5812 #define _mm_srli_epi16(a, imm) \
5813  __extension__({ \
5814  __m128i ret; \
5815  if (_sse2neon_unlikely((imm) & ~15)) { \
5816  ret = _mm_setzero_si128(); \
5817  } else { \
5818  ret = vreinterpretq_m128i_u16( \
5819  vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5820  } \
5821  ret; \
5822  })
5823 
5824 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5825 // store the results in dst.
5826 //
5827 // FOR j := 0 to 3
5828 // i := j*32
5829 // IF imm8[7:0] > 31
5830 // dst[i+31:i] := 0
5831 // ELSE
5832 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5833 // FI
5834 // ENDFOR
5835 //
5836 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5837 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5838 #define _mm_srli_epi32(a, imm) \
5839  __extension__({ \
5840  __m128i ret; \
5841  if (_sse2neon_unlikely((imm) & ~31)) { \
5842  ret = _mm_setzero_si128(); \
5843  } else { \
5844  ret = vreinterpretq_m128i_u32( \
5845  vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5846  } \
5847  ret; \
5848  })
5849 
5850 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5851 // store the results in dst.
5852 //
5853 // FOR j := 0 to 1
5854 // i := j*64
5855 // IF imm8[7:0] > 63
5856 // dst[i+63:i] := 0
5857 // ELSE
5858 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5859 // FI
5860 // ENDFOR
5861 //
5862 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5863 #define _mm_srli_epi64(a, imm) \
5864  __extension__({ \
5865  __m128i ret; \
5866  if (_sse2neon_unlikely((imm) & ~63)) { \
5867  ret = _mm_setzero_si128(); \
5868  } else { \
5869  ret = vreinterpretq_m128i_u64( \
5870  vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5871  } \
5872  ret; \
5873  })
5874 
5875 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
5876 // dst.
5877 //
5878 // tmp := imm8[7:0]
5879 // IF tmp > 15
5880 // tmp := 16
5881 // FI
5882 // dst[127:0] := a[127:0] >> (tmp*8)
5883 //
5884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
5886 {
5887  if (_sse2neon_unlikely(imm & ~15))
5888  return _mm_setzero_si128();
5889  uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
5890  return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
5891 }
5892 
5893 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5894 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5895 // or a general-protection exception may be generated.
5896 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
5897 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5898 {
5899 #if defined(__aarch64__)
5900  vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5901 #else
5902  vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5903 #endif
5904 }
5905 
5906 // Store the lower double-precision (64-bit) floating-point element from a into
5907 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5908 // boundary or a general-protection exception may be generated.
5909 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
5910 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5911 {
5912 #if defined(__aarch64__)
5913  float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5914  vst1q_f64((float64_t *) mem_addr,
5915  vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5916 #else
5917  float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5918  vst1q_f32((float32_t *) mem_addr,
5919  vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5920 #endif
5921 }
5922 
5923 // Store the lower double-precision (64-bit) floating-point element from a into
5924 // memory. mem_addr does not need to be aligned on any particular boundary.
5925 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
5926 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5927 {
5928 #if defined(__aarch64__)
5929  vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5930 #else
5931  vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5932 #endif
5933 }
5934 
5935 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
5936 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
5938 {
5939  vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5940 }
5941 
5942 // Store the lower double-precision (64-bit) floating-point element from a into
5943 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5944 // boundary or a general-protection exception may be generated.
5945 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5946 #define _mm_store1_pd _mm_store_pd1
5947 
5948 // Store the upper double-precision (64-bit) floating-point element from a into
5949 // memory.
5950 //
5951 // MEM[mem_addr+63:mem_addr] := a[127:64]
5952 //
5953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
5954 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5955 {
5956 #if defined(__aarch64__)
5957  vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5958 #else
5959  vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5960 #endif
5961 }
5962 
5963 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5964 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
5966 {
5967  vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5968 }
5969 
5970 // Store the lower double-precision (64-bit) floating-point element from a into
5971 // memory.
5972 //
5973 // MEM[mem_addr+63:mem_addr] := a[63:0]
5974 //
5975 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
5976 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5977 {
5978 #if defined(__aarch64__)
5979  vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5980 #else
5981  vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5982 #endif
5983 }
5984 
5985 // Store 2 double-precision (64-bit) floating-point elements from a into memory
5986 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5987 // general-protection exception may be generated.
5988 //
5989 // MEM[mem_addr+63:mem_addr] := a[127:64]
5990 // MEM[mem_addr+127:mem_addr+64] := a[63:0]
5991 //
5992 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
5993 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5994 {
5995  float32x4_t f = vreinterpretq_f32_m128d(a);
5996  _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5997 }
5998 
5999 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6000 // elements) from a into memory. mem_addr does not need to be aligned on any
6001 // particular boundary.
6002 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
6003 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
6004 {
6005  _mm_store_pd(mem_addr, a);
6006 }
6007 
6008 // Stores 128-bits of integer data a at the address p.
6009 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
6011 {
6012  vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
6013 }
6014 
6015 // Stores 32-bits of integer data a at the address p.
6016 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
6018 {
6019  vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
6020 }
6021 
6022 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
6023 // elements) from a into memory using a non-temporal memory hint. mem_addr must
6024 // be aligned on a 16-byte boundary or a general-protection exception may be
6025 // generated.
6026 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
6028 {
6029 #if __has_builtin(__builtin_nontemporal_store)
6030  __builtin_nontemporal_store(a, (float32x4_t *) p);
6031 #elif defined(__aarch64__)
6032  vst1q_f64(p, vreinterpretq_f64_m128d(a));
6033 #else
6034  vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
6035 #endif
6036 }
6037 
6038 // Stores the data in a to the address p without polluting the caches. If the
6039 // cache line containing address p is already in the cache, the cache will be
6040 // updated.
6041 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
6043 {
6044 #if __has_builtin(__builtin_nontemporal_store)
6045  __builtin_nontemporal_store(a, p);
6046 #else
6047  vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6048 #endif
6049 }
6050 
6051 // Store 32-bit integer a into memory using a non-temporal hint to minimize
6052 // cache pollution. If the cache line containing address mem_addr is already in
6053 // the cache, the cache will be updated.
6054 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
6055 FORCE_INLINE void _mm_stream_si32(int *p, int a)
6056 {
6057  vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6058 }
6059 
6060 // Store 64-bit integer a into memory using a non-temporal hint to minimize
6061 // cache pollution. If the cache line containing address mem_addr is already in
6062 // the cache, the cache will be updated.
6063 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
6065 {
6066  vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6067 }
6068 
6069 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6070 // store the results in dst.
6071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
6073 {
6074  return vreinterpretq_m128i_s16(
6076 }
6077 
6078 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6079 // unsigned 32-bit integers of a.
6080 //
6081 // r0 := a0 - b0
6082 // r1 := a1 - b1
6083 // r2 := a2 - b2
6084 // r3 := a3 - b3
6085 //
6086 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6088 {
6089  return vreinterpretq_m128i_s32(
6091 }
6092 
6093 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6094 // and store the results in dst.
6095 // r0 := a0 - b0
6096 // r1 := a1 - b1
6098 {
6099  return vreinterpretq_m128i_s64(
6101 }
6102 
6103 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6104 // store the results in dst.
6105 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
6107 {
6108  return vreinterpretq_m128i_s8(
6110 }
6111 
6112 // Subtract packed double-precision (64-bit) floating-point elements in b from
6113 // packed double-precision (64-bit) floating-point elements in a, and store the
6114 // results in dst.
6115 //
6116 // FOR j := 0 to 1
6117 // i := j*64
6118 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
6119 // ENDFOR
6120 //
6121 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
6123 {
6124 #if defined(__aarch64__)
6125  return vreinterpretq_m128d_f64(
6126  vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6127 #else
6128  double *da = (double *) &a;
6129  double *db = (double *) &b;
6130  double c[2];
6131  c[0] = da[0] - db[0];
6132  c[1] = da[1] - db[1];
6133  return vld1q_f32((float32_t *) c);
6134 #endif
6135 }
6136 
6137 // Subtract the lower double-precision (64-bit) floating-point element in b from
6138 // the lower double-precision (64-bit) floating-point element in a, store the
6139 // result in the lower element of dst, and copy the upper element from a to the
6140 // upper element of dst.
6141 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
6143 {
6144  return _mm_move_sd(a, _mm_sub_pd(a, b));
6145 }
6146 
6147 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6148 //
6149 // dst[63:0] := a[63:0] - b[63:0]
6150 //
6151 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
6153 {
6154  return vreinterpret_m64_s64(
6155  vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
6156 }
6157 
6158 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6159 // of a and saturates.
6160 //
6161 // r0 := SignedSaturate(a0 - b0)
6162 // r1 := SignedSaturate(a1 - b1)
6163 // ...
6164 // r7 := SignedSaturate(a7 - b7)
6165 //
6166 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6168 {
6169  return vreinterpretq_m128i_s16(
6171 }
6172 
6173 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6174 // of a and saturates.
6175 //
6176 // r0 := SignedSaturate(a0 - b0)
6177 // r1 := SignedSaturate(a1 - b1)
6178 // ...
6179 // r15 := SignedSaturate(a15 - b15)
6180 //
6181 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6183 {
6184  return vreinterpretq_m128i_s8(
6186 }
6187 
6188 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6189 // integers of a and saturates..
6190 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6192 {
6193  return vreinterpretq_m128i_u16(
6195 }
6196 
6197 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6198 // integers of a and saturates.
6199 //
6200 // r0 := UnsignedSaturate(a0 - b0)
6201 // r1 := UnsignedSaturate(a1 - b1)
6202 // ...
6203 // r15 := UnsignedSaturate(a15 - b15)
6204 //
6205 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6207 {
6208  return vreinterpretq_m128i_u8(
6210 }
6211 
6212 #define _mm_ucomieq_sd _mm_comieq_sd
6213 #define _mm_ucomige_sd _mm_comige_sd
6214 #define _mm_ucomigt_sd _mm_comigt_sd
6215 #define _mm_ucomile_sd _mm_comile_sd
6216 #define _mm_ucomilt_sd _mm_comilt_sd
6217 #define _mm_ucomineq_sd _mm_comineq_sd
6218 
6219 // Return vector of type __m128d with undefined elements.
6220 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
6222 {
6223 #if defined(__GNUC__) || defined(__clang__)
6224 #pragma GCC diagnostic push
6225 #pragma GCC diagnostic ignored "-Wuninitialized"
6226 #endif
6227  __m128d a;
6228  return a;
6229 #if defined(__GNUC__) || defined(__clang__)
6230 #pragma GCC diagnostic pop
6231 #endif
6232 }
6233 
6234 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6235 // upper 4 signed or unsigned 16-bit integers in b.
6236 //
6237 // r0 := a4
6238 // r1 := b4
6239 // r2 := a5
6240 // r3 := b5
6241 // r4 := a6
6242 // r5 := b6
6243 // r6 := a7
6244 // r7 := b7
6245 //
6246 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6248 {
6249 #if defined(__aarch64__)
6250  return vreinterpretq_m128i_s16(
6252 #else
6253  int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6254  int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6255  int16x4x2_t result = vzip_s16(a1, b1);
6256  return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6257 #endif
6258 }
6259 
6260 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6261 // upper 2 signed or unsigned 32-bit integers in b.
6262 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6264 {
6265 #if defined(__aarch64__)
6266  return vreinterpretq_m128i_s32(
6268 #else
6269  int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6270  int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6271  int32x2x2_t result = vzip_s32(a1, b1);
6272  return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6273 #endif
6274 }
6275 
6276 // Interleaves the upper signed or unsigned 64-bit integer in a with the
6277 // upper signed or unsigned 64-bit integer in b.
6278 //
6279 // r0 := a1
6280 // r1 := b1
6282 {
6283  int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6284  int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6285  return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6286 }
6287 
6288 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6289 // 8 signed or unsigned 8-bit integers in b.
6290 //
6291 // r0 := a8
6292 // r1 := b8
6293 // r2 := a9
6294 // r3 := b9
6295 // ...
6296 // r14 := a15
6297 // r15 := b15
6298 //
6299 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6301 {
6302 #if defined(__aarch64__)
6303  return vreinterpretq_m128i_s8(
6305 #else
6306  int8x8_t a1 =
6307  vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6308  int8x8_t b1 =
6309  vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6310  int8x8x2_t result = vzip_s8(a1, b1);
6311  return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6312 #endif
6313 }
6314 
6315 // Unpack and interleave double-precision (64-bit) floating-point elements from
6316 // the high half of a and b, and store the results in dst.
6317 //
6318 // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6319 // dst[63:0] := src1[127:64]
6320 // dst[127:64] := src2[127:64]
6321 // RETURN dst[127:0]
6322 // }
6323 // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6324 //
6325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
6327 {
6328 #if defined(__aarch64__)
6329  return vreinterpretq_m128d_f64(
6330  vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6331 #else
6332  return vreinterpretq_m128d_s64(
6333  vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6334  vget_high_s64(vreinterpretq_s64_m128d(b))));
6335 #endif
6336 }
6337 
6338 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6339 // lower 4 signed or unsigned 16-bit integers in b.
6340 //
6341 // r0 := a0
6342 // r1 := b0
6343 // r2 := a1
6344 // r3 := b1
6345 // r4 := a2
6346 // r5 := b2
6347 // r6 := a3
6348 // r7 := b3
6349 //
6350 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6352 {
6353 #if defined(__aarch64__)
6354  return vreinterpretq_m128i_s16(
6356 #else
6357  int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6358  int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6359  int16x4x2_t result = vzip_s16(a1, b1);
6360  return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6361 #endif
6362 }
6363 
6364 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6365 // lower 2 signed or unsigned 32 - bit integers in b.
6366 //
6367 // r0 := a0
6368 // r1 := b0
6369 // r2 := a1
6370 // r3 := b1
6371 //
6372 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6374 {
6375 #if defined(__aarch64__)
6376  return vreinterpretq_m128i_s32(
6378 #else
6379  int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6380  int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6381  int32x2x2_t result = vzip_s32(a1, b1);
6382  return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6383 #endif
6384 }
6385 
6387 {
6388  int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6389  int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6390  return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6391 }
6392 
6393 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6394 // 8 signed or unsigned 8-bit integers in b.
6395 //
6396 // r0 := a0
6397 // r1 := b0
6398 // r2 := a1
6399 // r3 := b1
6400 // ...
6401 // r14 := a7
6402 // r15 := b7
6403 //
6404 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6406 {
6407 #if defined(__aarch64__)
6408  return vreinterpretq_m128i_s8(
6410 #else
6411  int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6412  int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6413  int8x8x2_t result = vzip_s8(a1, b1);
6414  return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6415 #endif
6416 }
6417 
6418 // Unpack and interleave double-precision (64-bit) floating-point elements from
6419 // the low half of a and b, and store the results in dst.
6420 //
6421 // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6422 // dst[63:0] := src1[63:0]
6423 // dst[127:64] := src2[63:0]
6424 // RETURN dst[127:0]
6425 // }
6426 // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6427 //
6428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
6430 {
6431 #if defined(__aarch64__)
6432  return vreinterpretq_m128d_f64(
6433  vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6434 #else
6435  return vreinterpretq_m128d_s64(
6436  vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6437  vget_low_s64(vreinterpretq_s64_m128d(b))));
6438 #endif
6439 }
6440 
6441 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6442 // elements in a and b, and store the results in dst.
6443 //
6444 // FOR j := 0 to 1
6445 // i := j*64
6446 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6447 // ENDFOR
6448 //
6449 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
6451 {
6452  return vreinterpretq_m128d_s64(
6454 }
6455 
6456 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6457 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6459 {
6460  return vreinterpretq_m128i_s32(
6462 }
6463 
6464 /* SSE3 */
6465 
6466 // Alternatively add and subtract packed double-precision (64-bit)
6467 // floating-point elements in a to/from packed elements in b, and store the
6468 // results in dst.
6469 //
6470 // FOR j := 0 to 1
6471 // i := j*64
6472 // IF ((j & 1) == 0)
6473 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
6474 // ELSE
6475 // dst[i+63:i] := a[i+63:i] + b[i+63:i]
6476 // FI
6477 // ENDFOR
6478 //
6479 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
6481 {
6482  _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6483 #if defined(__aarch64__)
6484  return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6485  vreinterpretq_f64_m128d(b),
6486  vreinterpretq_f64_m128d(mask)));
6487 #else
6488  return _mm_add_pd(_mm_mul_pd(b, mask), a);
6489 #endif
6490 }
6491 
6492 // Alternatively add and subtract packed single-precision (32-bit)
6493 // floating-point elements in a to/from packed elements in b, and store the
6494 // results in dst.
6495 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
6497 {
6498  _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6499 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6500  return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6501  vreinterpretq_f32_m128(mask),
6503 #else
6504  return _mm_add_ps(_mm_mul_ps(b, mask), a);
6505 #endif
6506 }
6507 
6508 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6509 // elements in a and b, and pack the results in dst.
6510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
6512 {
6513 #if defined(__aarch64__)
6514  return vreinterpretq_m128d_f64(
6515  vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6516 #else
6517  double *da = (double *) &a;
6518  double *db = (double *) &b;
6519  double c[] = {da[0] + da[1], db[0] + db[1]};
6520  return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6521 #endif
6522 }
6523 
6524 // Computes pairwise add of each argument as single-precision, floating-point
6525 // values a and b.
6526 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6528 {
6529 #if defined(__aarch64__)
6530  return vreinterpretq_m128_f32(
6531  vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6532 #else
6533  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6534  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6535  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6536  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6537  return vreinterpretq_m128_f32(
6538  vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6539 #endif
6540 }
6541 
6542 // Horizontally subtract adjacent pairs of double-precision (64-bit)
6543 // floating-point elements in a and b, and pack the results in dst.
6544 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
6546 {
6547 #if defined(__aarch64__)
6548  float64x2_t a = vreinterpretq_f64_m128d(_a);
6549  float64x2_t b = vreinterpretq_f64_m128d(_b);
6550  return vreinterpretq_m128d_f64(
6551  vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6552 #else
6553  double *da = (double *) &_a;
6554  double *db = (double *) &_b;
6555  double c[] = {da[0] - da[1], db[0] - db[1]};
6556  return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6557 #endif
6558 }
6559 
6560 // Horizontally subtract adjacent pairs of single-precision (32-bit)
6561 // floating-point elements in a and b, and pack the results in dst.
6562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
6564 {
6565  float32x4_t a = vreinterpretq_f32_m128(_a);
6566  float32x4_t b = vreinterpretq_f32_m128(_b);
6567 #if defined(__aarch64__)
6568  return vreinterpretq_m128_f32(
6569  vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6570 #else
6571  float32x4x2_t c = vuzpq_f32(a, b);
6572  return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6573 #endif
6574 }
6575 
6576 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6577 // may perform better than _mm_loadu_si128 when the data crosses a cache line
6578 // boundary.
6579 //
6580 // dst[127:0] := MEM[mem_addr+127:mem_addr]
6581 //
6582 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6583 #define _mm_lddqu_si128 _mm_loadu_si128
6584 
6585 // Load a double-precision (64-bit) floating-point element from memory into both
6586 // elements of dst.
6587 //
6588 // dst[63:0] := MEM[mem_addr+63:mem_addr]
6589 // dst[127:64] := MEM[mem_addr+63:mem_addr]
6590 //
6591 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6592 #define _mm_loaddup_pd _mm_load1_pd
6593 
6594 // Duplicate the low double-precision (64-bit) floating-point element from a,
6595 // and store the results in dst.
6596 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
6598 {
6599 #if defined(__aarch64__)
6600  return vreinterpretq_m128d_f64(
6601  vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6602 #else
6603  return vreinterpretq_m128d_u64(
6604  vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6605 #endif
6606 }
6607 
6608 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
6609 // from a, and store the results in dst.
6610 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
6612 {
6613 #if __has_builtin(__builtin_shufflevector)
6614  return vreinterpretq_m128_f32(__builtin_shufflevector(
6615  vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6616 #else
6617  float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6618  float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6619  float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6620  return vreinterpretq_m128_f32(vld1q_f32(data));
6621 #endif
6622 }
6623 
6624 // Duplicate even-indexed single-precision (32-bit) floating-point elements
6625 // from a, and store the results in dst.
6626 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
6628 {
6629 #if __has_builtin(__builtin_shufflevector)
6630  return vreinterpretq_m128_f32(__builtin_shufflevector(
6631  vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6632 #else
6633  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6634  float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6635  float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6636  return vreinterpretq_m128_f32(vld1q_f32(data));
6637 #endif
6638 }
6639 
6640 /* SSSE3 */
6641 
6642 // Compute the absolute value of packed signed 16-bit integers in a, and store
6643 // the unsigned results in dst.
6644 //
6645 // FOR j := 0 to 7
6646 // i := j*16
6647 // dst[i+15:i] := ABS(a[i+15:i])
6648 // ENDFOR
6649 //
6650 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
6652 {
6653  return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6654 }
6655 
6656 // Compute the absolute value of packed signed 32-bit integers in a, and store
6657 // the unsigned results in dst.
6658 //
6659 // FOR j := 0 to 3
6660 // i := j*32
6661 // dst[i+31:i] := ABS(a[i+31:i])
6662 // ENDFOR
6663 //
6664 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
6666 {
6667  return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6668 }
6669 
6670 // Compute the absolute value of packed signed 8-bit integers in a, and store
6671 // the unsigned results in dst.
6672 //
6673 // FOR j := 0 to 15
6674 // i := j*8
6675 // dst[i+7:i] := ABS(a[i+7:i])
6676 // ENDFOR
6677 //
6678 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
6680 {
6681  return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6682 }
6683 
6684 // Compute the absolute value of packed signed 16-bit integers in a, and store
6685 // the unsigned results in dst.
6686 //
6687 // FOR j := 0 to 3
6688 // i := j*16
6689 // dst[i+15:i] := ABS(a[i+15:i])
6690 // ENDFOR
6691 //
6692 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
6694 {
6695  return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6696 }
6697 
6698 // Compute the absolute value of packed signed 32-bit integers in a, and store
6699 // the unsigned results in dst.
6700 //
6701 // FOR j := 0 to 1
6702 // i := j*32
6703 // dst[i+31:i] := ABS(a[i+31:i])
6704 // ENDFOR
6705 //
6706 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
6708 {
6709  return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6710 }
6711 
6712 // Compute the absolute value of packed signed 8-bit integers in a, and store
6713 // the unsigned results in dst.
6714 //
6715 // FOR j := 0 to 7
6716 // i := j*8
6717 // dst[i+7:i] := ABS(a[i+7:i])
6718 // ENDFOR
6719 //
6720 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
6722 {
6723  return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6724 }
6725 
6726 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6727 // the result right by imm8 bytes, and store the low 16 bytes in dst.
6728 //
6729 // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6730 // dst[127:0] := tmp[127:0]
6731 //
6732 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6734 {
6735  if (_sse2neon_unlikely(imm & ~31))
6736  return _mm_setzero_si128();
6737  int idx;
6738  uint8x16_t tmp[2];
6739  if (imm >= 16) {
6740  idx = imm - 16;
6741  tmp[0] = vreinterpretq_u8_m128i(a);
6742  tmp[1] = vdupq_n_u8(0);
6743  } else {
6744  idx = imm;
6745  tmp[0] = vreinterpretq_u8_m128i(b);
6746  tmp[1] = vreinterpretq_u8_m128i(a);
6747  }
6748  return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
6749 }
6750 
6751 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6752 // the result right by imm8 bytes, and store the low 8 bytes in dst.
6753 //
6754 // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6755 // dst[63:0] := tmp[63:0]
6756 //
6757 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6758 #define _mm_alignr_pi8(a, b, imm) \
6759  __extension__({ \
6760  __m64 ret; \
6761  if (_sse2neon_unlikely((imm) >= 16)) { \
6762  ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6763  } else { \
6764  uint8x8_t tmp_low, tmp_high; \
6765  if ((imm) >= 8) { \
6766  const int idx = (imm) -8; \
6767  tmp_low = vreinterpret_u8_m64(a); \
6768  tmp_high = vdup_n_u8(0); \
6769  ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6770  } else { \
6771  const int idx = (imm); \
6772  tmp_low = vreinterpret_u8_m64(b); \
6773  tmp_high = vreinterpret_u8_m64(a); \
6774  ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6775  } \
6776  } \
6777  ret; \
6778  })
6779 
6780 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6781 // values a and b.
6783 {
6784  int16x8_t a = vreinterpretq_s16_m128i(_a);
6785  int16x8_t b = vreinterpretq_s16_m128i(_b);
6786 #if defined(__aarch64__)
6787  return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6788 #else
6789  return vreinterpretq_m128i_s16(
6790  vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6791  vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6792 #endif
6793 }
6794 
6795 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6796 // values a and b.
6798 {
6799  int32x4_t a = vreinterpretq_s32_m128i(_a);
6800  int32x4_t b = vreinterpretq_s32_m128i(_b);
6801  return vreinterpretq_m128i_s32(
6802  vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6803  vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6804 }
6805 
6806 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6807 // signed 16-bit results in dst.
6808 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
6810 {
6811  return vreinterpret_m64_s16(
6812  vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6813 }
6814 
6815 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6816 // signed 32-bit results in dst.
6817 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
6819 {
6820  return vreinterpret_m64_s32(
6821  vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6822 }
6823 
6824 // Computes saturated pairwise sub of each argument as a 16-bit signed
6825 // integer values a and b.
6827 {
6828 #if defined(__aarch64__)
6829  int16x8_t a = vreinterpretq_s16_m128i(_a);
6830  int16x8_t b = vreinterpretq_s16_m128i(_b);
6831  return vreinterpretq_s64_s16(
6832  vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6833 #else
6834  int32x4_t a = vreinterpretq_s32_m128i(_a);
6835  int32x4_t b = vreinterpretq_s32_m128i(_b);
6836  // Interleave using vshrn/vmovn
6837  // [a0|a2|a4|a6|b0|b2|b4|b6]
6838  // [a1|a3|a5|a7|b1|b3|b5|b7]
6839  int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6840  int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6841  // Saturated add
6842  return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6843 #endif
6844 }
6845 
6846 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6847 // saturation, and pack the signed 16-bit results in dst.
6848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
6850 {
6851  int16x4_t a = vreinterpret_s16_m64(_a);
6852  int16x4_t b = vreinterpret_s16_m64(_b);
6853 #if defined(__aarch64__)
6854  return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6855 #else
6856  int16x4x2_t res = vuzp_s16(a, b);
6857  return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6858 #endif
6859 }
6860 
6861 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6862 // the signed 16-bit results in dst.
6863 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
6865 {
6866  int16x8_t a = vreinterpretq_s16_m128i(_a);
6867  int16x8_t b = vreinterpretq_s16_m128i(_b);
6868 #if defined(__aarch64__)
6869  return vreinterpretq_m128i_s16(
6870  vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6871 #else
6872  int16x8x2_t c = vuzpq_s16(a, b);
6873  return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6874 #endif
6875 }
6876 
6877 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6878 // the signed 32-bit results in dst.
6879 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
6881 {
6882  int32x4_t a = vreinterpretq_s32_m128i(_a);
6883  int32x4_t b = vreinterpretq_s32_m128i(_b);
6884 #if defined(__aarch64__)
6885  return vreinterpretq_m128i_s32(
6886  vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6887 #else
6888  int32x4x2_t c = vuzpq_s32(a, b);
6889  return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6890 #endif
6891 }
6892 
6893 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6894 // the signed 16-bit results in dst.
6895 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
6897 {
6898  int16x4_t a = vreinterpret_s16_m64(_a);
6899  int16x4_t b = vreinterpret_s16_m64(_b);
6900 #if defined(__aarch64__)
6901  return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6902 #else
6903  int16x4x2_t c = vuzp_s16(a, b);
6904  return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6905 #endif
6906 }
6907 
6908 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6909 // the signed 32-bit results in dst.
6910 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
6912 {
6913  int32x2_t a = vreinterpret_s32_m64(_a);
6914  int32x2_t b = vreinterpret_s32_m64(_b);
6915 #if defined(__aarch64__)
6916  return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6917 #else
6918  int32x2x2_t c = vuzp_s32(a, b);
6919  return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6920 #endif
6921 }
6922 
6923 // Computes saturated pairwise difference of each argument as a 16-bit signed
6924 // integer values a and b.
6925 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
6927 {
6928  int16x8_t a = vreinterpretq_s16_m128i(_a);
6929  int16x8_t b = vreinterpretq_s16_m128i(_b);
6930 #if defined(__aarch64__)
6931  return vreinterpretq_m128i_s16(
6932  vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6933 #else
6934  int16x8x2_t c = vuzpq_s16(a, b);
6935  return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6936 #endif
6937 }
6938 
6939 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6940 // using saturation, and pack the signed 16-bit results in dst.
6941 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
6943 {
6944  int16x4_t a = vreinterpret_s16_m64(_a);
6945  int16x4_t b = vreinterpret_s16_m64(_b);
6946 #if defined(__aarch64__)
6947  return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6948 #else
6949  int16x4x2_t c = vuzp_s16(a, b);
6950  return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6951 #endif
6952 }
6953 
6954 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6955 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6956 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6957 // and pack the saturated results in dst.
6958 //
6959 // FOR j := 0 to 7
6960 // i := j*16
6961 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6962 // a[i+7:i]*b[i+7:i] )
6963 // ENDFOR
6965 {
6966 #if defined(__aarch64__)
6967  uint8x16_t a = vreinterpretq_u8_m128i(_a);
6968  int8x16_t b = vreinterpretq_s8_m128i(_b);
6969  int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6970  vmovl_s8(vget_low_s8(b)));
6971  int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6972  vmovl_s8(vget_high_s8(b)));
6973  return vreinterpretq_m128i_s16(
6974  vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6975 #else
6976  // This would be much simpler if x86 would choose to zero extend OR sign
6977  // extend, not both. This could probably be optimized better.
6978  uint16x8_t a = vreinterpretq_u16_m128i(_a);
6979  int16x8_t b = vreinterpretq_s16_m128i(_b);
6980 
6981  // Zero extend a
6982  int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6983  int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6984 
6985  // Sign extend by shifting left then shifting right.
6986  int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6987  int16x8_t b_odd = vshrq_n_s16(b, 8);
6988 
6989  // multiply
6990  int16x8_t prod1 = vmulq_s16(a_even, b_even);
6991  int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6992 
6993  // saturated add
6994  return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6995 #endif
6996 }
6997 
6998 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6999 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
7000 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
7001 // pack the saturated results in dst.
7002 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
7004 {
7005  uint16x4_t a = vreinterpret_u16_m64(_a);
7006  int16x4_t b = vreinterpret_s16_m64(_b);
7007 
7008  // Zero extend a
7009  int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7010  int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7011 
7012  // Sign extend by shifting left then shifting right.
7013  int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7014  int16x4_t b_odd = vshr_n_s16(b, 8);
7015 
7016  // multiply
7017  int16x4_t prod1 = vmul_s16(a_even, b_even);
7018  int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7019 
7020  // saturated add
7021  return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
7022 }
7023 
7024 // Multiply packed signed 16-bit integers in a and b, producing intermediate
7025 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
7026 // the packed 16-bit integers in dst.
7027 //
7028 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
7029 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
7030 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
7031 // ...
7032 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
7034 {
7035  // Has issues due to saturation
7036  // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
7037 
7038  // Multiply
7039  int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
7040  vget_low_s16(vreinterpretq_s16_m128i(b)));
7041  int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
7042  vget_high_s16(vreinterpretq_s16_m128i(b)));
7043 
7044  // Rounding narrowing shift right
7045  // narrow = (int16_t)((mul + 16384) >> 15);
7046  int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7047  int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7048 
7049  // Join together
7050  return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
7051 }
7052 
7053 // Multiply packed signed 16-bit integers in a and b, producing intermediate
7054 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
7055 // significant bits, round by adding 1, and store bits [16:1] to dst.
7056 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
7058 {
7059  int32x4_t mul_extend =
7060  vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
7061 
7062  // Rounding narrowing shift right
7063  return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7064 }
7065 
7066 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7067 // corresponding 8-bit element of b, and store the results in dst.
7068 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
7070 {
7071  int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
7072  uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
7073  uint8x16_t idx_masked =
7074  vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
7075 #if defined(__aarch64__)
7076  return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7077 #elif defined(__GNUC__)
7078  int8x16_t ret;
7079  // %e and %f represent the even and odd D registers
7080  // respectively.
7081  __asm__ __volatile__(
7082  "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7083  "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7084  : [ret] "=&w"(ret)
7085  : [tbl] "w"(tbl), [idx] "w"(idx_masked));
7086  return vreinterpretq_m128i_s8(ret);
7087 #else
7088  // use this line if testing on aarch64
7089  int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7090  return vreinterpretq_m128i_s8(
7091  vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7092  vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7093 #endif
7094 }
7095 
7096 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7097 // corresponding 8-bit element of b, and store the results in dst.
7098 //
7099 // FOR j := 0 to 7
7100 // i := j*8
7101 // IF b[i+7] == 1
7102 // dst[i+7:i] := 0
7103 // ELSE
7104 // index[2:0] := b[i+2:i]
7105 // dst[i+7:i] := a[index*8+7:index*8]
7106 // FI
7107 // ENDFOR
7108 //
7109 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
7111 {
7112  const int8x8_t controlMask =
7113  vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
7114  int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7115  return vreinterpret_m64_s8(res);
7116 }
7117 
7118 // Negate packed 16-bit integers in a when the corresponding signed
7119 // 16-bit integer in b is negative, and store the results in dst.
7120 // Element in dst are zeroed out when the corresponding element
7121 // in b is zero.
7122 //
7123 // for i in 0..7
7124 // if b[i] < 0
7125 // r[i] := -a[i]
7126 // else if b[i] == 0
7127 // r[i] := 0
7128 // else
7129 // r[i] := a[i]
7130 // fi
7131 // done
7133 {
7134  int16x8_t a = vreinterpretq_s16_m128i(_a);
7135  int16x8_t b = vreinterpretq_s16_m128i(_b);
7136 
7137  // signed shift right: faster than vclt
7138  // (b < 0) ? 0xFFFF : 0
7139  uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7140  // (b == 0) ? 0xFFFF : 0
7141 #if defined(__aarch64__)
7142  int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7143 #else
7144  int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7145 #endif
7146 
7147  // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7148  // 'a') based on ltMask
7149  int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7150  // res = masked & (~zeroMask)
7151  int16x8_t res = vbicq_s16(masked, zeroMask);
7152  return vreinterpretq_m128i_s16(res);
7153 }
7154 
7155 // Negate packed 32-bit integers in a when the corresponding signed
7156 // 32-bit integer in b is negative, and store the results in dst.
7157 // Element in dst are zeroed out when the corresponding element
7158 // in b is zero.
7159 //
7160 // for i in 0..3
7161 // if b[i] < 0
7162 // r[i] := -a[i]
7163 // else if b[i] == 0
7164 // r[i] := 0
7165 // else
7166 // r[i] := a[i]
7167 // fi
7168 // done
7170 {
7171  int32x4_t a = vreinterpretq_s32_m128i(_a);
7172  int32x4_t b = vreinterpretq_s32_m128i(_b);
7173 
7174  // signed shift right: faster than vclt
7175  // (b < 0) ? 0xFFFFFFFF : 0
7176  uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7177 
7178  // (b == 0) ? 0xFFFFFFFF : 0
7179 #if defined(__aarch64__)
7180  int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7181 #else
7182  int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7183 #endif
7184 
7185  // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7186  // 'a') based on ltMask
7187  int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7188  // res = masked & (~zeroMask)
7189  int32x4_t res = vbicq_s32(masked, zeroMask);
7190  return vreinterpretq_m128i_s32(res);
7191 }
7192 
7193 // Negate packed 8-bit integers in a when the corresponding signed
7194 // 8-bit integer in b is negative, and store the results in dst.
7195 // Element in dst are zeroed out when the corresponding element
7196 // in b is zero.
7197 //
7198 // for i in 0..15
7199 // if b[i] < 0
7200 // r[i] := -a[i]
7201 // else if b[i] == 0
7202 // r[i] := 0
7203 // else
7204 // r[i] := a[i]
7205 // fi
7206 // done
7208 {
7209  int8x16_t a = vreinterpretq_s8_m128i(_a);
7210  int8x16_t b = vreinterpretq_s8_m128i(_b);
7211 
7212  // signed shift right: faster than vclt
7213  // (b < 0) ? 0xFF : 0
7214  uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7215 
7216  // (b == 0) ? 0xFF : 0
7217 #if defined(__aarch64__)
7218  int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7219 #else
7220  int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7221 #endif
7222 
7223  // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
7224  // based on ltMask
7225  int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7226  // res = masked & (~zeroMask)
7227  int8x16_t res = vbicq_s8(masked, zeroMask);
7228 
7229  return vreinterpretq_m128i_s8(res);
7230 }
7231 
7232 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
7233 // integer in b is negative, and store the results in dst. Element in dst are
7234 // zeroed out when the corresponding element in b is zero.
7235 //
7236 // FOR j := 0 to 3
7237 // i := j*16
7238 // IF b[i+15:i] < 0
7239 // dst[i+15:i] := -(a[i+15:i])
7240 // ELSE IF b[i+15:i] == 0
7241 // dst[i+15:i] := 0
7242 // ELSE
7243 // dst[i+15:i] := a[i+15:i]
7244 // FI
7245 // ENDFOR
7246 //
7247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
7249 {
7250  int16x4_t a = vreinterpret_s16_m64(_a);
7251  int16x4_t b = vreinterpret_s16_m64(_b);
7252 
7253  // signed shift right: faster than vclt
7254  // (b < 0) ? 0xFFFF : 0
7255  uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7256 
7257  // (b == 0) ? 0xFFFF : 0
7258 #if defined(__aarch64__)
7259  int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7260 #else
7261  int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7262 #endif
7263 
7264  // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
7265  // based on ltMask
7266  int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7267  // res = masked & (~zeroMask)
7268  int16x4_t res = vbic_s16(masked, zeroMask);
7269 
7270  return vreinterpret_m64_s16(res);
7271 }
7272 
7273 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
7274 // integer in b is negative, and store the results in dst. Element in dst are
7275 // zeroed out when the corresponding element in b is zero.
7276 //
7277 // FOR j := 0 to 1
7278 // i := j*32
7279 // IF b[i+31:i] < 0
7280 // dst[i+31:i] := -(a[i+31:i])
7281 // ELSE IF b[i+31:i] == 0
7282 // dst[i+31:i] := 0
7283 // ELSE
7284 // dst[i+31:i] := a[i+31:i]
7285 // FI
7286 // ENDFOR
7287 //
7288 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
7290 {
7291  int32x2_t a = vreinterpret_s32_m64(_a);
7292  int32x2_t b = vreinterpret_s32_m64(_b);
7293 
7294  // signed shift right: faster than vclt
7295  // (b < 0) ? 0xFFFFFFFF : 0
7296  uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7297 
7298  // (b == 0) ? 0xFFFFFFFF : 0
7299 #if defined(__aarch64__)
7300  int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7301 #else
7302  int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7303 #endif
7304 
7305  // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
7306  // based on ltMask
7307  int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7308  // res = masked & (~zeroMask)
7309  int32x2_t res = vbic_s32(masked, zeroMask);
7310 
7311  return vreinterpret_m64_s32(res);
7312 }
7313 
7314 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7315 // in b is negative, and store the results in dst. Element in dst are zeroed out
7316 // when the corresponding element in b is zero.
7317 //
7318 // FOR j := 0 to 7
7319 // i := j*8
7320 // IF b[i+7:i] < 0
7321 // dst[i+7:i] := -(a[i+7:i])
7322 // ELSE IF b[i+7:i] == 0
7323 // dst[i+7:i] := 0
7324 // ELSE
7325 // dst[i+7:i] := a[i+7:i]
7326 // FI
7327 // ENDFOR
7328 //
7329 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
7331 {
7332  int8x8_t a = vreinterpret_s8_m64(_a);
7333  int8x8_t b = vreinterpret_s8_m64(_b);
7334 
7335  // signed shift right: faster than vclt
7336  // (b < 0) ? 0xFF : 0
7337  uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7338 
7339  // (b == 0) ? 0xFF : 0
7340 #if defined(__aarch64__)
7341  int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7342 #else
7343  int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7344 #endif
7345 
7346  // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
7347  // based on ltMask
7348  int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7349  // res = masked & (~zeroMask)
7350  int8x8_t res = vbic_s8(masked, zeroMask);
7351 
7352  return vreinterpret_m64_s8(res);
7353 }
7354 
7355 /* SSE4.1 */
7356 
7357 // Blend packed 16-bit integers from a and b using control mask imm8, and store
7358 // the results in dst.
7359 //
7360 // FOR j := 0 to 7
7361 // i := j*16
7362 // IF imm8[j]
7363 // dst[i+15:i] := b[i+15:i]
7364 // ELSE
7365 // dst[i+15:i] := a[i+15:i]
7366 // FI
7367 // ENDFOR
7368 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7369 // __constrange(0,255) int imm)
7370 #define _mm_blend_epi16(a, b, imm) \
7371  __extension__({ \
7372  const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7373  ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7374  ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7375  ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7376  ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7377  ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7378  ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7379  ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7380  uint16x8_t _mask_vec = vld1q_u16(_mask); \
7381  uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7382  uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7383  vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7384  })
7385 
7386 // Blend packed double-precision (64-bit) floating-point elements from a and b
7387 // using control mask imm8, and store the results in dst.
7388 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7389 #define _mm_blend_pd(a, b, imm) \
7390  __extension__({ \
7391  const uint64_t _mask[2] = { \
7392  ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7393  ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7394  uint64x2_t _mask_vec = vld1q_u64(_mask); \
7395  uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7396  uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7397  vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7398  })
7399 
7400 // Blend packed single-precision (32-bit) floating-point elements from a and b
7401 // using mask, and store the results in dst.
7402 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
7404 {
7405  const uint32_t ALIGN_STRUCT(16)
7406  data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7407  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7408  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7409  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7410  uint32x4_t mask = vld1q_u32(data);
7411  float32x4_t a = vreinterpretq_f32_m128(_a);
7412  float32x4_t b = vreinterpretq_f32_m128(_b);
7413  return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7414 }
7415 
7416 // Blend packed 8-bit integers from a and b using mask, and store the results in
7417 // dst.
7418 //
7419 // FOR j := 0 to 15
7420 // i := j*8
7421 // IF mask[i+7]
7422 // dst[i+7:i] := b[i+7:i]
7423 // ELSE
7424 // dst[i+7:i] := a[i+7:i]
7425 // FI
7426 // ENDFOR
7428 {
7429  // Use a signed shift right to create a mask with the sign bit
7430  uint8x16_t mask =
7431  vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7432  uint8x16_t a = vreinterpretq_u8_m128i(_a);
7433  uint8x16_t b = vreinterpretq_u8_m128i(_b);
7434  return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7435 }
7436 
7437 // Blend packed double-precision (64-bit) floating-point elements from a and b
7438 // using mask, and store the results in dst.
7439 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
7441 {
7442  uint64x2_t mask =
7443  vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7444 #if defined(__aarch64__)
7445  float64x2_t a = vreinterpretq_f64_m128d(_a);
7446  float64x2_t b = vreinterpretq_f64_m128d(_b);
7447  return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7448 #else
7449  uint64x2_t a = vreinterpretq_u64_m128d(_a);
7450  uint64x2_t b = vreinterpretq_u64_m128d(_b);
7451  return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7452 #endif
7453 }
7454 
7455 // Blend packed single-precision (32-bit) floating-point elements from a and b
7456 // using mask, and store the results in dst.
7457 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
7459 {
7460  // Use a signed shift right to create a mask with the sign bit
7461  uint32x4_t mask =
7462  vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7463  float32x4_t a = vreinterpretq_f32_m128(_a);
7464  float32x4_t b = vreinterpretq_f32_m128(_b);
7465  return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7466 }
7467 
7468 // Round the packed double-precision (64-bit) floating-point elements in a up
7469 // to an integer value, and store the results as packed double-precision
7470 // floating-point elements in dst.
7471 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
7473 {
7474 #if defined(__aarch64__)
7475  return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7476 #else
7477  double *f = (double *) &a;
7478  return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7479 #endif
7480 }
7481 
7482 // Round the packed single-precision (32-bit) floating-point elements in a up to
7483 // an integer value, and store the results as packed single-precision
7484 // floating-point elements in dst.
7485 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
7487 {
7488 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7489  return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7490 #else
7491  float *f = (float *) &a;
7492  return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7493 #endif
7494 }
7495 
7496 // Round the lower double-precision (64-bit) floating-point element in b up to
7497 // an integer value, store the result as a double-precision floating-point
7498 // element in the lower element of dst, and copy the upper element from a to the
7499 // upper element of dst.
7500 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
7502 {
7503  return _mm_move_sd(a, _mm_ceil_pd(b));
7504 }
7505 
7506 // Round the lower single-precision (32-bit) floating-point element in b up to
7507 // an integer value, store the result as a single-precision floating-point
7508 // element in the lower element of dst, and copy the upper 3 packed elements
7509 // from a to the upper elements of dst.
7510 //
7511 // dst[31:0] := CEIL(b[31:0])
7512 // dst[127:32] := a[127:32]
7513 //
7514 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
7516 {
7517  return _mm_move_ss(a, _mm_ceil_ps(b));
7518 }
7519 
7520 // Compare packed 64-bit integers in a and b for equality, and store the results
7521 // in dst
7523 {
7524 #if defined(__aarch64__)
7525  return vreinterpretq_m128i_u64(
7527 #else
7528  // ARMv7 lacks vceqq_u64
7529  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7530  uint32x4_t cmp =
7532  uint32x4_t swapped = vrev64q_u32(cmp);
7533  return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7534 #endif
7535 }
7536 
7537 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
7538 // 32-bit integers.
7540 {
7541  return vreinterpretq_m128i_s32(
7542  vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7543 }
7544 
7545 // Converts the two signed 16-bit integers in the lower 32 bits two signed
7546 // 32-bit integers.
7548 {
7549  int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7550  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7551  int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7552  return vreinterpretq_m128i_s64(s64x2);
7553 }
7554 
7555 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
7556 // 64-bit integers.
7558 {
7559  return vreinterpretq_m128i_s64(
7560  vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7561 }
7562 
7563 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
7564 // unsigned 32-bit integers.
7566 {
7567  int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7568  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7569  return vreinterpretq_m128i_s16(s16x8);
7570 }
7571 
7572 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7573 // unsigned 32-bit integers.
7575 {
7576  int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7577  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7578  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7579  return vreinterpretq_m128i_s32(s32x4);
7580 }
7581 
7582 // Converts the two signed 8-bit integers in the lower 32 bits to four
7583 // signed 64-bit integers.
7585 {
7586  int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7587  int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7588  int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7589  int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7590  return vreinterpretq_m128i_s64(s64x2);
7591 }
7592 
7593 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
7594 // unsigned 32-bit integers.
7596 {
7597  return vreinterpretq_m128i_u32(
7598  vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7599 }
7600 
7601 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
7602 // unsigned 64-bit integers.
7604 {
7605  uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7606  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7607  uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7608  return vreinterpretq_m128i_u64(u64x2);
7609 }
7610 
7611 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
7612 // unsigned 64-bit integers.
7614 {
7615  return vreinterpretq_m128i_u64(
7616  vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7617 }
7618 
7619 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7620 // and store the results in dst.
7621 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
7623 {
7624  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7625  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7626  return vreinterpretq_m128i_u16(u16x8);
7627 }
7628 
7629 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7630 // unsigned 32-bit integers.
7631 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7633 {
7634  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7635  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7636  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7637  return vreinterpretq_m128i_u32(u32x4);
7638 }
7639 
7640 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
7641 // unsigned 64-bit integers.
7643 {
7644  uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7645  uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7646  uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7647  uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7648  return vreinterpretq_m128i_u64(u64x2);
7649 }
7650 
7651 // Conditionally multiply the packed double-precision (64-bit) floating-point
7652 // elements in a and b using the high 4 bits in imm8, sum the four products, and
7653 // conditionally store the sum in dst using the low 4 bits of imm8.
7654 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
7656 {
7657  // Generate mask value from constant immediate bit value
7658  const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7659  const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7660 #if !SSE2NEON_PRECISE_DP
7661  const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7662  const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7663 #endif
7664  // Conditional multiplication
7665 #if !SSE2NEON_PRECISE_DP
7666  __m128d mul = _mm_mul_pd(a, b);
7667  const __m128d mulMask =
7668  _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7669  __m128d tmp = _mm_and_pd(mul, mulMask);
7670 #else
7671 #if defined(__aarch64__)
7672  double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7673  vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7674  : 0;
7675  double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7676  vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7677  : 0;
7678 #else
7679  double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7680  double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7681 #endif
7682  __m128d tmp = _mm_set_pd(d1, d0);
7683 #endif
7684  // Sum the products
7685 #if defined(__aarch64__)
7686  double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7687 #else
7688  double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7689 #endif
7690  // Conditionally store the sum
7691  const __m128d sumMask =
7692  _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7693  __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7694  return res;
7695 }
7696 
7697 // Conditionally multiply the packed single-precision (32-bit) floating-point
7698 // elements in a and b using the high 4 bits in imm8, sum the four products,
7699 // and conditionally store the sum in dst using the low 4 bits of imm.
7700 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
7702 {
7703 #if defined(__aarch64__)
7704  /* shortcuts */
7705  if (imm == 0xFF) {
7706  return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7707  }
7708  if (imm == 0x7F) {
7709  float32x4_t m = _mm_mul_ps(a, b);
7710  m[3] = 0;
7711  return _mm_set1_ps(vaddvq_f32(m));
7712  }
7713 #endif
7714 
7715  float s = 0, c = 0;
7716  float32x4_t f32a = vreinterpretq_f32_m128(a);
7717  float32x4_t f32b = vreinterpretq_f32_m128(b);
7718 
7719  /* To improve the accuracy of floating-point summation, Kahan algorithm
7720  * is used for each operation.
7721  */
7722  if (imm & (1 << 4))
7723  _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7724  if (imm & (1 << 5))
7725  _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7726  if (imm & (1 << 6))
7727  _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7728  if (imm & (1 << 7))
7729  _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7730  s += c;
7731 
7732  float32x4_t res = {
7733  (imm & 0x1) ? s : 0,
7734  (imm & 0x2) ? s : 0,
7735  (imm & 0x4) ? s : 0,
7736  (imm & 0x8) ? s : 0,
7737  };
7738  return vreinterpretq_m128_f32(res);
7739 }
7740 
7741 // Extracts the selected signed or unsigned 32-bit integer from a and zero
7742 // extends.
7743 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7744 #define _mm_extract_epi32(a, imm) \
7745  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7746 
7747 // Extracts the selected signed or unsigned 64-bit integer from a and zero
7748 // extends.
7749 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7750 #define _mm_extract_epi64(a, imm) \
7751  vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7752 
7753 // Extracts the selected signed or unsigned 8-bit integer from a and zero
7754 // extends.
7755 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7756 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7757 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7758 
7759 // Extracts the selected single-precision (32-bit) floating-point from a.
7760 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7761 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7762 
7763 // Round the packed double-precision (64-bit) floating-point elements in a down
7764 // to an integer value, and store the results as packed double-precision
7765 // floating-point elements in dst.
7766 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
7768 {
7769 #if defined(__aarch64__)
7770  return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7771 #else
7772  double *f = (double *) &a;
7773  return _mm_set_pd(floor(f[1]), floor(f[0]));
7774 #endif
7775 }
7776 
7777 // Round the packed single-precision (32-bit) floating-point elements in a down
7778 // to an integer value, and store the results as packed single-precision
7779 // floating-point elements in dst.
7780 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
7782 {
7783 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7784  return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7785 #else
7786  float *f = (float *) &a;
7787  return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7788 #endif
7789 }
7790 
7791 // Round the lower double-precision (64-bit) floating-point element in b down to
7792 // an integer value, store the result as a double-precision floating-point
7793 // element in the lower element of dst, and copy the upper element from a to the
7794 // upper element of dst.
7795 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
7797 {
7798  return _mm_move_sd(a, _mm_floor_pd(b));
7799 }
7800 
7801 // Round the lower single-precision (32-bit) floating-point element in b down to
7802 // an integer value, store the result as a single-precision floating-point
7803 // element in the lower element of dst, and copy the upper 3 packed elements
7804 // from a to the upper elements of dst.
7805 //
7806 // dst[31:0] := FLOOR(b[31:0])
7807 // dst[127:32] := a[127:32]
7808 //
7809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
7811 {
7812  return _mm_move_ss(a, _mm_floor_ps(b));
7813 }
7814 
7815 // Inserts the least significant 32 bits of b into the selected 32-bit integer
7816 // of a.
7817 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7818 // __constrange(0,4) int imm)
7819 #define _mm_insert_epi32(a, b, imm) \
7820  __extension__({ \
7821  vreinterpretq_m128i_s32( \
7822  vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7823  })
7824 
7825 // Inserts the least significant 64 bits of b into the selected 64-bit integer
7826 // of a.
7827 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7828 // __constrange(0,2) int imm)
7829 #define _mm_insert_epi64(a, b, imm) \
7830  __extension__({ \
7831  vreinterpretq_m128i_s64( \
7832  vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7833  })
7834 
7835 // Inserts the least significant 8 bits of b into the selected 8-bit integer
7836 // of a.
7837 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7838 // __constrange(0,16) int imm)
7839 #define _mm_insert_epi8(a, b, imm) \
7840  __extension__({ \
7841  vreinterpretq_m128i_s8( \
7842  vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7843  })
7844 
7845 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
7846 // element from b into tmp using the control in imm8. Store tmp to dst using
7847 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7849 #define _mm_insert_ps(a, b, imm8) \
7850  __extension__({ \
7851  float32x4_t tmp1 = \
7852  vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
7853  vreinterpretq_f32_m128(a), 0); \
7854  float32x4_t tmp2 = \
7855  vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7856  ((imm8 >> 4) & 0x3)); \
7857  const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7858  ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7859  ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7860  ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7861  uint32x4_t mask = vld1q_u32(data); \
7862  float32x4_t all_zeros = vdupq_n_f32(0); \
7863  \
7864  vreinterpretq_m128_f32( \
7865  vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7866  })
7867 
7868 // epi versions of min/max
7869 // Computes the pariwise maximums of the four signed 32-bit integer values of a
7870 // and b.
7871 //
7872 // A 128-bit parameter that can be defined with the following equations:
7873 // r0 := (a0 > b0) ? a0 : b0
7874 // r1 := (a1 > b1) ? a1 : b1
7875 // r2 := (a2 > b2) ? a2 : b2
7876 // r3 := (a3 > b3) ? a3 : b3
7877 //
7878 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
7880 {
7881  return vreinterpretq_m128i_s32(
7883 }
7884 
7885 // Compare packed signed 8-bit integers in a and b, and store packed maximum
7886 // values in dst.
7887 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
7889 {
7890  return vreinterpretq_m128i_s8(
7892 }
7893 
7894 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7895 // values in dst.
7896 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
7898 {
7899  return vreinterpretq_m128i_u16(
7901 }
7902 
7903 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7904 // values in dst.
7905 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7907 {
7908  return vreinterpretq_m128i_u32(
7910 }
7911 
7912 // Computes the pariwise minima of the four signed 32-bit integer values of a
7913 // and b.
7914 //
7915 // A 128-bit parameter that can be defined with the following equations:
7916 // r0 := (a0 < b0) ? a0 : b0
7917 // r1 := (a1 < b1) ? a1 : b1
7918 // r2 := (a2 < b2) ? a2 : b2
7919 // r3 := (a3 < b3) ? a3 : b3
7920 //
7921 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
7923 {
7924  return vreinterpretq_m128i_s32(
7926 }
7927 
7928 // Compare packed signed 8-bit integers in a and b, and store packed minimum
7929 // values in dst.
7930 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
7932 {
7933  return vreinterpretq_m128i_s8(
7935 }
7936 
7937 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7938 // values in dst.
7939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
7941 {
7942  return vreinterpretq_m128i_u16(
7944 }
7945 
7946 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7947 // values in dst.
7948 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7950 {
7951  return vreinterpretq_m128i_u32(
7953 }
7954 
7955 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7956 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
7957 //
7958 // index[2:0] := 0
7959 // min[15:0] := a[15:0]
7960 // FOR j := 0 to 7
7961 // i := j*16
7962 // IF a[i+15:i] < min[15:0]
7963 // index[2:0] := j
7964 // min[15:0] := a[i+15:i]
7965 // FI
7966 // ENDFOR
7967 // dst[15:0] := min[15:0]
7968 // dst[18:16] := index[2:0]
7969 // dst[127:19] := 0
7970 //
7971 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
7973 {
7974  __m128i dst;
7975  uint16_t min, idx = 0;
7976  // Find the minimum value
7977 #if defined(__aarch64__)
7978  min = vminvq_u16(vreinterpretq_u16_m128i(a));
7979 #else
7980  __m64 tmp;
7981  tmp = vreinterpret_m64_u16(
7982  vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7983  vget_high_u16(vreinterpretq_u16_m128i(a))));
7984  tmp = vreinterpret_m64_u16(
7985  vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7986  tmp = vreinterpret_m64_u16(
7987  vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7988  min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7989 #endif
7990  // Get the index of the minimum value
7991  int i;
7992  for (i = 0; i < 8; i++) {
7993  if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7994  idx = (uint16_t) i;
7995  break;
7996  }
7997  a = _mm_srli_si128(a, 2);
7998  }
7999  // Generate result
8000  dst = _mm_setzero_si128();
8002  vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
8004  vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
8005  return dst;
8006 }
8007 
8008 // Compute the sum of absolute differences (SADs) of quadruplets of unsigned
8009 // 8-bit integers in a compared to those in b, and store the 16-bit results in
8010 // dst. Eight SADs are performed using one quadruplet from b and eight
8011 // quadruplets from a. One quadruplet is selected from b starting at on the
8012 // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
8013 // integers selected from a starting at the offset specified in imm8.
8014 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
8016 {
8017  uint8x16_t _a, _b;
8018 
8019  switch (imm & 0x4) {
8020  case 0:
8021  // do nothing
8022  _a = vreinterpretq_u8_m128i(a);
8023  break;
8024  case 4:
8025  _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
8026  vreinterpretq_u32_m128i(a), 1));
8027  break;
8028  default:
8029 #if defined(__GNUC__) || defined(__clang__)
8030  __builtin_unreachable();
8031 #endif
8032  break;
8033  }
8034 
8035  switch (imm & 0x3) {
8036  case 0:
8037  _b = vreinterpretq_u8_u32(
8038  vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
8039  break;
8040  case 1:
8041  _b = vreinterpretq_u8_u32(
8042  vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
8043  break;
8044  case 2:
8045  _b = vreinterpretq_u8_u32(
8046  vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
8047  break;
8048  case 3:
8049  _b = vreinterpretq_u8_u32(
8050  vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
8051  break;
8052  default:
8053 #if defined(__GNUC__) || defined(__clang__)
8054  __builtin_unreachable();
8055 #endif
8056  break;
8057  }
8058 
8059  int16x8_t c04, c15, c26, c37;
8060  uint8x8_t low_b = vget_low_u8(_b);
8061  c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8062  _a = vextq_u8(_a, _a, 1);
8063  c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8064  _a = vextq_u8(_a, _a, 1);
8065  c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8066  _a = vextq_u8(_a, _a, 1);
8067  c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8068 #if defined(__aarch64__)
8069  // |0|4|2|6|
8070  c04 = vpaddq_s16(c04, c26);
8071  // |1|5|3|7|
8072  c15 = vpaddq_s16(c15, c37);
8073 
8074  int32x4_t trn1_c =
8075  vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8076  int32x4_t trn2_c =
8077  vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8078  return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8079  vreinterpretq_s16_s32(trn2_c)));
8080 #else
8081  int16x4_t c01, c23, c45, c67;
8082  c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8083  c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8084  c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8085  c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8086 
8087  return vreinterpretq_m128i_s16(
8088  vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8089 #endif
8090 }
8091 
8092 // Multiply the low signed 32-bit integers from each packed 64-bit element in
8093 // a and b, and store the signed 64-bit results in dst.
8094 //
8095 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8096 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8098 {
8099  // vmull_s32 upcasts instead of masking, so we downcast.
8100  int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8101  int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8102  return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8103 }
8104 
8105 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8106 // unsigned 32-bit integers from b.
8107 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8109 {
8110  return vreinterpretq_m128i_s32(
8112 }
8113 
8114 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8115 // integers and saturates.
8116 //
8117 // r0 := UnsignedSaturate(a0)
8118 // r1 := UnsignedSaturate(a1)
8119 // r2 := UnsignedSaturate(a2)
8120 // r3 := UnsignedSaturate(a3)
8121 // r4 := UnsignedSaturate(b0)
8122 // r5 := UnsignedSaturate(b1)
8123 // r6 := UnsignedSaturate(b2)
8124 // r7 := UnsignedSaturate(b3)
8126 {
8127  return vreinterpretq_m128i_u16(
8128  vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8129  vqmovun_s32(vreinterpretq_s32_m128i(b))));
8130 }
8131 
8132 // Round the packed double-precision (64-bit) floating-point elements in a using
8133 // the rounding parameter, and store the results as packed double-precision
8134 // floating-point elements in dst.
8135 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
8137 {
8138 #if defined(__aarch64__)
8139  switch (rounding) {
8141  return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8143  return _mm_floor_pd(a);
8145  return _mm_ceil_pd(a);
8147  return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8148  default: //_MM_FROUND_CUR_DIRECTION
8149  return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8150  }
8151 #else
8152  double *v_double = (double *) &a;
8153 
8154  if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8155  (rounding == _MM_FROUND_CUR_DIRECTION &&
8157  double res[2], tmp;
8158  for (int i = 0; i < 2; i++) {
8159  tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8160  double roundDown = floor(tmp); // Round down value
8161  double roundUp = ceil(tmp); // Round up value
8162  double diffDown = tmp - roundDown;
8163  double diffUp = roundUp - tmp;
8164  if (diffDown < diffUp) {
8165  /* If it's closer to the round down value, then use it */
8166  res[i] = roundDown;
8167  } else if (diffDown > diffUp) {
8168  /* If it's closer to the round up value, then use it */
8169  res[i] = roundUp;
8170  } else {
8171  /* If it's equidistant between round up and round down value,
8172  * pick the one which is an even number */
8173  double half = roundDown / 2;
8174  if (half != floor(half)) {
8175  /* If the round down value is odd, return the round up value
8176  */
8177  res[i] = roundUp;
8178  } else {
8179  /* If the round up value is odd, return the round down value
8180  */
8181  res[i] = roundDown;
8182  }
8183  }
8184  res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8185  }
8186  return _mm_set_pd(res[1], res[0]);
8187  } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8188  (rounding == _MM_FROUND_CUR_DIRECTION &&
8190  return _mm_floor_pd(a);
8191  } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8192  (rounding == _MM_FROUND_CUR_DIRECTION &&
8194  return _mm_ceil_pd(a);
8195  }
8196  return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8197  v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8198 #endif
8199 }
8200 
8201 // Round the packed single-precision (32-bit) floating-point elements in a using
8202 // the rounding parameter, and store the results as packed single-precision
8203 // floating-point elements in dst.
8204 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8206 {
8207 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8208  switch (rounding) {
8210  return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8212  return _mm_floor_ps(a);
8214  return _mm_ceil_ps(a);
8216  return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8217  default: //_MM_FROUND_CUR_DIRECTION
8218  return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8219  }
8220 #else
8221  float *v_float = (float *) &a;
8222 
8223  if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8224  (rounding == _MM_FROUND_CUR_DIRECTION &&
8226  uint32x4_t signmask = vdupq_n_u32(0x80000000);
8227  float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8228  vdupq_n_f32(0.5f)); /* +/- 0.5 */
8229  int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8230  vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8231  int32x4_t r_trunc = vcvtq_s32_f32(
8232  vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8233  int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8234  vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8235  int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8236  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8237  float32x4_t delta = vsubq_f32(
8239  vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8240  uint32x4_t is_delta_half =
8241  vceqq_f32(delta, half); /* delta == +/- 0.5 */
8242  return vreinterpretq_m128_f32(
8243  vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8244  } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8245  (rounding == _MM_FROUND_CUR_DIRECTION &&
8247  return _mm_floor_ps(a);
8248  } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8249  (rounding == _MM_FROUND_CUR_DIRECTION &&
8251  return _mm_ceil_ps(a);
8252  }
8253  return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8254  v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8255  v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8256  v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8257 #endif
8258 }
8259 
8260 // Round the lower double-precision (64-bit) floating-point element in b using
8261 // the rounding parameter, store the result as a double-precision floating-point
8262 // element in the lower element of dst, and copy the upper element from a to the
8263 // upper element of dst.
8264 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
8266 {
8267  return _mm_move_sd(a, _mm_round_pd(b, rounding));
8268 }
8269 
8270 // Round the lower single-precision (32-bit) floating-point element in b using
8271 // the rounding parameter, store the result as a single-precision floating-point
8272 // element in the lower element of dst, and copy the upper 3 packed elements
8273 // from a to the upper elements of dst. Rounding is done according to the
8274 // rounding[3:0] parameter, which can be one of:
8275 // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8276 // suppress exceptions
8277 // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
8278 // suppress exceptions
8279 // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
8280 // exceptions
8281 // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
8282 // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8283 // _MM_SET_ROUNDING_MODE
8284 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
8286 {
8287  return _mm_move_ss(a, _mm_round_ps(b, rounding));
8288 }
8289 
8290 // Load 128-bits of integer data from memory into dst using a non-temporal
8291 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
8292 // general-protection exception may be generated.
8293 //
8294 // dst[127:0] := MEM[mem_addr+127:mem_addr]
8295 //
8296 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
8298 {
8299 #if __has_builtin(__builtin_nontemporal_store)
8300  return __builtin_nontemporal_load(p);
8301 #else
8302  return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8303 #endif
8304 }
8305 
8306 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8307 // all 1's, and return 1 if the result is zero, otherwise return 0.
8308 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
8310 {
8311  return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8312  ~(uint64_t) 0;
8313 }
8314 
8315 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8316 // mask, and return 1 if the result is zero, otherwise return 0.
8317 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
8319 {
8320  int64x2_t a_and_mask =
8321  vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
8322  return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8323 }
8324 
8325 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8326 // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8327 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8328 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8329 // otherwise return 0.
8330 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
8332 {
8333  uint64x2_t zf =
8334  vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8335  uint64x2_t cf =
8336  vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8337  uint64x2_t result = vandq_u64(zf, cf);
8338  return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8339 }
8340 
8341 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8342 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8343 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8344 // otherwise set CF to 0. Return the CF value.
8345 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
8347 {
8348  int64x2_t s64 =
8349  vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
8351  return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8352 }
8353 
8354 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8355 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8356 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8357 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8358 // otherwise return 0.
8359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
8360 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8361 
8362 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8363 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8364 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8365 // otherwise set CF to 0. Return the ZF value.
8366 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
8368 {
8369  int64x2_t s64 =
8371  return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8372 }
8373 
8374 /* SSE4.2 */
8375 
8376 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8377 // in b for greater than.
8379 {
8380 #if defined(__aarch64__)
8381  return vreinterpretq_m128i_u64(
8383 #else
8384  return vreinterpretq_m128i_s64(vshrq_n_s64(
8386  63));
8387 #endif
8388 }
8389 
8390 // Starting with the initial value in crc, accumulates a CRC32 value for
8391 // unsigned 16-bit integer v.
8392 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
8393 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8394 {
8395 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8396  __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8397  : [c] "+r"(crc)
8398  : [v] "r"(v));
8399 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8400  crc = __crc32ch(crc, v);
8401 #else
8402  crc = _mm_crc32_u8(crc, v & 0xff);
8403  crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8404 #endif
8405  return crc;
8406 }
8407 
8408 // Starting with the initial value in crc, accumulates a CRC32 value for
8409 // unsigned 32-bit integer v.
8410 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
8411 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8412 {
8413 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8414  __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8415  : [c] "+r"(crc)
8416  : [v] "r"(v));
8417 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8418  crc = __crc32cw(crc, v);
8419 #else
8420  crc = _mm_crc32_u16(crc, v & 0xffff);
8421  crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8422 #endif
8423  return crc;
8424 }
8425 
8426 // Starting with the initial value in crc, accumulates a CRC32 value for
8427 // unsigned 64-bit integer v.
8428 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
8429 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8430 {
8431 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8432  __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8433  : [c] "+r"(crc)
8434  : [v] "r"(v));
8435 #else
8436  crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8437  crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8438 #endif
8439  return crc;
8440 }
8441 
8442 // Starting with the initial value in crc, accumulates a CRC32 value for
8443 // unsigned 8-bit integer v.
8444 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
8445 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8446 {
8447 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8448  __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8449  : [c] "+r"(crc)
8450  : [v] "r"(v));
8451 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8452  crc = __crc32cb(crc, v);
8453 #else
8454  crc ^= v;
8455  for (int bit = 0; bit < 8; bit++) {
8456  if (crc & 1)
8457  crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8458  else
8459  crc = (crc >> 1);
8460  }
8461 #endif
8462  return crc;
8463 }
8464 
8465 /* AES */
8466 
8467 #if !defined(__ARM_FEATURE_CRYPTO)
8468 /* clang-format off */
8469 #define SSE2NEON_AES_DATA(w) \
8470  { \
8471  w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8472  w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8473  w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8474  w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8475  w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8476  w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8477  w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8478  w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8479  w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8480  w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8481  w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8482  w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8483  w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8484  w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8485  w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8486  w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8487  w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8488  w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8489  w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8490  w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8491  w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8492  w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8493  w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8494  w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8495  w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8496  w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8497  w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8498  w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8499  w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8500  w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8501  w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8502  w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8503  w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8504  w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8505  w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8506  w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8507  w(0xb0), w(0x54), w(0xbb), w(0x16) \
8508  }
8509 /* clang-format on */
8510 
8511 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8512 #define SSE2NEON_AES_H0(x) (x)
8513 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8514 #undef SSE2NEON_AES_H0
8515 
8516 // In the absence of crypto extensions, implement aesenc using regular neon
8517 // intrinsics instead. See:
8518 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8519 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8520 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8521 // for more information Reproduced with permission of the author.
8523 {
8524 #if defined(__aarch64__)
8525  static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8526  0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8527  0xc, 0x1, 0x6, 0xb};
8528  static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8529  0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8530 
8531  uint8x16_t v;
8532  uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8533 
8534  // shift rows
8535  w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8536 
8537  // sub bytes
8538  v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8539  v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8540  v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8541  v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8542 
8543  // mix columns
8544  w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8545  w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8546  w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8547 
8548  // add round key
8549  return vreinterpretq_m128i_u8(w) ^ RoundKey;
8550 
8551 #else /* ARMv7-A NEON implementation */
8552 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8553  (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8554  ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8555 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8556 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8557 #define SSE2NEON_AES_U0(p) \
8558  SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8559 #define SSE2NEON_AES_U1(p) \
8560  SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8561 #define SSE2NEON_AES_U2(p) \
8562  SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8563 #define SSE2NEON_AES_U3(p) \
8564  SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8565  static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8570  };
8571 #undef SSE2NEON_AES_B2W
8572 #undef SSE2NEON_AES_F2
8573 #undef SSE2NEON_AES_F3
8574 #undef SSE2NEON_AES_U0
8575 #undef SSE2NEON_AES_U1
8576 #undef SSE2NEON_AES_U2
8577 #undef SSE2NEON_AES_U3
8578 
8579  uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8580  uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8581  uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8582  uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8583 
8584  __m128i out = _mm_set_epi32(
8585  (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8586  aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8587  (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8588  aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8589  (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8590  aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8591  (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8592  aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8593 
8594  return _mm_xor_si128(out, RoundKey);
8595 #endif
8596 }
8597 
8598 // Perform the last round of an AES encryption flow on data (state) in a using
8599 // the round key in RoundKey, and store the result in dst.
8600 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8602 {
8603  /* FIXME: optimized for NEON */
8604  uint8_t v[4][4] = {
8621  };
8622  for (int i = 0; i < 16; i++)
8624  v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8625  return a;
8626 }
8627 
8628 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8629 // This instruction generates a round key for AES encryption. See
8630 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8631 // for details.
8632 //
8633 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
8635 {
8636  uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8637  uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8638  for (int i = 0; i < 4; ++i) {
8639  ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8640  ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8641  }
8642  return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8643  ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8644 }
8645 #undef SSE2NEON_AES_DATA
8646 
8647 #else /* __ARM_FEATURE_CRYPTO */
8648 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8649 // AESMC and then manually applying the real key as an xor operation. This
8650 // unfortunately means an additional xor op; the compiler should be able to
8651 // optimize this away for repeated calls however. See
8652 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8653 // for more details.
8655 {
8656  return vreinterpretq_m128i_u8(
8657  vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8659 }
8660 
8661 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8663 {
8664  return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8665  vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8666  RoundKey);
8667 }
8668 
8670 {
8671  // AESE does ShiftRows and SubBytes on A
8672  uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8673 
8674  uint8x16_t dest = {
8675  // Undo ShiftRows step from AESE and extract X1 and X3
8676  u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
8677  u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
8678  u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
8679  u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
8680  };
8681  uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8683 }
8684 #endif
8685 
8686 /* Others */
8687 
8688 // Perform a carry-less multiplication of two 64-bit integers, selected from a
8689 // and b according to imm8, and store the results in dst.
8690 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
8692 {
8693  uint64x2_t a = vreinterpretq_u64_m128i(_a);
8694  uint64x2_t b = vreinterpretq_u64_m128i(_b);
8695  switch (imm & 0x11) {
8696  case 0x00:
8697  return vreinterpretq_m128i_u64(
8698  _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8699  case 0x01:
8700  return vreinterpretq_m128i_u64(
8701  _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8702  case 0x10:
8703  return vreinterpretq_m128i_u64(
8704  _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8705  case 0x11:
8706  return vreinterpretq_m128i_u64(
8707  _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8708  default:
8709  abort();
8710  }
8711 }
8712 
8714 {
8715  union {
8716  fpcr_bitfield field;
8717 #if defined(__aarch64__)
8718  uint64_t value;
8719 #else
8720  uint32_t value;
8721 #endif
8722  } r;
8723 
8724 #if defined(__aarch64__)
8725  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
8726 #else
8727  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8728 #endif
8729 
8730  return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
8731 }
8732 
8733 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
8734 // return that count in dst.
8735 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
8736 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8737 {
8738 #if defined(__aarch64__)
8739 #if __has_builtin(__builtin_popcount)
8740  return __builtin_popcount(a);
8741 #else
8742  return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8743 #endif
8744 #else
8745  uint32_t count = 0;
8746  uint8x8_t input_val, count8x8_val;
8747  uint16x4_t count16x4_val;
8748  uint32x2_t count32x2_val;
8749 
8750  input_val = vld1_u8((uint8_t *) &a);
8751  count8x8_val = vcnt_u8(input_val);
8752  count16x4_val = vpaddl_u8(count8x8_val);
8753  count32x2_val = vpaddl_u16(count16x4_val);
8754 
8755  vst1_u32(&count, count32x2_val);
8756  return count;
8757 #endif
8758 }
8759 
8760 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
8761 // return that count in dst.
8762 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
8763 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8764 {
8765 #if defined(__aarch64__)
8766 #if __has_builtin(__builtin_popcountll)
8767  return __builtin_popcountll(a);
8768 #else
8769  return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8770 #endif
8771 #else
8772  uint64_t count = 0;
8773  uint8x8_t input_val, count8x8_val;
8774  uint16x4_t count16x4_val;
8775  uint32x2_t count32x2_val;
8776  uint64x1_t count64x1_val;
8777 
8778  input_val = vld1_u8((uint8_t *) &a);
8779  count8x8_val = vcnt_u8(input_val);
8780  count16x4_val = vpaddl_u8(count8x8_val);
8781  count32x2_val = vpaddl_u16(count16x4_val);
8782  count64x1_val = vpaddl_u32(count32x2_val);
8783  vst1_u64(&count, count64x1_val);
8784  return count;
8785 #endif
8786 }
8787 
8789 {
8790  // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
8791  // regardless of the value of the FZ bit.
8792  union {
8793  fpcr_bitfield field;
8794 #if defined(__aarch64__)
8795  uint64_t value;
8796 #else
8797  uint32_t value;
8798 #endif
8799  } r;
8800 
8801 #if defined(__aarch64__)
8802  __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
8803 #else
8804  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8805 #endif
8806 
8807  r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
8808 
8809 #if defined(__aarch64__)
8810  __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
8811 #else
8812  __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
8813 #endif
8814 }
8815 
8816 // Return the current 64-bit value of the processor's time-stamp counter.
8817 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
8818 
8819 FORCE_INLINE uint64_t _rdtsc(void)
8820 {
8821 #if defined(__aarch64__)
8822  uint64_t val;
8823 
8824  /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
8825  * system counter is at least 56 bits wide; from Armv8.6, the counter
8826  * must be 64 bits wide. So the system counter could be less than 64
8827  * bits wide and it is attributed with the flag 'cap_user_time_short'
8828  * is true.
8829  */
8830  asm volatile("mrs %0, cntvct_el0" : "=r"(val));
8831 
8832  return val;
8833 #else
8834  uint32_t pmccntr, pmuseren, pmcntenset;
8835  // Read the user mode Performance Monitoring Unit (PMU)
8836  // User Enable Register (PMUSERENR) access permissions.
8837  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
8838  if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
8839  asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
8840  if (pmcntenset & 0x80000000UL) { // Is it counting?
8841  asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
8842  // The counter is set up to count every 64th cycle
8843  return (uint64_t) (pmccntr) << 6;
8844  }
8845  }
8846 
8847  // Fallback to syscall as we can't enable PMUSERENR in user mode.
8848  struct timeval tv;
8849  gettimeofday(&tv, NULL);
8850  return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
8851 #endif
8852 }
8853 
8854 #if defined(__GNUC__) || defined(__clang__)
8855 #pragma pop_macro("ALIGN_STRUCT")
8856 #pragma pop_macro("FORCE_INLINE")
8857 #endif
8858 
8859 #if defined(__GNUC__) && !defined(__clang__)
8860 #pragma GCC pop_options
8861 #endif
8862 
8863 #endif
8864 // clang-format on
data
Definition: plot_best_vs_generic.py:23
val
Definition: volk_arch_defs.py:57
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3826
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2958
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1265
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition: sse2neon.h:2399
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1325
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1235
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition: sse2neon.h:6679
#define vreinterpret_m64_f32(x)
Definition: sse2neon.h:320
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:7522
#define _MM_FROUND_TO_POS_INF
Definition: sse2neon.h:201
#define vreinterpretq_u32_m128d(x)
Definition: sse2neon.h:362
#define vreinterpret_m64_s32(x)
Definition: sse2neon.h:311
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6247
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition: sse2neon.h:5286
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition: sse2neon.h:1437
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
#define vreinterpretq_m128_s32(x)
Definition: sse2neon.h:269
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7888
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition: sse2neon.h:4246
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:1095
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4687
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2046
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3293
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4394
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3116
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition: sse2neon.h:5328
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition: sse2neon.h:5199
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
Definition: sse2neon.h:1799
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition: sse2neon.h:1510
#define vreinterpretq_m128_f32(x)
Definition: sse2neon.h:259
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition: sse2neon.h:4113
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3083
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition: sse2neon.h:1954
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:2993
FORCE_INLINE void _mm_sfence(void)
Definition: sse2neon.h:2568
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition: sse2neon.h:1877
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3311
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition: sse2neon.h:2817
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3156
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
Definition: sse2neon.h:8713
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8097
#define vreinterpretq_m128i_s8(x)
Definition: sse2neon.h:286
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition: sse2neon.h:2794
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8601
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6926
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6405
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition: sse2neon.h:867
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3101
#define vreinterpret_m64_s8(x)
Definition: sse2neon.h:309
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition: sse2neon.h:4209
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6782
#define vreinterpret_u16_m64(x)
Definition: sse2neon.h:324
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
#define vreinterpretq_m128i_u64(x)
Definition: sse2neon.h:294
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:5115
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3749
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition: sse2neon.h:461
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6373
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition: sse2neon.h:4096
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:6911
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition: sse2neon.h:571
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4705
#define vreinterpret_m64_s64(x)
Definition: sse2neon.h:312
#define vreinterpretq_u8_m128i(x)
Definition: sse2neon.h:304
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5743
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition: sse2neon.h:7472
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition: sse2neon.h:2719
#define vreinterpretq_s16_m128i(x)
Definition: sse2neon.h:300
#define _MM_FLUSH_ZERO_MASK
Definition: sse2neon.h:217
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4950
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
#define vreinterpretq_u64_m128d(x)
Definition: sse2neon.h:363
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1220
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:7033
#define vreinterpretq_s8_m128(x)
Definition: sse2neon.h:281
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition: sse2neon.h:4883
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:1110
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8125
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2523
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3540
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3376
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6263
#define vreinterpret_m64_u8(x)
Definition: sse2neon.h:314
#define vreinterpretq_m128d_s32(x)
Definition: sse2neon.h:352
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition: sse2neon.h:1991
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4924
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2219
FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2132
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5653
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1335
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition: sse2neon.h:2778
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition: sse2neon.h:7207
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3422
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition: sse2neon.h:4550
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition: sse2neon.h:8411
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:7289
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition: sse2neon.h:4530
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4563
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition: sse2neon.h:588
#define _sse2neon_unlikely(x)
Definition: sse2neon.h:106
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3267
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition: sse2neon.h:8393
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:6191
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition: sse2neon.h:5321
#define vreinterpretq_f32_m128i(x)
Definition: sse2neon.h:296
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition: sse2neon.h:876
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6281
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
Definition: sse2neon.h:8522
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6097
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4776
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5766
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition: sse2neon.h:1409
#define vreinterpret_m64_u16(x)
Definition: sse2neon.h:315
#define SSE2NEON_AES_DATA(w)
Definition: sse2neon.h:8469
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition: sse2neon.h:1393
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition: sse2neon.h:1743
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2387
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition: sse2neon.h:8309
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3275
#define _MM_DENORMALS_ZERO_OFF
Definition: sse2neon.h:223
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3341
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3907
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6826
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition: sse2neon.h:6693
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition: sse2neon.h:2801
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition: sse2neon.h:7427
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition: sse2neon.h:8265
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3034
#define vreinterpretq_m128_u32(x)
Definition: sse2neon.h:264
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5954
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition: sse2neon.h:4765
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3886
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1126
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition: sse2neon.h:3952
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1133
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition: sse2neon.h:113
#define vreinterpretq_s32_m128i(x)
Definition: sse2neon.h:301
#define vreinterpret_s64_m64(x)
Definition: sse2neon.h:331
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition: sse2neon.h:8367
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition: sse2neon.h:6665
#define vreinterpretq_m128i_s16(x)
Definition: sse2neon.h:287
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition: sse2neon.h:4935
#define vreinterpretq_f32_m128d(x)
Definition: sse2neon.h:365
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3092
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition: sse2neon.h:1577
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition: sse2neon.h:2694
#define vreinterpretq_m128d_s64(x)
Definition: sse2neon.h:353
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition: sse2neon.h:2509
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition: sse2neon.h:857
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition: sse2neon.h:923
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition: sse2neon.h:2330
#define vreinterpretq_s8_m128i(x)
Definition: sse2neon.h:299
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4635
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition: sse2neon.h:4581
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition: sse2neon.h:6064
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition: sse2neon.h:581
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3610
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition: sse2neon.h:613
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition: sse2neon.h:4013
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4595
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition: sse2neon.h:7767
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1250
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition: sse2neon.h:7501
FORCE_INLINE uint64_t _rdtsc(void)
Definition: sse2neon.h:8819
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3518
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition: sse2neon.h:4335
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6896
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3717
#define vreinterpretq_s64_m128d(x)
Definition: sse2neon.h:360
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition: sse2neon.h:7622
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition: sse2neon.h:892
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5010
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition: sse2neon.h:7632
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition: sse2neon.h:7330
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition: sse2neon.h:4483
#define _MM_DENORMALS_ZERO_ON
Definition: sse2neon.h:222
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3002
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition: sse2neon.h:1701
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition: sse2neon.h:629
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
#define vreinterpretq_m128i_u8(x)
Definition: sse2neon.h:291
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1355
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition: sse2neon.h:8136
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition: sse2neon.h:1459
#define _MM_ROUND_NEAREST
Definition: sse2neon.h:212
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1917
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition: sse2neon.h:606
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:8318
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition: sse2neon.h:8691
#define vreinterpret_u32_m64(x)
Definition: sse2neon.h:325
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition: sse2neon.h:4871
#define vreinterpretq_nth_u64_m128i(x, n)
Definition: sse2neon.h:404
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1306
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3559
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7931
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7949
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition: sse2neon.h:1475
#define vreinterpretq_m128d_f32(x)
Definition: sse2neon.h:358
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3684
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition: sse2neon.h:4145
#define vreinterpretq_f64_m128i(x)
Definition: sse2neon.h:297
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition: sse2neon.h:1975
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1175
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6326
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition: sse2neon.h:4303
#define vreinterpret_m64_s16(x)
Definition: sse2neon.h:310
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition: sse2neon.h:4859
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3676
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7906
#define _MM_ROUND_DOWN
Definition: sse2neon.h:213
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7922
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition: sse2neon.h:677
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition: sse2neon.h:5219
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition: sse2neon.h:2898
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3804
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6511
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition: sse2neon.h:5586
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2849
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition: sse2neon.h:5070
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition: sse2neon.h:7557
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition: sse2neon.h:5186
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition: sse2neon.h:3970
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition: sse2neon.h:7515
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition: sse2neon.h:6721
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6480
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1243
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3871
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition: sse2neon.h:2883
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3856
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4644
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition: sse2neon.h:636
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2101
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition: sse2neon.h:3238
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4905
#define _MM_FROUND_TO_NEG_INF
Definition: sse2neon.h:200
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition: sse2neon.h:7655
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:8331
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5976
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2763
FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
Definition: sse2neon.h:706
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition: sse2neon.h:8736
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition: sse2neon.h:1627
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2055
FORCE_INLINE void _mm_empty(void)
Definition: sse2neon.h:1027
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
#define vreinterpretq_u64_m128(x)
Definition: sse2neon.h:279
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition: sse2neon.h:6017
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6300
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6042
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition: sse2neon.h:8445
#define ALIGN_STRUCT(x)
Definition: sse2neon.h:103
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1213
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3618
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:6003
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1273
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition: sse2neon.h:1681
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1317
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition: sse2neon.h:3053
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6797
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1367
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition: sse2neon.h:4196
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6964
FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
Definition: sse2neon.h:6733
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition: sse2neon.h:643
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3284
#define vreinterpretq_s32_m128(x)
Definition: sse2neon.h:283
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition: sse2neon.h:2727
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:7057
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1258
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6942
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition: sse2neon.h:3214
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3651
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5498
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:7169
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:7539
#define vreinterpretq_u32_m128i(x)
Definition: sse2neon.h:306
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6849
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3771
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6072
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition: sse2neon.h:7584
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition: sse2neon.h:930
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition: sse2neon.h:2377
#define vreinterpretq_m128i_s64(x)
Definition: sse2neon.h:289
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
#define vreinterpretq_m128_s64(x)
Definition: sse2neon.h:270
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE void _mm_free(void *addr)
Definition: sse2neon.h:1790
#define vreinterpretq_u32_m128(x)
Definition: sse2neon.h:278
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4374
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition: sse2neon.h:1933
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:5092
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3841
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition: sse2neon.h:7642
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3500
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3442
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6864
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1377
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition: sse2neon.h:5226
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition: sse2neon.h:4751
#define _MM_ROUND_UP
Definition: sse2neon.h:214
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
Definition: sse2neon.h:6545
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1774
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7940
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4696
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5674
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3509
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
Definition: sse2neon.h:8634
#define vreinterpret_s32_m64(x)
Definition: sse2neon.h:330
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition: sse2neon.h:7110
#define vreinterpretq_m128d_u32(x)
Definition: sse2neon.h:355
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3143
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition: sse2neon.h:4288
#define vreinterpret_m64_u64(x)
Definition: sse2neon.h:317
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:2016
#define __int64
Definition: sse2neon.h:252
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition: sse2neon.h:659
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3400
#define vreinterpretq_u64_m128i(x)
Definition: sse2neon.h:307
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition: sse2neon.h:4263
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition: sse2neon.h:7595
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4513
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition: sse2neon.h:8788
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition: sse2neon.h:5132
#define _MM_ROUND_TOWARD_ZERO
Definition: sse2neon.h:215
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition: sse2neon.h:620
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition: sse2neon.h:6152
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition: sse2neon.h:4613
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1198
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7879
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6182
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1299
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition: sse2neon.h:2177
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition: sse2neon.h:7796
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition: sse2neon.h:5263
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3577
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition: sse2neon.h:7613
#define _mm_set_pd1
Definition: sse2neon.h:5181
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4626
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition: sse2neon.h:6055
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7897
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6880
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3709
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition: sse2neon.h:4128
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition: sse2neon.h:5965
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition: sse2neon.h:5247
#define _MM_FROUND_TO_NEAREST_INT
Definition: sse2neon.h:199
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition: sse2neon.h:1651
FORCE_INLINE void _mm_prefetch(const void *p, int i)
Definition: sse2neon.h:2308
#define vreinterpretq_m128d_u64(x)
Definition: sse2neon.h:356
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition: sse2neon.h:4237
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1183
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6122
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition: sse2neon.h:8429
#define SSE2NEON_AES_U3(p)
#define FORCE_INLINE
Definition: sse2neon.h:100
#define vreinterpret_s16_m64(x)
Definition: sse2neon.h:329
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition: sse2neon.h:6651
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:5100
#define vreinterpretq_nth_u8_m128i(x, n)
Definition: sse2neon.h:406
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition: sse2neon.h:4458
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1228
FORCE_INLINE unsigned int _mm_getcsr()
Definition: sse2neon.h:2515
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition: sse2neon.h:3991
#define vreinterpretq_m128i_u32(x)
Definition: sse2neon.h:293
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:6206
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:2071
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition: sse2neon.h:910
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition: sse2neon.h:6142
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3172
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition: sse2neon.h:884
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1047
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition: sse2neon.h:2809
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition: sse2neon.h:7486
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition: sse2neon.h:8015
#define vreinterpretq_m128i_s32(x)
Definition: sse2neon.h:288
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8108
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition: sse2neon.h:7403
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition: sse2neon.h:597
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition: sse2neon.h:2492
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition: sse2neon.h:695
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition: sse2neon.h:686
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition: sse2neon.h:8763
FORCE_INLINE void _mm_pause()
Definition: sse2neon.h:5082
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition: sse2neon.h:7972
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition: sse2neon.h:3922
static const uint8_t SSE2NEON_sbox[256]
Definition: sse2neon.h:8513
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:5631
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1900
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1162
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition: sse2neon.h:2350
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition: sse2neon.h:3206
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition: sse2neon.h:2119
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition: sse2neon.h:4313
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:7132
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3490
#define vreinterpretq_f32_m128(x)
Definition: sse2neon.h:273
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition: sse2neon.h:8205
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition: sse2neon.h:651
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:4894
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3585
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:2228
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition: sse2neon.h:7810
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition: sse2neon.h:5124
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:4971
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5993
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition: sse2neon.h:900
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition: sse2neon.h:6221
#define _MM_FROUND_TO_ZERO
Definition: sse2neon.h:202
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition: sse2neon.h:5168
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition: sse2neon.h:3222
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition: sse2neon.h:669
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition: sse2neon.h:7603
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition: sse2neon.h:7547
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition: sse2neon.h:6563
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition: sse2neon.h:1494
#define _MM_FLUSH_ZERO_ON
Definition: sse2neon.h:218
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:8378
#define _MM_DENORMALS_ZERO_MASK
Definition: sse2neon.h:221
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition: sse2neon.h:1610
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition: sse2neon.h:8346
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4732
#define vreinterpretq_m128i_u16(x)
Definition: sse2neon.h:292
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition: sse2neon.h:1532
_mm_hint
Definition: sse2neon.h:545
@ _MM_HINT_T1
Definition: sse2neon.h:548
@ _MM_HINT_T0
Definition: sse2neon.h:547
@ _MM_HINT_ET0
Definition: sse2neon.h:551
@ _MM_HINT_ET1
Definition: sse2neon.h:552
@ _MM_HINT_ET2
Definition: sse2neon.h:553
@ _MM_HINT_ENTA
Definition: sse2neon.h:550
@ _MM_HINT_T2
Definition: sse2neon.h:549
@ _MM_HINT_NTA
Definition: sse2neon.h:546
#define _MM_FROUND_CUR_DIRECTION
Definition: sse2neon.h:203
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3187
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition: sse2neon.h:8285
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition: sse2neon.h:2359
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5926
#define vreinterpretq_u16_m128i(x)
Definition: sse2neon.h:305
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition: sse2neon.h:4169
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition: sse2neon.h:2155
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition: sse2neon.h:916
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition: sse2neon.h:1713
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3319
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition: sse2neon.h:6027
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3011
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition: sse2neon.h:4347
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition: sse2neon.h:5616
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:6809
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition: sse2neon.h:2447
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition: sse2neon.h:4500
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6106
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5521
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4672
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition: sse2neon.h:1428
#define vreinterpret_s8_m64(x)
Definition: sse2neon.h:328
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5475
#define vreinterpret_u8_m64(x)
Definition: sse2neon.h:323
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6351
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1345
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:7248
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1141
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
Definition: sse2neon.h:1823
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
Definition: sse2neon.h:737
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition: sse2neon.h:6597
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition: sse2neon.h:1966
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6450
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition: sse2neon.h:8297
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition: sse2neon.h:5910
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition: sse2neon.h:7440
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6429
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2110
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3464
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3643
#define SSE2NEON_AES_H0(x)
Definition: sse2neon.h:8512
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition: sse2neon.h:6707
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5789
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition: sse2neon.h:6818
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition: sse2neon.h:1662
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1285
#define _MM_FROUND_NO_EXC
Definition: sse2neon.h:204
#define vreinterpretq_s64_m128i(x)
Definition: sse2neon.h:302
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition: sse2neon.h:4186
#define _MM_FLUSH_ZERO_OFF
Definition: sse2neon.h:219
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:7003
SIMDVec
Definition: sse2neon.h:401
FORCE_INLINE void _mm_clflush(void const *p)
Definition: sse2neon.h:3258
Definition: sse2neon.h:557
uint16_t res0
Definition: sse2neon.h:558
uint8_t bit23
Definition: sse2neon.h:561
uint8_t res1
Definition: sse2neon.h:559
uint8_t bit24
Definition: sse2neon.h:562
uint8_t bit22
Definition: sse2neon.h:560
uint8_t res2
Definition: sse2neon.h:563
static int gettimeofday(struct timeval *tv, struct timezone *tz)
Definition: time.h:48
VOLK_API void
Call into a specific implementation given by name.
Definition: volk.tmpl.h:101
for i
Definition: volk_config_fixed.tmpl.h:13