73 #ifndef SSE2NEON_PRECISE_MINMAX
74 #define SSE2NEON_PRECISE_MINMAX (0)
77 #ifndef SSE2NEON_PRECISE_DIV
78 #define SSE2NEON_PRECISE_DIV (0)
81 #ifndef SSE2NEON_PRECISE_SQRT
82 #define SSE2NEON_PRECISE_SQRT (0)
85 #ifndef SSE2NEON_PRECISE_DP
86 #define SSE2NEON_PRECISE_DP (0)
90 #if defined(__GNUC__) || defined(__clang__)
91 #pragma push_macro("FORCE_INLINE")
92 #pragma push_macro("ALIGN_STRUCT")
93 #define FORCE_INLINE static inline __attribute__((always_inline))
94 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
95 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
96 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
98 #warning "Macro name collisions may happen with unsupported compiler."
100 #define FORCE_INLINE static inline
103 #define ALIGN_STRUCT(x) __declspec(align(x))
105 #define _sse2neon_likely(x) (x)
106 #define _sse2neon_unlikely(x) (x)
111 #define _sse2neon_const static const
113 #define _sse2neon_const const
121 #if defined(__GNUC__)
122 #if defined(__arm__) && __ARM_ARCH == 7
127 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
128 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
130 #if !defined(__clang__)
131 #pragma GCC push_options
132 #pragma GCC target("fpu=neon")
134 #elif defined(__aarch64__)
135 #if !defined(__clang__)
136 #pragma GCC push_options
137 #pragma GCC target("+simd")
139 #elif __ARM_ARCH == 8
140 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
142 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
144 #if !defined(__clang__)
145 #pragma GCC push_options
148 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
152 #include <arm_neon.h>
153 #if !defined(__aarch64__) && (__ARM_ARCH == 8)
154 #if defined __has_include && __has_include(<arm_acle.h>)
155 #include <arm_acle.h>
160 #if !defined(__aarch64__)
169 #if !defined(__aarch64__)
176 #ifndef __has_builtin
178 #if defined(__GNUC__) && (__GNUC__ <= 9)
179 #define __has_builtin(x) HAS##x
180 #define HAS__builtin_popcount 1
181 #define HAS__builtin_popcountll 1
183 #define __has_builtin(x) 0
195 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
196 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
199 #define _MM_FROUND_TO_NEAREST_INT 0x00
200 #define _MM_FROUND_TO_NEG_INF 0x01
201 #define _MM_FROUND_TO_POS_INF 0x02
202 #define _MM_FROUND_TO_ZERO 0x03
203 #define _MM_FROUND_CUR_DIRECTION 0x04
204 #define _MM_FROUND_NO_EXC 0x08
205 #define _MM_FROUND_RAISE_EXC 0x00
206 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
207 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
208 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
209 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
210 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
211 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
212 #define _MM_ROUND_NEAREST 0x0000
213 #define _MM_ROUND_DOWN 0x2000
214 #define _MM_ROUND_UP 0x4000
215 #define _MM_ROUND_TOWARD_ZERO 0x6000
217 #define _MM_FLUSH_ZERO_MASK 0x8000
218 #define _MM_FLUSH_ZERO_ON 0x8000
219 #define _MM_FLUSH_ZERO_OFF 0x0000
221 #define _MM_DENORMALS_ZERO_MASK 0x0040
222 #define _MM_DENORMALS_ZERO_ON 0x0040
223 #define _MM_DENORMALS_ZERO_OFF 0x0000
226 #define __constrange(a, b) const
239 #if defined(__aarch64__)
248 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
249 #if (defined(__x86_64__) || defined(__i386__))
250 #define __int64 long long
252 #define __int64 int64_t
258 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
259 #define vreinterpretq_m128_f32(x) (x)
260 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
262 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
263 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
264 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
265 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
267 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
268 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
269 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
270 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
272 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
273 #define vreinterpretq_f32_m128(x) (x)
274 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
276 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
277 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
278 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
279 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
281 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
282 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
283 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
284 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
286 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
287 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
288 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
289 #define vreinterpretq_m128i_s64(x) (x)
291 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
292 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
293 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
294 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
296 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
297 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
299 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
300 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
301 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
302 #define vreinterpretq_s64_m128i(x) (x)
304 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
305 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
306 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
307 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
309 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
310 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
311 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
312 #define vreinterpret_m64_s64(x) (x)
314 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
315 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
316 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
317 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
319 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
320 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
321 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
323 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
324 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
325 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
326 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
328 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
329 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
330 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
331 #define vreinterpret_s64_m64(x) (x)
333 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
335 #if defined(__aarch64__)
336 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
337 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
339 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
341 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
342 #define vreinterpretq_m128d_f64(x) (x)
344 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
346 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
347 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
349 #define vreinterpretq_f64_m128d(x) (x)
350 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
352 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
353 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
355 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
356 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
358 #define vreinterpretq_m128d_f32(x) (x)
360 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
362 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
363 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
365 #define vreinterpretq_f32_m128d(x) (x)
398 uint16_t m128_u16[8];
399 uint32_t m128_u32[4];
400 uint64_t m128_u64[2];
404 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
405 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
406 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
409 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
410 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
411 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
412 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
446 #if defined(__GNUC__) && !defined(__clang__) && \
447 ((__GNUC__ <= 12 && defined(__arm__)) || \
448 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
449 (__GNUC__ <= 9 && defined(__aarch64__)))
453 ret.val[0] = vld1q_u8(p + 0);
454 ret.val[1] = vld1q_u8(p + 16);
455 ret.val[2] = vld1q_u8(p + 32);
456 ret.val[3] = vld1q_u8(p + 48);
463 return vld1q_u8_x4(p);
564 #if defined(__aarch64__)
590 float32x2_t a21 = vget_high_f32(
592 float32x2_t b03 = vget_low_f32(
599 float32x2_t a03 = vget_low_f32(
601 float32x2_t b21 = vget_high_f32(
664 float32x2_t a02 = vset_lane_f32(a0, a22, 1);
682 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
689 float32_t b2 = vgetq_lane_f32(b, 2);
691 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
698 float32_t b2 = vgetq_lane_f32(b, 2);
700 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
714 #if defined(__ARM_FEATURE_CRYPTO) && \
715 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
719 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
720 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
721 return vreinterpretq_u64_p128(vmull_p64(a, b));
739 poly8x8_t a = vreinterpret_p8_u64(_a);
740 poly8x8_t b = vreinterpret_p8_u64(_b);
743 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
744 vcreate_u8(0x00000000ffffffff));
745 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
746 vcreate_u8(0x0000000000000000));
749 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
751 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
753 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
755 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
757 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
759 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
761 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
763 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
766 uint8x16_t l = veorq_u8(e, f);
767 uint8x16_t m = veorq_u8(g, h);
768 uint8x16_t n = veorq_u8(
i, j);
772 #if defined(__aarch64__)
773 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
774 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
775 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
776 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
777 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
778 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
779 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
780 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
782 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
783 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
784 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
785 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
789 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
790 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
791 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
795 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
796 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
797 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
800 #if defined(__aarch64__)
801 uint8x16_t t0 = vreinterpretq_u8_u64(
802 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
803 uint8x16_t t1 = vreinterpretq_u8_u64(
804 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
805 uint8x16_t t2 = vreinterpretq_u8_u64(
806 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
807 uint8x16_t t3 = vreinterpretq_u8_u64(
808 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
810 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
811 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
812 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
813 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
816 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
817 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
818 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
819 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
822 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
823 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
824 uint8x16_t mix = veorq_u8(d, cross1);
825 uint8x16_t r = veorq_u8(mix, cross2);
826 return vreinterpretq_u64_u8(r);
838 #define _mm_shuffle_epi32_default(a, imm) \
842 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
843 ret = vsetq_lane_s32( \
844 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
846 ret = vsetq_lane_s32( \
847 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
849 ret = vsetq_lane_s32( \
850 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
852 vreinterpretq_m128i_s32(ret); \
939 #if defined(__aarch64__)
940 #define _mm_shuffle_epi32_splat(a, imm) \
942 vreinterpretq_m128i_s32( \
943 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
946 #define _mm_shuffle_epi32_splat(a, imm) \
948 vreinterpretq_m128i_s32( \
949 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
967 #define _mm_shuffle_ps_default(a, b, imm) \
971 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
972 ret = vsetq_lane_f32( \
973 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
975 ret = vsetq_lane_f32( \
976 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
978 ret = vsetq_lane_f32( \
979 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
981 vreinterpretq_m128_f32(ret); \
990 #define _mm_shufflelo_epi16_function(a, imm) \
992 int16x8_t ret = vreinterpretq_s16_m128i(a); \
993 int16x4_t lowBits = vget_low_s16(ret); \
994 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
995 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
997 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
999 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1001 vreinterpretq_m128i_s16(ret); \
1010 #define _mm_shufflehi_epi16_function(a, imm) \
1012 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1013 int16x4_t highBits = vget_high_s16(ret); \
1014 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1015 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1017 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1019 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1021 vreinterpretq_m128i_s16(ret); \
1050 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1329 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1339 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1349 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1359 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1371 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1411 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1439 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1443 float32_t
data = vgetq_lane_f32(
1445 return (int32_t)
data;
1559 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1593 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1594 static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1595 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1641 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1673 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1683 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1686 float32_t
data = vgetq_lane_f32(
1688 return (int64_t)
data;
1727 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1735 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1758 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1764 #if SSE2NEON_PRECISE_DIV
1785 #define _mm_extract_pi16(a, imm) \
1786 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1803 #if defined(__aarch64__)
1810 #if defined(__aarch64__)
1811 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
1813 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1827 #if defined(__aarch64__)
1834 #if defined(__aarch64__)
1835 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
1837 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1840 if (r.field.bit22) {
1850 #define _mm_insert_pi16(a, b, imm) \
1852 vreinterpret_m64_s16( \
1853 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1872 #define _mm_load_ps1 _mm_load1_ps
1903 vcombine_f32(vget_low_f32(a), vld1_f32((
const float32_t *) p)));
1920 vcombine_f32(vld1_f32((
const float32_t *) p), vget_high_f32(a)));
1935 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1957 vsetq_lane_s16(*(
const int16_t *) p, vdupq_n_s16(0), 0));
1969 vcombine_s64(vld1_s64((
const int64_t *) p), vdup_n_s64(0)));
1979 return malloc(size);
1980 if (align == 2 || (
sizeof(
void *) == 8 && align == 4))
1981 align =
sizeof(
void *);
1982 if (!posix_memalign(&ptr, align, size))
1998 vst1_s8((int8_t *) mem_addr, masked);
2005 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2027 #if SSE2NEON_PRECISE_MINMAX
2057 float32_t value = vgetq_lane_f32(
_mm_max_ps(a, b), 0);
2082 #if SSE2NEON_PRECISE_MINMAX
2112 float32_t value = vgetq_lane_f32(
_mm_min_ps(a, b), 0);
2158 #if defined(__aarch64__)
2159 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2160 uint8x8_t tmp = vshr_n_u8(input, 7);
2161 return vaddv_u8(vshl_u8(tmp, shift));
2164 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2165 uint32x2_t paired16 =
2166 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2167 uint8x8_t paired32 =
2168 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2169 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2180 #if defined(__aarch64__)
2181 static const int32x4_t shift = {0, 1, 2, 3};
2182 uint32x4_t tmp = vshrq_n_u32(input, 31);
2183 return vaddvq_u32(vshlq_u32(tmp, shift));
2188 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2191 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2193 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2252 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2263 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2268 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2273 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2278 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2283 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2288 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2293 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2298 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2304 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2311 __builtin_prefetch(p);
2319 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2324 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2334 #if SSE2NEON_PRECISE_DIV
2362 #if SSE2NEON_PRECISE_SQRT
2379 return vsetq_lane_f32(vgetq_lane_f32(
_mm_rsqrt_ps(in), 0), in, 0);
2389 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2392 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2405 #if defined(__aarch64__)
2412 #if defined(__aarch64__)
2413 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
2415 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2420 #if defined(__aarch64__)
2421 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
2423 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2451 #if defined(__aarch64__)
2458 #if defined(__aarch64__)
2459 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
2461 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2482 #if defined(__aarch64__)
2483 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
2485 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2539 #if __has_builtin(__builtin_shufflevector)
2540 #define _mm_shuffle_pi16(a, imm) \
2542 vreinterpret_m64_s16(__builtin_shufflevector( \
2543 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2544 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2547 #define _mm_shuffle_pi16(a, imm) \
2551 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2552 ret = vset_lane_s16( \
2553 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2555 ret = vset_lane_s16( \
2556 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2558 ret = vset_lane_s16( \
2559 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2561 vreinterpret_m64_s16(ret); \
2570 __sync_synchronize();
2575 #if __has_builtin(__builtin_shufflevector)
2576 #define _mm_shuffle_ps(a, b, imm) \
2578 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2579 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2580 float32x4_t _shuf = __builtin_shufflevector( \
2581 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2582 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2583 vreinterpretq_m128_f32(_shuf); \
2586 #define _mm_shuffle_ps(a, b, imm) \
2590 case _MM_SHUFFLE(1, 0, 3, 2): \
2591 ret = _mm_shuffle_ps_1032((a), (b)); \
2593 case _MM_SHUFFLE(2, 3, 0, 1): \
2594 ret = _mm_shuffle_ps_2301((a), (b)); \
2596 case _MM_SHUFFLE(0, 3, 2, 1): \
2597 ret = _mm_shuffle_ps_0321((a), (b)); \
2599 case _MM_SHUFFLE(2, 1, 0, 3): \
2600 ret = _mm_shuffle_ps_2103((a), (b)); \
2602 case _MM_SHUFFLE(1, 0, 1, 0): \
2603 ret = _mm_movelh_ps((a), (b)); \
2605 case _MM_SHUFFLE(1, 0, 0, 1): \
2606 ret = _mm_shuffle_ps_1001((a), (b)); \
2608 case _MM_SHUFFLE(0, 1, 0, 1): \
2609 ret = _mm_shuffle_ps_0101((a), (b)); \
2611 case _MM_SHUFFLE(3, 2, 1, 0): \
2612 ret = _mm_shuffle_ps_3210((a), (b)); \
2614 case _MM_SHUFFLE(0, 0, 1, 1): \
2615 ret = _mm_shuffle_ps_0011((a), (b)); \
2617 case _MM_SHUFFLE(0, 0, 2, 2): \
2618 ret = _mm_shuffle_ps_0022((a), (b)); \
2620 case _MM_SHUFFLE(2, 2, 0, 0): \
2621 ret = _mm_shuffle_ps_2200((a), (b)); \
2623 case _MM_SHUFFLE(3, 2, 0, 2): \
2624 ret = _mm_shuffle_ps_3202((a), (b)); \
2626 case _MM_SHUFFLE(3, 2, 3, 2): \
2627 ret = _mm_movehl_ps((b), (a)); \
2629 case _MM_SHUFFLE(1, 1, 3, 3): \
2630 ret = _mm_shuffle_ps_1133((a), (b)); \
2632 case _MM_SHUFFLE(2, 0, 1, 0): \
2633 ret = _mm_shuffle_ps_2010((a), (b)); \
2635 case _MM_SHUFFLE(2, 0, 0, 1): \
2636 ret = _mm_shuffle_ps_2001((a), (b)); \
2638 case _MM_SHUFFLE(2, 0, 3, 2): \
2639 ret = _mm_shuffle_ps_2032((a), (b)); \
2642 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2661 #if SSE2NEON_PRECISE_SQRT
2666 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2667 const uint32x4_t div_by_zero =
2668 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2669 recip = vreinterpretq_f32_u32(
2670 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2682 #elif defined(__aarch64__)
2686 float32x4_t sq = vrecpeq_f32(recipsq);
2722 vst1q_f32(p, vdupq_n_f32(a0));
2742 #define _mm_store1_ps _mm_store_ps1
2781 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2819 #if __has_builtin(__builtin_nontemporal_store)
2820 __builtin_nontemporal_store(a, (float32x4_t *) p);
2858 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2860 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2861 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2862 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2863 vget_low_f32(ROW23.val[0])); \
2864 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2865 vget_low_f32(ROW23.val[1])); \
2866 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2867 vget_high_f32(ROW23.val[0])); \
2868 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2869 vget_high_f32(ROW23.val[1])); \
2874 #define _mm_ucomieq_ss _mm_comieq_ss
2875 #define _mm_ucomige_ss _mm_comige_ss
2876 #define _mm_ucomigt_ss _mm_comigt_ss
2877 #define _mm_ucomile_ss _mm_comile_ss
2878 #define _mm_ucomilt_ss _mm_comilt_ss
2879 #define _mm_ucomineq_ss _mm_comineq_ss
2885 #if defined(__GNUC__) || defined(__clang__)
2886 #pragma GCC diagnostic push
2887 #pragma GCC diagnostic ignored "-Wuninitialized"
2891 #if defined(__GNUC__) || defined(__clang__)
2892 #pragma GCC diagnostic pop
2900 #if defined(__GNUC__) || defined(__clang__)
2901 #pragma GCC diagnostic push
2902 #pragma GCC diagnostic ignored "-Wuninitialized"
2906 #if defined(__GNUC__) || defined(__clang__)
2907 #pragma GCC diagnostic pop
2922 #if defined(__aarch64__)
2928 float32x2x2_t result = vzip_f32(a1, b1);
2944 #if defined(__aarch64__)
2950 float32x2x2_t result = vzip_f32(a1, b1);
3013 #if defined(__aarch64__)
3014 return vreinterpretq_m128d_f64(
3015 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3017 double *da = (
double *) &a;
3018 double *db = (
double *) &b;
3020 c[0] = da[0] + db[0];
3021 c[1] = da[1] + db[1];
3022 return vld1q_f32((float32_t *) c);
3036 #if defined(__aarch64__)
3039 double *da = (
double *) &a;
3040 double *db = (
double *) &b;
3042 c[0] = da[0] + db[0];
3044 return vld1q_f32((float32_t *) c);
3196 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3201 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3240 #if defined(__aarch64__)
3295 #if defined(__aarch64__)
3297 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3302 uint32x4_t swapped = vrev64q_u32(cmp);
3321 #if defined(__aarch64__)
3323 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3330 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3331 d[1] = (*(
double *) &a1) >= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3343 #if defined(__aarch64__)
3351 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3402 #if defined(__aarch64__)
3404 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3411 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3412 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3424 #if defined(__aarch64__)
3432 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3444 #if defined(__aarch64__)
3446 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3453 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3454 d[1] = (*(
double *) &a1) <= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3466 #if defined(__aarch64__)
3474 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3520 #if defined(__aarch64__)
3522 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3529 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3530 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3542 #if defined(__aarch64__)
3549 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3561 #if defined(__aarch64__)
3563 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3568 uint32x4_t swapped = vrev64q_u32(cmp);
3587 #if defined(__aarch64__)
3589 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3590 vdupq_n_u64(UINT64_MAX)));
3598 !((*(
double *) &a0) >= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3600 !((*(
double *) &a1) >= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3620 #if defined(__aarch64__)
3622 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3623 vdupq_n_u64(UINT64_MAX)));
3631 !((*(
double *) &a0) > (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3633 !((*(
double *) &a1) > (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3653 #if defined(__aarch64__)
3655 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656 vdupq_n_u64(UINT64_MAX)));
3664 !((*(
double *) &a0) <= (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3666 !((*(
double *) &a1) <= (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3686 #if defined(__aarch64__)
3688 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3689 vdupq_n_u64(UINT64_MAX)));
3697 !((*(
double *) &a0) < (*(
double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3699 !((*(
double *) &a1) < (*(
double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3719 #if defined(__aarch64__)
3721 uint64x2_t not_nan_a =
3722 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3723 uint64x2_t not_nan_b =
3724 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3732 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3733 (*(
double *) &b0) == (*(
double *) &b0))
3736 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3737 (*(
double *) &b1) == (*(
double *) &b1))
3751 #if defined(__aarch64__)
3758 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3759 (*(
double *) &b0) == (*(
double *) &b0))
3773 #if defined(__aarch64__)
3775 uint64x2_t not_nan_a =
3776 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3777 uint64x2_t not_nan_b =
3778 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3780 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3787 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3788 (*(
double *) &b0) == (*(
double *) &b0))
3791 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3792 (*(
double *) &b1) == (*(
double *) &b1))
3806 #if defined(__aarch64__)
3813 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3814 (*(
double *) &b0) == (*(
double *) &b0))
3828 #if defined(__aarch64__)
3829 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3834 return (*(
double *) &a0 >= *(
double *) &b0);
3843 #if defined(__aarch64__)
3844 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3849 return (*(
double *) &a0 > *(
double *) &b0);
3858 #if defined(__aarch64__)
3859 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3864 return (*(
double *) &a0 <= *(
double *) &b0);
3873 #if defined(__aarch64__)
3874 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3879 return (*(
double *) &a0 < *(
double *) &b0);
3888 #if defined(__aarch64__)
3889 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3891 uint32x4_t a_not_nan =
3893 uint32x4_t b_not_nan =
3895 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3898 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3899 vreinterpretq_u64_u32(a_eq_b));
3900 return vgetq_lane_u64(and_results, 0) & 0x1;
3924 #if defined(__aarch64__)
3925 return vreinterpretq_m128d_f64(
3955 double d0 = ((
double *) &rnd)[0];
3956 double d1 = ((
double *) &rnd)[1];
3973 double d0 = ((
double *) &rnd)[0];
3974 double d1 = ((
double *) &rnd)[1];
3993 #if defined(__aarch64__)
3994 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3997 float a0 = (float) ((
double *) &a)[0];
3998 float a1 = (float) ((
double *) &a)[1];
4015 #if defined(__aarch64__)
4016 return vreinterpretq_m128d_f64(
4038 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
4050 float *f = (
float *) &a;
4053 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4056 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4058 int32x4_t r_trunc = vcvtq_s32_f32(
4060 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4061 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
4062 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4064 float32x4_t delta = vsubq_f32(
4066 vcvtq_f32_s32(r_trunc));
4067 uint32x4_t is_delta_half =
4068 vceqq_f32(delta, half);
4070 vbslq_s32(is_delta_half, r_even, r_normal));
4073 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4079 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4098 #if defined(__aarch64__)
4099 return vreinterpretq_m128d_f64(
4115 #if defined(__aarch64__)
4116 return (
double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4118 return ((
double *) &a)[0];
4130 #if defined(__aarch64__)
4131 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4134 double ret = ((
double *) &rnd)[0];
4135 return (int32_t) ret;
4147 #if defined(__aarch64__)
4148 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4151 double ret = ((
double *) &rnd)[0];
4152 return (int64_t) ret;
4162 #define _mm_cvtsd_si64x _mm_cvtsd_si64
4171 #if defined(__aarch64__)
4173 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4203 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4211 #if defined(__aarch64__)
4212 return vreinterpretq_m128d_f64(
4213 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4215 double bf = (double) b;
4226 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4248 #if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(
4250 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4252 double bf = (double) b;
4271 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4277 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4291 #if defined(__aarch64__)
4292 return vreinterpretq_m128d_f64(
4293 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4305 double a0 = ((
double *) &a)[0];
4306 double a1 = ((
double *) &a)[1];
4315 double a0 = ((
double *) &a)[0];
4316 double a1 = ((
double *) &a)[1];
4337 double ret = *((
double *) &a);
4338 return (int32_t) ret;
4349 #if defined(__aarch64__)
4350 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4352 double ret = *((
double *) &a);
4353 return (int64_t) ret;
4363 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4376 #if defined(__aarch64__)
4377 return vreinterpretq_m128d_f64(
4378 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4380 double *da = (
double *) &a;
4381 double *db = (
double *) &b;
4383 c[0] = da[0] / db[0];
4384 c[1] = da[1] / db[1];
4385 return vld1q_f32((float32_t *) c);
4396 #if defined(__aarch64__)
4398 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4399 return vreinterpretq_m128d_f64(
4400 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4410 #define _mm_extract_epi16(a, imm) \
4411 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4418 #define _mm_insert_epi16(a, b, imm) \
4420 vreinterpretq_m128i_s16( \
4421 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4432 #if defined(__aarch64__)
4433 return vreinterpretq_m128d_f64(vld1q_f64(p));
4435 const float *fp = (
const float *) p;
4448 #define _mm_load_pd1 _mm_load1_pd
4460 #if defined(__aarch64__)
4461 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4463 const float *fp = (
const float *) p;
4485 #if defined(__aarch64__)
4486 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4502 #if defined(__aarch64__)
4503 return vreinterpretq_m128d_f64(
4504 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4519 vcombine_s32(vld1_s32((int32_t
const *) p), vcreate_s32(0)));
4532 #if defined(__aarch64__)
4533 return vreinterpretq_m128d_f64(
4534 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4537 vcombine_f32(vld1_f32((
const float *) p),
4552 #if defined(__aarch64__)
4553 float64x2_t v = vld1q_f64(p);
4554 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4556 int64x2_t v = vld1q_s64((
const int64_t *) p);
4584 vsetq_lane_s32(*(
const int32_t *) p, vdupq_n_s32(0), 0));
4602 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4603 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4620 vst1q_s8((int8_t *) mem_addr, masked);
4646 #if defined(__aarch64__)
4647 #if SSE2NEON_PRECISE_MINMAX
4648 float64x2_t _a = vreinterpretq_f64_m128d(a);
4649 float64x2_t _b = vreinterpretq_f64_m128d(b);
4650 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4652 return vreinterpretq_m128d_f64(
4653 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4661 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? a0 : b0;
4662 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? a1 : b1;
4674 #if defined(__aarch64__)
4677 double *da = (
double *) &a;
4678 double *db = (
double *) &b;
4679 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4707 #if defined(__aarch64__)
4708 #if SSE2NEON_PRECISE_MINMAX
4709 float64x2_t _a = vreinterpretq_f64_m128d(a);
4710 float64x2_t _b = vreinterpretq_f64_m128d(b);
4711 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4713 return vreinterpretq_m128d_f64(
4714 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4722 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? a0 : b0;
4723 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? a1 : b1;
4734 #if defined(__aarch64__)
4737 double *da = (
double *) &a;
4738 double *db = (
double *) &b;
4739 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4802 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4817 uint32x4_t paired16 =
4818 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4831 uint64x2_t paired32 =
4832 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4845 uint8x16_t paired64 =
4846 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4853 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4862 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4863 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4907 #if defined(__aarch64__)
4908 return vreinterpretq_m128d_f64(
4909 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4911 double *da = (
double *) &a;
4912 double *db = (
double *) &b;
4914 c[0] = da[0] * db[0];
4915 c[1] = da[1] * db[1];
4916 return vld1q_f32((float32_t *) c);
4958 int32x4_t ab3210 = vmull_s16(a3210, b3210);
4961 int32x4_t ab7654 = vmull_s16(a7654, b7654);
4963 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4975 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4976 #if defined(__aarch64__)
4979 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4980 vreinterpretq_u16_u32(ab7654));
4985 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4987 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
5084 __asm__ __volatile__(
"isb\n");
5094 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5135 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5158 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5159 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5160 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5161 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5171 #if defined(__aarch64__)
5172 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *)
data));
5181 #define _mm_set_pd1 _mm_set1_pd
5249 #if defined(__aarch64__)
5250 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5311 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5312 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5313 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5314 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5330 #if defined(__aarch64__)
5331 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5348 #if __has_builtin(__builtin_shufflevector)
5349 #define _mm_shuffle_epi32(a, imm) \
5351 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5352 int32x4_t _shuf = __builtin_shufflevector( \
5353 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5354 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5355 vreinterpretq_m128i_s32(_shuf); \
5358 #define _mm_shuffle_epi32(a, imm) \
5362 case _MM_SHUFFLE(1, 0, 3, 2): \
5363 ret = _mm_shuffle_epi_1032((a)); \
5365 case _MM_SHUFFLE(2, 3, 0, 1): \
5366 ret = _mm_shuffle_epi_2301((a)); \
5368 case _MM_SHUFFLE(0, 3, 2, 1): \
5369 ret = _mm_shuffle_epi_0321((a)); \
5371 case _MM_SHUFFLE(2, 1, 0, 3): \
5372 ret = _mm_shuffle_epi_2103((a)); \
5374 case _MM_SHUFFLE(1, 0, 1, 0): \
5375 ret = _mm_shuffle_epi_1010((a)); \
5377 case _MM_SHUFFLE(1, 0, 0, 1): \
5378 ret = _mm_shuffle_epi_1001((a)); \
5380 case _MM_SHUFFLE(0, 1, 0, 1): \
5381 ret = _mm_shuffle_epi_0101((a)); \
5383 case _MM_SHUFFLE(2, 2, 1, 1): \
5384 ret = _mm_shuffle_epi_2211((a)); \
5386 case _MM_SHUFFLE(0, 1, 2, 2): \
5387 ret = _mm_shuffle_epi_0122((a)); \
5389 case _MM_SHUFFLE(3, 3, 3, 2): \
5390 ret = _mm_shuffle_epi_3332((a)); \
5392 case _MM_SHUFFLE(0, 0, 0, 0): \
5393 ret = _mm_shuffle_epi32_splat((a), 0); \
5395 case _MM_SHUFFLE(1, 1, 1, 1): \
5396 ret = _mm_shuffle_epi32_splat((a), 1); \
5398 case _MM_SHUFFLE(2, 2, 2, 2): \
5399 ret = _mm_shuffle_epi32_splat((a), 2); \
5401 case _MM_SHUFFLE(3, 3, 3, 3): \
5402 ret = _mm_shuffle_epi32_splat((a), 3); \
5405 ret = _mm_shuffle_epi32_default((a), (imm)); \
5419 #if __has_builtin(__builtin_shufflevector)
5420 #define _mm_shuffle_pd(a, b, imm8) \
5421 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5422 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5423 ((imm8 & 0x2) >> 1) + 2))
5425 #define _mm_shuffle_pd(a, b, imm8) \
5426 _mm_castsi128_pd(_mm_set_epi64x( \
5427 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5428 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5433 #if __has_builtin(__builtin_shufflevector)
5434 #define _mm_shufflehi_epi16(a, imm) \
5436 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5437 int16x8_t _shuf = __builtin_shufflevector( \
5438 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5439 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5440 (((imm) >> 6) & 0x3) + 4); \
5441 vreinterpretq_m128i_s16(_shuf); \
5444 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5449 #if __has_builtin(__builtin_shufflevector)
5450 #define _mm_shufflelo_epi16(a, imm) \
5452 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5453 int16x8_t _shuf = __builtin_shufflevector( \
5454 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5455 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5456 vreinterpretq_m128i_s16(_shuf); \
5459 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5481 int16x8_t vc = vdupq_n_s16((int16_t) c);
5504 int32x4_t vc = vdupq_n_s32((int32_t) c);
5527 int64x2_t vc = vdupq_n_s64((int64_t) c);
5610 vld1q_u8(((uint8_t
const *) tmp) + (16 - imm)));
5618 #if defined(__aarch64__)
5619 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5621 double a0 = sqrt(((
double *) &a)[0]);
5622 double a1 = sqrt(((
double *) &a)[1]);
5633 #if defined(__aarch64__)
5636 return _mm_set_pd(((
double *) &a)[1], sqrt(((
double *) &b)[0]));
5655 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5676 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5697 const int count = (imm & ~15) ? 15 : imm;
5698 return (
__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5715 #define _mm_srai_epi32(a, imm) \
5718 if (_sse2neon_unlikely((imm) == 0)) { \
5720 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5721 ret = vreinterpretq_m128i_s32( \
5722 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
5724 ret = vreinterpretq_m128i_s32( \
5725 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5749 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5772 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5795 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5812 #define _mm_srli_epi16(a, imm) \
5815 if (_sse2neon_unlikely((imm) & ~15)) { \
5816 ret = _mm_setzero_si128(); \
5818 ret = vreinterpretq_m128i_u16( \
5819 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5838 #define _mm_srli_epi32(a, imm) \
5841 if (_sse2neon_unlikely((imm) & ~31)) { \
5842 ret = _mm_setzero_si128(); \
5844 ret = vreinterpretq_m128i_u32( \
5845 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5863 #define _mm_srli_epi64(a, imm) \
5866 if (_sse2neon_unlikely((imm) & ~63)) { \
5867 ret = _mm_setzero_si128(); \
5869 ret = vreinterpretq_m128i_u64( \
5870 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5899 #if defined(__aarch64__)
5900 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5912 #if defined(__aarch64__)
5913 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5914 vst1q_f64((float64_t *) mem_addr,
5915 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5918 vst1q_f32((float32_t *) mem_addr,
5928 #if defined(__aarch64__)
5929 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5946 #define _mm_store1_pd _mm_store_pd1
5956 #if defined(__aarch64__)
5957 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5978 #if defined(__aarch64__)
5979 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
6029 #if __has_builtin(__builtin_nontemporal_store)
6030 __builtin_nontemporal_store(a, (float32x4_t *) p);
6031 #elif defined(__aarch64__)
6032 vst1q_f64(p, vreinterpretq_f64_m128d(a));
6044 #if __has_builtin(__builtin_nontemporal_store)
6045 __builtin_nontemporal_store(a, p);
6057 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6066 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6124 #if defined(__aarch64__)
6125 return vreinterpretq_m128d_f64(
6126 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6128 double *da = (
double *) &a;
6129 double *db = (
double *) &b;
6131 c[0] = da[0] - db[0];
6132 c[1] = da[1] - db[1];
6133 return vld1q_f32((float32_t *) c);
6212 #define _mm_ucomieq_sd _mm_comieq_sd
6213 #define _mm_ucomige_sd _mm_comige_sd
6214 #define _mm_ucomigt_sd _mm_comigt_sd
6215 #define _mm_ucomile_sd _mm_comile_sd
6216 #define _mm_ucomilt_sd _mm_comilt_sd
6217 #define _mm_ucomineq_sd _mm_comineq_sd
6223 #if defined(__GNUC__) || defined(__clang__)
6224 #pragma GCC diagnostic push
6225 #pragma GCC diagnostic ignored "-Wuninitialized"
6229 #if defined(__GNUC__) || defined(__clang__)
6230 #pragma GCC diagnostic pop
6249 #if defined(__aarch64__)
6255 int16x4x2_t result = vzip_s16(a1, b1);
6265 #if defined(__aarch64__)
6271 int32x2x2_t result = vzip_s32(a1, b1);
6302 #if defined(__aarch64__)
6310 int8x8x2_t result = vzip_s8(a1, b1);
6328 #if defined(__aarch64__)
6329 return vreinterpretq_m128d_f64(
6330 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6353 #if defined(__aarch64__)
6359 int16x4x2_t result = vzip_s16(a1, b1);
6375 #if defined(__aarch64__)
6381 int32x2x2_t result = vzip_s32(a1, b1);
6407 #if defined(__aarch64__)
6413 int8x8x2_t result = vzip_s8(a1, b1);
6431 #if defined(__aarch64__)
6432 return vreinterpretq_m128d_f64(
6433 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6483 #if defined(__aarch64__)
6484 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6485 vreinterpretq_f64_m128d(b),
6486 vreinterpretq_f64_m128d(mask)));
6499 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA)
6513 #if defined(__aarch64__)
6514 return vreinterpretq_m128d_f64(
6515 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6517 double *da = (
double *) &a;
6518 double *db = (
double *) &b;
6519 double c[] = {da[0] + da[1], db[0] + db[1]};
6529 #if defined(__aarch64__)
6538 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6547 #if defined(__aarch64__)
6548 float64x2_t a = vreinterpretq_f64_m128d(_a);
6549 float64x2_t b = vreinterpretq_f64_m128d(_b);
6550 return vreinterpretq_m128d_f64(
6551 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6553 double *da = (
double *) &_a;
6554 double *db = (
double *) &_b;
6555 double c[] = {da[0] - da[1], db[0] - db[1]};
6567 #if defined(__aarch64__)
6569 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6571 float32x4x2_t c = vuzpq_f32(a, b);
6583 #define _mm_lddqu_si128 _mm_loadu_si128
6592 #define _mm_loaddup_pd _mm_load1_pd
6599 #if defined(__aarch64__)
6600 return vreinterpretq_m128d_f64(
6601 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6613 #if __has_builtin(__builtin_shufflevector)
6629 #if __has_builtin(__builtin_shufflevector)
6742 tmp[1] = vdupq_n_u8(0);
6758 #define _mm_alignr_pi8(a, b, imm) \
6761 if (_sse2neon_unlikely((imm) >= 16)) { \
6762 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6764 uint8x8_t tmp_low, tmp_high; \
6766 const int idx = (imm) -8; \
6767 tmp_low = vreinterpret_u8_m64(a); \
6768 tmp_high = vdup_n_u8(0); \
6769 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6771 const int idx = (imm); \
6772 tmp_low = vreinterpret_u8_m64(b); \
6773 tmp_high = vreinterpret_u8_m64(a); \
6774 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6786 #if defined(__aarch64__)
6790 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6791 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6802 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6803 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6828 #if defined(__aarch64__)
6831 return vreinterpretq_s64_s16(
6832 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6839 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6840 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6853 #if defined(__aarch64__)
6854 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6856 int16x4x2_t res = vuzp_s16(a, b);
6857 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6868 #if defined(__aarch64__)
6870 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6872 int16x8x2_t c = vuzpq_s16(a, b);
6884 #if defined(__aarch64__)
6886 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6888 int32x4x2_t c = vuzpq_s32(a, b);
6900 #if defined(__aarch64__)
6903 int16x4x2_t c = vuzp_s16(a, b);
6915 #if defined(__aarch64__)
6918 int32x2x2_t c = vuzp_s32(a, b);
6930 #if defined(__aarch64__)
6932 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6934 int16x8x2_t c = vuzpq_s16(a, b);
6946 #if defined(__aarch64__)
6949 int16x4x2_t c = vuzp_s16(a, b);
6966 #if defined(__aarch64__)
6969 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6970 vmovl_s8(vget_low_s8(b)));
6971 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6972 vmovl_s8(vget_high_s8(b)));
6974 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6982 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6983 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6986 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6987 int16x8_t b_odd = vshrq_n_s16(b, 8);
6990 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6991 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
7009 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
7010 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
7013 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
7014 int16x4_t b_odd = vshr_n_s16(b, 8);
7017 int16x4_t prod1 = vmul_s16(a_even, b_even);
7018 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
7046 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7047 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7059 int32x4_t mul_extend =
7073 uint8x16_t idx_masked =
7074 vandq_u8(idx, vdupq_n_u8(0x8F));
7075 #if defined(__aarch64__)
7077 #elif defined(__GNUC__)
7081 __asm__ __volatile__(
7082 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7083 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7085 : [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
7089 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7091 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7092 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7112 const int8x8_t controlMask =
7139 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7141 #if defined(__aarch64__)
7142 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7144 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7149 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7151 int16x8_t res = vbicq_s16(masked, zeroMask);
7176 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7179 #if defined(__aarch64__)
7180 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7182 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7187 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7189 int32x4_t res = vbicq_s32(masked, zeroMask);
7214 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7217 #if defined(__aarch64__)
7218 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7220 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7225 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7227 int8x16_t res = vbicq_s8(masked, zeroMask);
7255 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7258 #if defined(__aarch64__)
7259 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7261 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7266 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7268 int16x4_t res = vbic_s16(masked, zeroMask);
7296 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7299 #if defined(__aarch64__)
7300 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7302 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7307 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7309 int32x2_t res = vbic_s32(masked, zeroMask);
7337 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7340 #if defined(__aarch64__)
7341 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7343 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7348 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7350 int8x8_t res = vbic_s8(masked, zeroMask);
7370 #define _mm_blend_epi16(a, b, imm) \
7372 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7373 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7374 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7375 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7376 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7377 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7378 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7379 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7380 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7381 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7382 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7383 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7389 #define _mm_blend_pd(a, b, imm) \
7391 const uint64_t _mask[2] = { \
7392 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7393 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7394 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7395 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7396 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7397 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7406 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7407 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7408 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7409 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7410 uint32x4_t mask = vld1q_u32(
data);
7444 #if defined(__aarch64__)
7445 float64x2_t a = vreinterpretq_f64_m128d(_a);
7446 float64x2_t b = vreinterpretq_f64_m128d(_b);
7447 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7474 #if defined(__aarch64__)
7475 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7477 double *f = (
double *) &a;
7488 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7491 float *f = (
float *) &a;
7492 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7524 #if defined(__aarch64__)
7532 uint32x4_t swapped = vrev64q_u32(cmp);
7550 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7551 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7568 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7577 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7578 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7587 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7588 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7589 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7606 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7607 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7625 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7635 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7636 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7645 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7646 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7647 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7658 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7659 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7660 #if !SSE2NEON_PRECISE_DP
7661 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7662 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7665 #if !SSE2NEON_PRECISE_DP
7671 #if defined(__aarch64__)
7672 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7673 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7675 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7676 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7679 double d0 = (imm & 0x10) ? ((
double *) &a)[0] * ((
double *) &b)[0] : 0;
7680 double d1 = (imm & 0x20) ? ((
double *) &a)[1] * ((
double *) &b)[1] : 0;
7685 #if defined(__aarch64__)
7686 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7688 double sum = *((
double *) &tmp) + *(((
double *) &tmp) + 1);
7703 #if defined(__aarch64__)
7733 (imm & 0x1) ? s : 0,
7734 (imm & 0x2) ? s : 0,
7735 (imm & 0x4) ? s : 0,
7736 (imm & 0x8) ? s : 0,
7744 #define _mm_extract_epi32(a, imm) \
7745 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7750 #define _mm_extract_epi64(a, imm) \
7751 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7757 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7761 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7769 #if defined(__aarch64__)
7770 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7772 double *f = (
double *) &a;
7783 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7786 float *f = (
float *) &a;
7787 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7819 #define _mm_insert_epi32(a, b, imm) \
7821 vreinterpretq_m128i_s32( \
7822 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7829 #define _mm_insert_epi64(a, b, imm) \
7831 vreinterpretq_m128i_s64( \
7832 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7839 #define _mm_insert_epi8(a, b, imm) \
7841 vreinterpretq_m128i_s8( \
7842 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7849 #define _mm_insert_ps(a, b, imm8) \
7851 float32x4_t tmp1 = \
7852 vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
7853 vreinterpretq_f32_m128(a), 0); \
7854 float32x4_t tmp2 = \
7855 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7856 ((imm8 >> 4) & 0x3)); \
7857 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7858 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7859 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7860 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7861 uint32x4_t mask = vld1q_u32(data); \
7862 float32x4_t all_zeros = vdupq_n_f32(0); \
7864 vreinterpretq_m128_f32( \
7865 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7975 uint16_t min, idx = 0;
7977 #if defined(__aarch64__)
7992 for (
i = 0;
i < 8;
i++) {
8019 switch (imm & 0x4) {
8029 #if defined(__GNUC__) || defined(__clang__)
8030 __builtin_unreachable();
8035 switch (imm & 0x3) {
8037 _b = vreinterpretq_u8_u32(
8041 _b = vreinterpretq_u8_u32(
8045 _b = vreinterpretq_u8_u32(
8049 _b = vreinterpretq_u8_u32(
8053 #if defined(__GNUC__) || defined(__clang__)
8054 __builtin_unreachable();
8059 int16x8_t c04, c15, c26, c37;
8060 uint8x8_t low_b = vget_low_u8(_b);
8061 c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8062 _a = vextq_u8(_a, _a, 1);
8063 c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8064 _a = vextq_u8(_a, _a, 1);
8065 c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8066 _a = vextq_u8(_a, _a, 1);
8067 c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8068 #if defined(__aarch64__)
8070 c04 = vpaddq_s16(c04, c26);
8072 c15 = vpaddq_s16(c15, c37);
8075 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8077 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8079 vreinterpretq_s16_s32(trn2_c)));
8081 int16x4_t c01, c23, c45, c67;
8082 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8083 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8084 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8085 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8088 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8138 #if defined(__aarch64__)
8141 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8147 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8149 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8152 double *v_double = (
double *) &a;
8158 for (
int i = 0;
i < 2;
i++) {
8159 tmp = (v_double[
i] < 0) ? -v_double[
i] : v_double[
i];
8160 double roundDown = floor(tmp);
8161 double roundUp = ceil(tmp);
8162 double diffDown = tmp - roundDown;
8163 double diffUp = roundUp - tmp;
8164 if (diffDown < diffUp) {
8167 }
else if (diffDown > diffUp) {
8173 double half = roundDown / 2;
8174 if (half != floor(half)) {
8184 res[
i] = (v_double[
i] < 0) ? -res[
i] : res[
i];
8196 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8197 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8207 #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
8221 float *v_float = (
float *) &a;
8226 uint32x4_t signmask = vdupq_n_u32(0x80000000);
8229 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8231 int32x4_t r_trunc = vcvtq_s32_f32(
8233 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8234 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
8235 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8237 float32x4_t delta = vsubq_f32(
8239 vcvtq_f32_s32(r_trunc));
8240 uint32x4_t is_delta_half =
8241 vceqq_f32(delta, half);
8243 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8253 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8254 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8255 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8256 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8299 #if __has_builtin(__builtin_nontemporal_store)
8300 return __builtin_nontemporal_load(p);
8311 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8320 int64x2_t a_and_mask =
8322 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8337 uint64x2_t result = vandq_u64(zf, cf);
8338 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8351 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8360 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8371 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8380 #if defined(__aarch64__)
8395 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8396 __asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t"
8399 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8400 crc = __crc32ch(crc, v);
8413 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8414 __asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t"
8417 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8418 crc = __crc32cw(crc, v);
8431 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8432 __asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t"
8437 crc =
_mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8447 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8448 __asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t"
8451 #elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
8452 crc = __crc32cb(crc, v);
8455 for (
int bit = 0; bit < 8; bit++) {
8457 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8467 #if !defined(__ARM_FEATURE_CRYPTO)
8469 #define SSE2NEON_AES_DATA(w) \
8471 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8472 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8473 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8474 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8475 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8476 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8477 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8478 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8479 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8480 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8481 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8482 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8483 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8484 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8485 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8486 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8487 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8488 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8489 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8490 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8491 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8492 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8493 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8494 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8495 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8496 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8497 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8498 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8499 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8500 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8501 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8502 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8503 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8504 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8505 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8506 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8507 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8512 #define SSE2NEON_AES_H0(x) (x)
8514 #undef SSE2NEON_AES_H0
8524 #if defined(__aarch64__)
8525 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8526 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8527 0xc, 0x1, 0x6, 0xb};
8528 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8529 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8535 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8544 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8545 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8546 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8552 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8553 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8554 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8555 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8556 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8557 #define SSE2NEON_AES_U0(p) \
8558 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8559 #define SSE2NEON_AES_U1(p) \
8560 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8561 #define SSE2NEON_AES_U2(p) \
8562 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8563 #define SSE2NEON_AES_U3(p) \
8564 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8565 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8571 #undef SSE2NEON_AES_B2W
8572 #undef SSE2NEON_AES_F2
8573 #undef SSE2NEON_AES_F3
8574 #undef SSE2NEON_AES_U0
8575 #undef SSE2NEON_AES_U1
8576 #undef SSE2NEON_AES_U2
8577 #undef SSE2NEON_AES_U3
8585 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8586 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8587 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8588 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8589 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8590 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8591 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8592 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8622 for (
int i = 0;
i < 16;
i++)
8638 for (
int i = 0;
i < 4; ++
i) {
8643 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8645 #undef SSE2NEON_AES_DATA
8676 u8[0x4], u8[0x1], u8[0xE], u8[0xB],
8677 u8[0x1], u8[0xE], u8[0xB], u8[0x4],
8678 u8[0xC], u8[0x9], u8[0x6], u8[0x3],
8679 u8[0x9], u8[0x6], u8[0x3], u8[0xC],
8681 uint32x4_t r = {0, (unsigned) rcon, 0, (
unsigned) rcon};
8695 switch (imm & 0x11) {
8717 #if defined(__aarch64__)
8724 #if defined(__aarch64__)
8725 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
8727 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
8738 #if defined(__aarch64__)
8739 #if __has_builtin(__builtin_popcount)
8740 return __builtin_popcount(a);
8742 return (
int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8746 uint8x8_t input_val, count8x8_val;
8747 uint16x4_t count16x4_val;
8748 uint32x2_t count32x2_val;
8750 input_val = vld1_u8((uint8_t *) &a);
8751 count8x8_val = vcnt_u8(input_val);
8752 count16x4_val = vpaddl_u8(count8x8_val);
8753 count32x2_val = vpaddl_u16(count16x4_val);
8755 vst1_u32(&count, count32x2_val);
8765 #if defined(__aarch64__)
8766 #if __has_builtin(__builtin_popcountll)
8767 return __builtin_popcountll(a);
8769 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8773 uint8x8_t input_val, count8x8_val;
8774 uint16x4_t count16x4_val;
8775 uint32x2_t count32x2_val;
8776 uint64x1_t count64x1_val;
8778 input_val = vld1_u8((uint8_t *) &a);
8779 count8x8_val = vcnt_u8(input_val);
8780 count16x4_val = vpaddl_u8(count8x8_val);
8781 count32x2_val = vpaddl_u16(count16x4_val);
8782 count64x1_val = vpaddl_u32(count32x2_val);
8783 vst1_u64(&count, count64x1_val);
8794 #if defined(__aarch64__)
8801 #if defined(__aarch64__)
8802 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(r.value));
8804 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
8809 #if defined(__aarch64__)
8810 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(r));
8812 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
8821 #if defined(__aarch64__)
8830 asm volatile(
"mrs %0, cntvct_el0" :
"=r"(
val));
8834 uint32_t pmccntr, pmuseren, pmcntenset;
8837 asm volatile(
"mrc p15, 0, %0, c9, c14, 0" :
"=r"(pmuseren));
8839 asm volatile(
"mrc p15, 0, %0, c9, c12, 1" :
"=r"(pmcntenset));
8840 if (pmcntenset & 0x80000000UL) {
8841 asm volatile(
"mrc p15, 0, %0, c9, c13, 0" :
"=r"(pmccntr));
8843 return (uint64_t) (pmccntr) << 6;
8850 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
8854 #if defined(__GNUC__) || defined(__clang__)
8855 #pragma pop_macro("ALIGN_STRUCT")
8856 #pragma pop_macro("FORCE_INLINE")
8859 #if defined(__GNUC__) && !defined(__clang__)
8860 #pragma GCC pop_options
data
Definition: plot_best_vs_generic.py:23
val
Definition: volk_arch_defs.py:57
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3826
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2958
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1265
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition: sse2neon.h:2399
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1325
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1235
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition: sse2neon.h:6679
#define vreinterpret_m64_f32(x)
Definition: sse2neon.h:320
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:7522
#define _MM_FROUND_TO_POS_INF
Definition: sse2neon.h:201
#define vreinterpretq_u32_m128d(x)
Definition: sse2neon.h:362
#define vreinterpret_m64_s32(x)
Definition: sse2neon.h:311
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6247
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition: sse2neon.h:5286
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition: sse2neon.h:1437
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
#define vreinterpretq_m128_s32(x)
Definition: sse2neon.h:269
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7888
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition: sse2neon.h:4246
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:1095
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4687
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2046
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3293
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4394
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3116
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition: sse2neon.h:5328
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition: sse2neon.h:5199
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
Definition: sse2neon.h:1799
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition: sse2neon.h:1510
#define vreinterpretq_m128_f32(x)
Definition: sse2neon.h:259
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition: sse2neon.h:4113
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3083
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition: sse2neon.h:1954
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:2993
FORCE_INLINE void _mm_sfence(void)
Definition: sse2neon.h:2568
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition: sse2neon.h:1877
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3311
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition: sse2neon.h:2817
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3156
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
Definition: sse2neon.h:8713
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8097
#define vreinterpretq_m128i_s8(x)
Definition: sse2neon.h:286
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition: sse2neon.h:2794
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8601
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6926
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6405
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition: sse2neon.h:867
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3101
#define vreinterpret_m64_s8(x)
Definition: sse2neon.h:309
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition: sse2neon.h:4209
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6782
#define vreinterpret_u16_m64(x)
Definition: sse2neon.h:324
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
#define vreinterpretq_m128i_u64(x)
Definition: sse2neon.h:294
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:5115
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3749
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition: sse2neon.h:461
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6373
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition: sse2neon.h:4096
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:6911
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition: sse2neon.h:571
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4705
#define vreinterpret_m64_s64(x)
Definition: sse2neon.h:312
#define vreinterpretq_u8_m128i(x)
Definition: sse2neon.h:304
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5743
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition: sse2neon.h:7472
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition: sse2neon.h:2719
#define vreinterpretq_s16_m128i(x)
Definition: sse2neon.h:300
#define _MM_FLUSH_ZERO_MASK
Definition: sse2neon.h:217
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4950
FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5604
#define vreinterpretq_u64_m128d(x)
Definition: sse2neon.h:363
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1220
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:7033
#define vreinterpretq_s8_m128(x)
Definition: sse2neon.h:281
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition: sse2neon.h:4883
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:1110
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8125
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2523
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3540
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3376
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6263
#define vreinterpret_m64_u8(x)
Definition: sse2neon.h:314
#define vreinterpretq_m128d_s32(x)
Definition: sse2neon.h:352
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition: sse2neon.h:1991
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4924
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2219
FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2132
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5653
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1335
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition: sse2neon.h:2778
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition: sse2neon.h:7207
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3422
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition: sse2neon.h:4550
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition: sse2neon.h:8411
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:7289
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition: sse2neon.h:4530
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4563
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition: sse2neon.h:588
#define _sse2neon_unlikely(x)
Definition: sse2neon.h:106
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3267
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition: sse2neon.h:8393
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:6191
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition: sse2neon.h:5321
#define vreinterpretq_f32_m128i(x)
Definition: sse2neon.h:296
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition: sse2neon.h:876
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6281
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
Definition: sse2neon.h:8522
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6097
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4776
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5766
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition: sse2neon.h:1409
#define vreinterpret_m64_u16(x)
Definition: sse2neon.h:315
#define SSE2NEON_AES_DATA(w)
Definition: sse2neon.h:8469
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition: sse2neon.h:1393
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition: sse2neon.h:1743
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2387
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition: sse2neon.h:8309
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3275
#define _MM_DENORMALS_ZERO_OFF
Definition: sse2neon.h:223
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3341
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3907
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6826
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition: sse2neon.h:6693
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition: sse2neon.h:2801
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition: sse2neon.h:7427
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition: sse2neon.h:8265
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3034
#define vreinterpretq_m128_u32(x)
Definition: sse2neon.h:264
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5954
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition: sse2neon.h:4765
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3886
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1126
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition: sse2neon.h:3952
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1133
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition: sse2neon.h:113
#define vreinterpretq_s32_m128i(x)
Definition: sse2neon.h:301
#define vreinterpret_s64_m64(x)
Definition: sse2neon.h:331
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition: sse2neon.h:8367
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition: sse2neon.h:6665
#define vreinterpretq_m128i_s16(x)
Definition: sse2neon.h:287
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition: sse2neon.h:4935
#define vreinterpretq_f32_m128d(x)
Definition: sse2neon.h:365
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3092
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition: sse2neon.h:1577
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:7701
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition: sse2neon.h:2694
#define vreinterpretq_m128d_s64(x)
Definition: sse2neon.h:353
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition: sse2neon.h:2509
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition: sse2neon.h:857
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition: sse2neon.h:923
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition: sse2neon.h:2330
#define vreinterpretq_s8_m128i(x)
Definition: sse2neon.h:299
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4635
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition: sse2neon.h:4581
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition: sse2neon.h:6064
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition: sse2neon.h:581
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3610
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition: sse2neon.h:613
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition: sse2neon.h:4013
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4595
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition: sse2neon.h:7767
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1250
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition: sse2neon.h:7501
FORCE_INLINE uint64_t _rdtsc(void)
Definition: sse2neon.h:8819
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2969
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3518
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition: sse2neon.h:4335
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6896
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3717
#define vreinterpretq_s64_m128d(x)
Definition: sse2neon.h:360
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition: sse2neon.h:7622
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition: sse2neon.h:892
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5010
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition: sse2neon.h:7632
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition: sse2neon.h:7330
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition: sse2neon.h:4483
#define _MM_DENORMALS_ZERO_ON
Definition: sse2neon.h:222
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3002
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition: sse2neon.h:1701
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition: sse2neon.h:629
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
#define vreinterpretq_m128i_u8(x)
Definition: sse2neon.h:291
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1355
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition: sse2neon.h:8136
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition: sse2neon.h:1459
#define _MM_ROUND_NEAREST
Definition: sse2neon.h:212
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1917
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition: sse2neon.h:606
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:8318
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition: sse2neon.h:8691
#define vreinterpret_u32_m64(x)
Definition: sse2neon.h:325
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition: sse2neon.h:4871
#define vreinterpretq_nth_u64_m128i(x, n)
Definition: sse2neon.h:404
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1306
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3559
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7931
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7949
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition: sse2neon.h:1475
#define vreinterpretq_m128d_f32(x)
Definition: sse2neon.h:358
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3684
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition: sse2neon.h:4145
#define vreinterpretq_f64_m128i(x)
Definition: sse2neon.h:297
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition: sse2neon.h:1975
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1175
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6326
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition: sse2neon.h:4303
#define vreinterpret_m64_s16(x)
Definition: sse2neon.h:310
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition: sse2neon.h:4859
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3676
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7906
#define _MM_ROUND_DOWN
Definition: sse2neon.h:213
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7922
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition: sse2neon.h:677
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition: sse2neon.h:5219
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition: sse2neon.h:2898
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3804
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6511
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition: sse2neon.h:5586
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2849
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition: sse2neon.h:5070
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition: sse2neon.h:7557
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5695
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition: sse2neon.h:5186
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition: sse2neon.h:3970
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition: sse2neon.h:7515
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition: sse2neon.h:6721
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6480
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1243
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3871
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition: sse2neon.h:2883
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3856
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4644
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition: sse2neon.h:636
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2101
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition: sse2neon.h:3238
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4905
#define _MM_FROUND_TO_NEG_INF
Definition: sse2neon.h:200
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition: sse2neon.h:7655
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:8331
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5976
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2763
FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
Definition: sse2neon.h:706
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition: sse2neon.h:8736
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition: sse2neon.h:1627
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2055
FORCE_INLINE void _mm_empty(void)
Definition: sse2neon.h:1027
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
#define vreinterpretq_u64_m128(x)
Definition: sse2neon.h:279
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition: sse2neon.h:6017
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6300
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6042
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition: sse2neon.h:8445
#define ALIGN_STRUCT(x)
Definition: sse2neon.h:103
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1213
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:7574
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3618
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:6003
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1273
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition: sse2neon.h:1681
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1317
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition: sse2neon.h:3053
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6797
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1367
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition: sse2neon.h:4196
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6964
FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
Definition: sse2neon.h:6733
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition: sse2neon.h:643
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3284
#define vreinterpretq_s32_m128(x)
Definition: sse2neon.h:283
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition: sse2neon.h:2727
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:7057
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1258
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6942
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition: sse2neon.h:3214
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3651
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5498
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:7169
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:7539
#define vreinterpretq_u32_m128i(x)
Definition: sse2neon.h:306
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6849
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3771
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6072
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition: sse2neon.h:7584
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition: sse2neon.h:930
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition: sse2neon.h:2377
#define vreinterpretq_m128i_s64(x)
Definition: sse2neon.h:289
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
#define vreinterpretq_m128_s64(x)
Definition: sse2neon.h:270
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE void _mm_free(void *addr)
Definition: sse2neon.h:1790
#define vreinterpretq_u32_m128(x)
Definition: sse2neon.h:278
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4374
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition: sse2neon.h:1933
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:5092
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3841
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition: sse2neon.h:7642
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3500
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3442
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6864
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1377
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition: sse2neon.h:5226
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition: sse2neon.h:4751
#define _MM_ROUND_UP
Definition: sse2neon.h:214
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
Definition: sse2neon.h:6545
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5544
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1774
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7940
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4696
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5674
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3509
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
Definition: sse2neon.h:8634
#define vreinterpret_s32_m64(x)
Definition: sse2neon.h:330
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition: sse2neon.h:7110
#define vreinterpretq_m128d_u32(x)
Definition: sse2neon.h:355
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3143
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition: sse2neon.h:4288
#define vreinterpret_m64_u64(x)
Definition: sse2neon.h:317
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:2016
#define __int64
Definition: sse2neon.h:252
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition: sse2neon.h:659
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3367
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3400
#define vreinterpretq_u64_m128i(x)
Definition: sse2neon.h:307
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition: sse2neon.h:4263
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition: sse2neon.h:7595
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4513
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition: sse2neon.h:8788
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition: sse2neon.h:5132
#define _MM_ROUND_TOWARD_ZERO
Definition: sse2neon.h:215
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition: sse2neon.h:620
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition: sse2neon.h:6152
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition: sse2neon.h:4613
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1198
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7879
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6182
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1299
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition: sse2neon.h:2177
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition: sse2neon.h:7796
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition: sse2neon.h:5263
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:5021
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3577
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition: sse2neon.h:7613
#define _mm_set_pd1
Definition: sse2neon.h:5181
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4626
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition: sse2neon.h:6055
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7897
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6880
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3709
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3068
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition: sse2neon.h:4128
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition: sse2neon.h:5965
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition: sse2neon.h:5247
#define _MM_FROUND_TO_NEAREST_INT
Definition: sse2neon.h:199
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2942
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition: sse2neon.h:1651
FORCE_INLINE void _mm_prefetch(const void *p, int i)
Definition: sse2neon.h:2308
#define vreinterpretq_m128d_u64(x)
Definition: sse2neon.h:356
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition: sse2neon.h:4237
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1183
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6122
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition: sse2neon.h:8429
#define SSE2NEON_AES_U3(p)
#define FORCE_INLINE
Definition: sse2neon.h:100
#define vreinterpret_s16_m64(x)
Definition: sse2neon.h:329
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition: sse2neon.h:6651
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:5100
#define vreinterpretq_nth_u8_m128i(x, n)
Definition: sse2neon.h:406
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition: sse2neon.h:4458
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1228
FORCE_INLINE unsigned int _mm_getcsr()
Definition: sse2neon.h:2515
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition: sse2neon.h:3991
#define vreinterpretq_m128i_u32(x)
Definition: sse2neon.h:293
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:6206
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:2071
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition: sse2neon.h:910
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition: sse2neon.h:6142
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3172
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition: sse2neon.h:884
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1047
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition: sse2neon.h:2809
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition: sse2neon.h:7486
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition: sse2neon.h:8015
#define vreinterpretq_m128i_s32(x)
Definition: sse2neon.h:288
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:8108
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition: sse2neon.h:7403
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition: sse2neon.h:597
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition: sse2neon.h:2492
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition: sse2neon.h:695
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5001
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition: sse2neon.h:686
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition: sse2neon.h:8763
FORCE_INLINE void _mm_pause()
Definition: sse2neon.h:5082
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6167
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition: sse2neon.h:7972
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition: sse2neon.h:3922
static const uint8_t SSE2NEON_sbox[256]
Definition: sse2neon.h:8513
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:5631
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1900
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1162
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition: sse2neon.h:2350
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition: sse2neon.h:3206
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition: sse2neon.h:2119
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition: sse2neon.h:4313
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:7132
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3490
#define vreinterpretq_f32_m128(x)
Definition: sse2neon.h:273
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition: sse2neon.h:8205
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition: sse2neon.h:651
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:4894
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3585
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:2228
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition: sse2neon.h:7810
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition: sse2neon.h:5124
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:4971
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5993
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition: sse2neon.h:900
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition: sse2neon.h:6221
#define _MM_FROUND_TO_ZERO
Definition: sse2neon.h:202
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition: sse2neon.h:5168
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition: sse2neon.h:3222
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition: sse2neon.h:669
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition: sse2neon.h:7603
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition: sse2neon.h:7547
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition: sse2neon.h:6563
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition: sse2neon.h:1494
#define _MM_FLUSH_ZERO_ON
Definition: sse2neon.h:218
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:8378
#define _MM_DENORMALS_ZERO_MASK
Definition: sse2neon.h:221
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition: sse2neon.h:1610
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition: sse2neon.h:8346
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4732
#define vreinterpretq_m128i_u16(x)
Definition: sse2neon.h:292
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition: sse2neon.h:1532
_mm_hint
Definition: sse2neon.h:545
@ _MM_HINT_T1
Definition: sse2neon.h:548
@ _MM_HINT_T0
Definition: sse2neon.h:547
@ _MM_HINT_ET0
Definition: sse2neon.h:551
@ _MM_HINT_ET1
Definition: sse2neon.h:552
@ _MM_HINT_ET2
Definition: sse2neon.h:553
@ _MM_HINT_ENTA
Definition: sse2neon.h:550
@ _MM_HINT_T2
Definition: sse2neon.h:549
@ _MM_HINT_NTA
Definition: sse2neon.h:546
#define _MM_FROUND_CUR_DIRECTION
Definition: sse2neon.h:203
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3187
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition: sse2neon.h:8285
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition: sse2neon.h:2359
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5926
#define vreinterpretq_u16_m128i(x)
Definition: sse2neon.h:305
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition: sse2neon.h:4169
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition: sse2neon.h:2155
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition: sse2neon.h:916
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition: sse2neon.h:1713
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3319
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition: sse2neon.h:6027
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3011
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition: sse2neon.h:4347
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition: sse2neon.h:5616
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:6809
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition: sse2neon.h:2447
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition: sse2neon.h:4500
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6106
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5521
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4672
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition: sse2neon.h:1428
#define vreinterpret_s8_m64(x)
Definition: sse2neon.h:328
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5475
#define vreinterpret_u8_m64(x)
Definition: sse2neon.h:323
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6351
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1345
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:7248
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1141
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
Definition: sse2neon.h:1823
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
Definition: sse2neon.h:737
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition: sse2neon.h:6597
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition: sse2neon.h:1966
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6450
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition: sse2neon.h:8297
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition: sse2neon.h:5910
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition: sse2neon.h:7440
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition: sse2neon.h:6429
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2110
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3464
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3643
#define SSE2NEON_AES_H0(x)
Definition: sse2neon.h:8512
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition: sse2neon.h:6707
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5789
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition: sse2neon.h:6818
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition: sse2neon.h:1662
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1285
#define _MM_FROUND_NO_EXC
Definition: sse2neon.h:204
#define vreinterpretq_s64_m128i(x)
Definition: sse2neon.h:302
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition: sse2neon.h:4186
#define _MM_FLUSH_ZERO_OFF
Definition: sse2neon.h:219
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:7003
SIMDVec
Definition: sse2neon.h:401
FORCE_INLINE void _mm_clflush(void const *p)
Definition: sse2neon.h:3258
Definition: sse2neon.h:557
uint16_t res0
Definition: sse2neon.h:558
uint8_t bit23
Definition: sse2neon.h:561
uint8_t res1
Definition: sse2neon.h:559
uint8_t bit24
Definition: sse2neon.h:562
uint8_t bit22
Definition: sse2neon.h:560
uint8_t res2
Definition: sse2neon.h:563
static int gettimeofday(struct timeval *tv, struct timezone *tz)
Definition: time.h:48
VOLK_API void
Call into a specific implementation given by name.
Definition: volk.tmpl.h:101
for i
Definition: volk_config_fixed.tmpl.h:13