From 9896cc4e67b80e81e8dc496f73ed0ca25953cbb868e3ca53ec3d5ddc92d3c58a Mon Sep 17 00:00:00 2001 From: Callum Farmer Date: Thu, 3 Jun 2021 12:10:39 +0000 Subject: [PATCH] OBS-URL: https://build.opensuse.org/package/show/network:chromium/chromium?expand=0&rev=1566 --- chromium-91-highway-gcc-arm.patch | 2369 ----------------------------- 1 file changed, 2369 deletions(-) delete mode 100644 chromium-91-highway-gcc-arm.patch diff --git a/chromium-91-highway-gcc-arm.patch b/chromium-91-highway-gcc-arm.patch deleted file mode 100644 index 7ca2319..0000000 --- a/chromium-91-highway-gcc-arm.patch +++ /dev/null @@ -1,2369 +0,0 @@ ---- a/third_party/highway/src/hwy/ops/arm_neon-inl.h -+++ b/third_party/highway/src/hwy/ops/arm_neon-inl.h -@@ -26,6 +26,8 @@ - namespace hwy { - namespace HWY_NAMESPACE { - -+namespace detail { // for code folding and Raw128 -+ - // Macros used to define single and double function calls for multiple types - // for full and half vectors. These macros are undefined at the end of the file. - -@@ -133,7 +135,7 @@ - HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args) - - // float and double --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \ - HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \ -@@ -181,7 +183,7 @@ - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) - - // Emulation of some intrinsics on armv7. --#if !defined(__aarch64__) -+#if HWY_ARCH_ARM_V7 - #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] - #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] - #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] -@@ -294,7 +296,7 @@ - using type = float32x4_t; - }; - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - template <> - struct Raw128 { - using type = float64x2_t; -@@ -352,7 +354,7 @@ - using type = float32x2_t; - }; - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - template <> - struct Raw128 { - using type = float64x1_t; -@@ -437,12 +439,14 @@ - using type = int8x8_t; - }; - -+} // namespace detail -+ - template - using Full128 = Simd; - - template - class Vec128 { -- using Raw = typename Raw128::type; -+ using Raw = typename detail::Raw128::type; - - public: - HWY_INLINE Vec128() {} -@@ -480,7 +484,8 @@ - // FF..FF or 0, also for floating-point - see README. - template - class Mask128 { -- using Raw = typename Raw128::type; -+ // ACLE intrinsics return and expect unsigned type. -+ using Raw = typename detail::Raw128, N>::type; - - public: - HWY_INLINE Mask128() {} -@@ -573,7 +578,7 @@ - Vec128 v) { - return Vec128(vreinterpret_s64_u8(v.raw)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, - Vec128 v) { - return Vec128(vreinterpret_f64_u8(v.raw)); -@@ -615,7 +620,7 @@ - return Vec128(vreinterpretq_s64_u8(v.raw)); - } - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 BitCastFromByte(Full128 /* tag */, - Vec128 v) { - return Vec128(vreinterpretq_f64_u8(v.raw)); -@@ -664,15 +669,25 @@ - HWY_INLINE Vec128 Undefined(Simd /*d*/) { - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") -- typename Raw128::type a; -+ typename detail::Raw128::type a; - return Vec128(a); - HWY_DIAGNOSTICS(pop) - } - --// ------------------------------ Extract lane -+// Returns a vector with lane i=[0, N) set to "first" + i. -+template -+Vec128 Iota(const Simd d, const T2 first) { -+ HWY_ALIGN T lanes[16 / sizeof(T)]; -+ for (size_t i = 0; i < 16 / sizeof(T); ++i) { -+ lanes[i] = static_cast(first + static_cast(i)); -+ } -+ return Load(d, lanes); -+} -+ -+// ------------------------------ GetLane - - HWY_INLINE uint8_t GetLane(const Vec128 v) { -- return vget_lane_u8(vget_low_u8(v.raw), 0); -+ return vgetq_lane_u8(v.raw, 0); - } - template - HWY_INLINE uint8_t GetLane(const Vec128 v) { -@@ -680,7 +695,7 @@ - } - - HWY_INLINE int8_t GetLane(const Vec128 v) { -- return vget_lane_s8(vget_low_s8(v.raw), 0); -+ return vgetq_lane_s8(v.raw, 0); - } - template - HWY_INLINE int8_t GetLane(const Vec128 v) { -@@ -688,7 +703,7 @@ - } - - HWY_INLINE uint16_t GetLane(const Vec128 v) { -- return vget_lane_u16(vget_low_u16(v.raw), 0); -+ return vgetq_lane_u16(v.raw, 0); - } - template - HWY_INLINE uint16_t GetLane(const Vec128 v) { -@@ -696,7 +711,7 @@ - } - - HWY_INLINE int16_t GetLane(const Vec128 v) { -- return vget_lane_s16(vget_low_s16(v.raw), 0); -+ return vgetq_lane_s16(v.raw, 0); - } - template - HWY_INLINE int16_t GetLane(const Vec128 v) { -@@ -704,7 +719,7 @@ - } - - HWY_INLINE uint32_t GetLane(const Vec128 v) { -- return vget_lane_u32(vget_low_u32(v.raw), 0); -+ return vgetq_lane_u32(v.raw, 0); - } - template - HWY_INLINE uint32_t GetLane(const Vec128 v) { -@@ -712,7 +727,7 @@ - } - - HWY_INLINE int32_t GetLane(const Vec128 v) { -- return vget_lane_s32(vget_low_s32(v.raw), 0); -+ return vgetq_lane_s32(v.raw, 0); - } - template - HWY_INLINE int32_t GetLane(const Vec128 v) { -@@ -720,20 +735,20 @@ - } - - HWY_INLINE uint64_t GetLane(const Vec128 v) { -- return vget_lane_u64(vget_low_u64(v.raw), 0); -+ return vgetq_lane_u64(v.raw, 0); - } - HWY_INLINE uint64_t GetLane(const Vec128 v) { - return vget_lane_u64(v.raw, 0); - } - HWY_INLINE int64_t GetLane(const Vec128 v) { -- return vget_lane_s64(vget_low_s64(v.raw), 0); -+ return vgetq_lane_s64(v.raw, 0); - } - HWY_INLINE int64_t GetLane(const Vec128 v) { - return vget_lane_s64(v.raw, 0); - } - - HWY_INLINE float GetLane(const Vec128 v) { -- return vget_lane_f32(vget_low_f32(v.raw), 0); -+ return vgetq_lane_f32(v.raw, 0); - } - HWY_INLINE float GetLane(const Vec128 v) { - return vget_lane_f32(v.raw, 0); -@@ -741,9 +756,9 @@ - HWY_INLINE float GetLane(const Vec128 v) { - return vget_lane_f32(v.raw, 0); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE double GetLane(const Vec128 v) { -- return vget_lane_f64(vget_low_f64(v.raw), 0); -+ return vgetq_lane_f64(v.raw, 0); - } - HWY_INLINE double GetLane(const Vec128 v) { - return vget_lane_f64(v.raw, 0); -@@ -785,8 +800,6 @@ - // ------------------------------ Average - - // Returns (a + b + 1) / 2 -- --// Unsigned - HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) - HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) - -@@ -802,6 +815,7 @@ - HWY_INLINE Vec128 Abs(const Vec128 v) { - return Vec128(vabsq_s32(v.raw)); - } -+// i64 is implemented after BroadcastSignBit. - HWY_INLINE Vec128 Abs(const Vec128 v) { - return Vec128(vabsq_f32(v.raw)); - } -@@ -823,7 +837,7 @@ - return Vec128(vabs_f32(v.raw)); - } - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 Abs(const Vec128 v) { - return Vec128(vabsq_f64(v.raw)); - } -@@ -839,7 +853,7 @@ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below - - HWY_INLINE Vec128 Neg(const Vec128 v) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return Vec128(vneg_s64(v.raw)); - #else - return Zero(Simd()) - v; -@@ -847,7 +861,7 @@ - } - - HWY_INLINE Vec128 Neg(const Vec128 v) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return Vec128(vnegq_s64(v.raw)); - #else - return Zero(Full128()) - v; -@@ -876,6 +890,16 @@ - - // ------------------------------ Shl - -+HWY_INLINE Vec128 operator<<(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); -+} -+template -+HWY_INLINE Vec128 operator<<(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); -+} -+ - HWY_INLINE Vec128 operator<<(const Vec128 v, - const Vec128 bits) { - return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); -@@ -905,6 +929,16 @@ - return Vec128(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); - } - -+HWY_INLINE Vec128 operator<<(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshlq_s8(v.raw, bits.raw)); -+} -+template -+HWY_INLINE Vec128 operator<<(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshl_s8(v.raw, bits.raw)); -+} -+ - HWY_INLINE Vec128 operator<<(const Vec128 v, - const Vec128 bits) { - return Vec128(vshlq_s16(v.raw, bits.raw)); -@@ -936,6 +970,18 @@ - - // ------------------------------ Shr (Neg) - -+HWY_INLINE Vec128 operator>>(const Vec128 v, -+ const Vec128 bits) { -+ const int8x16_t neg_bits = Neg(BitCast(Full128(), bits)).raw; -+ return Vec128(vshlq_u8(v.raw, neg_bits)); -+} -+template -+HWY_INLINE Vec128 operator>>(const Vec128 v, -+ const Vec128 bits) { -+ const int8x8_t neg_bits = Neg(BitCast(Simd(), bits)).raw; -+ return Vec128(vshl_u8(v.raw, neg_bits)); -+} -+ - HWY_INLINE Vec128 operator>>(const Vec128 v, - const Vec128 bits) { - const int16x8_t neg_bits = Neg(BitCast(Full128(), bits)).raw; -@@ -971,6 +1017,16 @@ - return Vec128(vshl_u64(v.raw, neg_bits)); - } - -+HWY_INLINE Vec128 operator>>(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); -+} -+template -+HWY_INLINE Vec128 operator>>(const Vec128 v, -+ const Vec128 bits) { -+ return Vec128(vshl_s8(v.raw, Neg(bits).raw)); -+} -+ - HWY_INLINE Vec128 operator>>(const Vec128 v, - const Vec128 bits) { - return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); -@@ -1059,7 +1115,7 @@ - HWY_INLINE Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - int32x4_t rhi = vmull_high_s16(a.raw, b.raw); - #else - int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); -@@ -1070,7 +1126,7 @@ - HWY_INLINE Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); - #else - uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); -@@ -1139,24 +1195,37 @@ - return Vec128(vrecpe_f32(v.raw)); - } - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) - #else --// Emulated with approx reciprocal + Newton-Raphson + mul -+// Not defined on armv7: approximate -+namespace detail { -+ -+HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( -+ const Vec128 recip, const Vec128 divisor) { -+ return Vec128(vrecpsq_f32(recip.raw, divisor.raw)); -+} -+template -+HWY_INLINE Vec128 ReciprocalNewtonRaphsonStep( -+ const Vec128 recip, Vec128 divisor) { -+ return Vec128(vrecps_f32(recip.raw, divisor.raw)); -+} -+ -+} // namespace detail -+ - template - HWY_INLINE Vec128 operator/(const Vec128 a, - const Vec128 b) { - auto x = ApproximateReciprocal(b); -- // Newton-Raphson on 1/x - b -- const auto two = Set(Simd(), 2); -- x = x * (two - b * x); -- x = x * (two - b * x); -- x = x * (two - b * x); -+ x *= detail::ReciprocalNewtonRaphsonStep(x, b); -+ x *= detail::ReciprocalNewtonRaphsonStep(x, b); -+ x *= detail::ReciprocalNewtonRaphsonStep(x, b); - return a * x; - } - #endif - --// Absolute value of difference. -+// ------------------------------ Absolute value of difference. -+ - HWY_INLINE Vec128 AbsDiff(const Vec128 a, const Vec128 b) { - return Vec128(vabdq_f32(a.raw, b.raw)); - } -@@ -1169,7 +1238,7 @@ - // ------------------------------ Floating-point multiply-add variants - - // Returns add + mul * x --#if defined(__aarch64__) -+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 - template - HWY_INLINE Vec128 MulAdd(const Vec128 mul, - const Vec128 x, -@@ -1180,6 +1249,17 @@ - const Vec128 add) { - return Vec128(vfmaq_f32(add.raw, mul.raw, x.raw)); - } -+#else -+// Emulate FMA for floats. -+template -+HWY_INLINE Vec128 MulAdd(const Vec128 mul, -+ const Vec128 x, -+ const Vec128 add) { -+ return mul * x + add; -+} -+#endif -+ -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 MulAdd(const Vec128 mul, - const Vec128 x, - const Vec128 add) { -@@ -1190,18 +1270,10 @@ - const Vec128 add) { - return Vec128(vfmaq_f64(add.raw, mul.raw, x.raw)); - } --#else --// Emulate FMA for floats. --template --HWY_INLINE Vec128 MulAdd(const Vec128 mul, -- const Vec128 x, -- const Vec128 add) { -- return mul * x + add; --} - #endif - - // Returns add - mul * x --#if defined(__aarch64__) -+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 - template - HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, - const Vec128 x, -@@ -1213,7 +1285,17 @@ - const Vec128 add) { - return Vec128(vfmsq_f32(add.raw, mul.raw, x.raw)); - } -+#else -+// Emulate FMA for floats. -+template -+HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, -+ const Vec128 x, -+ const Vec128 add) { -+ return add - mul * x; -+} -+#endif - -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, - const Vec128 x, - const Vec128 add) { -@@ -1224,14 +1306,6 @@ - const Vec128 add) { - return Vec128(vfmsq_f64(add.raw, mul.raw, x.raw)); - } --#else --// Emulate FMA for floats. --template --HWY_INLINE Vec128 NegMulAdd(const Vec128 mul, -- const Vec128 x, -- const Vec128 add) { -- return add - mul * x; --} - #endif - - // Returns mul * x - sub -@@ -1241,12 +1315,6 @@ - const Vec128 sub) { - return MulAdd(mul, x, Neg(sub)); - } --template --HWY_INLINE Vec128 MulSub(const Vec128 mul, -- const Vec128 x, -- const Vec128 sub) { -- return MulAdd(mul, x, Neg(sub)); --} - - // Returns -mul * x - sub - template -@@ -1255,14 +1323,23 @@ - const Vec128 sub) { - return Neg(MulAdd(mul, x, sub)); - } -+ -+#if HWY_ARCH_ARM_A64 -+template -+HWY_INLINE Vec128 MulSub(const Vec128 mul, -+ const Vec128 x, -+ const Vec128 sub) { -+ return MulAdd(mul, x, Neg(sub)); -+} - template - HWY_INLINE Vec128 NegMulSub(const Vec128 mul, - const Vec128 x, - const Vec128 sub) { - return Neg(MulAdd(mul, x, sub)); - } -+#endif - --// ------------------------------ Floating-point square root -+// ------------------------------ Floating-point square root (IfThenZeroElse) - - // Approximate reciprocal square root - HWY_INLINE Vec128 ApproximateReciprocalSqrt(const Vec128 v) { -@@ -1275,80 +1352,36 @@ - } - - // Full precision square root --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) - #else --// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt. --template --HWY_INLINE Vec128 Sqrt(const Vec128 v) { -- auto b = v; -- auto Y = ApproximateReciprocalSqrt(v); -- auto x = v * Y; -- const auto half = Set(Simd(), 0.5); -- const auto oneandhalf = Set(Simd(), 1.5); -- for (size_t i = 0; i < 3; i++) { -- b = b * Y * Y; -- Y = oneandhalf - half * b; -- x = x * Y; -- } -- return IfThenZeroElse(v == Zero(Simd()), x); --} --#endif -- --// ================================================== COMPARE -- --// Comparisons fill a lane with 1-bits if the condition is true, else 0. -+namespace detail { - --template --HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { -- static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); -- return Mask128{m.raw}; -+HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, -+ const Vec128 recip) { -+ return Vec128(vrsqrtsq_f32(root.raw, recip.raw)); -+} -+template -+HWY_INLINE Vec128 ReciprocalSqrtStep(const Vec128 root, -+ Vec128 recip) { -+ return Vec128(vrsqrts_f32(root.raw, recip.raw)); - } - --#define HWY_NEON_BUILD_TPL_HWY_COMPARE --#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 --#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ -- const Vec128 a, const Vec128 b --#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw -- --// ------------------------------ Equality --HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) --#if defined(__aarch64__) --HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) --#else --// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. --HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) --HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) --#endif -+} // namespace detail - --// ------------------------------ Strict inequality -+// Not defined on armv7: approximate -+template -+HWY_INLINE Vec128 Sqrt(const Vec128 v) { -+ auto recip = ApproximateReciprocalSqrt(v); - --// Signed/float < (no unsigned) --#if defined(__aarch64__) --HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) --#else --HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) --#endif --HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) -+ recip *= detail::ReciprocalSqrtStep(v * recip, recip); -+ recip *= detail::ReciprocalSqrtStep(v * recip, recip); -+ recip *= detail::ReciprocalSqrtStep(v * recip, recip); - --// Signed/float > (no unsigned) --#if defined(__aarch64__) --HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE) --#else --HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE) -+ const auto root = v * recip; -+ return IfThenZeroElse(v == Zero(Simd()), root); -+} - #endif --HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE) -- --// ------------------------------ Weak inequality -- --// Float <= >= --HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) --HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE) -- --#undef HWY_NEON_BUILD_TPL_HWY_COMPARE --#undef HWY_NEON_BUILD_RET_HWY_COMPARE --#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE --#undef HWY_NEON_BUILD_ARG_HWY_COMPARE - - // ================================================== LOGICAL - -@@ -1357,13 +1390,16 @@ - // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. - template - HWY_INLINE Vec128 Not(const Vec128 v) { -- const Full128 d8; -- return Vec128(vmvnq_u8(BitCast(d8, v).raw)); -+ const Full128 d; -+ const Repartition d8; -+ return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); - } - template - HWY_INLINE Vec128 Not(const Vec128 v) { -- const Repartition> d8; -- return Vec128(vmvn_u8(BitCast(d8, v).raw)); -+ const Simd d; -+ const Repartition d8; -+ using V8 = decltype(Zero(d8)); -+ return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); - } - - // ------------------------------ And -@@ -1463,33 +1499,38 @@ - return ShiftRight(v); - } - --// ------------------------------ Make mask -+// ================================================== MASK - --template --HWY_INLINE Mask128 TestBit(Vec128 v, Vec128 bit) { -- static_assert(!hwy::IsFloat(), "Only integer vectors supported"); -- return (v & bit) == bit; --} -+// ------------------------------ To/from vector - --// Mask and Vec are the same (true = FF..FF). -+// Mask and Vec have the same representation (true = FF..FF). - template - HWY_INLINE Mask128 MaskFromVec(const Vec128 v) { -- return Mask128(v.raw); -+ const Simd, N> du; -+ return Mask128(BitCast(du, v).raw); - } - -+// DEPRECATED - template - HWY_INLINE Vec128 VecFromMask(const Mask128 v) { -- return Vec128(v.raw); -+ return BitCast(Simd(), Vec128, N>(v.raw)); - } - - template --HWY_INLINE Vec128 VecFromMask(Simd /* tag */, -- const Mask128 v) { -- return Vec128(v.raw); -+HWY_INLINE Vec128 VecFromMask(Simd d, const Mask128 v) { -+ return BitCast(d, Vec128, N>(v.raw)); -+} -+ -+// ------------------------------ RebindMask -+ -+template -+HWY_API Mask128 RebindMask(Simd dto, Mask128 m) { -+ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); -+ return MaskFromVec(BitCast(dto, VecFromMask(Simd(), m))); - } - --// IfThenElse(mask, yes, no) --// Returns mask ? b : a. -+// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a. -+ - #define HWY_NEON_BUILD_TPL_HWY_IF - #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 - #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ -@@ -1524,7 +1565,6 @@ - return Max(zero, v); - } - -- - // ------------------------------ Mask logical - - template -@@ -1557,11 +1597,199 @@ - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); - } - -+// ================================================== COMPARE -+ -+// Comparisons fill a lane with 1-bits if the condition is true, else 0. -+ -+// ------------------------------ Shuffle2301 (for i64 compares) -+ -+// Swap 32-bit halves in 64-bits -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64_u32(v.raw)); -+} -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64_s32(v.raw)); -+} -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64_f32(v.raw)); -+} -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64q_u32(v.raw)); -+} -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64q_s32(v.raw)); -+} -+HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -+ return Vec128(vrev64q_f32(v.raw)); -+} -+ -+// Intrinsics return unsigned mask, and our macros do not support casting, -+// so the intrinsics reside in detail and are called from a wrapper. -+namespace detail { -+ -+#define HWY_NEON_BUILD_TPL_HWY_COMPARE -+#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 -+#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ -+ const Vec128 a, const Vec128 b -+#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw -+ -+// ------------------------------ Equality -+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Eq, vceq, _, HWY_COMPARE) -+#if HWY_ARCH_ARM_A64 -+HWY_NEON_DEF_FUNCTION_INTS_UINTS(Eq, vceq, _, HWY_COMPARE) -+#else -+// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. -+HWY_NEON_DEF_FUNCTION_INT_8_16_32(Eq, vceq, _, HWY_COMPARE) -+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Eq, vceq, _, HWY_COMPARE) -+#endif -+ -+// ------------------------------ Strict inequality (signed, float) -+#if HWY_ARCH_ARM_A64 -+HWY_NEON_DEF_FUNCTION_INTS(Lt, vclt, _, HWY_COMPARE) -+#else -+HWY_NEON_DEF_FUNCTION_INT_8_16_32(Lt, vclt, _, HWY_COMPARE) -+#endif -+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Lt, vclt, _, HWY_COMPARE) -+ -+// ------------------------------ Weak inequality (float) -+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Le, vcle, _, HWY_COMPARE) -+ -+#undef HWY_NEON_BUILD_TPL_HWY_COMPARE -+#undef HWY_NEON_BUILD_RET_HWY_COMPARE -+#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE -+#undef HWY_NEON_BUILD_ARG_HWY_COMPARE -+ -+} // namespace detail -+ -+// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq) -+ -+#if HWY_ARCH_ARM_V7 -+namespace detail { -+ -+template -+HWY_INLINE Mask128 Eq(const Vec128 a, -+ const Vec128 b) { -+ const Simd d32; -+ const Simd d64; -+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); -+ const auto cmp64 = cmp32 & Shuffle2301(cmp32); -+ return MaskFromVec(BitCast(d64, cmp64)); -+} -+ -+template -+HWY_INLINE Mask128 Eq(const Vec128 a, -+ const Vec128 b) { -+ const Simd d32; -+ const Simd d64; -+ const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); -+ const auto cmp64 = cmp32 & Shuffle2301(cmp32); -+ return MaskFromVec(BitCast(d64, cmp64)); -+} -+ -+HWY_INLINE Mask128 Lt(const Vec128 a, -+ const Vec128 b) { -+ const int64x2_t sub = vqsubq_s64(a.raw, b.raw); -+ return MaskFromVec(BroadcastSignBit(Vec128(sub))); -+} -+HWY_INLINE Mask128 Lt(const Vec128 a, -+ const Vec128 b) { -+ const int64x1_t sub = vqsub_s64(a.raw, b.raw); -+ return MaskFromVec(BroadcastSignBit(Vec128(sub))); -+} -+ -+} // namespace detail -+#endif -+ -+// ------------------------------ Comparison wrapper -+ -+template -+HWY_API Mask128 operator==(Vec128 a, Vec128 b) { -+ return RebindMask(Simd(), detail::Eq(a, b)); -+} -+ -+template -+HWY_API Mask128 operator<(Vec128 a, Vec128 b) { -+ return RebindMask(Simd(), detail::Lt(a, b)); -+} -+ -+template -+HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { -+ return RebindMask(Simd(), detail::Le(a, b)); -+} -+ -+// Swapped operand order -+template -+HWY_API Mask128 operator>(Vec128 a, Vec128 b) { -+ return operator<(b, a); -+} -+template -+HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { -+ return operator<=(b, a); -+} -+ -+// ------------------------------ FirstN (Iota, Lt) -+ -+template -+HWY_API Mask128 FirstN(const Simd d, size_t num) { -+ const RebindToSigned di; // Signed comparisons are cheaper. -+ return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); -+} -+ -+// ------------------------------ TestBit (Eq) -+ -+#define HWY_NEON_BUILD_TPL_HWY_TESTBIT -+#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 -+#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ -+ Vec128 v, Vec128 bit -+#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw -+ -+#if HWY_ARCH_ARM_A64 -+HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) -+#else -+// No 64-bit versions on armv7 -+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) -+HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) -+ -+template -+HWY_INLINE Mask128 TestBit(Vec128 v, -+ Vec128 bit) { -+ return (v & bit) == bit; -+} -+template -+HWY_INLINE Mask128 TestBit(Vec128 v, -+ Vec128 bit) { -+ return (v & bit) == bit; -+} -+ -+#endif -+#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT -+#undef HWY_NEON_BUILD_RET_HWY_TESTBIT -+#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT -+#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT -+ -+// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) -+HWY_INLINE Vec128 Abs(const Vec128 v) { -+#if HWY_ARCH_ARM_A64 -+ return Vec128(vabsq_s64(v.raw)); -+#else -+ const auto zero = Zero(Full128()); -+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -+#endif -+} -+HWY_INLINE Vec128 Abs(const Vec128 v) { -+#if HWY_ARCH_ARM_A64 -+ return Vec128(vabs_s64(v.raw)); -+#else -+ const auto zero = Zero(Simd()); -+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -+#endif -+} -+ - // ------------------------------ Min (IfThenElse, BroadcastSignBit) - - namespace detail { - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - - HWY_INLINE Vec128 Gt(Vec128 a, Vec128 b) { - return Vec128(vcgtq_u64(a.raw, b.raw)); -@@ -1588,7 +1816,7 @@ - template - HWY_INLINE Vec128 Min(const Vec128 a, - const Vec128 b) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); - #else - const Simd du; -@@ -1603,7 +1831,7 @@ - template - HWY_INLINE Vec128 Min(const Vec128 a, - const Vec128 b) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a); - #else - const Vec128 sign = detail::SaturatedSub(a, b); -@@ -1612,7 +1840,7 @@ - } - - // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN. --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2) - #else - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) -@@ -1626,7 +1854,7 @@ - template - HWY_INLINE Vec128 Max(const Vec128 a, - const Vec128 b) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); - #else - const Simd du; -@@ -1641,7 +1869,7 @@ - template - HWY_INLINE Vec128 Max(const Vec128 a, - const Vec128 b) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b); - #else - const Vec128 sign = detail::SaturatedSub(a, b); -@@ -1650,7 +1878,7 @@ - } - - // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN. --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2) - #else - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) -@@ -1696,7 +1924,7 @@ - const float* HWY_RESTRICT aligned) { - return Vec128(vld1q_f32(aligned)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 LoadU(Full128 /* tag */, - const double* HWY_RESTRICT aligned) { - return Vec128(vld1q_f64(aligned)); -@@ -1741,7 +1969,7 @@ - const float* HWY_RESTRICT p) { - return Vec128(vld1_f32(p)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 LoadU(Simd /* tag */, - const double* HWY_RESTRICT p) { - return Vec128(vld1_f64(p)); -@@ -1755,73 +1983,72 @@ - // we don't actually care what is in it, and we don't want - // to introduce extra overhead by initializing it to something. - --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const uint8_t* HWY_RESTRICT p) { -- uint32x2_t a = Undefined(d).raw; -+ uint32x2_t a = Undefined(Simd()).raw; - uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_u8_u32(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const uint16_t* HWY_RESTRICT p) { -- uint32x2_t a = Undefined(d).raw; -+ uint32x2_t a = Undefined(Simd()).raw; - uint32x2_t b = vld1_lane_u32(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_u16_u32(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const uint32_t* HWY_RESTRICT p) { -- uint32x2_t a = Undefined(d).raw; -+ uint32x2_t a = Undefined(Simd()).raw; - uint32x2_t b = vld1_lane_u32(p, a, 0); - return Vec128(b); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const int8_t* HWY_RESTRICT p) { -- int32x2_t a = Undefined(d).raw; -+ int32x2_t a = Undefined(Simd()).raw; - int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_s8_s32(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const int16_t* HWY_RESTRICT p) { -- int32x2_t a = Undefined(d).raw; -+ int32x2_t a = Undefined(Simd()).raw; - int32x2_t b = vld1_lane_s32(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_s16_s32(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const int32_t* HWY_RESTRICT p) { -- int32x2_t a = Undefined(d).raw; -+ int32x2_t a = Undefined(Simd()).raw; - int32x2_t b = vld1_lane_s32(p, a, 0); - return Vec128(b); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const float* HWY_RESTRICT p) { -- float32x2_t a = Undefined(d).raw; -+ float32x2_t a = Undefined(Simd()).raw; - float32x2_t b = vld1_lane_f32(p, a, 0); - return Vec128(b); - } - - // ------------------------------ Load 16 - --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const uint8_t* HWY_RESTRICT p) { -- uint16x4_t a = Undefined(d).raw; -+ uint16x4_t a = Undefined(Simd()).raw; - uint16x4_t b = vld1_lane_u16(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_u8_u16(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const uint16_t* HWY_RESTRICT p) { -- uint16x4_t a = Undefined(d).raw; -+ uint16x4_t a = Undefined(Simd()).raw; - uint16x4_t b = vld1_lane_u16(p, a, 0); - return Vec128(b); - } -- --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const int8_t* HWY_RESTRICT p) { -- int16x4_t a = Undefined(d).raw; -+ int16x4_t a = Undefined(Simd()).raw; - int16x4_t b = vld1_lane_s16(reinterpret_cast(p), a, 0); - return Vec128(vreinterpret_s8_s16(b)); - } --HWY_INLINE Vec128 LoadU(Simd d, -+HWY_INLINE Vec128 LoadU(Simd /*tag*/, - const int16_t* HWY_RESTRICT p) { -- int16x4_t a = Undefined(d).raw; -+ int16x4_t a = Undefined(Simd()).raw; - int16x4_t b = vld1_lane_s16(p, a, 0); - return Vec128(b); - } -@@ -1902,7 +2129,7 @@ - float* HWY_RESTRICT aligned) { - vst1q_f32(aligned, v.raw); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE void StoreU(const Vec128 v, Full128 /* tag */, - double* HWY_RESTRICT aligned) { - vst1q_f64(aligned, v.raw); -@@ -1947,7 +2174,7 @@ - float* HWY_RESTRICT p) { - vst1_f32(p, v.raw); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE void StoreU(const Vec128 v, Simd /* tag */, - double* HWY_RESTRICT p) { - vst1_f64(p, v.raw); -@@ -1959,12 +2186,12 @@ - HWY_INLINE void StoreU(const Vec128 v, Simd, - uint8_t* HWY_RESTRICT p) { - uint32x2_t a = vreinterpret_u32_u8(v.raw); -- vst1_lane_u32(p, a, 0); -+ vst1_lane_u32(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - uint16_t* HWY_RESTRICT p) { - uint32x2_t a = vreinterpret_u32_u16(v.raw); -- vst1_lane_u32(p, a, 0); -+ vst1_lane_u32(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - uint32_t* HWY_RESTRICT p) { -@@ -1973,12 +2200,12 @@ - HWY_INLINE void StoreU(const Vec128 v, Simd, - int8_t* HWY_RESTRICT p) { - int32x2_t a = vreinterpret_s32_s8(v.raw); -- vst1_lane_s32(p, a, 0); -+ vst1_lane_s32(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - int16_t* HWY_RESTRICT p) { - int32x2_t a = vreinterpret_s32_s16(v.raw); -- vst1_lane_s32(p, a, 0); -+ vst1_lane_s32(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - int32_t* HWY_RESTRICT p) { -@@ -1994,7 +2221,7 @@ - HWY_INLINE void StoreU(const Vec128 v, Simd, - uint8_t* HWY_RESTRICT p) { - uint16x4_t a = vreinterpret_u16_u8(v.raw); -- vst1_lane_u16(p, a, 0); -+ vst1_lane_u16(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - uint16_t* HWY_RESTRICT p) { -@@ -2003,7 +2230,7 @@ - HWY_INLINE void StoreU(const Vec128 v, Simd, - int8_t* HWY_RESTRICT p) { - int16x4_t a = vreinterpret_s16_s8(v.raw); -- vst1_lane_s16(p, a, 0); -+ vst1_lane_s16(reinterpret_cast(p), a, 0); - } - HWY_INLINE void StoreU(const Vec128 v, Simd, - int16_t* HWY_RESTRICT p) { -@@ -2068,18 +2295,18 @@ - const Vec128 v) { - return Vec128(vmovl_u32(v.raw)); - } --HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, -+HWY_INLINE Vec128 PromoteTo(Full128 d, - const Vec128 v) { -- return Vec128(vmovl_u8(v.raw)); -+ return BitCast(d, Vec128(vmovl_u8(v.raw))); - } --HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, -+HWY_INLINE Vec128 PromoteTo(Full128 d, - const Vec128 v) { - uint16x8_t a = vmovl_u8(v.raw); -- return Vec128(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a)))); -+ return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); - } --HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, -+HWY_INLINE Vec128 PromoteTo(Full128 d, - const Vec128 v) { -- return Vec128(vmovl_u16(v.raw)); -+ return BitCast(d, Vec128(vmovl_u16(v.raw))); - } - - // Unsigned: zero-extend to half vector. -@@ -2105,9 +2332,9 @@ - return Vec128(vget_low_u64(vmovl_u32(v.raw))); - } - template --HWY_INLINE Vec128 PromoteTo(Simd /* tag */, -+HWY_INLINE Vec128 PromoteTo(Simd d, - const Vec128 v) { -- return Vec128(vget_low_s16(vmovl_u8(v.raw))); -+ return BitCast(d, Vec128(vget_low_u16(vmovl_u8(v.raw)))); - } - template - HWY_INLINE Vec128 PromoteTo(Simd /* tag */, -@@ -2170,12 +2397,14 @@ - - HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, - const Vec128 v) { -- return Vec128(vcvt_f32_f16(v.raw)); -+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); -+ return Vec128(f32); - } - template - HWY_INLINE Vec128 PromoteTo(Simd /* tag */, - const Vec128 v) { -- return Vec128(vget_low_f32(vcvt_f32_f16(v.raw))); -+ const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw)); -+ return Vec128(vget_low_f32(f32)); - } - - #else -@@ -2204,7 +2433,7 @@ - - #endif - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - - HWY_INLINE Vec128 PromoteTo(Full128 /* tag */, - const Vec128 v) { -@@ -2298,12 +2527,13 @@ - - HWY_INLINE Vec128 DemoteTo(Simd /* tag */, - const Vec128 v) { -- return Vec128{vcvt_f16_f32(v.raw)}; -+ return Vec128{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))}; - } - template - HWY_INLINE Vec128 DemoteTo(Simd /* tag */, - const Vec128 v) { -- return Vec128{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))}; -+ const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw)); -+ return Vec128(vreinterpret_u16_f16(f16)); - } - - #else -@@ -2339,7 +2569,7 @@ - } - - #endif --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - - HWY_INLINE Vec128 DemoteTo(Simd /* tag */, - const Vec128 v) { -@@ -2397,7 +2627,7 @@ - const Vec128 v) { - Vec128 a = DemoteTo(Simd(), v); - Vec128 b; -- uint16x8_t c = vcombine_s16(a.raw, b.raw); -+ int16x8_t c = vcombine_s16(a.raw, b.raw); - return Vec128(vqmovn_s16(c)); - } - -@@ -2426,7 +2656,7 @@ - return Vec128(vcvt_s32_f32(v.raw)); - } - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - - HWY_INLINE Vec128 ConvertTo(Full128 /* tag */, - const Vec128 v) { -@@ -2451,7 +2681,7 @@ - - // ------------------------------ Round (IfThenElse, mask, logical) - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - // Toward nearest integer - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) - -@@ -2472,18 +2702,26 @@ - // representation, clearing the lowest 23-exp mantissa bits. This requires 9 - // integer operations and 3 constants, which is likely more expensive. - -+namespace detail { -+ -+// The original value is already the desired result if NaN or the magnitude is -+// large (i.e. the value is already an integer). -+template -+HWY_API Mask128 UseInt(const Vec128 v) { -+ return Abs(v) < Set(Simd(), MantissaEnd()); -+} -+ -+} // namespace detail -+ - template - HWY_INLINE Vec128 Trunc(const Vec128 v) { - const Simd df; -- const Simd di; -+ const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - -- // The original value is already the desired result if NaN or the magnitude is -- // large (i.e. the value is already an integer). -- const auto max = Set(df, MantissaEnd()); -- return IfThenElse(Abs(v) < max, int_f, v); -+ return IfThenElse(detail::UseInt(v), int_f, v); - } - - template -@@ -2506,7 +2744,7 @@ - template - HWY_INLINE Vec128 Ceil(const Vec128 v) { - const Simd df; -- const Simd di; -+ const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); -@@ -2514,9 +2752,7 @@ - // Truncating a positive non-integer ends up smaller; if so, add 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); - -- // Keep original if NaN or the magnitude is large (already an int). -- const auto max = Set(df, MantissaEnd()); -- return IfThenElse(Abs(v) < max, int_f - neg1, v); -+ return IfThenElse(detail::UseInt(v), int_f - neg1, v); - } - - template -@@ -2530,16 +2766,14 @@ - // Truncating a negative non-integer ends up larger; if so, subtract 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); - -- // Keep original if NaN or the magnitude is large (already an int). -- const auto max = Set(df, MantissaEnd()); -- return IfThenElse(Abs(v) < max, int_f + neg1, v); -+ return IfThenElse(detail::UseInt(v), int_f + neg1, v); - } - - #endif - - // ------------------------------ NearestInt (Round) - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - - HWY_INLINE Vec128 NearestInt(const Vec128 v) { - return Vec128(vcvtnq_s32_f32(v.raw)); -@@ -2596,7 +2830,7 @@ - HWY_INLINE Vec128 LowerHalf(const Vec128 v) { - return Vec128(vget_low_f32(v.raw)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 LowerHalf(const Vec128 v) { - return Vec128(vget_low_f64(v.raw)); - } -@@ -2629,7 +2863,7 @@ - HWY_INLINE Vec128 UpperHalf(const Vec128 v) { - return Vec128(vget_high_f32(v.raw)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 UpperHalf(const Vec128 v) { - return Vec128(vget_high_f64(v.raw)); - } -@@ -2714,7 +2948,7 @@ - - // ------------------------------ Broadcast/splat any lane - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - // Unsigned - template - HWY_INLINE Vec128 Broadcast(const Vec128 v) { -@@ -2886,7 +3120,7 @@ - const Vec128 from) { - const Full128 d; - const Repartition d8; --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, - BitCast(d8, from).raw))); - #else -@@ -2911,33 +3145,58 @@ - BitCast(d8, from).raw))); - } - --// ------------------------------ Hard-coded shuffles -+// ------------------------------ TableLookupLanes - --// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). --// Shuffle0321 rotates one lane to the right (the previous least-significant --// lane is now most-significant). These could also be implemented via --// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. -+// Returned by SetTableIndices for use by TableLookupLanes. -+template -+struct Indices128 { -+ typename detail::Raw128::type raw; -+}; - --// Swap 32-bit halves in 64-bits --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64_u32(v.raw)); --} --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64_s32(v.raw)); --} --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64_f32(v.raw)); -+template -+HWY_INLINE Indices128 SetTableIndices(Simd d, const int32_t* idx) { -+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) -+ for (size_t i = 0; i < N; ++i) { -+ HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); -+ } -+#endif -+ -+ const Repartition d8; -+ alignas(16) uint8_t control[16] = {0}; -+ for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) { -+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { -+ control[idx_lane * sizeof(T) + idx_byte] = -+ static_cast(idx[idx_lane] * sizeof(T) + idx_byte); -+ } -+ } -+ return Indices128{BitCast(d, Load(d8, control)).raw}; - } --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64q_u32(v.raw)); -+ -+template -+HWY_INLINE Vec128 TableLookupLanes( -+ const Vec128 v, const Indices128 idx) { -+ return TableLookupBytes(v, Vec128{idx.raw}); - } --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64q_s32(v.raw)); -+template -+HWY_INLINE Vec128 TableLookupLanes( -+ const Vec128 v, const Indices128 idx) { -+ return TableLookupBytes(v, Vec128{idx.raw}); - } --HWY_INLINE Vec128 Shuffle2301(const Vec128 v) { -- return Vec128(vrev64q_f32(v.raw)); -+template -+HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, -+ const Indices128 idx) { -+ const Simd di; -+ const auto idx_i = BitCast(di, Vec128{idx.raw}); -+ return BitCast(Simd(), TableLookupBytes(BitCast(di, v), idx_i)); - } - -+// ------------------------------ Other shuffles (TableLookupBytes) -+ -+// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -+// Shuffle0321 rotates one lane to the right (the previous least-significant -+// lane is now most-significant). These could also be implemented via -+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. -+ - // Swap 64-bit halves - template - HWY_INLINE Vec128 Shuffle1032(const Vec128 v) { -@@ -2975,49 +3234,6 @@ - return TableLookupBytes(v, BitCast(d, Load(d8, bytes))); - } - --// ------------------------------ TableLookupLanes -- --// Returned by SetTableIndices for use by TableLookupLanes. --template --struct Indices128 { -- uint8x16_t raw; --}; -- --template --HWY_INLINE Indices128 SetTableIndices(const Full128, const int32_t* idx) { --#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) -- const size_t N = 16 / sizeof(T); -- for (size_t i = 0; i < N; ++i) { -- HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast(N)); -- } --#endif -- -- const Full128 d8; -- alignas(16) uint8_t control[16]; -- for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { -- const size_t idx_lane = idx_byte / sizeof(T); -- const size_t mod = idx_byte % sizeof(T); -- control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; -- } -- return Indices128{Load(d8, control).raw}; --} -- --HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, -- const Indices128 idx) { -- return TableLookupBytes(v, Vec128(idx.raw)); --} --HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, -- const Indices128 idx) { -- return TableLookupBytes(v, Vec128(idx.raw)); --} --HWY_INLINE Vec128 TableLookupLanes(const Vec128 v, -- const Indices128 idx) { -- const Full128 di; -- const Full128 df; -- return BitCast(df, -- TableLookupBytes(BitCast(di, v), Vec128(idx.raw))); --} -- - // ------------------------------ Interleave lanes - - // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides -@@ -3029,7 +3245,7 @@ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) - HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - // For 64 bit types, we only have the "q" version of the function defined as - // interleaving 64-wide registers with 64-wide types in them makes no sense. - HWY_INLINE Vec128 InterleaveLower(const Vec128 a, -@@ -3079,7 +3295,7 @@ - const Vec128 b) { - return Vec128(vzip1q_f32(a.raw, b.raw)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 InterleaveLower(const Vec128 a, - const Vec128 b) { - return Vec128(vzip1q_f64(a.raw, b.raw)); -@@ -3090,10 +3306,10 @@ - const Vec128 b) { - return Vec128(vzip2q_f32(a.raw, b.raw)); - } --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 InterleaveUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_s64(a.raw, b.raw)); -+ return Vec128(vzip2q_f64(a.raw, b.raw)); - } - #endif - -@@ -3105,119 +3321,125 @@ - // Full vectors - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_u8(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_u16(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_u32(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw))); - } - - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_s8(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_s16(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1q_s32(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw))); - } - - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_u8(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_u16(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_u32(a.raw, b.raw)); -+ return Vec128(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw))); - } - - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_s8(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_s16(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw))); - } - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2q_s32(a.raw, b.raw)); -+ return Vec128(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw))); - } - - // Half vectors or less - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_u8(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_u16(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_u32(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw))); - } - - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_s8(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_s16(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipLower(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip1_s32(a.raw, b.raw)); -+ return Vec128( -+ vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw))); - } - - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_u8(a.raw, b.raw)); -+ return Vec128(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_u16(a.raw, b.raw)); -+ return Vec128(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_u32(a.raw, b.raw)); -+ return Vec128(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw))); - } - - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_s8(a.raw, b.raw)); -+ return Vec128(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_s16(a.raw, b.raw)); -+ return Vec128(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw))); - } - template - HWY_INLINE Vec128 ZipUpper(const Vec128 a, - const Vec128 b) { -- return Vec128(vzip2_s32(a.raw, b.raw)); -+ return Vec128(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw))); - } - - // ------------------------------ Blocks -@@ -3274,84 +3496,113 @@ - - // ================================================== MISC - --// Returns a vector with lane i=[0, N) set to "first" + i. --template --Vec128 Iota(const Simd d, const T2 first) { -- HWY_ALIGN T lanes[16 / sizeof(T)]; -- for (size_t i = 0; i < 16 / sizeof(T); ++i) { -- lanes[i] = static_cast(first + static_cast(i)); -+// ------------------------------ Scatter (Store) -+ -+template -+HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, -+ const Vec128 offset) { -+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); -+ -+ alignas(16) T lanes[N]; -+ Store(v, d, lanes); -+ -+ alignas(16) Offset offset_lanes[N]; -+ Store(offset, Simd(), offset_lanes); -+ -+ uint8_t* base_bytes = reinterpret_cast(base); -+ for (size_t i = 0; i < N; ++i) { -+ CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); - } -- return Load(d, lanes); - } - --// ------------------------------ Gather (requires GetLane) -+template -+HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, -+ const Vec128 index) { -+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); -+ -+ alignas(16) T lanes[N]; -+ Store(v, d, lanes); -+ -+ alignas(16) Index index_lanes[N]; -+ Store(index, Simd(), index_lanes); -+ -+ for (size_t i = 0; i < N; ++i) { -+ base[index_lanes[i]] = lanes[i]; -+ } -+} -+ -+// ------------------------------ Gather (Load/Store) - - template - HWY_API Vec128 GatherOffset(const Simd d, - const T* HWY_RESTRICT base, - const Vec128 offset) { -- static_assert(N == 1, "NEON does not support full gather"); -- static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); -- const uintptr_t address = reinterpret_cast(base) + GetLane(offset); -- T val; -- CopyBytes(reinterpret_cast(address), &val); -- return Set(d, val); -+ static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); -+ -+ alignas(16) Offset offset_lanes[N]; -+ Store(offset, Simd(), offset_lanes); -+ -+ alignas(16) T lanes[N]; -+ const uint8_t* base_bytes = reinterpret_cast(base); -+ for (size_t i = 0; i < N; ++i) { -+ CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); -+ } -+ return Load(d, lanes); - } - - template - HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, - const Vec128 index) { -- static_assert(N == 1, "NEON does not support full gather"); -- static_assert(sizeof(T) == sizeof(Index), "T must match Index"); -- return Set(d, base[GetLane(index)]); -+ static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); -+ -+ alignas(16) Index index_lanes[N]; -+ Store(index, Simd(), index_lanes); -+ -+ alignas(16) T lanes[N]; -+ for (size_t i = 0; i < N; ++i) { -+ lanes[i] = base[index_lanes[i]]; -+ } -+ return Load(d, lanes); - } - --// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301) -+// ------------------------------ Reductions - --#if !defined(__aarch64__) -+namespace detail { - --template --HWY_INLINE Mask128 operator==(const Vec128 a, -- const Vec128 b) { -- const Simd d32; -- const Simd d64; -- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); -- const auto cmp64 = cmp32 & Shuffle2301(cmp32); -- return MaskFromVec(BitCast(d64, cmp64)); -+// N=1 for any T: no-op -+template -+HWY_API Vec128 SumOfLanes(const Vec128 v) { -+ return v; - } -- --template --HWY_INLINE Mask128 operator==(const Vec128 a, -- const Vec128 b) { -- const Simd d32; -- const Simd d64; -- const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b)); -- const auto cmp64 = cmp32 & Shuffle2301(cmp32); -- return MaskFromVec(BitCast(d64, cmp64)); -+template -+HWY_API Vec128 MinOfLanes(hwy::SizeTag /* tag */, -+ const Vec128 v) { -+ return v; -+} -+template -+HWY_API Vec128 MaxOfLanes(hwy::SizeTag /* tag */, -+ const Vec128 v) { -+ return v; - } - --HWY_INLINE Mask128 operator<(const Vec128 a, -- const Vec128 b) { -- const int64x2_t sub = vqsubq_s64(a.raw, b.raw); -- return MaskFromVec(BroadcastSignBit(Vec128(sub))); -+// u32/i32/f32: N=2 -+template -+HWY_API Vec128 SumOfLanes(const Vec128 v10) { -+ return v10 + Shuffle2301(v10); - } --HWY_INLINE Mask128 operator<(const Vec128 a, -- const Vec128 b) { -- const int64x1_t sub = vqsub_s64(a.raw, b.raw); -- return MaskFromVec(BroadcastSignBit(Vec128(sub))); -+template -+HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, -+ const Vec128 v10) { -+ return Min(v10, Shuffle2301(v10)); - } -- --template --HWY_INLINE Mask128 operator>(const Vec128 a, -- const Vec128 b) { -- return b < a; -+template -+HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, -+ const Vec128 v10) { -+ return Max(v10, Shuffle2301(v10)); - } --#endif - --// ------------------------------ Reductions -- --#if defined(__aarch64__) --// Supported for 32b and 64b vector types. Returns the sum in each lane. -+// full vectors -+#if HWY_ARCH_ARM_A64 - HWY_INLINE Vec128 SumOfLanes(const Vec128 v) { - return Vec128(vdupq_n_u32(vaddvq_u32(v.raw))); - } -@@ -3398,20 +3649,15 @@ - } - #endif - --namespace detail { -- --// For u32/i32/f32. --template --HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, -- const Vec128 v3210) { -+template -+HWY_API Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { - const Vec128 v1032 = Shuffle1032(v3210); - const Vec128 v31_20_31_20 = Min(v3210, v1032); - const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); - return Min(v20_31_20_31, v31_20_31_20); - } --template --HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, -- const Vec128 v3210) { -+template -+HWY_API Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { - const Vec128 v1032 = Shuffle1032(v3210); - const Vec128 v31_20_31_20 = Max(v3210, v1032); - const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); -@@ -3419,15 +3665,13 @@ - } - - // For u64/i64[/f64]. --template --HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, -- const Vec128 v10) { -+template -+HWY_API Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { - const Vec128 v01 = Shuffle01(v10); - return Min(v10, v01); - } --template --HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, -- const Vec128 v10) { -+template -+HWY_API Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { - const Vec128 v01 = Shuffle01(v10); - return Max(v10, v01); - } -@@ -3435,6 +3679,10 @@ - } // namespace detail - - template -+HWY_API Vec128 SumOfLanes(const Vec128 v) { -+ return detail::SumOfLanes(v); -+} -+template - HWY_API Vec128 MinOfLanes(const Vec128 v) { - return detail::MinOfLanes(hwy::SizeTag(), v); - } -@@ -3457,18 +3705,18 @@ - const Vec128 values = - BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - // Can't vaddv - we need two separate bytes (16 bits). - const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); - const uint8x8_t x4 = vpadd_u8(x2, x2); - const uint8x8_t x8 = vpadd_u8(x4, x4); -- return vreinterpret_u16_u8(x8)[0]; -+ return vget_lane_u64(vreinterpret_u64_u8(x8), 0); - #else - // Don't have vpaddq, so keep doubling lane size. - const uint16x8_t x2 = vpaddlq_u8(values.raw); - const uint32x4_t x4 = vpaddlq_u16(x2); - const uint64x2_t x8 = vpaddlq_u32(x4); -- return (uint64_t(x8[1]) << 8) | x8[0]; -+ return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); - #endif - } - -@@ -3484,7 +3732,7 @@ - const Vec128 slice(Load(Simd(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddv_u8(values.raw); - #else - const uint16x4_t x2 = vpaddl_u8(values.raw); -@@ -3503,7 +3751,7 @@ - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_u16(values.raw); - #else - const uint32x4_t x2 = vpaddlq_u16(values.raw); -@@ -3522,7 +3770,7 @@ - const Simd du; - const Vec128 slice(Load(Simd(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddv_u16(values.raw); - #else - const uint32x2_t x2 = vpaddl_u16(values.raw); -@@ -3539,7 +3787,7 @@ - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_u32(values.raw); - #else - const uint64x2_t x2 = vpaddlq_u32(values.raw); -@@ -3557,7 +3805,7 @@ - const Simd du; - const Vec128 slice(Load(Simd(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddv_u32(values.raw); - #else - const uint64x1_t x2 = vpaddl_u32(values.raw); -@@ -3572,7 +3820,7 @@ - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_u64(values.raw); - #else - return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); -@@ -3612,13 +3860,13 @@ - const int8x16_t ones = - vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_s8(ones); - #else - const int16x8_t x2 = vpaddlq_s8(ones); - const int32x4_t x4 = vpaddlq_s16(x2); - const int64x2_t x8 = vpaddlq_s32(x4); -- return x8[0] + x8[1]; -+ return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1); - #endif - } - template -@@ -3627,12 +3875,12 @@ - const int16x8_t ones = - vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_s16(ones); - #else - const int32x4_t x2 = vpaddlq_s16(ones); - const int64x2_t x4 = vpaddlq_s32(x2); -- return x4[0] + x4[1]; -+ return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1); - #endif - } - -@@ -3642,26 +3890,26 @@ - const int32x4_t ones = - vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); - --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - return vaddvq_s32(ones); - #else - const int64x2_t x2 = vpaddlq_s32(ones); -- return x2[0] + x2[1]; -+ return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1); - #endif - } - - template - HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { --#if defined(__aarch64__) -+#if HWY_ARCH_ARM_A64 - const Full128 di; - const int64x2_t ones = - vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); - return vaddvq_s64(ones); - #else -- const Full128 di; -- const int64x2_t ones = -- vshrq_n_u64(BitCast(di, VecFromMask(Full128(), mask)).raw, 63); -- return ones[0] + ones[1]; -+ const Full128 du; -+ const auto mask_u = VecFromMask(du, RebindMask(du, mask)); -+ const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); -+ return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1); - #endif - } - -@@ -3690,9 +3938,13 @@ - // Full - template - HWY_INLINE bool AllFalse(const Mask128 m) { -+#if HWY_ARCH_ARM_A64 -+ return (vmaxvq_u32(m.raw) == 0); -+#else - const auto v64 = BitCast(Full128(), VecFromMask(Full128(), m)); - uint32x2_t a = vqmovn_u64(v64.raw); -- return vreinterpret_u64_u32(a)[0] == 0; -+ return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0; -+#endif - } - - // Partial -@@ -3711,8 +3963,160 @@ - - namespace detail { - -+// Load 8 bytes, replicate into upper half so ZipLower can use the lower half. -+HWY_INLINE Vec128 Load8Bytes(Full128 /*d*/, -+ const uint8_t* bytes) { -+ return Vec128(vreinterpretq_u8_u64( -+ vld1q_dup_u64(reinterpret_cast(bytes)))); -+} -+ -+// Load 8 bytes and return half-reg with N <= 8 bytes. -+template -+HWY_INLINE Vec128 Load8Bytes(Simd d, -+ const uint8_t* bytes) { -+ return Load(d, bytes); -+} -+ -+template -+HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, -+ const uint64_t mask_bits) { -+ HWY_DASSERT(mask_bits < 256); -+ const Simd d; -+ const Repartition d8; -+ const Simd du; -+ -+ // ARM does not provide an equivalent of AVX2 permutevar, so we need byte -+ // indices for VTBL (one vector's worth for each of 256 combinations of -+ // 8 mask bits). Loading them directly would require 4 KiB. We can instead -+ // store lane indices and convert to byte indices (2*lane + 0..1), with the -+ // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane -+ // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. -+ // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles -+ // is likely more costly than the higher cache footprint from storing bytes. -+ alignas(16) constexpr uint8_t table[256 * 8] = { -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, -+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, -+ 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, -+ 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, -+ 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, -+ 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, -+ 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, -+ 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, -+ 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, -+ 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, -+ 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, -+ 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, -+ 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, -+ 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, -+ 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, -+ 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, -+ 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, -+ 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, -+ 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, -+ 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, -+ 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, -+ 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, -+ 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, -+ 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, -+ 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, -+ 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, -+ 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, -+ 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, -+ 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, -+ 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, -+ 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, -+ 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, -+ 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, -+ 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, -+ 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, -+ 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, -+ 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, -+ 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, -+ 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, -+ 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, -+ 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, -+ 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, -+ 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, -+ 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, -+ 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, -+ 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, -+ 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, -+ 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, -+ 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, -+ 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, -+ 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, -+ 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, -+ 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, -+ 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, -+ 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, -+ 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, -+ 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, -+ 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, -+ 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, -+ 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, -+ 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, -+ 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, -+ 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, -+ 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, -+ 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, -+ 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, -+ 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, -+ 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, -+ 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, -+ 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, -+ 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, -+ 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, -+ 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, -+ 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, -+ 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, -+ 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, -+ 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, -+ 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, -+ 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, -+ 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, -+ 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, -+ 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, -+ 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, -+ 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, -+ 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, -+ 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, -+ 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, -+ 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, -+ 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, -+ 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, -+ 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, -+ 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, -+ 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, -+ 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, -+ 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, -+ 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, -+ 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, -+ 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, -+ 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, -+ 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, -+ 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, -+ 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, -+ 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, -+ 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, -+ 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, -+ 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, -+ 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, -+ 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, -+ 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, -+ 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, -+ 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, -+ 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, -+ 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, -+ 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; -+ -+ const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); -+ const Vec128 pairs = ZipLower(byte_idx, byte_idx); -+ return BitCast(d, pairs + Set(du, 0x0100)); -+} -+ - template --HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { -+HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, -+ const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. -@@ -3742,7 +4146,8 @@ - #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64 - - template --HWY_INLINE Vec128 Idx64x2FromBits(const uint64_t mask_bits) { -+HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, -+ const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. -@@ -3761,59 +4166,15 @@ - - // Helper function called by both Compress and CompressStore - avoids a - // redundant BitsFromMask in the latter. -- --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx32x4FromBits(mask_bits); -- return TableLookupBytes(v, idx); --} --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx32x4FromBits(mask_bits); -- return TableLookupBytes(v, idx); --} -- --#if HWY_CAP_INTEGER64 -- --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx64x2FromBits(mask_bits); -- return TableLookupBytes(v, idx); --} --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx64x2FromBits(mask_bits); -- return TableLookupBytes(v, idx); --} -- --#endif -- --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx32x4FromBits(mask_bits); -- const Simd df; -- const Simd di; -- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); --} -- --#if HWY_CAP_FLOAT64 -- --template --HWY_API Vec128 Compress(Vec128 v, -- const uint64_t mask_bits) { -- const auto idx = detail::Idx64x2FromBits(mask_bits); -- const Simd df; -- const Simd di; -- return BitCast(df, TableLookupBytes(BitCast(di, v), idx)); -+template -+HWY_API Vec128 Compress(Vec128 v, const uint64_t mask_bits) { -+ const auto idx = -+ detail::IdxFromBits(hwy::SizeTag(), mask_bits); -+ using D = Simd; -+ const RebindToSigned di; -+ return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); - } - --#endif -- - } // namespace detail - - template -@@ -3831,6 +4192,79 @@ - return PopCount(mask_bits); - } - -+// ------------------------------ StoreInterleaved3 -+ -+// 128 bits -+HWY_API void StoreInterleaved3(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ Full128 /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw}; -+ vst3q_u8(unaligned, triple); -+} -+ -+// 64 bits -+HWY_API void StoreInterleaved3(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ Simd /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; -+ vst3_u8(unaligned, triple); -+} -+ -+// <= 32 bits: avoid writing more than N bytes by copying to buffer -+template -+HWY_API void StoreInterleaved3(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ Simd /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ alignas(16) uint8_t buf[24]; -+ const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw}; -+ vst3_u8(buf, triple); -+ CopyBytes(buf, unaligned); -+} -+ -+// ------------------------------ StoreInterleaved4 -+ -+// 128 bits -+HWY_API void StoreInterleaved4(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ const Vec128 v3, -+ Full128 /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; -+ vst4q_u8(unaligned, quad); -+} -+ -+// 64 bits -+HWY_API void StoreInterleaved4(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ const Vec128 v3, -+ Simd /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; -+ vst4_u8(unaligned, quad); -+} -+ -+// <= 32 bits: avoid writing more than N bytes by copying to buffer -+template -+HWY_API void StoreInterleaved4(const Vec128 v0, -+ const Vec128 v1, -+ const Vec128 v2, -+ const Vec128 v3, -+ Simd /*tag*/, -+ uint8_t* HWY_RESTRICT unaligned) { -+ alignas(16) uint8_t buf[32]; -+ const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw}; -+ vst4_u8(buf, quad); -+ CopyBytes(buf, unaligned); -+} -+ - // ================================================== Operator wrapper - - // These apply to all x86_*-inl.h because there are no restrictions on V. -@@ -3885,7 +4319,8 @@ - return a <= b; - } - --#if !defined(__aarch64__) -+namespace detail { // for code folding -+#if HWY_ARCH_ARM_V7 - #undef vuzp1_s8 - #undef vuzp1_u8 - #undef vuzp1_s16 -@@ -3972,8 +4407,9 @@ - #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 - #undef HWY_NEON_DEF_FUNCTION_UINTS - #undef HWY_NEON_EVAL -+} // namespace detail - - // NOLINTNEXTLINE(google-readability-namespace-comments) - } // namespace HWY_NAMESPACE - } // namespace hwy --HWY_AFTER_NAMESPACE(); -+HWY_AFTER_NAMESPACE(); -\ No newline at end of file