From 9896cc4e67b80e81e8dc496f73ed0ca25953cbb868e3ca53ec3d5ddc92d3c58a Mon Sep 17 00:00:00 2001
From: Callum Farmer <gmbr3@opensuse.org>
Date: Thu, 3 Jun 2021 12:10:39 +0000
Subject: [PATCH] OBS-URL:
 https://build.opensuse.org/package/show/network:chromium/chromium?expand=0&rev=1566

---
 chromium-91-highway-gcc-arm.patch | 2369 -----------------------------
 1 file changed, 2369 deletions(-)
 delete mode 100644 chromium-91-highway-gcc-arm.patch
diff --git a/chromium-91-highway-gcc-arm.patch b/chromium-91-highway-gcc-arm.patch
deleted file mode 100644
index 7ca2319..0000000
--- a/chromium-91-highway-gcc-arm.patch
+++ /dev/null
@@ -1,2369 +0,0 @@
---- a/third_party/highway/src/hwy/ops/arm_neon-inl.h
-+++ b/third_party/highway/src/hwy/ops/arm_neon-inl.h
-@@ -26,6 +26,8 @@
- namespace hwy {
- namespace HWY_NAMESPACE {
- 
-+namespace detail {  // for code folding and Raw128
-+
- // Macros used to define single and double function calls for multiple types
- // for full and half vectors. These macros are undefined at the end of the file.
- 
-@@ -133,7 +135,7 @@
-   HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
- 
- // float and double
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)   \
-   HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args)  \
-   HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args)     \
-@@ -181,7 +183,7 @@
-   HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
- 
- // Emulation of some intrinsics on armv7.
--#if !defined(__aarch64__)
-+#if HWY_ARCH_ARM_V7
- #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
- #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
- #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
-@@ -294,7 +296,7 @@
-   using type = float32x4_t;
- };
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- template <>
- struct Raw128<double, 2> {
-   using type = float64x2_t;
-@@ -352,7 +354,7 @@
-   using type = float32x2_t;
- };
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- template <>
- struct Raw128<double, 1> {
-   using type = float64x1_t;
-@@ -437,12 +439,14 @@
-   using type = int8x8_t;
- };
- 
-+}  // namespace detail
-+
- template <typename T>
- using Full128 = Simd<T, 16 / sizeof(T)>;
- 
- template <typename T, size_t N = 16 / sizeof(T)>
- class Vec128 {
--  using Raw = typename Raw128<T, N>::type;
-+  using Raw = typename detail::Raw128<T, N>::type;
- 
-  public:
-   HWY_INLINE Vec128() {}
-@@ -480,7 +484,8 @@
- // FF..FF or 0, also for floating-point - see README.
- template <typename T, size_t N = 16 / sizeof(T)>
- class Mask128 {
--  using Raw = typename Raw128<T, N>::type;
-+  // ACLE intrinsics return and expect unsigned type.
-+  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
- 
-  public:
-   HWY_INLINE Mask128() {}
-@@ -573,7 +578,7 @@
-                                               Vec128<uint8_t, 1 * 8> v) {
-   return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> BitCastFromByte(Simd<double, 1> /* tag */,
-                                              Vec128<uint8_t, 1 * 8> v) {
-   return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
-@@ -615,7 +620,7 @@
-   return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
- }
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
-                                           Vec128<uint8_t> v) {
-   return Vec128<double>(vreinterpretq_f64_u8(v.raw));
-@@ -664,15 +669,25 @@
- HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
-   HWY_DIAGNOSTICS(push)
-   HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
--  typename Raw128<T, N>::type a;
-+  typename detail::Raw128<T, N>::type a;
-   return Vec128<T, N>(a);
-   HWY_DIAGNOSTICS(pop)
- }
- 
--// ------------------------------ Extract lane
-+// Returns a vector with lane i=[0, N) set to "first" + i.
-+template <typename T, size_t N, typename T2>
-+Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
-+  HWY_ALIGN T lanes[16 / sizeof(T)];
-+  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-+    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-+  }
-+  return Load(d, lanes);
-+}
-+
-+// ------------------------------ GetLane
- 
- HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
--  return vget_lane_u8(vget_low_u8(v.raw), 0);
-+  return vgetq_lane_u8(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
-@@ -680,7 +695,7 @@
- }
- 
- HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
--  return vget_lane_s8(vget_low_s8(v.raw), 0);
-+  return vgetq_lane_s8(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
-@@ -688,7 +703,7 @@
- }
- 
- HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
--  return vget_lane_u16(vget_low_u16(v.raw), 0);
-+  return vgetq_lane_u16(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
-@@ -696,7 +711,7 @@
- }
- 
- HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
--  return vget_lane_s16(vget_low_s16(v.raw), 0);
-+  return vgetq_lane_s16(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
-@@ -704,7 +719,7 @@
- }
- 
- HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
--  return vget_lane_u32(vget_low_u32(v.raw), 0);
-+  return vgetq_lane_u32(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
-@@ -712,7 +727,7 @@
- }
- 
- HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
--  return vget_lane_s32(vget_low_s32(v.raw), 0);
-+  return vgetq_lane_s32(v.raw, 0);
- }
- template <size_t N>
- HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
-@@ -720,20 +735,20 @@
- }
- 
- HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
--  return vget_lane_u64(vget_low_u64(v.raw), 0);
-+  return vgetq_lane_u64(v.raw, 0);
- }
- HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
-   return vget_lane_u64(v.raw, 0);
- }
- HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
--  return vget_lane_s64(vget_low_s64(v.raw), 0);
-+  return vgetq_lane_s64(v.raw, 0);
- }
- HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
-   return vget_lane_s64(v.raw, 0);
- }
- 
- HWY_INLINE float GetLane(const Vec128<float, 4> v) {
--  return vget_lane_f32(vget_low_f32(v.raw), 0);
-+  return vgetq_lane_f32(v.raw, 0);
- }
- HWY_INLINE float GetLane(const Vec128<float, 2> v) {
-   return vget_lane_f32(v.raw, 0);
-@@ -741,9 +756,9 @@
- HWY_INLINE float GetLane(const Vec128<float, 1> v) {
-   return vget_lane_f32(v.raw, 0);
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE double GetLane(const Vec128<double, 2> v) {
--  return vget_lane_f64(vget_low_f64(v.raw), 0);
-+  return vgetq_lane_f64(v.raw, 0);
- }
- HWY_INLINE double GetLane(const Vec128<double, 1> v) {
-   return vget_lane_f64(v.raw, 0);
-@@ -785,8 +800,6 @@
- // ------------------------------ Average
- 
- // Returns (a + b + 1) / 2
--
--// Unsigned
- HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
- HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
- 
-@@ -802,6 +815,7 @@
- HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
-   return Vec128<int32_t>(vabsq_s32(v.raw));
- }
-+// i64 is implemented after BroadcastSignBit.
- HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
-   return Vec128<float>(vabsq_f32(v.raw));
- }
-@@ -823,7 +837,7 @@
-   return Vec128<float, N>(vabs_f32(v.raw));
- }
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
-   return Vec128<double>(vabsq_f64(v.raw));
- }
-@@ -839,7 +853,7 @@
- HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below
- 
- HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return Vec128<int64_t, 1>(vneg_s64(v.raw));
- #else
-   return Zero(Simd<int64_t, 1>()) - v;
-@@ -847,7 +861,7 @@
- }
- 
- HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return Vec128<int64_t>(vnegq_s64(v.raw));
- #else
-   return Zero(Full128<int64_t>()) - v;
-@@ -876,6 +890,16 @@
- 
- // ------------------------------ Shl
- 
-+HWY_INLINE Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
-+                                      const Vec128<uint8_t> bits) {
-+  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
-+}
-+template <size_t N, HWY_IF_LE64(uint8_t, N)>
-+HWY_INLINE Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
-+                                         const Vec128<uint8_t, N> bits) {
-+  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
-+}
-+
- HWY_INLINE Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
-                                        const Vec128<uint16_t> bits) {
-   return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
-@@ -905,6 +929,16 @@
-   return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
- }
- 
-+HWY_INLINE Vec128<int8_t> operator<<(const Vec128<int8_t> v,
-+                                     const Vec128<int8_t> bits) {
-+  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
-+}
-+template <size_t N, HWY_IF_LE64(int8_t, N)>
-+HWY_INLINE Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
-+                                        const Vec128<int8_t, N> bits) {
-+  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
-+}
-+
- HWY_INLINE Vec128<int16_t> operator<<(const Vec128<int16_t> v,
-                                       const Vec128<int16_t> bits) {
-   return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
-@@ -936,6 +970,18 @@
- 
- // ------------------------------ Shr (Neg)
- 
-+HWY_INLINE Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
-+                                      const Vec128<uint8_t> bits) {
-+  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
-+  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
-+}
-+template <size_t N, HWY_IF_LE64(uint8_t, N)>
-+HWY_INLINE Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
-+                                         const Vec128<uint8_t, N> bits) {
-+  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N>(), bits)).raw;
-+  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
-+}
-+
- HWY_INLINE Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
-                                        const Vec128<uint16_t> bits) {
-   const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
-@@ -971,6 +1017,16 @@
-   return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
- }
- 
-+HWY_INLINE Vec128<int8_t> operator>>(const Vec128<int8_t> v,
-+                                     const Vec128<int8_t> bits) {
-+  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
-+}
-+template <size_t N, HWY_IF_LE64(int8_t, N)>
-+HWY_INLINE Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
-+                                        const Vec128<int8_t, N> bits) {
-+  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
-+}
-+
- HWY_INLINE Vec128<int16_t> operator>>(const Vec128<int16_t> v,
-                                       const Vec128<int16_t> bits) {
-   return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
-@@ -1059,7 +1115,7 @@
- HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
-                                    const Vec128<int16_t> b) {
-   int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
- #else
-   int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
-@@ -1070,7 +1126,7 @@
- HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
-                                     const Vec128<uint16_t> b) {
-   uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
- #else
-   uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
-@@ -1139,24 +1195,37 @@
-   return Vec128<float, N>(vrecpe_f32(v.raw));
- }
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
- #else
--// Emulated with approx reciprocal + Newton-Raphson + mul
-+// Not defined on armv7: approximate
-+namespace detail {
-+
-+HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
-+    const Vec128<float> recip, const Vec128<float> divisor) {
-+  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
-+}
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
-+    const Vec128<float, N> recip, Vec128<float, N> divisor) {
-+  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
-+}
-+
-+}  // namespace detail
-+
- template <size_t N>
- HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
-                                       const Vec128<float, N> b) {
-   auto x = ApproximateReciprocal(b);
--  // Newton-Raphson on 1/x - b
--  const auto two = Set(Simd<float, N>(), 2);
--  x = x * (two - b * x);
--  x = x * (two - b * x);
--  x = x * (two - b * x);
-+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-+  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-   return a * x;
- }
- #endif
- 
--// Absolute value of difference.
-+// ------------------------------ Absolute value of difference.
-+
- HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
-   return Vec128<float>(vabdq_f32(a.raw, b.raw));
- }
-@@ -1169,7 +1238,7 @@
- // ------------------------------ Floating-point multiply-add variants
- 
- // Returns add + mul * x
--#if defined(__aarch64__)
-+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
- template <size_t N, HWY_IF_LE64(float, N)>
- HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                    const Vec128<float, N> x,
-@@ -1180,6 +1249,17 @@
-                                 const Vec128<float> add) {
-   return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
- }
-+#else
-+// Emulate FMA for floats.
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-+                                   const Vec128<float, N> x,
-+                                   const Vec128<float, N> add) {
-+  return mul * x + add;
-+}
-+#endif
-+
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
-                                     const Vec128<double, 1> x,
-                                     const Vec128<double, 1> add) {
-@@ -1190,18 +1270,10 @@
-                                  const Vec128<double> add) {
-   return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
- }
--#else
--// Emulate FMA for floats.
--template <size_t N>
--HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
--                                   const Vec128<float, N> x,
--                                   const Vec128<float, N> add) {
--  return mul * x + add;
--}
- #endif
- 
- // Returns add - mul * x
--#if defined(__aarch64__)
-+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
- template <size_t N, HWY_IF_LE64(float, N)>
- HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                       const Vec128<float, N> x,
-@@ -1213,7 +1285,17 @@
-                                    const Vec128<float> add) {
-   return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
- }
-+#else
-+// Emulate FMA for floats.
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-+                                      const Vec128<float, N> x,
-+                                      const Vec128<float, N> add) {
-+  return add - mul * x;
-+}
-+#endif
- 
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
-                                        const Vec128<double, 1> x,
-                                        const Vec128<double, 1> add) {
-@@ -1224,14 +1306,6 @@
-                                     const Vec128<double> add) {
-   return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
- }
--#else
--// Emulate FMA for floats.
--template <size_t N>
--HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
--                                      const Vec128<float, N> x,
--                                      const Vec128<float, N> add) {
--  return add - mul * x;
--}
- #endif
- 
- // Returns mul * x - sub
-@@ -1241,12 +1315,6 @@
-                                    const Vec128<float, N> sub) {
-   return MulAdd(mul, x, Neg(sub));
- }
--template <size_t N>
--HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
--                                    const Vec128<double, N> x,
--                                    const Vec128<double, N> sub) {
--  return MulAdd(mul, x, Neg(sub));
--}
- 
- // Returns -mul * x - sub
- template <size_t N>
-@@ -1255,14 +1323,23 @@
-                                       const Vec128<float, N> sub) {
-   return Neg(MulAdd(mul, x, sub));
- }
-+
-+#if HWY_ARCH_ARM_A64
-+template <size_t N>
-+HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
-+                                    const Vec128<double, N> x,
-+                                    const Vec128<double, N> sub) {
-+  return MulAdd(mul, x, Neg(sub));
-+}
- template <size_t N>
- HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
-                                        const Vec128<double, N> x,
-                                        const Vec128<double, N> sub) {
-   return Neg(MulAdd(mul, x, sub));
- }
-+#endif
- 
--// ------------------------------ Floating-point square root
-+// ------------------------------ Floating-point square root (IfThenZeroElse)
- 
- // Approximate reciprocal square root
- HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
-@@ -1275,80 +1352,36 @@
- }
- 
- // Full precision square root
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
- #else
--// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
--template <size_t N>
--HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
--  auto b = v;
--  auto Y = ApproximateReciprocalSqrt(v);
--  auto x = v * Y;
--  const auto half = Set(Simd<float, N>(), 0.5);
--  const auto oneandhalf = Set(Simd<float, N>(), 1.5);
--  for (size_t i = 0; i < 3; i++) {
--    b = b * Y * Y;
--    Y = oneandhalf - half * b;
--    x = x * Y;
--  }
--  return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
--}
--#endif
--
--// ================================================== COMPARE
--
--// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-+namespace detail {
- 
--template <typename TFrom, typename TTo, size_t N>
--HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
--  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
--  return Mask128<TTo, N>{m.raw};
-+HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
-+                                            const Vec128<float> recip) {
-+  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
-+}
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
-+                                               Vec128<float, N> recip) {
-+  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
- }
- 
--#define HWY_NEON_BUILD_TPL_HWY_COMPARE
--#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
--#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
--  const Vec128<type, size> a, const Vec128<type, size> b
--#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
--
--// ------------------------------ Equality
--HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
--#if defined(__aarch64__)
--HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
--#else
--// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
--HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
--HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
--#endif
-+}  // namespace detail
- 
--// ------------------------------ Strict inequality
-+// Not defined on armv7: approximate
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
-+  auto recip = ApproximateReciprocalSqrt(v);
- 
--// Signed/float < (no unsigned)
--#if defined(__aarch64__)
--HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
--#else
--HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
--#endif
--HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
-+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-+  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
- 
--// Signed/float > (no unsigned)
--#if defined(__aarch64__)
--HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
--#else
--HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
-+  const auto root = v * recip;
-+  return IfThenZeroElse(v == Zero(Simd<float, N>()), root);
-+}
- #endif
--HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
--
--// ------------------------------ Weak inequality
--
--// Float <= >=
--HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
--HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
--
--#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
--#undef HWY_NEON_BUILD_RET_HWY_COMPARE
--#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
--#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
- 
- // ================================================== LOGICAL
- 
-@@ -1357,13 +1390,16 @@
- // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
- template <typename T>
- HWY_INLINE Vec128<T> Not(const Vec128<T> v) {
--  const Full128<uint8_t> d8;
--  return Vec128<T>(vmvnq_u8(BitCast(d8, v).raw));
-+  const Full128<T> d;
-+  const Repartition<uint8_t, decltype(d)> d8;
-+  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
- }
- template <typename T, size_t N, HWY_IF_LE64(T, N)>
- HWY_INLINE Vec128<T, N> Not(const Vec128<T, N> v) {
--  const Repartition<uint8_t, Simd<T, N>> d8;
--  return Vec128<T, N>(vmvn_u8(BitCast(d8, v).raw));
-+  const Simd<T, N> d;
-+  const Repartition<uint8_t, decltype(d)> d8;
-+  using V8 = decltype(Zero(d8));
-+  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
- }
- 
- // ------------------------------ And
-@@ -1463,33 +1499,38 @@
-   return ShiftRight<sizeof(T) * 8 - 1>(v);
- }
- 
--// ------------------------------ Make mask
-+// ================================================== MASK
- 
--template <typename T, size_t N>
--HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
--  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
--  return (v & bit) == bit;
--}
-+// ------------------------------ To/from vector
- 
--// Mask and Vec are the same (true = FF..FF).
-+// Mask and Vec have the same representation (true = FF..FF).
- template <typename T, size_t N>
- HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
--  return Mask128<T, N>(v.raw);
-+  const Simd<MakeUnsigned<T>, N> du;
-+  return Mask128<T, N>(BitCast(du, v).raw);
- }
- 
-+// DEPRECATED
- template <typename T, size_t N>
- HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
--  return Vec128<T, N>(v.raw);
-+  return BitCast(Simd<T, N>(), Vec128<MakeUnsigned<T>, N>(v.raw));
- }
- 
- template <typename T, size_t N>
--HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> /* tag */,
--                                     const Mask128<T, N> v) {
--  return Vec128<T, N>(v.raw);
-+HWY_INLINE Vec128<T, N> VecFromMask(Simd<T, N> d, const Mask128<T, N> v) {
-+  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
-+}
-+
-+// ------------------------------ RebindMask
-+
-+template <typename TFrom, typename TTo, size_t N>
-+HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> dto, Mask128<TFrom, N> m) {
-+  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-+  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N>(), m)));
- }
- 
--// IfThenElse(mask, yes, no)
--// Returns mask ? b : a.
-+// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
-+
- #define HWY_NEON_BUILD_TPL_HWY_IF
- #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
- #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                 \
-@@ -1524,7 +1565,6 @@
-   return Max(zero, v);
- }
- 
--
- // ------------------------------ Mask logical
- 
- template <typename T, size_t N>
-@@ -1557,11 +1597,199 @@
-   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
- }
- 
-+// ================================================== COMPARE
-+
-+// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-+
-+// ------------------------------ Shuffle2301 (for i64 compares)
-+
-+// Swap 32-bit halves in 64-bits
-+HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
-+  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
-+}
-+HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
-+  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
-+}
-+HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
-+  return Vec128<float, 2>(vrev64_f32(v.raw));
-+}
-+HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
-+  return Vec128<uint32_t>(vrev64q_u32(v.raw));
-+}
-+HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
-+  return Vec128<int32_t>(vrev64q_s32(v.raw));
-+}
-+HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
-+  return Vec128<float>(vrev64q_f32(v.raw));
-+}
-+
-+// Intrinsics return unsigned mask, and our macros do not support casting,
-+// so the intrinsics reside in detail and are called from a wrapper.
-+namespace detail {
-+
-+#define HWY_NEON_BUILD_TPL_HWY_COMPARE
-+#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
-+#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
-+  const Vec128<type, size> a, const Vec128<type, size> b
-+#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
-+
-+// ------------------------------ Equality
-+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Eq, vceq, _, HWY_COMPARE)
-+#if HWY_ARCH_ARM_A64
-+HWY_NEON_DEF_FUNCTION_INTS_UINTS(Eq, vceq, _, HWY_COMPARE)
-+#else
-+// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
-+HWY_NEON_DEF_FUNCTION_INT_8_16_32(Eq, vceq, _, HWY_COMPARE)
-+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Eq, vceq, _, HWY_COMPARE)
-+#endif
-+
-+// ------------------------------ Strict inequality (signed, float)
-+#if HWY_ARCH_ARM_A64
-+HWY_NEON_DEF_FUNCTION_INTS(Lt, vclt, _, HWY_COMPARE)
-+#else
-+HWY_NEON_DEF_FUNCTION_INT_8_16_32(Lt, vclt, _, HWY_COMPARE)
-+#endif
-+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Lt, vclt, _, HWY_COMPARE)
-+
-+// ------------------------------ Weak inequality (float)
-+HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Le, vcle, _, HWY_COMPARE)
-+
-+#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
-+#undef HWY_NEON_BUILD_RET_HWY_COMPARE
-+#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
-+#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
-+
-+}  // namespace detail
-+
-+// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
-+
-+#if HWY_ARCH_ARM_V7
-+namespace detail {
-+
-+template <size_t N>
-+HWY_INLINE Mask128<int64_t, N> Eq(const Vec128<int64_t, N> a,
-+                                  const Vec128<int64_t, N> b) {
-+  const Simd<int32_t, N * 2> d32;
-+  const Simd<int64_t, N> d64;
-+  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
-+  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-+  return MaskFromVec(BitCast(d64, cmp64));
-+}
-+
-+template <size_t N>
-+HWY_INLINE Mask128<uint64_t, N> Eq(const Vec128<uint64_t, N> a,
-+                                   const Vec128<uint64_t, N> b) {
-+  const Simd<uint32_t, N * 2> d32;
-+  const Simd<uint64_t, N> d64;
-+  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
-+  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-+  return MaskFromVec(BitCast(d64, cmp64));
-+}
-+
-+HWY_INLINE Mask128<int64_t> Lt(const Vec128<int64_t> a,
-+                               const Vec128<int64_t> b) {
-+  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
-+  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
-+}
-+HWY_INLINE Mask128<int64_t, 1> Lt(const Vec128<int64_t, 1> a,
-+                                  const Vec128<int64_t, 1> b) {
-+  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
-+  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
-+}
-+
-+}  // namespace detail
-+#endif
-+
-+// ------------------------------ Comparison wrapper
-+
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
-+  return RebindMask(Simd<T, N>(), detail::Eq(a, b));
-+}
-+
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
-+  return RebindMask(Simd<T, N>(), detail::Lt(a, b));
-+}
-+
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
-+  return RebindMask(Simd<T, N>(), detail::Le(a, b));
-+}
-+
-+// Swapped operand order
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
-+  return operator<(b, a);
-+}
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
-+  return operator<=(b, a);
-+}
-+
-+// ------------------------------ FirstN (Iota, Lt)
-+
-+template <typename T, size_t N>
-+HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
-+  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
-+  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
-+}
-+
-+// ------------------------------ TestBit (Eq)
-+
-+#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
-+#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type, size>
-+#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
-+  Vec128<type, size> v, Vec128<type, size> bit
-+#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
-+
-+#if HWY_ARCH_ARM_A64
-+HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
-+#else
-+// No 64-bit versions on armv7
-+HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
-+HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
-+
-+template <size_t N>
-+HWY_INLINE Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
-+                                        Vec128<uint64_t, N> bit) {
-+  return (v & bit) == bit;
-+}
-+template <size_t N>
-+HWY_INLINE Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
-+                                       Vec128<int64_t, N> bit) {
-+  return (v & bit) == bit;
-+}
-+
-+#endif
-+#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
-+#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
-+#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
-+#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
-+
-+// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
-+HWY_INLINE Vec128<int64_t> Abs(const Vec128<int64_t> v) {
-+#if HWY_ARCH_ARM_A64
-+  return Vec128<int64_t>(vabsq_s64(v.raw));
-+#else
-+  const auto zero = Zero(Full128<int64_t>());
-+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-+#endif
-+}
-+HWY_INLINE Vec128<int64_t, 1> Abs(const Vec128<int64_t, 1> v) {
-+#if HWY_ARCH_ARM_A64
-+  return Vec128<int64_t, 1>(vabs_s64(v.raw));
-+#else
-+  const auto zero = Zero(Simd<int64_t, 1>());
-+  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-+#endif
-+}
-+
- // ------------------------------ Min (IfThenElse, BroadcastSignBit)
- 
- namespace detail {
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- 
- HWY_INLINE Vec128<uint64_t> Gt(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-   return Vec128<uint64_t>(vcgtq_u64(a.raw, b.raw));
-@@ -1588,7 +1816,7 @@
- template <size_t N>
- HWY_INLINE Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
-                                    const Vec128<uint64_t, N> b) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
- #else
-   const Simd<uint64_t, N> du;
-@@ -1603,7 +1831,7 @@
- template <size_t N>
- HWY_INLINE Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
-                                   const Vec128<int64_t, N> b) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return IfThenElse(MaskFromVec(detail::Gt(a, b)), b, a);
- #else
-   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
-@@ -1612,7 +1840,7 @@
- }
- 
- // Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
- #else
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
-@@ -1626,7 +1854,7 @@
- template <size_t N>
- HWY_INLINE Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
-                                    const Vec128<uint64_t, N> b) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
- #else
-   const Simd<uint64_t, N> du;
-@@ -1641,7 +1869,7 @@
- template <size_t N>
- HWY_INLINE Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
-                                   const Vec128<int64_t, N> b) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return IfThenElse(MaskFromVec(detail::Gt(a, b)), a, b);
- #else
-   const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
-@@ -1650,7 +1878,7 @@
- }
- 
- // Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
- #else
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
-@@ -1696,7 +1924,7 @@
-                                const float* HWY_RESTRICT aligned) {
-   return Vec128<float>(vld1q_f32(aligned));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
-                                 const double* HWY_RESTRICT aligned) {
-   return Vec128<double>(vld1q_f64(aligned));
-@@ -1741,7 +1969,7 @@
-                                   const float* HWY_RESTRICT p) {
-   return Vec128<float, 2>(vld1_f32(p));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
-                                    const double* HWY_RESTRICT p) {
-   return Vec128<double, 1>(vld1_f64(p));
-@@ -1755,73 +1983,72 @@
- // we don't actually care what is in it, and we don't want
- // to introduce extra overhead by initializing it to something.
- 
--HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
-+HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> /*tag*/,
-                                     const uint8_t* HWY_RESTRICT p) {
--  uint32x2_t a = Undefined(d).raw;
-+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
-   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
-   return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
- }
--HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
-+HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> /*tag*/,
-                                      const uint16_t* HWY_RESTRICT p) {
--  uint32x2_t a = Undefined(d).raw;
-+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
-   uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
-   return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
- }
--HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
-+HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> /*tag*/,
-                                      const uint32_t* HWY_RESTRICT p) {
--  uint32x2_t a = Undefined(d).raw;
-+  uint32x2_t a = Undefined(Simd<uint32_t, 2>()).raw;
-   uint32x2_t b = vld1_lane_u32(p, a, 0);
-   return Vec128<uint32_t, 1>(b);
- }
--HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
-+HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> /*tag*/,
-                                    const int8_t* HWY_RESTRICT p) {
--  int32x2_t a = Undefined(d).raw;
-+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
-   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
-   return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
- }
--HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
-+HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> /*tag*/,
-                                     const int16_t* HWY_RESTRICT p) {
--  int32x2_t a = Undefined(d).raw;
-+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
-   int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
-   return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
- }
--HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
-+HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> /*tag*/,
-                                     const int32_t* HWY_RESTRICT p) {
--  int32x2_t a = Undefined(d).raw;
-+  int32x2_t a = Undefined(Simd<int32_t, 2>()).raw;
-   int32x2_t b = vld1_lane_s32(p, a, 0);
-   return Vec128<int32_t, 1>(b);
- }
--HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
-+HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> /*tag*/,
-                                   const float* HWY_RESTRICT p) {
--  float32x2_t a = Undefined(d).raw;
-+  float32x2_t a = Undefined(Simd<float, 2>()).raw;
-   float32x2_t b = vld1_lane_f32(p, a, 0);
-   return Vec128<float, 1>(b);
- }
- 
- // ------------------------------ Load 16
- 
--HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
-+HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> /*tag*/,
-                                     const uint8_t* HWY_RESTRICT p) {
--  uint16x4_t a = Undefined(d).raw;
-+  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
-   uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
-   return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
- }
--HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
-+HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> /*tag*/,
-                                      const uint16_t* HWY_RESTRICT p) {
--  uint16x4_t a = Undefined(d).raw;
-+  uint16x4_t a = Undefined(Simd<uint16_t, 4>()).raw;
-   uint16x4_t b = vld1_lane_u16(p, a, 0);
-   return Vec128<uint16_t, 1>(b);
- }
--
--HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
-+HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> /*tag*/,
-                                    const int8_t* HWY_RESTRICT p) {
--  int16x4_t a = Undefined(d).raw;
-+  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
-   int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
-   return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
- }
--HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
-+HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> /*tag*/,
-                                     const int16_t* HWY_RESTRICT p) {
--  int16x4_t a = Undefined(d).raw;
-+  int16x4_t a = Undefined(Simd<int16_t, 4>()).raw;
-   int16x4_t b = vld1_lane_s16(p, a, 0);
-   return Vec128<int16_t, 1>(b);
- }
-@@ -1902,7 +2129,7 @@
-                        float* HWY_RESTRICT aligned) {
-   vst1q_f32(aligned, v.raw);
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
-                        double* HWY_RESTRICT aligned) {
-   vst1q_f64(aligned, v.raw);
-@@ -1947,7 +2174,7 @@
-                        float* HWY_RESTRICT p) {
-   vst1_f32(p, v.raw);
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
-                        double* HWY_RESTRICT p) {
-   vst1_f64(p, v.raw);
-@@ -1959,12 +2186,12 @@
- HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
-                        uint8_t* HWY_RESTRICT p) {
-   uint32x2_t a = vreinterpret_u32_u8(v.raw);
--  vst1_lane_u32(p, a, 0);
-+  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
-                        uint16_t* HWY_RESTRICT p) {
-   uint32x2_t a = vreinterpret_u32_u16(v.raw);
--  vst1_lane_u32(p, a, 0);
-+  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
-                        uint32_t* HWY_RESTRICT p) {
-@@ -1973,12 +2200,12 @@
- HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
-                        int8_t* HWY_RESTRICT p) {
-   int32x2_t a = vreinterpret_s32_s8(v.raw);
--  vst1_lane_s32(p, a, 0);
-+  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
-                        int16_t* HWY_RESTRICT p) {
-   int32x2_t a = vreinterpret_s32_s16(v.raw);
--  vst1_lane_s32(p, a, 0);
-+  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
-                        int32_t* HWY_RESTRICT p) {
-@@ -1994,7 +2221,7 @@
- HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
-                        uint8_t* HWY_RESTRICT p) {
-   uint16x4_t a = vreinterpret_u16_u8(v.raw);
--  vst1_lane_u16(p, a, 0);
-+  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
-                        uint16_t* HWY_RESTRICT p) {
-@@ -2003,7 +2230,7 @@
- HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
-                        int8_t* HWY_RESTRICT p) {
-   int16x4_t a = vreinterpret_s16_s8(v.raw);
--  vst1_lane_s16(p, a, 0);
-+  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
- }
- HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
-                        int16_t* HWY_RESTRICT p) {
-@@ -2068,18 +2295,18 @@
-                                       const Vec128<uint32_t, 2> v) {
-   return Vec128<uint64_t>(vmovl_u32(v.raw));
- }
--HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
-+HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> d,
-                                      const Vec128<uint8_t, 8> v) {
--  return Vec128<int16_t>(vmovl_u8(v.raw));
-+  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
- }
--HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
-+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
-                                      const Vec128<uint8_t, 4> v) {
-   uint16x8_t a = vmovl_u8(v.raw);
--  return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
-+  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
- }
--HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
-+HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> d,
-                                      const Vec128<uint16_t, 4> v) {
--  return Vec128<int32_t>(vmovl_u16(v.raw));
-+  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
- }
- 
- // Unsigned: zero-extend to half vector.
-@@ -2105,9 +2332,9 @@
-   return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
- }
- template <size_t N, HWY_IF_LE64(int16_t, N)>
--HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
-+HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> d,
-                                         const Vec128<uint8_t, N> v) {
--  return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
-+  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
- }
- template <size_t N, HWY_IF_LE64(int32_t, N)>
- HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
-@@ -2170,12 +2397,14 @@
- 
- HWY_INLINE Vec128<float> PromoteTo(Full128<float> /* tag */,
-                                    const Vec128<float16_t, 4> v) {
--  return Vec128<float>(vcvt_f32_f16(v.raw));
-+  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
-+  return Vec128<float>(f32);
- }
- template <size_t N>
- HWY_INLINE Vec128<float, N> PromoteTo(Simd<float, N> /* tag */,
-                                       const Vec128<float16_t, N> v) {
--  return Vec128<float, N>(vget_low_f32(vcvt_f32_f16(v.raw)));
-+  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
-+  return Vec128<float, N>(vget_low_f32(f32));
- }
- 
- #else
-@@ -2204,7 +2433,7 @@
- 
- #endif
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- 
- HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
-                                     const Vec128<float, 2> v) {
-@@ -2298,12 +2527,13 @@
- 
- HWY_INLINE Vec128<float16_t, 4> DemoteTo(Simd<float16_t, 4> /* tag */,
-                                          const Vec128<float> v) {
--  return Vec128<float16_t, 4>{vcvt_f16_f32(v.raw)};
-+  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
- }
- template <size_t N>
- HWY_INLINE Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> /* tag */,
-                                          const Vec128<float, N> v) {
--  return Vec128<float16_t, N>{vcvt_f16_f32(vcombine_f32(v.raw, v.raw))};
-+  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
-+  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
- }
- 
- #else
-@@ -2339,7 +2569,7 @@
- }
- 
- #endif
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- 
- HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
-                                      const Vec128<double> v) {
-@@ -2397,7 +2627,7 @@
-                                       const Vec128<int32_t> v) {
-   Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
-   Vec128<int16_t, N> b;
--  uint16x8_t c = vcombine_s16(a.raw, b.raw);
-+  int16x8_t c = vcombine_s16(a.raw, b.raw);
-   return Vec128<int8_t, N>(vqmovn_s16(c));
- }
- 
-@@ -2426,7 +2656,7 @@
-   return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
- }
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- 
- HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
-                                     const Vec128<int64_t> v) {
-@@ -2451,7 +2681,7 @@
- 
- // ------------------------------ Round (IfThenElse, mask, logical)
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- // Toward nearest integer
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
- 
-@@ -2472,18 +2702,26 @@
- // representation, clearing the lowest 23-exp mantissa bits. This requires 9
- // integer operations and 3 constants, which is likely more expensive.
- 
-+namespace detail {
-+
-+// The original value is already the desired result if NaN or the magnitude is
-+// large (i.e. the value is already an integer).
-+template <size_t N>
-+HWY_API Mask128<float, N> UseInt(const Vec128<float, N> v) {
-+  return Abs(v) < Set(Simd<float, N>(), MantissaEnd<float>());
-+}
-+
-+}  // namespace detail
-+
- template <size_t N>
- HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
-   const Simd<float, N> df;
--  const Simd<int32_t, N> di;
-+  const RebindToSigned<decltype(df)> di;
- 
-   const auto integer = ConvertTo(di, v);  // round toward 0
-   const auto int_f = ConvertTo(df, integer);
- 
--  // The original value is already the desired result if NaN or the magnitude is
--  // large (i.e. the value is already an integer).
--  const auto max = Set(df, MantissaEnd<float>());
--  return IfThenElse(Abs(v) < max, int_f, v);
-+  return IfThenElse(detail::UseInt(v), int_f, v);
- }
- 
- template <size_t N>
-@@ -2506,7 +2744,7 @@
- template <size_t N>
- HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
-   const Simd<float, N> df;
--  const Simd<int32_t, N> di;
-+  const RebindToSigned<decltype(df)> di;
- 
-   const auto integer = ConvertTo(di, v);  // round toward 0
-   const auto int_f = ConvertTo(df, integer);
-@@ -2514,9 +2752,7 @@
-   // Truncating a positive non-integer ends up smaller; if so, add 1.
-   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
- 
--  // Keep original if NaN or the magnitude is large (already an int).
--  const auto max = Set(df, MantissaEnd<float>());
--  return IfThenElse(Abs(v) < max, int_f - neg1, v);
-+  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
- }
- 
- template <size_t N>
-@@ -2530,16 +2766,14 @@
-   // Truncating a negative non-integer ends up larger; if so, subtract 1.
-   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
- 
--  // Keep original if NaN or the magnitude is large (already an int).
--  const auto max = Set(df, MantissaEnd<float>());
--  return IfThenElse(Abs(v) < max, int_f + neg1, v);
-+  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
- }
- 
- #endif
- 
- // ------------------------------ NearestInt (Round)
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- 
- HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
-   return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
-@@ -2596,7 +2830,7 @@
- HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
-   return Vec128<float, 2>(vget_low_f32(v.raw));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
-   return Vec128<double, 1>(vget_low_f64(v.raw));
- }
-@@ -2629,7 +2863,7 @@
- HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
-   return Vec128<float, 2>(vget_high_f32(v.raw));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
-   return Vec128<double, 1>(vget_high_f64(v.raw));
- }
-@@ -2714,7 +2948,7 @@
- 
- // ------------------------------ Broadcast/splat any lane
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- // Unsigned
- template <int kLane>
- HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
-@@ -2886,7 +3120,7 @@
-                                    const Vec128<T> from) {
-   const Full128<T> d;
-   const Repartition<uint8_t, decltype(d)> d8;
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
-                                                BitCast(d8, from).raw)));
- #else
-@@ -2911,33 +3145,58 @@
-                                                 BitCast(d8, from).raw)));
- }
- 
--// ------------------------------ Hard-coded shuffles
-+// ------------------------------ TableLookupLanes
- 
--// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
--// Shuffle0321 rotates one lane to the right (the previous least-significant
--// lane is now most-significant). These could also be implemented via
--// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-+// Returned by SetTableIndices for use by TableLookupLanes.
-+template <typename T, size_t N>
-+struct Indices128 {
-+  typename detail::Raw128<T, N>::type raw;
-+};
- 
--// Swap 32-bit halves in 64-bits
--HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
--  return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
--}
--HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
--  return Vec128<int32_t, 2>(vrev64_s32(v.raw));
--}
--HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
--  return Vec128<float, 2>(vrev64_f32(v.raw));
-+template <typename T, size_t N, HWY_IF_LE128(T, N)>
-+HWY_INLINE Indices128<T, N> SetTableIndices(Simd<T, N> d, const int32_t* idx) {
-+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
-+  for (size_t i = 0; i < N; ++i) {
-+    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
-+  }
-+#endif
-+
-+  const Repartition<uint8_t, decltype(d)> d8;
-+  alignas(16) uint8_t control[16] = {0};
-+  for (size_t idx_lane = 0; idx_lane < N; ++idx_lane) {
-+    for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-+      control[idx_lane * sizeof(T) + idx_byte] =
-+          static_cast<uint8_t>(idx[idx_lane] * sizeof(T) + idx_byte);
-+    }
-+  }
-+  return Indices128<T, N>{BitCast(d, Load(d8, control)).raw};
- }
--HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
--  return Vec128<uint32_t>(vrev64q_u32(v.raw));
-+
-+template <size_t N>
-+HWY_INLINE Vec128<uint32_t, N> TableLookupLanes(
-+    const Vec128<uint32_t, N> v, const Indices128<uint32_t, N> idx) {
-+  return TableLookupBytes(v, Vec128<uint32_t, N>{idx.raw});
- }
--HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
--  return Vec128<int32_t>(vrev64q_s32(v.raw));
-+template <size_t N>
-+HWY_INLINE Vec128<int32_t, N> TableLookupLanes(
-+    const Vec128<int32_t, N> v, const Indices128<int32_t, N> idx) {
-+  return TableLookupBytes(v, Vec128<int32_t, N>{idx.raw});
- }
--HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
--  return Vec128<float>(vrev64q_f32(v.raw));
-+template <size_t N>
-+HWY_INLINE Vec128<float, N> TableLookupLanes(const Vec128<float, N> v,
-+                                             const Indices128<float, N> idx) {
-+  const Simd<int32_t, N> di;
-+  const auto idx_i = BitCast(di, Vec128<float, N>{idx.raw});
-+  return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
- }
- 
-+// ------------------------------ Other shuffles (TableLookupBytes)
-+
-+// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-+// Shuffle0321 rotates one lane to the right (the previous least-significant
-+// lane is now most-significant). These could also be implemented via
-+// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-+
- // Swap 64-bit halves
- template <typename T>
- HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
-@@ -2975,49 +3234,6 @@
-   return TableLookupBytes(v, BitCast(d, Load(d8, bytes)));
- }
- 
--// ------------------------------ TableLookupLanes
--
--// Returned by SetTableIndices for use by TableLookupLanes.
--template <typename T>
--struct Indices128 {
--  uint8x16_t raw;
--};
--
--template <typename T>
--HWY_INLINE Indices128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
--#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
--  const size_t N = 16 / sizeof(T);
--  for (size_t i = 0; i < N; ++i) {
--    HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
--  }
--#endif
--
--  const Full128<uint8_t> d8;
--  alignas(16) uint8_t control[16];
--  for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
--    const size_t idx_lane = idx_byte / sizeof(T);
--    const size_t mod = idx_byte % sizeof(T);
--    control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
--  }
--  return Indices128<T>{Load(d8, control).raw};
--}
--
--HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
--                                             const Indices128<uint32_t> idx) {
--  return TableLookupBytes(v, Vec128<uint32_t>(idx.raw));
--}
--HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
--                                            const Indices128<int32_t> idx) {
--  return TableLookupBytes(v, Vec128<int32_t>(idx.raw));
--}
--HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
--                                          const Indices128<float> idx) {
--  const Full128<int32_t> di;
--  const Full128<float> df;
--  return BitCast(df,
--                 TableLookupBytes(BitCast(di, v), Vec128<int32_t>(idx.raw)));
--}
--
- // ------------------------------ Interleave lanes
- 
- // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
-@@ -3029,7 +3245,7 @@
- HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
- HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- // For 64 bit types, we only have the "q" version of the function defined as
- // interleaving 64-wide registers with 64-wide types in them makes no sense.
- HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
-@@ -3079,7 +3295,7 @@
-                                          const Vec128<float> b) {
-   return Vec128<float>(vzip1q_f32(a.raw, b.raw));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
-                                           const Vec128<double> b) {
-   return Vec128<double>(vzip1q_f64(a.raw, b.raw));
-@@ -3090,10 +3306,10 @@
-                                          const Vec128<float> b) {
-   return Vec128<float>(vzip2q_f32(a.raw, b.raw));
- }
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
-                                           const Vec128<double> b) {
--  return Vec128<double>(vzip2q_s64(a.raw, b.raw));
-+  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
- }
- #endif
- 
-@@ -3105,119 +3321,125 @@
- // Full vectors
- HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
-                                      const Vec128<uint8_t> b) {
--  return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
-+  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip1q_u8(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
-                                      const Vec128<uint16_t> b) {
--  return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
-+  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip1q_u16(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
-                                      const Vec128<uint32_t> b) {
--  return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
-+  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip1q_u32(a.raw, b.raw)));
- }
- 
- HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
-                                     const Vec128<int8_t> b) {
--  return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
-+  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip1q_s8(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
-                                     const Vec128<int16_t> b) {
--  return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
-+  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip1q_s16(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
-                                     const Vec128<int32_t> b) {
--  return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
-+  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip1q_s32(a.raw, b.raw)));
- }
- 
- HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
-                                      const Vec128<uint8_t> b) {
--  return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
-+  return Vec128<uint16_t>(vreinterpretq_u16_u8(vzip2q_u8(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
-                                      const Vec128<uint16_t> b) {
--  return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
-+  return Vec128<uint32_t>(vreinterpretq_u32_u16(vzip2q_u16(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
-                                      const Vec128<uint32_t> b) {
--  return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
-+  return Vec128<uint64_t>(vreinterpretq_u64_u32(vzip2q_u32(a.raw, b.raw)));
- }
- 
- HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
-                                     const Vec128<int8_t> b) {
--  return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
-+  return Vec128<int16_t>(vreinterpretq_s16_s8(vzip2q_s8(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
-                                     const Vec128<int16_t> b) {
--  return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
-+  return Vec128<int32_t>(vreinterpretq_s32_s16(vzip2q_s16(a.raw, b.raw)));
- }
- HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
-                                     const Vec128<int32_t> b) {
--  return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
-+  return Vec128<int64_t>(vreinterpretq_s64_s32(vzip2q_s32(a.raw, b.raw)));
- }
- 
- // Half vectors or less
- template <size_t N, HWY_IF_LE64(uint8_t, N)>
- HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
-                                                   const Vec128<uint8_t, N> b) {
--  return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
-+  return Vec128<uint16_t, (N + 1) / 2>(
-+      vreinterpret_u16_u8(vzip1_u8(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(uint16_t, N)>
- HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
-                                                   const Vec128<uint16_t, N> b) {
--  return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
-+  return Vec128<uint32_t, (N + 1) / 2>(
-+      vreinterpret_u32_u16(vzip1_u16(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(uint32_t, N)>
- HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
-                                                   const Vec128<uint32_t, N> b) {
--  return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
-+  return Vec128<uint64_t, (N + 1) / 2>(
-+      vreinterpret_u64_u32(vzip1_u32(a.raw, b.raw)));
- }
- 
- template <size_t N, HWY_IF_LE64(int8_t, N)>
- HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
-                                                  const Vec128<int8_t, N> b) {
--  return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
-+  return Vec128<int16_t, (N + 1) / 2>(
-+      vreinterpret_s16_s8(vzip1_s8(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(int16_t, N)>
- HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
-                                                  const Vec128<int16_t, N> b) {
--  return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
-+  return Vec128<int32_t, (N + 1) / 2>(
-+      vreinterpret_s32_s16(vzip1_s16(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(int32_t, N)>
- HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
-                                                  const Vec128<int32_t, N> b) {
--  return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
-+  return Vec128<int64_t, (N + 1) / 2>(
-+      vreinterpret_s64_s32(vzip1_s32(a.raw, b.raw)));
- }
- 
- template <size_t N, HWY_IF_LE64(uint8_t, N)>
- HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
-                                             const Vec128<uint8_t, N> b) {
--  return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
-+  return Vec128<uint16_t, N / 2>(vreinterpret_u16_u8(vzip2_u8(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(uint16_t, N)>
- HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
-                                             const Vec128<uint16_t, N> b) {
--  return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
-+  return Vec128<uint32_t, N / 2>(vreinterpret_u32_u16(vzip2_u16(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(uint32_t, N)>
- HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
-                                             const Vec128<uint32_t, N> b) {
--  return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
-+  return Vec128<uint64_t, N / 2>(vreinterpret_u64_u32(vzip2_u32(a.raw, b.raw)));
- }
- 
- template <size_t N, HWY_IF_LE64(int8_t, N)>
- HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
-                                            const Vec128<int8_t, N> b) {
--  return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
-+  return Vec128<int16_t, N / 2>(vreinterpret_s16_s8(vzip2_s8(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(int16_t, N)>
- HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
-                                            const Vec128<int16_t, N> b) {
--  return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
-+  return Vec128<int32_t, N / 2>(vreinterpret_s32_s16(vzip2_s16(a.raw, b.raw)));
- }
- template <size_t N, HWY_IF_LE64(int32_t, N)>
- HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
-                                            const Vec128<int32_t, N> b) {
--  return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
-+  return Vec128<int64_t, N / 2>(vreinterpret_s64_s32(vzip2_s32(a.raw, b.raw)));
- }
- 
- // ------------------------------ Blocks
-@@ -3274,84 +3496,113 @@
- 
- // ================================================== MISC
- 
--// Returns a vector with lane i=[0, N) set to "first" + i.
--template <typename T, size_t N, typename T2>
--Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
--  HWY_ALIGN T lanes[16 / sizeof(T)];
--  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
--    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-+// ------------------------------ Scatter (Store)
-+
-+template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
-+HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
-+                           const Vec128<Offset, N> offset) {
-+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-+
-+  alignas(16) T lanes[N];
-+  Store(v, d, lanes);
-+
-+  alignas(16) Offset offset_lanes[N];
-+  Store(offset, Simd<Offset, N>(), offset_lanes);
-+
-+  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-+  for (size_t i = 0; i < N; ++i) {
-+    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-   }
--  return Load(d, lanes);
- }
- 
--// ------------------------------ Gather (requires GetLane)
-+template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
-+HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
-+                          const Vec128<Index, N> index) {
-+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-+
-+  alignas(16) T lanes[N];
-+  Store(v, d, lanes);
-+
-+  alignas(16) Index index_lanes[N];
-+  Store(index, Simd<Index, N>(), index_lanes);
-+
-+  for (size_t i = 0; i < N; ++i) {
-+    base[index_lanes[i]] = lanes[i];
-+  }
-+}
-+
-+// ------------------------------ Gather (Load/Store)
- 
- template <typename T, size_t N, typename Offset>
- HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
-                                   const T* HWY_RESTRICT base,
-                                   const Vec128<Offset, N> offset) {
--  static_assert(N == 1, "NEON does not support full gather");
--  static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
--  const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
--  T val;
--  CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
--  return Set(d, val);
-+  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-+
-+  alignas(16) Offset offset_lanes[N];
-+  Store(offset, Simd<Offset, N>(), offset_lanes);
-+
-+  alignas(16) T lanes[N];
-+  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
-+  for (size_t i = 0; i < N; ++i) {
-+    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
-+  }
-+  return Load(d, lanes);
- }
- 
- template <typename T, size_t N, typename Index>
- HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
-                                  const Vec128<Index, N> index) {
--  static_assert(N == 1, "NEON does not support full gather");
--  static_assert(sizeof(T) == sizeof(Index), "T must match Index");
--  return Set(d, base[GetLane(index)]);
-+  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-+
-+  alignas(16) Index index_lanes[N];
-+  Store(index, Simd<Index, N>(), index_lanes);
-+
-+  alignas(16) T lanes[N];
-+  for (size_t i = 0; i < N; ++i) {
-+    lanes[i] = base[index_lanes[i]];
-+  }
-+  return Load(d, lanes);
- }
- 
--// ------------------------------ ARMv7 int64 comparisons (requires Shuffle2301)
-+// ------------------------------ Reductions
- 
--#if !defined(__aarch64__)
-+namespace detail {
- 
--template <size_t N>
--HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
--                                          const Vec128<int64_t, N> b) {
--  const Simd<int32_t, N * 2> d32;
--  const Simd<int64_t, N> d64;
--  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
--  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
--  return MaskFromVec(BitCast(d64, cmp64));
-+// N=1 for any T: no-op
-+template <typename T>
-+HWY_API Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
-+  return v;
- }
--
--template <size_t N>
--HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
--                                           const Vec128<uint64_t, N> b) {
--  const Simd<uint32_t, N * 2> d32;
--  const Simd<uint64_t, N> d64;
--  const auto cmp32 = VecFromMask(d32, BitCast(d32, a) == BitCast(d32, b));
--  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
--  return MaskFromVec(BitCast(d64, cmp64));
-+template <typename T>
-+HWY_API Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-+                                const Vec128<T, 1> v) {
-+  return v;
-+}
-+template <typename T>
-+HWY_API Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-+                                const Vec128<T, 1> v) {
-+  return v;
- }
- 
--HWY_INLINE Mask128<int64_t> operator<(const Vec128<int64_t> a,
--                                      const Vec128<int64_t> b) {
--  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
--  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
-+// u32/i32/f32: N=2
-+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-+HWY_API Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
-+  return v10 + Shuffle2301(v10);
- }
--HWY_INLINE Mask128<int64_t, 1> operator<(const Vec128<int64_t, 1> a,
--                                         const Vec128<int64_t, 1> b) {
--  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
--  return MaskFromVec(BroadcastSignBit(Vec128<int64_t, 1>(sub)));
-+template <typename T>
-+HWY_API Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
-+                                const Vec128<T, 2> v10) {
-+  return Min(v10, Shuffle2301(v10));
- }
--
--template <size_t N>
--HWY_INLINE Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
--                                         const Vec128<int64_t, N> b) {
--  return b < a;
-+template <typename T>
-+HWY_API Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-+                                const Vec128<T, 2> v10) {
-+  return Max(v10, Shuffle2301(v10));
- }
--#endif
- 
--// ------------------------------ Reductions
--
--#if defined(__aarch64__)
--// Supported for 32b and 64b vector types. Returns the sum in each lane.
-+// full vectors
-+#if HWY_ARCH_ARM_A64
- HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
-   return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
- }
-@@ -3398,20 +3649,15 @@
- }
- #endif
- 
--namespace detail {
--
--// For u32/i32/f32.
--template <typename T, size_t N>
--HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
--                                const Vec128<T, N> v3210) {
-+template <typename T>
-+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
-   const Vec128<T> v1032 = Shuffle1032(v3210);
-   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
-   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-   return Min(v20_31_20_31, v31_20_31_20);
- }
--template <typename T, size_t N>
--HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
--                                const Vec128<T, N> v3210) {
-+template <typename T>
-+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128<T> v3210) {
-   const Vec128<T> v1032 = Shuffle1032(v3210);
-   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
-   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-@@ -3419,15 +3665,13 @@
- }
- 
- // For u64/i64[/f64].
--template <typename T, size_t N>
--HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
--                                const Vec128<T, N> v10) {
-+template <typename T>
-+HWY_API Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
-   const Vec128<T> v01 = Shuffle01(v10);
-   return Min(v10, v01);
- }
--template <typename T, size_t N>
--HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
--                                const Vec128<T, N> v10) {
-+template <typename T>
-+HWY_API Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128<T> v10) {
-   const Vec128<T> v01 = Shuffle01(v10);
-   return Max(v10, v01);
- }
-@@ -3435,6 +3679,10 @@
- }  // namespace detail
- 
- template <typename T, size_t N>
-+HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
-+  return detail::SumOfLanes(v);
-+}
-+template <typename T, size_t N>
- HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
-   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
- }
-@@ -3457,18 +3705,18 @@
-   const Vec128<uint8_t> values =
-       BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   // Can't vaddv - we need two separate bytes (16 bits).
-   const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
-   const uint8x8_t x4 = vpadd_u8(x2, x2);
-   const uint8x8_t x8 = vpadd_u8(x4, x4);
--  return vreinterpret_u16_u8(x8)[0];
-+  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
- #else
-   // Don't have vpaddq, so keep doubling lane size.
-   const uint16x8_t x2 = vpaddlq_u8(values.raw);
-   const uint32x4_t x4 = vpaddlq_u16(x2);
-   const uint64x2_t x8 = vpaddlq_u32(x4);
--  return (uint64_t(x8[1]) << 8) | x8[0];
-+  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
- #endif
- }
- 
-@@ -3484,7 +3732,7 @@
-   const Vec128<uint8_t, N> slice(Load(Simd<uint8_t, 8>(), kSliceLanes).raw);
-   const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddv_u8(values.raw);
- #else
-   const uint16x4_t x2 = vpaddl_u8(values.raw);
-@@ -3503,7 +3751,7 @@
-   const Full128<uint16_t> du;
-   const Vec128<uint16_t> values =
-       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_u16(values.raw);
- #else
-   const uint32x4_t x2 = vpaddlq_u16(values.raw);
-@@ -3522,7 +3770,7 @@
-   const Simd<uint16_t, N> du;
-   const Vec128<uint16_t, N> slice(Load(Simd<uint16_t, 4>(), kSliceLanes).raw);
-   const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddv_u16(values.raw);
- #else
-   const uint32x2_t x2 = vpaddl_u16(values.raw);
-@@ -3539,7 +3787,7 @@
-   const Full128<uint32_t> du;
-   const Vec128<uint32_t> values =
-       BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_u32(values.raw);
- #else
-   const uint64x2_t x2 = vpaddlq_u32(values.raw);
-@@ -3557,7 +3805,7 @@
-   const Simd<uint32_t, N> du;
-   const Vec128<uint32_t, N> slice(Load(Simd<uint32_t, 2>(), kSliceLanes).raw);
-   const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddv_u32(values.raw);
- #else
-   const uint64x1_t x2 = vpaddl_u32(values.raw);
-@@ -3572,7 +3820,7 @@
-   const Full128<uint64_t> du;
-   const Vec128<uint64_t> values =
-       BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_u64(values.raw);
- #else
-   return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
-@@ -3612,13 +3860,13 @@
-   const int8x16_t ones =
-       vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_s8(ones);
- #else
-   const int16x8_t x2 = vpaddlq_s8(ones);
-   const int32x4_t x4 = vpaddlq_s16(x2);
-   const int64x2_t x8 = vpaddlq_s32(x4);
--  return x8[0] + x8[1];
-+  return vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1);
- #endif
- }
- template <typename T>
-@@ -3627,12 +3875,12 @@
-   const int16x8_t ones =
-       vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_s16(ones);
- #else
-   const int32x4_t x2 = vpaddlq_s16(ones);
-   const int64x2_t x4 = vpaddlq_s32(x2);
--  return x4[0] + x4[1];
-+  return vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1);
- #endif
- }
- 
-@@ -3642,26 +3890,26 @@
-   const int32x4_t ones =
-       vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
- 
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   return vaddvq_s32(ones);
- #else
-   const int64x2_t x2 = vpaddlq_s32(ones);
--  return x2[0] + x2[1];
-+  return vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1);
- #endif
- }
- 
- template <typename T>
- HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
--#if defined(__aarch64__)
-+#if HWY_ARCH_ARM_A64
-   const Full128<int64_t> di;
-   const int64x2_t ones =
-       vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-   return vaddvq_s64(ones);
- #else
--  const Full128<int64_t> di;
--  const int64x2_t ones =
--      vshrq_n_u64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw, 63);
--  return ones[0] + ones[1];
-+  const Full128<uint64_t> du;
-+  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
-+  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
-+  return vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1);
- #endif
- }
- 
-@@ -3690,9 +3938,13 @@
- // Full
- template <typename T>
- HWY_INLINE bool AllFalse(const Mask128<T> m) {
-+#if HWY_ARCH_ARM_A64
-+  return (vmaxvq_u32(m.raw) == 0);
-+#else
-   const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(Full128<T>(), m));
-   uint32x2_t a = vqmovn_u64(v64.raw);
--  return vreinterpret_u64_u32(a)[0] == 0;
-+  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
-+#endif
- }
- 
- // Partial
-@@ -3711,8 +3963,160 @@
- 
- namespace detail {
- 
-+// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
-+HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
-+                                      const uint8_t* bytes) {
-+  return Vec128<uint8_t>(vreinterpretq_u8_u64(
-+      vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
-+}
-+
-+// Load 8 bytes and return half-reg with N <= 8 bytes.
-+template <size_t N, HWY_IF_LE64(uint8_t, N)>
-+HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N> d,
-+                                         const uint8_t* bytes) {
-+  return Load(d, bytes);
-+}
-+
-+template <typename T, size_t N>
-+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
-+                                    const uint64_t mask_bits) {
-+  HWY_DASSERT(mask_bits < 256);
-+  const Simd<T, N> d;
-+  const Repartition<uint8_t, decltype(d)> d8;
-+  const Simd<uint16_t, N> du;
-+
-+  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
-+  // indices for VTBL (one vector's worth for each of 256 combinations of
-+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
-+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
-+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
-+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
-+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
-+  // is likely more costly than the higher cache footprint from storing bytes.
-+  alignas(16) constexpr uint8_t table[256 * 8] = {
-+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
-+      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
-+      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
-+      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
-+      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
-+      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
-+      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
-+      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
-+      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
-+      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
-+      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
-+      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
-+      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
-+      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
-+      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
-+      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
-+      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
-+      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
-+      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
-+      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
-+      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
-+      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
-+      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
-+      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
-+      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
-+      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
-+      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
-+      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
-+      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
-+      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
-+      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
-+      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
-+      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
-+      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
-+      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
-+      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
-+      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
-+      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
-+      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
-+      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
-+      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
-+      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
-+      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
-+      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
-+      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
-+      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
-+      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
-+      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
-+      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
-+      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
-+      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
-+      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
-+      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
-+      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
-+      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
-+      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
-+      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
-+      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
-+      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
-+      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
-+      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
-+      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
-+      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
-+      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
-+      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
-+      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
-+      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
-+      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
-+      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
-+      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
-+      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
-+      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
-+      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
-+      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
-+      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
-+      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
-+      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
-+      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
-+      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
-+      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
-+      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
-+      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
-+      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
-+      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
-+      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
-+      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
-+      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
-+      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
-+      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
-+      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
-+      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
-+      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
-+      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
-+      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
-+      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
-+      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
-+      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
-+      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
-+      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
-+      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
-+      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
-+      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
-+      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
-+      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
-+      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
-+      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
-+      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
-+      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
-+      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
-+      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
-+      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
-+      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
-+      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
-+      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
-+
-+  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
-+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-+  return BitCast(d, pairs + Set(du, 0x0100));
-+}
-+
- template <typename T, size_t N>
--HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
-+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
-+                                    const uint64_t mask_bits) {
-   HWY_DASSERT(mask_bits < 16);
- 
-   // There are only 4 lanes, so we can afford to load the index vector directly.
-@@ -3742,7 +4146,8 @@
- #if HWY_CAP_INTEGER64 || HWY_CAP_FLOAT64
- 
- template <typename T, size_t N>
--HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
-+HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
-+                                    const uint64_t mask_bits) {
-   HWY_DASSERT(mask_bits < 4);
- 
-   // There are only 2 lanes, so we can afford to load the index vector directly.
-@@ -3761,59 +4166,15 @@
- 
- // Helper function called by both Compress and CompressStore - avoids a
- // redundant BitsFromMask in the latter.
--
--template <size_t N>
--HWY_API Vec128<uint32_t, N> Compress(Vec128<uint32_t, N> v,
--                                     const uint64_t mask_bits) {
--  const auto idx = detail::Idx32x4FromBits<uint32_t, N>(mask_bits);
--  return TableLookupBytes(v, idx);
--}
--template <size_t N>
--HWY_API Vec128<int32_t, N> Compress(Vec128<int32_t, N> v,
--                                    const uint64_t mask_bits) {
--  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
--  return TableLookupBytes(v, idx);
--}
--
--#if HWY_CAP_INTEGER64
--
--template <size_t N>
--HWY_API Vec128<uint64_t, N> Compress(Vec128<uint64_t, N> v,
--                                     const uint64_t mask_bits) {
--  const auto idx = detail::Idx64x2FromBits<uint64_t, N>(mask_bits);
--  return TableLookupBytes(v, idx);
--}
--template <size_t N>
--HWY_API Vec128<int64_t, N> Compress(Vec128<int64_t, N> v,
--                                    const uint64_t mask_bits) {
--  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
--  return TableLookupBytes(v, idx);
--}
--
--#endif
--
--template <size_t N>
--HWY_API Vec128<float, N> Compress(Vec128<float, N> v,
--                                  const uint64_t mask_bits) {
--  const auto idx = detail::Idx32x4FromBits<int32_t, N>(mask_bits);
--  const Simd<float, N> df;
--  const Simd<int32_t, N> di;
--  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
--}
--
--#if HWY_CAP_FLOAT64
--
--template <size_t N>
--HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
--                                   const uint64_t mask_bits) {
--  const auto idx = detail::Idx64x2FromBits<int64_t, N>(mask_bits);
--  const Simd<double, N> df;
--  const Simd<int64_t, N> di;
--  return BitCast(df, TableLookupBytes(BitCast(di, v), idx));
-+template <typename T, size_t N>
-+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
-+  const auto idx =
-+      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
-+  using D = Simd<T, N>;
-+  const RebindToSigned<D> di;
-+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
- }
- 
--#endif
--
- }  // namespace detail
- 
- template <typename T, size_t N>
-@@ -3831,6 +4192,79 @@
-   return PopCount(mask_bits);
- }
- 
-+// ------------------------------ StoreInterleaved3
-+
-+// 128 bits
-+HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
-+                               const Vec128<uint8_t> v1,
-+                               const Vec128<uint8_t> v2,
-+                               Full128<uint8_t> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  const uint8x16x3_t triple = {v0.raw, v1.raw, v2.raw};
-+  vst3q_u8(unaligned, triple);
-+}
-+
-+// 64 bits
-+HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
-+                               const Vec128<uint8_t, 8> v1,
-+                               const Vec128<uint8_t, 8> v2,
-+                               Simd<uint8_t, 8> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
-+  vst3_u8(unaligned, triple);
-+}
-+
-+// <= 32 bits: avoid writing more than N bytes by copying to buffer
-+template <size_t N, HWY_IF_LE32(uint8_t, N)>
-+HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
-+                               const Vec128<uint8_t, N> v1,
-+                               const Vec128<uint8_t, N> v2,
-+                               Simd<uint8_t, N> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  alignas(16) uint8_t buf[24];
-+  const uint8x8x3_t triple = {v0.raw, v1.raw, v2.raw};
-+  vst3_u8(buf, triple);
-+  CopyBytes<N * 3>(buf, unaligned);
-+}
-+
-+// ------------------------------ StoreInterleaved4
-+
-+// 128 bits
-+HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
-+                               const Vec128<uint8_t> v1,
-+                               const Vec128<uint8_t> v2,
-+                               const Vec128<uint8_t> v3,
-+                               Full128<uint8_t> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  const uint8x16x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
-+  vst4q_u8(unaligned, quad);
-+}
-+
-+// 64 bits
-+HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> v0,
-+                               const Vec128<uint8_t, 8> v1,
-+                               const Vec128<uint8_t, 8> v2,
-+                               const Vec128<uint8_t, 8> v3,
-+                               Simd<uint8_t, 8> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
-+  vst4_u8(unaligned, quad);
-+}
-+
-+// <= 32 bits: avoid writing more than N bytes by copying to buffer
-+template <size_t N, HWY_IF_LE32(uint8_t, N)>
-+HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> v0,
-+                               const Vec128<uint8_t, N> v1,
-+                               const Vec128<uint8_t, N> v2,
-+                               const Vec128<uint8_t, N> v3,
-+                               Simd<uint8_t, N> /*tag*/,
-+                               uint8_t* HWY_RESTRICT unaligned) {
-+  alignas(16) uint8_t buf[32];
-+  const uint8x8x4_t quad = {v0.raw, v1.raw, v2.raw, v3.raw};
-+  vst4_u8(buf, quad);
-+  CopyBytes<N * 4>(buf, unaligned);
-+}
-+
- // ================================================== Operator wrapper
- 
- // These apply to all x86_*-inl.h because there are no restrictions on V.
-@@ -3885,7 +4319,8 @@
-   return a <= b;
- }
- 
--#if !defined(__aarch64__)
-+namespace detail {  // for code folding
-+#if HWY_ARCH_ARM_V7
- #undef vuzp1_s8
- #undef vuzp1_u8
- #undef vuzp1_s16
-@@ -3972,8 +4407,9 @@
- #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
- #undef HWY_NEON_DEF_FUNCTION_UINTS
- #undef HWY_NEON_EVAL
-+}  // namespace detail
- 
- // NOLINTNEXTLINE(google-readability-namespace-comments)
- }  // namespace HWY_NAMESPACE
- }  // namespace hwy
--HWY_AFTER_NAMESPACE();
-+HWY_AFTER_NAMESPACE();
-\ No newline at end of file