trisquel-icecat/icecat/third_party/highway/hwy/ops/x86_128-inl.h

// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
// operations when compiling for those targets.
// External include guard in highway.h - see comment there.

// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
#include "hwy/base.h"

// Avoid uninitialized warnings in GCC's emmintrin.h - see
// https://github.com/google/highway/issues/710 and pull/902
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
                    ignored "-Wmaybe-uninitialized")
#endif

#include <emmintrin.h>
#include <stdio.h>
#if HWY_TARGET == HWY_SSSE3
#include <tmmintrin.h>  // SSSE3
#elif HWY_TARGET <= HWY_SSE4
#include <smmintrin.h>  // SSE4
#ifndef HWY_DISABLE_PCLMUL_AES
#include <wmmintrin.h>  // CLMUL
#endif
#endif

#include "hwy/ops/shared-inl.h"

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {

// Enable generic functions for whichever of (f16, bf16) are not supported.
#if !HWY_HAVE_FLOAT16
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
#else
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
#endif

template <typename T>
struct Raw128 {
  using type = __m128i;
};
#if HWY_HAVE_FLOAT16
template <>
struct Raw128<float16_t> {
  using type = __m128h;
};
#endif  // HWY_HAVE_FLOAT16
template <>
struct Raw128<float> {
  using type = __m128;
};
template <>
struct Raw128<double> {
  using type = __m128d;
};

}  // namespace detail

template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
  using Raw = typename detail::Raw128<T>::type;

 public:
  using PrivateT = T;                     // only for DFromV
  static constexpr size_t kPrivateN = N;  // only for DFromV

  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    return *this = (*this * other);
  }
  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    return *this = (*this / other);
  }
  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    return *this = (*this + other);
  }
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    return *this = (*this - other);
  }
  HWY_INLINE Vec128& operator%=(const Vec128 other) {
    return *this = (*this % other);
  }
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    return *this = (*this & other);
  }
  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    return *this = (*this | other);
  }
  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    return *this = (*this ^ other);
  }

  Raw raw;
};

template <typename T>
using Vec64 = Vec128<T, 8 / sizeof(T)>;

template <typename T>
using Vec32 = Vec128<T, 4 / sizeof(T)>;

template <typename T>
using Vec16 = Vec128<T, 2 / sizeof(T)>;

#if HWY_TARGET <= HWY_AVX3

namespace detail {

// Template arg: sizeof(lane type)
template <size_t size>
struct RawMask128 {};
template <>
struct RawMask128<1> {
  using type = __mmask16;
};
template <>
struct RawMask128<2> {
  using type = __mmask8;
};
template <>
struct RawMask128<4> {
  using type = __mmask8;
};
template <>
struct RawMask128<8> {
  using type = __mmask8;
};

}  // namespace detail

template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
  using Raw = typename detail::RawMask128<sizeof(T)>::type;

  static Mask128<T, N> FromBits(uint64_t mask_bits) {
    return Mask128<T, N>{static_cast<Raw>(mask_bits)};
  }

  Raw raw;
};

#else  // AVX2 or below

// FF..FF or 0.
template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
  typename detail::Raw128<T>::type raw;
};

#endif  // AVX2 or below

namespace detail {

// Returns the lowest N of the _mm_movemask* bits.
template <typename T, size_t N>
constexpr uint64_t OnlyActive(uint64_t mask_bits) {
  return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
}

}  // namespace detail

#if HWY_TARGET <= HWY_AVX3
namespace detail {

// Used by Expand() emulation, which is required for both AVX3 and AVX2.
template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
  return OnlyActive<T, N>(mask.raw);
}

}  // namespace detail
#endif  // HWY_TARGET <= HWY_AVX3

template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

template <class V>
using TFromV = typename V::PrivateT;

// ------------------------------ Zero

// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
}
#endif  // HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
}

// Using the existing Zero function instead of a dedicated function for
// deduction avoids having to forward-declare Vec256 here.
template <class D>
using VFromD = decltype(Zero(D()));

// ------------------------------ Tuple (VFromD)
#include "hwy/ops/tuple-inl.h"

// ------------------------------ BitCast

namespace detail {

HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
#if HWY_HAVE_FLOAT16
HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); }
#endif  // HWY_HAVE_FLOAT16
HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }

template <typename T, size_t N>
HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
}

// Cannot rely on function overloading because return types differ.
template <typename T>
struct BitCastFromInteger128 {
  HWY_INLINE __m128i operator()(__m128i v) { return v; }
};
#if HWY_HAVE_FLOAT16
template <>
struct BitCastFromInteger128<float16_t> {
  HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); }
};
#endif  // HWY_HAVE_FLOAT16
template <>
struct BitCastFromInteger128<float> {
  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
};
template <>
struct BitCastFromInteger128<double> {
  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
};

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
                                     Vec128<uint8_t, D().MaxBytes()> v) {
  return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
}

}  // namespace detail

template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> BitCast(D d,
                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}

// ------------------------------ Set

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
  return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
  return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
  return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
  return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> Set(D /* tag */, float16_t t) {
  return VFromD<D>{_mm_set1_ph(t)};
}
#endif  // HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> Set(D /* tag */, float t) {
  return VFromD<D>{_mm_set1_ps(t)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> Set(D /* tag */, double t) {
  return VFromD<D>{_mm_set1_pd(t)};
}

// Generic for all vector lengths.
template <class D, HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Set(D df, TFromD<D> t) {
  const RebindToUnsigned<decltype(df)> du;
  static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
  uint16_t bits;
  CopyBytes<2>(&t, &bits);
  return BitCast(df, Set(du, bits));
}

// ------------------------------ Undefined

HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")

// Returns a vector with uninitialized elements.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> Undefined(D /* tag */) {
  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
  // generate an XOR instruction.
  return VFromD<D>{_mm_undefined_si128()};
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> Undefined(D /* tag */) {
  return VFromD<D>{_mm_undefined_ph()};
}
#endif  // HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> Undefined(D /* tag */) {
  return VFromD<D>{_mm_undefined_ps()};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> Undefined(D /* tag */) {
  return VFromD<D>{_mm_undefined_pd()};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Undefined(D /* tag */) {
  return VFromD<D>{_mm_undefined_si128()};
}

HWY_DIAGNOSTICS(pop)

// ------------------------------ GetLane

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API T GetLane(const Vec128<T, N> v) {
  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API T GetLane(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const uint16_t bits =
      static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
  return BitCastScalar<T>(bits);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API T GetLane(const Vec128<T, N> v) {
  return static_cast<T>(_mm_cvtsi128_si32(v.raw));
}
template <size_t N>
HWY_API float GetLane(const Vec128<float, N> v) {
  return _mm_cvtss_f32(v.raw);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API T GetLane(const Vec128<T, N> v) {
#if HWY_ARCH_X86_32
  const DFromV<decltype(v)> d;
  alignas(16) T lanes[2];
  Store(v, d, lanes);
  return lanes[0];
#else
  return static_cast<T>(_mm_cvtsi128_si64(v.raw));
#endif
}
template <size_t N>
HWY_API double GetLane(const Vec128<double, N> v) {
  return _mm_cvtsd_f64(v.raw);
}

// ------------------------------ ResizeBitCast

template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
          HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
}

// ------------------------------ Dup128VecFromValues

template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
                                      TFromD<D> t11, TFromD<D> t12,
                                      TFromD<D> t13, TFromD<D> t14,
                                      TFromD<D> t15) {
  return VFromD<D>{_mm_setr_epi8(
      static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
      static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
      static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
      static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
      static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
      static_cast<char>(t15))};
}

template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  return VFromD<D>{
      _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
                     static_cast<int16_t>(t2), static_cast<int16_t>(t3),
                     static_cast<int16_t>(t4), static_cast<int16_t>(t5),
                     static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
}

// Generic for all vector lengths
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  const RebindToSigned<decltype(d)> di;
  return BitCast(d,
                 Dup128VecFromValues(
                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
}
#else
// Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
template <class D, HWY_IF_F16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
                                      TFromD<D> t5, TFromD<D> t6,
                                      TFromD<D> t7) {
  const RebindToSigned<decltype(d)> di;
  return BitCast(d,
                 Dup128VecFromValues(
                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
}
#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3) {
  return VFromD<D>{
      _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
                     static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
}

template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
                                      TFromD<D> t2, TFromD<D> t3) {
  return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
}

template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
  // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
  // available
  return VFromD<D>{
      _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))};
}

template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
  return VFromD<D>{_mm_setr_pd(t0, t1)};
}

// ================================================== LOGICAL

// ------------------------------ And

template <typename T, size_t N>
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;  // for float16_t
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>{
                        _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <size_t N>
HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> And(Vec128<double, N> a, Vec128<double, N> b) {
  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
}

// ------------------------------ AndNot

// Returns ~not_mask & mask.
template <typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
  const DFromV<decltype(mask)> d;  // for float16_t
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
                        BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
}
template <size_t N>
HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask,
                                Vec128<float, N> mask) {
  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> AndNot(Vec128<double, N> not_mask,
                                 Vec128<double, N> mask) {
  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
}

// ------------------------------ Or

template <typename T, size_t N>
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;  // for float16_t
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>{
                        _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}

template <size_t N>
HWY_API Vec128<float, N> Or(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Or(Vec128<double, N> a, Vec128<double, N> b) {
  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
}

// ------------------------------ Xor

template <typename T, size_t N>
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;  // for float16_t
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, VFromD<decltype(du)>{
                        _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
}

template <size_t N>
HWY_API Vec128<float, N> Xor(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Xor(Vec128<double, N> a, Vec128<double, N> b) {
  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
}

// ------------------------------ Not
template <typename T, size_t N>
HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
#if HWY_TARGET <= HWY_AVX3
  const __m128i vu = BitCast(du, v).raw;
  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
#else
  return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
#endif
}

// ------------------------------ Xor3
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
#if HWY_TARGET <= HWY_AVX3
  const DFromV<decltype(x1)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const __m128i ret = _mm_ternarylogic_epi64(
      BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
  return BitCast(d, VU{ret});
#else
  return Xor(x1, Xor(x2, x3));
#endif
}

// ------------------------------ Or3
template <typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
#if HWY_TARGET <= HWY_AVX3
  const DFromV<decltype(o1)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const __m128i ret = _mm_ternarylogic_epi64(
      BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
  return BitCast(d, VU{ret});
#else
  return Or(o1, Or(o2, o3));
#endif
}

// ------------------------------ OrAnd
template <typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
#if HWY_TARGET <= HWY_AVX3
  const DFromV<decltype(o)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const __m128i ret = _mm_ternarylogic_epi64(
      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
  return BitCast(d, VU{ret});
#else
  return Or(o, And(a1, a2));
#endif
}

// ------------------------------ IfVecThenElse
template <typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
#if HWY_TARGET <= HWY_AVX3
  const DFromV<decltype(no)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  return BitCast(
      d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
                                   BitCast(du, no).raw, 0xCA)});
#else
  return IfThenElse(MaskFromVec(mask), yes, no);
#endif
}

// ------------------------------ BitwiseIfThenElse
#if HWY_TARGET <= HWY_AVX3

#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif

template <class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
  return IfVecThenElse(mask, yes, no);
}

#endif

// ------------------------------ Operator overloads (internal-only if float)

template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
  return And(a, b);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Or(a, b);
}

template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Xor(a, b);
}

// ------------------------------ PopulationCount

// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
#if HWY_TARGET <= HWY_AVX3_DL

#ifdef HWY_NATIVE_POPCNT
#undef HWY_NATIVE_POPCNT
#else
#define HWY_NATIVE_POPCNT
#endif

namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
                                        Vec128<T, N> v) {
  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
                                        Vec128<T, N> v) {
  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
                                        Vec128<T, N> v) {
  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
                                        Vec128<T, N> v) {
  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
}

#endif  // HWY_TARGET <= HWY_AVX3_DL

// ================================================== SIGN

// ------------------------------ Neg

// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) {
  return Xor(v, SignBit(DFromV<decltype(v)>()));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, const Vec128<T, N> v) {
  return Xor(v, SignBit(DFromV<decltype(v)>()));
}

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(hwy::SignedTag /*tag*/, const Vec128<T, N> v) {
  return Zero(DFromV<decltype(v)>()) - v;
}

}  // namespace detail

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
  return detail::Neg(hwy::TypeTag<T>(), v);
}

// ------------------------------ Floating-point Abs
// Generic for all vector lengths
template <class V, HWY_IF_FLOAT(TFromV<V>)>
HWY_API V Abs(V v) {
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  using TI = TFromD<decltype(di)>;
  return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
}

// ------------------------------ CopySign
// Generic for all vector lengths.
template <class V>
HWY_API V CopySign(const V magn, const V sign) {
  static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");

  const DFromV<decltype(magn)> d;
  const auto msb = SignBit(d);

  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
  //                  0    0     0   |  0
  //                  0    0     1   |  0
  //                  0    1     0   |  1
  //                  0    1     1   |  1
  //                  1    0     0   |  0
  //                  1    0     1   |  1
  //                  1    1     0   |  0
  //                  1    1     1   |  1
  return BitwiseIfThenElse(msb, sign, magn);
}

// ------------------------------ CopySignToAbs
// Generic for all vector lengths.
template <class V>
HWY_API V CopySignToAbs(const V abs, const V sign) {
  const DFromV<decltype(abs)> d;
  return OrAnd(abs, SignBit(d), sign);
}

// ================================================== MASK

#if HWY_TARGET <= HWY_AVX3
// ------------------------------ MaskFromVec

namespace detail {

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
                                     const Vec128<T, N> v) {
  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
                                     const Vec128<T, N> v) {
  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
                                     const Vec128<T, N> v) {
  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
                                     const Vec128<T, N> v) {
  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
}
// There do not seem to be native floating-point versions of these instructions.
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
}
#endif
template <size_t N>
HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
}
template <size_t N>
HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
}

template <class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));

// ------------------------------ MaskFalse (MFromD)

#ifdef HWY_NATIVE_MASK_FALSE
#undef HWY_NATIVE_MASK_FALSE
#else
#define HWY_NATIVE_MASK_FALSE
#endif

// Generic for all vector lengths
template <class D>
HWY_API MFromD<D> MaskFalse(D /*d*/) {
  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
}

// ------------------------------ PromoteMaskTo (MFromD)

#ifdef HWY_NATIVE_PROMOTE_MASK_TO
#undef HWY_NATIVE_PROMOTE_MASK_TO
#else
#define HWY_NATIVE_PROMOTE_MASK_TO
#endif

// AVX3 PromoteMaskTo is generic for all vector lengths
template <class DTo, class DFrom,
          HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
          class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
          hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
                                  MFromD<DFrom> m) {
  return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
}

// ------------------------------ DemoteMaskTo (MFromD)

#ifdef HWY_NATIVE_DEMOTE_MASK_TO
#undef HWY_NATIVE_DEMOTE_MASK_TO
#else
#define HWY_NATIVE_DEMOTE_MASK_TO
#endif

// AVX3 DemoteMaskTo is generic for all vector lengths
template <class DTo, class DFrom,
          HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
          class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
          hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
                                 MFromD<DFrom> m) {
  return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
}

// ------------------------------ CombineMasks (MFromD)

#ifdef HWY_NATIVE_COMBINE_MASKS
#undef HWY_NATIVE_COMBINE_MASKS
#else
#define HWY_NATIVE_COMBINE_MASKS
#endif

template <class D, HWY_IF_LANES_D(D, 2)>
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
                               MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const __mmask8 combined_mask = _kor_mask8(
      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1),
      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1)));
#else
  const auto combined_mask =
      (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}

template <class D, HWY_IF_LANES_D(D, 4)>
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
                               MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const __mmask8 combined_mask = _kor_mask8(
      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2),
      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3)));
#else
  const auto combined_mask =
      (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}

template <class D, HWY_IF_LANES_D(D, 8)>
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
                               MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const __mmask8 combined_mask = _kor_mask8(
      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4),
      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15)));
#else
  const auto combined_mask =
      (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}

template <class D, HWY_IF_LANES_D(D, 16)>
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
                               MFromD<Half<D>> lo) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const __mmask16 combined_mask = _mm512_kunpackb(
      static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw));
#else
  const auto combined_mask =
      ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
}

// ------------------------------ LowerHalfOfMask (MFromD)

#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
#else
#define HWY_NATIVE_LOWER_HALF_OF_MASK
#endif

// Generic for all vector lengths
template <class D>
HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
  using RawM = decltype(MFromD<D>().raw);
  constexpr size_t kN = MaxLanes(d);
  constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8;

  MFromD<D> result_mask{static_cast<RawM>(m.raw)};

  if (kN < kNumOfBitsInRawMask) {
    result_mask =
        And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)});
  }

  return result_mask;
}

// ------------------------------ UpperHalfOfMask (MFromD)

#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
#else
#define HWY_NATIVE_UPPER_HALF_OF_MASK
#endif

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1);
#else
  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1;
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}

template <class D, HWY_IF_LANES_D(D, 2)>
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2);
#else
  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2;
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}

template <class D, HWY_IF_LANES_D(D, 4)>
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4);
#else
  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4;
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}

template <class D, HWY_IF_LANES_D(D, 8)>
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8);
#else
  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8;
#endif

  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
}

// ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)

#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#else
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
#endif

// Generic for all vector lengths
template <class DTo, class DFrom,
          HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
          class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
          hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
                                          MFromD<DFrom> a, MFromD<DFrom> b) {
  using MH = MFromD<Half<DTo>>;
  using RawMH = decltype(MH().raw);

  return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)},
                      MH{static_cast<RawMH>(a.raw)});
}

// ------------------------------ VecFromMask

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
}

template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
  return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
}
#endif  // HWY_HAVE_FLOAT16

template <size_t N>
HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
}

template <size_t N>
HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
}

// Generic for all vector lengths.
template <class D>
HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
  return VecFromMask(v);
}

// ------------------------------ RebindMask (MaskFromVec)

template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
  return MFromD<DTo>{m.raw};
}

// ------------------------------ IfThenElse

namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
                                   Mask128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */,
                                   Mask128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */,
                                   Mask128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
                                   Mask128<T, N> mask, Vec128<T, N> yes,
                                   Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
}

}  // namespace detail

template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                Vec128<T, N> no) {
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
                                        Vec128<float16_t, N> yes,
                                        Vec128<float16_t, N> no) {
  return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// Generic for all vector lengths.
template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
  const RebindToUnsigned<D> du;
  return BitCast(
      D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
}

template <size_t N>
HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
                                    Vec128<float, N> yes, Vec128<float, N> no) {
  return Vec128<float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)};
}

template <size_t N>
HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
                                     Vec128<double, N> yes,
                                     Vec128<double, N> no) {
  return Vec128<double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)};
}

namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> yes) {
  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> yes) {
  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> yes) {
  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> yes) {
  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
}

}  // namespace detail

template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
}

template <size_t N>
HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask,
                                        Vec128<float, N> yes) {
  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
}

template <size_t N>
HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
                                         Vec128<double, N> yes) {
  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
}

// Generic for all vector lengths.
template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API V IfThenElseZero(MFromD<D> mask, V yes) {
  const RebindToUnsigned<D> du;
  return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
}

namespace detail {

template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> no) {
  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
                                       Mask128<T, N> mask, Vec128<T, N> no) {
  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
}

}  // namespace detail

template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
}

template <size_t N>
HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask,
                                        Vec128<float, N> no) {
  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
}

template <size_t N>
HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
                                         Vec128<double, N> no) {
  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
}

// Generic for all vector lengths.
template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
  const RebindToUnsigned<D> du;
  return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
}

// ------------------------------ Mask logical

// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
    HWY_COMPILER_CLANG >= 800
#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
#else
#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
#endif
#endif  // HWY_COMPILER_HAS_MASK_INTRINSICS

namespace detail {

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
#endif
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
                                const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
                                const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
                                const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
                                const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
#endif
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
                            const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
                            const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
                            const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
                            const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
#endif
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
                             const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
#endif
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
                                          const Mask128<T, N> a,
                                          const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
                                          const Mask128<T, N> a,
                                          const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
                                          const Mask128<T, N> a,
                                          const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
                                          const Mask128<T, N> a,
                                          const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
#endif
}

// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))};
#else
  return Mask128<T, N>{static_cast<__mmask16>(~m.raw)};
#endif
}

template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
  return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))};
#else
  return Mask128<T, N>{static_cast<__mmask8>(~m.raw)};
#endif
}

template <typename T>
HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
  // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
  return UnmaskedNot(m);
}
template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) {
  // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
  // are fewer than 16 valid bits in m

  // Return (~m) & ((1ull << N) - 1)
  return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <typename T>
HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
  // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
  return UnmaskedNot(m);
}
template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) {
  // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
  // are fewer than 8 valid bits in m

  // Return (~m) & ((1ull << N) - 1)
  return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) {
  // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
  // 4 valid bits in m

  // Return (~m) & ((1ull << N) - 1)
  return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) {
  // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
  // 2 valid bits in m

  // Return (~m) & ((1ull << N) - 1)
  return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
}

template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
  // Flip only the valid bits
  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
}

template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
}

#else  // AVX2 or below

// ------------------------------ Mask

// Mask and Vec are the same (true = FF..FF).
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
  return Mask128<T, N>{v.raw};
}

template <class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));

template <typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
  return Vec128<T, N>{v.raw};
}

// Generic for all vector lengths.
template <class D>
HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
  return VecFromMask(v);
}

#if HWY_TARGET >= HWY_SSSE3

// mask ? yes : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                Vec128<T, N> no) {
  const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
  return Or(And(vmask, yes), AndNot(vmask, no));
}

#else  // HWY_TARGET < HWY_SSSE3

// mask ? yes : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                Vec128<T, N> no) {
  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
                                    Vec128<float, N> yes, Vec128<float, N> no) {
  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
                                     Vec128<double, N> yes,
                                     Vec128<double, N> no) {
  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
}

#endif  // HWY_TARGET >= HWY_SSSE3

// mask ? yes : 0
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}

// mask ? 0 : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}

// ------------------------------ Mask logical

template <typename T, size_t N>
HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Not(VecFromMask(d, m)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
  const Simd<T, N, 0> d;
  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ ShiftLeft

template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
}

template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
}

template <int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
}

template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
}

#if HWY_TARGET <= HWY_AVX3_DL

namespace detail {
template <typename T, size_t N>
HWY_API Vec128<T, N> GaloisAffine(
    Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) {
  return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
}
}  // namespace detail

#else  // HWY_TARGET > HWY_AVX3_DL

template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d8;
  // Use raw instead of BitCast to support N=1.
  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
  return kBits == 1
             ? (v + v)
             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
}

#endif  // HWY_TARGET > HWY_AVX3_DL

// ------------------------------ ShiftRight

template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
}

template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
}

#if HWY_TARGET > HWY_AVX3_DL

template <int kBits, size_t N>
HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
  const DFromV<decltype(v)> d8;
  // Use raw instead of BitCast to support N=1.
  const Vec128<uint8_t, N> shifted{
      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
  return shifted & Set(d8, 0xFF >> kBits);
}

template <int kBits, size_t N>
HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
  const DFromV<decltype(v)> di;
  const RebindToUnsigned<decltype(di)> du;
  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
  return (shifted ^ shifted_sign) - shifted_sign;
}

#endif  // HWY_TARGET > HWY_AVX3_DL

// i64 is implemented after BroadcastSignBit.

// ================================================== MEMORY (1)

// Clang static analysis claims the memory immediately after a partial vector
// store is uninitialized, and also flags the input to partial loads (at least
// for loadl_pd) as "garbage". This is a false alarm because msan does not
// raise errors. We work around this by using CopyBytes instead of intrinsics,
// but only for the analyzer to avoid potentially bad code generation.
// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
#if defined(__clang_analyzer__) || \
    (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
#define HWY_SAFE_PARTIAL_LOAD_STORE 1
#else
#define HWY_SAFE_PARTIAL_LOAD_STORE 0
#endif
#endif  // HWY_SAFE_PARTIAL_LOAD_STORE

// ------------------------------ Load

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
  return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) {
  return Vec128<float16_t>{_mm_load_ph(aligned)};
}
#endif  // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
  return Vec128<float>{_mm_load_ps(aligned)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) {
  return Vec128<double>{_mm_load_pd(aligned)};
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) {
  return Vec128<float16_t>{_mm_loadu_ph(p)};
}
#endif  // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
  return Vec128<float>{_mm_loadu_ps(p)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
  return Vec128<double>{_mm_loadu_pd(p)};
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
#if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128i v = _mm_setzero_si128();
  CopyBytes<8>(p, &v);  // not same size
#else
  const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
#endif
  return BitCast(d, VFromD<decltype(du)>{v});
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128 v = _mm_setzero_ps();
  CopyBytes<8>(p, &v);  // not same size
  return Vec64<float>{v};
#else
  const __m128 hi = _mm_setzero_ps();
  return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128d v = _mm_setzero_pd();
  CopyBytes<8>(p, &v);  // not same size
  return Vec64<double>{v};
#else
  return Vec64<double>{_mm_load_sd(p)};
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128 v = _mm_setzero_ps();
  CopyBytes<4>(p, &v);  // not same size
  return Vec32<float>{v};
#else
  return Vec32<float>{_mm_load_ss(p)};
#endif
}

// Any <= 32 bit except <float, 1>
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  // Clang ArgumentPromotionPass seems to break this code. We can unpoison
  // before SetTableIndices -> LoadU -> Load and the memory is poisoned again.
  detail::MaybeUnpoison(p, Lanes(d));

#if HWY_SAFE_PARTIAL_LOAD_STORE
  __m128i v = Zero(Full128<TFromD<decltype(du)>>()).raw;
  CopyBytes<d.MaxBytes()>(p, &v);  // not same size as VFromD
#else
  int32_t bits = 0;
  CopyBytes<d.MaxBytes()>(p, &bits);  // not same size as VFromD
  const __m128i v = _mm_cvtsi32_si128(bits);
#endif
  return BitCast(d, VFromD<decltype(du)>{v});
}

// For < 128 bit, LoadU == Load.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
  return Load(d, p);
}

// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
  return LoadU(d, p);
}

// ------------------------------ Store

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) {
  _mm_store_ph(aligned, v.raw);
}
#endif  // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  const RebindToUnsigned<decltype(d)> du;
  Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
  _mm_store_ps(aligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API void Store(Vec128<double> v, D /* tag */,
                   double* HWY_RESTRICT aligned) {
  _mm_store_pd(aligned, v.raw);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
}
#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) {
  _mm_storeu_ph(p, v.raw);
}
#endif  // HWY_HAVE_FLOAT16
// Generic for all vector lengths greater than or equal to 16 bytes.
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;
  StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) {
  _mm_storeu_ps(p, v.raw);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) {
  _mm_storeu_pd(p, v.raw);
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  (void)d;
  CopyBytes<8>(&v, p);  // not same size
#else
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw);
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  CopyBytes<8>(&v, p);  // not same size
#else
  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  CopyBytes<8>(&v, p);  // not same size
#else
  _mm_storel_pd(p, v.raw);
#endif
}

// Any <= 32 bit except <float, 1>
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
  CopyBytes<d.MaxBytes()>(&v, p);  // not same size
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) {
#if HWY_SAFE_PARTIAL_LOAD_STORE
  CopyBytes<4>(&v, p);  // not same size
#else
  _mm_store_ss(p, v.raw);
#endif
}

// For < 128 bit, StoreU == Store.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
  Store(v, d, p);
}

// ================================================== SWIZZLE (1)

// ------------------------------ TableLookupBytes
template <typename T, size_t N, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
                                        const Vec128<TI, NI> from) {
  const DFromV<decltype(from)> d;
  const Repartition<uint8_t, decltype(d)> du8;

  const DFromV<decltype(bytes)> d_bytes;
  const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
#if HWY_TARGET == HWY_SSE2
#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
  typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
  (void)d;
  (void)du8;
  (void)d_bytes;
  (void)du8_bytes;
  return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
      __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw),
                        reinterpret_cast<GccU8RawVectType>(from.raw)))};
#else
  const Full128<uint8_t> du8_full;

  alignas(16) uint8_t result_bytes[16];
  alignas(16) uint8_t u8_bytes[16];
  alignas(16) uint8_t from_bytes[16];

  Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes);
  Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes);

  for (int i = 0; i < 16; i++) {
    result_bytes[i] = u8_bytes[from_bytes[i] & 15];
  }

  return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw});
#endif
#else  // SSSE3 or newer
  return BitCast(
      d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw,
                                                BitCast(du8, from).raw)});
#endif
}

// ------------------------------ TableLookupBytesOr0
// For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3
template <class V, class VI>
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
#if HWY_TARGET == HWY_SSE2
  const DFromV<decltype(from)> d;
  const Repartition<int8_t, decltype(d)> di8;

  const auto di8_from = BitCast(di8, from);
  return BitCast(d, IfThenZeroElse(di8_from < Zero(di8),
                                   TableLookupBytes(bytes, di8_from)));
#else
  return TableLookupBytes(bytes, from);
#endif
}

// ------------------------------ Shuffles (ShiftRight, TableLookupBytes)

// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.

// Swap 32-bit halves in 64-bit halves.
template <typename T, size_t N>
HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
}
template <size_t N>
HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) {
  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
}

// These are used by generic_ops-inl to implement LoadInterleaved3. As with
// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
// comes from the first argument.
namespace detail {

template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
#if HWY_TARGET == HWY_SSE2
  Vec32<uint16_t> ba_shuffled{
      _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
  return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
#else
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx =
      BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0));
  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
#if HWY_TARGET == HWY_SSE2
  Vec64<uint32_t> ba_shuffled{
      _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
  return Vec64<T>{
      _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
#else
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx = BitCast(
      d2,
      Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0));
  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) {
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> df;
  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
                                                 BitCast(df, b).raw, m)});
}

template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
  const DFromV<decltype(a)> d;
#if HWY_TARGET == HWY_SSE2
  const auto zero = Zero(d);
  const Rebind<int16_t, decltype(d)> di16;
  const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
      _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
  const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
      _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
  const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
  return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
#else
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx =
      BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0));
  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
  const DFromV<decltype(a)> d;
#if HWY_TARGET == HWY_SSE2
  const Vec32<T> a_shuffled{
      _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))};
  const Vec32<T> b_shuffled{
      _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))};
  return Combine(d, b_shuffled, a_shuffled);
#else
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx = BitCast(
      d2,
      Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0));
  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) {
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> df;
  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
                                                 BitCast(df, b).raw, m)});
}

template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
  const DFromV<decltype(a)> d;
#if HWY_TARGET == HWY_SSE2
  const auto zero = Zero(d);
  const Rebind<int16_t, decltype(d)> di16;
  const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
      _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
  const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
      _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
  const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
  return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
#else
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx =
      BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0));
  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
  const DFromV<decltype(a)> d;
#if HWY_TARGET == HWY_SSE2
  const Vec32<T> a_shuffled{
      _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))};
  const Vec32<T> b_shuffled{
      _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))};
  return Combine(d, b_shuffled, a_shuffled);
#else
  const Twice<decltype(d)> d2;
  const auto ba = Combine(d2, b, a);
  const RebindToUnsigned<decltype(d2)> d2_u;
  const auto shuffle_idx = BitCast(
      d2,
      Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0));
  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
#endif
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) {
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> df;
  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
                                                 BitCast(df, b).raw, m)});
}

}  // namespace detail

// Swap 64-bit halves
HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
}
HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
}
HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
}
HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) {
  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
}
HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) {
  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
}
HWY_API Vec128<double> Shuffle01(const Vec128<double> v) {
  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
}

// Rotate right 32 bits
HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
}
HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
}
HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
}
// Rotate left 32 bits
HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
}
HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
}
HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
}

// Reverse
HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
}
HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
}
HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
}

// ================================================== COMPARE

#if HWY_TARGET <= HWY_AVX3

// Comparisons set a mask bit to 1 if the condition is true, else 0.

// ------------------------------ TestBit

namespace detail {

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v,
                                 const Vec128<T, N> bit) {
  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v,
                                 const Vec128<T, N> bit) {
  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v,
                                 const Vec128<T, N> bit) {
  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v,
                                 const Vec128<T, N> bit) {
  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
}

// ------------------------------ Equality

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a,
                                         Vec128<float16_t, N> b) {
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
  HWY_DIAGNOSTICS(pop)
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
}

template <size_t N>
HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
}

// ------------------------------ Inequality

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a,
                                         Vec128<float16_t, N> b) {
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
  HWY_DIAGNOSTICS(pop)
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
}

template <size_t N>
HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
}

// ------------------------------ Strict inequality

// Signed/float <
template <size_t N>
HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
                                      Vec128<int16_t, N> b) {
  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
                                      Vec128<int32_t, N> b) {
  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
                                      Vec128<int64_t, N> b) {
  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
}

template <size_t N>
HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
                                      Vec128<uint8_t, N> b) {
  return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
                                       Vec128<uint16_t, N> b) {
  return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
                                       Vec128<uint32_t, N> b) {
  return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
                                       Vec128<uint64_t, N> b) {
  return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a,
                                        Vec128<float16_t, N> b) {
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
  HWY_DIAGNOSTICS(pop)
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
}
template <size_t N>
HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
}

// ------------------------------ Weak inequality

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a,
                                         Vec128<float16_t, N> b) {
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
  HWY_DIAGNOSTICS(push)
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
  HWY_DIAGNOSTICS(pop)
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
}
template <size_t N>
HWY_API Mask128<double, N> operator>=(Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
}

template <size_t N>
HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a,
                                      Vec128<int8_t, N> b) {
  return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a,
                                       Vec128<int16_t, N> b) {
  return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a,
                                       Vec128<int32_t, N> b) {
  return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a,
                                       Vec128<int64_t, N> b) {
  return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)};
}

template <size_t N>
HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a,
                                       Vec128<uint8_t, N> b) {
  return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a,
                                        Vec128<uint16_t, N> b) {
  return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a,
                                        Vec128<uint32_t, N> b) {
  return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a,
                                        Vec128<uint64_t, N> b) {
  return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)};
}

#else  // AVX2 or below

// Comparisons fill a lane with 1-bits if the condition is true, else 0.

template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)>
HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) {
  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
  const Simd<TFrom, NFrom, 0> d;
  return MaskFromVec(BitCast(dto, VecFromMask(d, m)));
}

template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
  return (v & bit) == bit;
}

// ------------------------------ Equality

// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a,
                                       Vec128<uint8_t, N> b) {
  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a,
                                        Vec128<uint16_t, N> b) {
  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a,
                                        Vec128<uint32_t, N> b) {
  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
                                        const Vec128<uint64_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  const DFromV<decltype(a)> d64;
  const RepartitionToNarrow<decltype(d64)> d32;
  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
  return MaskFromVec(BitCast(d64, cmp64));
#else
  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
#endif
}

// Signed
template <size_t N>
HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a,
                                      Vec128<int8_t, N> b) {
  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
                                       Vec128<int16_t, N> b) {
  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a,
                                       Vec128<int32_t, N> b) {
  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
                                       const Vec128<int64_t, N> b) {
  // Same as signed ==; avoid duplicating the SSSE3 version.
  const DFromV<decltype(a)> d;
  RebindToUnsigned<decltype(d)> du;
  return RebindMask(d, BitCast(du, a) == BitCast(du, b));
}

// Float
template <size_t N>
HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
}

// ------------------------------ Inequality

// This cannot have T as a template argument, otherwise it is not more
// specialized than rewritten operator== in C++20, leading to compile
// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
template <size_t N>
HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
                                       Vec128<uint8_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
                                        Vec128<uint16_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
                                        Vec128<uint32_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
                                        Vec128<uint64_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
                                      Vec128<int8_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
                                       Vec128<int16_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
                                       Vec128<int32_t, N> b) {
  return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
                                       Vec128<int64_t, N> b) {
  return Not(a == b);
}

template <size_t N>
HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
}

// ------------------------------ Strict inequality

namespace detail {

template <size_t N>
HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
                                 Vec128<int8_t, N> b) {
  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
                                  Vec128<int16_t, N> b) {
  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
                                  Vec128<int32_t, N> b) {
  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
}

template <size_t N>
HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
                                  const Vec128<int64_t, N> a,
                                  const Vec128<int64_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  // See https://stackoverflow.com/questions/65166174/:
  const DFromV<decltype(a)> d;
  const RepartitionToNarrow<decltype(d)> d32;
  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
  // Duplicate upper to lower half.
  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
#else
  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
#endif
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
                            Vec128<T, N> b) {
  const DFromV<decltype(a)> du;
  const RebindToSigned<decltype(du)> di;
  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
  const auto sa = BitCast(di, Xor(a, msb));
  const auto sb = BitCast(di, Xor(b, msb));
  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
}

template <size_t N>
HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
                                Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
                                 Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
}

}  // namespace detail

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Gt(hwy::TypeTag<T>(), a, b);
}

// ------------------------------ Weak inequality

namespace detail {
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a,
                            Vec128<T, N> b) {
  return Not(Gt(tag, b, a));
}

template <typename T, size_t N>
HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a,
                            Vec128<T, N> b) {
  return Not(Gt(tag, b, a));
}

template <size_t N>
HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a,
                                Vec128<float, N> b) {
  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a,
                                 Vec128<double, N> b) {
  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
  return detail::Ge(hwy::TypeTag<T>(), a, b);
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ Reversed comparisons

template <typename T, size_t N>
HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
  return b > a;
}

template <typename T, size_t N>
HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
  return b >= a;
}

// ------------------------------ Iota (Load)

namespace detail {

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_epi8(
      static_cast<char>(15), static_cast<char>(14), static_cast<char>(13),
      static_cast<char>(12), static_cast<char>(11), static_cast<char>(10),
      static_cast<char>(9), static_cast<char>(8), static_cast<char>(7),
      static_cast<char>(6), static_cast<char>(5), static_cast<char>(4),
      static_cast<char>(3), static_cast<char>(2), static_cast<char>(1),
      static_cast<char>(0))};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4},
                                 int16_t{3}, int16_t{2}, int16_t{1},
                                 int16_t{0})};
}

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5},
                              float16_t{4}, float16_t{3}, float16_t{2},
                              float16_t{1}, float16_t{0})};
}
#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{
      _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  return VFromD<D>{_mm_set_pd(1.0, 0.0)};
}

#if HWY_COMPILER_MSVC
template <class V, HWY_IF_V_SIZE_V(V, 1)>
static HWY_INLINE V MaskOutVec128Iota(V v) {
  const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)};
  return v & mask_out_mask;
}
template <class V, HWY_IF_V_SIZE_V(V, 2)>
static HWY_INLINE V MaskOutVec128Iota(V v) {
#if HWY_TARGET <= HWY_SSE4
  return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)};
#else
  const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)};
  return v & mask_out_mask;
#endif
}
template <class V, HWY_IF_V_SIZE_V(V, 4)>
static HWY_INLINE V MaskOutVec128Iota(V v) {
  const DFromV<decltype(v)> d;
  const Repartition<float, decltype(d)> df;
  using VF = VFromD<decltype(df)>;
  return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)});
}
template <class V, HWY_IF_V_SIZE_V(V, 8)>
static HWY_INLINE V MaskOutVec128Iota(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)});
}
template <class V, HWY_IF_V_SIZE_GT_V(V, 8)>
static HWY_INLINE V MaskOutVec128Iota(V v) {
  return v;
}
#endif

}  // namespace detail

template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> Iota(D d, const T2 first) {
  const auto result_iota =
      detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
#if HWY_COMPILER_MSVC
  return detail::MaskOutVec128Iota(result_iota);
#else
  return result_iota;
#endif
}

// ------------------------------ FirstN (Iota, Lt)

template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API M FirstN(D d, size_t num) {
  constexpr size_t kN = MaxLanes(d);
  // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks
  // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI.
  num = HWY_MIN(num, kN);
#if HWY_TARGET <= HWY_AVX3
#if HWY_ARCH_X86_64
  const uint64_t all = (1ull << kN) - 1;
  return M::FromBits(_bzhi_u64(all, num));
#else
  const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1);
  return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(num)));
#endif  // HWY_ARCH_X86_64
#else   // HWY_TARGET > HWY_AVX3
  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
  using TI = TFromD<decltype(di)>;
  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
#endif  // HWY_TARGET <= HWY_AVX3
}

// ------------------------------ InterleaveLower

// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
// the least-significant lane) and "b". To concatenate two half-width integers
// into one, use ZipLower/Upper instead (also works with scalar).

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;  // for float16_t
  return BitCast(
      d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
}

template <size_t N>
HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
                                         Vec128<float, N> b) {
  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
                                          Vec128<double, N> b) {
  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
}

// Generic for all vector lengths.
template <class D>
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return InterleaveLower(a, b);
}

// ================================================== MEMORY (2)

// ------------------------------ MaskedLoad

#if HWY_TARGET <= HWY_AVX3

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                             const TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d, VFromD<decltype(du)>{_mm_maskz_loadu_epi16(m.raw, p)});
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const float* HWY_RESTRICT p) {
  return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const double* HWY_RESTRICT p) {
  return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
                               const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
                               const TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d, VFromD<decltype(du)>{
                        _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
                               const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
                               const TFromD<D>* HWY_RESTRICT p) {
  return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
                               const float* HWY_RESTRICT p) {
  return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
                               const double* HWY_RESTRICT p) {
  return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)};
}

#elif HWY_TARGET == HWY_AVX2

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const TFromD<D>* HWY_RESTRICT p) {
  auto p_p = reinterpret_cast<const int*>(p);  // NOLINT
  return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
                             const TFromD<D>* HWY_RESTRICT p) {
  auto p_p = reinterpret_cast<const long long*>(p);  // NOLINT
  return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) {
  const RebindToSigned<decltype(d)> di;
  return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) {
  const RebindToSigned<decltype(d)> di;
  return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)};
}

// There is no maskload_epi8/16, so blend instead.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                             const TFromD<D>* HWY_RESTRICT p) {
  return IfThenElseZero(m, LoadU(d, p));
}

#else  // <= SSE4

// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                             const TFromD<D>* HWY_RESTRICT p) {
  return IfThenElseZero(m, LoadU(d, p));
}

#endif

// ------------------------------ MaskedLoadOr

#if HWY_TARGET > HWY_AVX3  // else: native

// Generic for all vector lengths.
template <class D>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
                               const TFromD<D>* HWY_RESTRICT p) {
  return IfThenElse(m, LoadU(d, p), v);
}

#endif  // HWY_TARGET > HWY_AVX3

// ------------------------------ LoadN (InterleaveLower)

#if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT

#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_ONE_OF_D(
                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
                              (1 << 4) | (1 << 8))>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                        size_t num_lanes) {
  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
      d_full;
  return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p));
}

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_ONE_OF_D(
                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
                              (1 << 4) | (1 << 8))>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t num_lanes) {
  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
      d_full;
  return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no),
                                       FirstN(d_full, num_lanes), d_full, p));
}

#if HWY_TARGET > HWY_AVX3
namespace detail {

// 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors,
// there are none, so return the remainder (v_trailing).
template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(
    VFromD<D> /*load_mask*/, D /*d*/, const TFromD<D>* HWY_RESTRICT /*p*/,
    VFromD<D> v_trailing) {
  return v_trailing;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(
    VFromD<D> /*no*/, VFromD<D> /*load_mask*/, D /*d*/,
    const TFromD<D>* HWY_RESTRICT /*p*/, VFromD<D> v_trailing) {
  return v_trailing;
}

template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(VFromD<D> load_mask, D d,
                                              const TFromD<D>* HWY_RESTRICT p,
                                              VFromD<D> v_trailing) {
  using DI32 = Repartition<int32_t, D>;
  const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;

  // ResizeBitCast of load_mask to di32 is okay below if
  // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
  // the first (lowest-index) lanes of load_mask.raw will have already been
  // zeroed out by FirstN.
  return ResizeBitCast(
      d, IfNegativeThenElse(
             ResizeBitCast(di32_full, load_mask),
             MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)),
                        di32_full, reinterpret_cast<const int32_t*>(p)),
             ResizeBitCast(di32_full, v_trailing)));
}

template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(VFromD<D> no,
                                                VFromD<D> load_mask, D d,
                                                const TFromD<D>* HWY_RESTRICT p,
                                                VFromD<D> v_trailing) {
  using DI32 = Repartition<int32_t, D>;
  const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;

  // ResizeBitCast of load_mask to di32 is okay below if
  // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
  // the first (lowest-index) lanes of load_mask.raw will have already been
  // zeroed out by FirstN.
  return ResizeBitCast(
      d, IfNegativeThenElse(
             ResizeBitCast(di32_full, load_mask),
             MaskedLoadOr(ResizeBitCast(di32_full, no),
                          MaskFromVec(ResizeBitCast(di32_full, load_mask)),
                          di32_full, reinterpret_cast<const int32_t*>(p)),
             ResizeBitCast(di32_full, v_trailing)));
}

// Single lane: load or default value.
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
                                               const TFromD<D>* HWY_RESTRICT p,
                                               size_t num_lanes) {
  return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
}

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
    VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
    size_t num_lanes) {
  return (num_lanes > 0) ? LoadU(d, p) : no;
}

// Two lanes: load 1, 2, or default.
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
                                               const TFromD<D>* HWY_RESTRICT p,
                                               size_t num_lanes) {
  if (num_lanes > 1) {
    return LoadU(d, p);
  } else {
    const FixedTag<TFromD<D>, 1> d1;
    return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d);
  }
}

template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
    VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
    size_t num_lanes) {
  if (num_lanes > 1) {
    return LoadU(d, p);
  } else {
    if (num_lanes == 0) return no;
    // Load one, upper lane is default.
    const FixedTag<TFromD<D>, 1> d1;
    return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
  }
}

template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
                                               const TFromD<D>* HWY_RESTRICT p,
                                               size_t num_lanes) {
  const size_t trailing_n = num_lanes & 3;
  if (trailing_n == 0) return Zero(d);

  VFromD<D> v_trailing = And(load_mask, Set(d, p[num_lanes - 1]));

  if ((trailing_n & 2) != 0) {
    const Repartition<int16_t, decltype(d)> di16;
    int16_t i16_bits;
    CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
    v_trailing = BitCast(
        d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
                              BitCast(di16, v_trailing)));
  }

  return v_trailing;
}

template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
    VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
    size_t num_lanes) {
  const size_t trailing_n = num_lanes & 3;
  if (trailing_n == 0) return no;

  VFromD<D> v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);

  if ((trailing_n & 2) != 0) {
    const Repartition<int16_t, decltype(d)> di16;
    int16_t i16_bits;
    CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
    v_trailing = BitCast(
        d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
                              BitCast(di16, v_trailing)));
  }

  return v_trailing;
}

template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
                                               const TFromD<D>* HWY_RESTRICT p,
                                               size_t num_lanes) {
  if ((num_lanes & 1) != 0) {
    return And(load_mask, Set(d, p[num_lanes - 1]));
  } else {
    return Zero(d);
  }
}

template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
    VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
    size_t num_lanes) {
  if ((num_lanes & 1) != 0) {
    return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);
  } else {
    return no;
  }
}

}  // namespace detail

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, size_t N) {
  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
      d_full;

  const VFromD<D> load_mask =
      ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
  const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
  const VFromD<D> v_trailing =
      detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes);

#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
  if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
      num_lanes < (4 / sizeof(TFromD<D>))) {
    return v_trailing;
  }
#endif

  return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing);
}

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                          size_t N) {
  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
      d_full;

  const VFromD<D> load_mask =
      ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
  const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
  const VFromD<D> v_trailing =
      detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes);

#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
  if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
      num_lanes < (4 / sizeof(TFromD<D>))) {
    return v_trailing;
  }
#endif

  return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing);
}

#endif  // HWY_TARGET > HWY_AVX3
#endif  // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT

// ------------------------------ BlendedStore

namespace detail {

// There is no maskload_epi8/16 with which we could safely implement
// BlendedStore. Manual blending is also unsafe because loading a full vector
// that crosses the array end causes asan faults. Resort to scalar code; the
// caller should instead use memcpy, assuming m is FirstN(d, n).
template <class D>
HWY_API void ScalarMaskedStore(VFromD<D> v, MFromD<D> m, D d,
                               TFromD<D>* HWY_RESTRICT p) {
  const RebindToSigned<decltype(d)> di;  // for testing mask if T=bfloat16_t.
  using TI = TFromD<decltype(di)>;
  alignas(16) TI buf[MaxLanes(d)];
  alignas(16) TI mask[MaxLanes(d)];
  Store(BitCast(di, v), di, buf);
  Store(BitCast(di, VecFromMask(d, m)), di, mask);
  for (size_t i = 0; i < MaxLanes(d); ++i) {
    if (mask[i]) {
      CopySameSize(buf + i, p + i);
    }
  }
}
}  // namespace detail

#if HWY_TARGET <= HWY_AVX3

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
                          TFromD<D>* HWY_RESTRICT p) {
  _mm_mask_storeu_epi8(p, m.raw, v.raw);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  _mm_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), RebindMask(du, m).raw,
                        BitCast(du, v).raw);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
                          TFromD<D>* HWY_RESTRICT p) {
  auto pi = reinterpret_cast<int*>(p);  // NOLINT
  _mm_mask_storeu_epi32(pi, m.raw, v.raw);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
                          TFromD<D>* HWY_RESTRICT p) {
  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
  _mm_mask_storeu_epi64(pi, m.raw, v.raw);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, float* HWY_RESTRICT p) {
  _mm_mask_storeu_ps(p, m.raw, v.raw);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, double* HWY_RESTRICT p) {
  _mm_mask_storeu_pd(p, m.raw, v.raw);
}

#elif HWY_TARGET == HWY_AVX2

template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  detail::ScalarMaskedStore(v, m, d, p);
}

namespace detail {

template <class D, class V, class M, HWY_IF_UI32_D(D)>
HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
  auto pi = reinterpret_cast<int*>(p);  // NOLINT
  _mm_maskstore_epi32(pi, m.raw, v.raw);
}

template <class D, class V, class M, HWY_IF_UI64_D(D)>
HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
  _mm_maskstore_epi64(pi, m.raw, v.raw);
}

template <class D, class V, class M, HWY_IF_F32_D(D)>
HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) {
  _mm_maskstore_ps(p, m.raw, v.raw);
}

template <class D, class V, class M, HWY_IF_F64_D(D)>
HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) {
  _mm_maskstore_pd(p, m.raw, v.raw);
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  const RebindToSigned<decltype(d)> di;
  // For partial vectors, avoid writing other lanes by zeroing their mask.
  if (d.MaxBytes() < 16) {
    const Full128<TFromD<D>> dfull;
    const Mask128<TFromD<D>> mfull{m.raw};
    m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw};
  }

  // Float/double require, and unsigned ints tolerate, signed int masks.
  detail::NativeBlendedStore<D>(v, RebindMask(di, m), p);
}

#else  // <= SSE4

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
                          TFromD<D>* HWY_RESTRICT p) {
  // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
  detail::ScalarMaskedStore(v, m, d, p);
}

#endif  // SSE4

// ================================================== ARITHMETIC

// ------------------------------ Addition

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
                                     const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
                                      const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
                                      const Vec128<uint32_t, N> b) {
  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
                                      const Vec128<uint64_t, N> b) {
  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
                                    const Vec128<int8_t, N> b) {
  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
                                     const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
                                     const Vec128<int32_t, N> b) {
  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
                                     const Vec128<int64_t, N> b) {
  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
}

// Float
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> operator+(const Vec128<float16_t, N> a,
                                       const Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
                                   const Vec128<float, N> b) {
  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
                                    const Vec128<double, N> b) {
  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
}

// ------------------------------ Subtraction

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
                                     const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
                                      Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
                                      const Vec128<uint32_t, N> b) {
  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
                                      const Vec128<uint64_t, N> b) {
  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
                                    const Vec128<int8_t, N> b) {
  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
                                     const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
                                     const Vec128<int32_t, N> b) {
  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
                                     const Vec128<int64_t, N> b) {
  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
}

// Float
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> operator-(const Vec128<float16_t, N> a,
                                       const Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
                                   const Vec128<float, N> b) {
  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
                                    const Vec128<double, N> b) {
  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
}

// ------------------------------ AddSub

#if HWY_TARGET <= HWY_SSSE3
template <size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
}
HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
  return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)};
}
#endif  // HWY_TARGET <= HWY_SSSE3

// ------------------------------ SumsOf8
template <size_t N>
HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
}

// Generic for all vector lengths
template <class V, HWY_IF_I8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Repartition<int64_t, decltype(d)> di64;

  // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
  // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
  // bitcasting the Xor result to an u8 vector.
  const auto v_adj = BitCast(du, Xor(v, SignBit(d)));

  // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj)
  // operation to account for the adjustment made above.
  return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024});
}

#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
#endif

template <size_t N>
HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a,
                                               const Vec128<uint8_t, N> b) {
  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
}

// Generic for all vector lengths
template <class V, HWY_IF_I8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
  const DFromV<V> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWideX3<decltype(d)> di64;

  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
  // by 128) and then bitcasting the results of the Xor operations to u8
  // vectors.
  const auto i8_msb = SignBit(d);
  const auto a_adj = BitCast(du, Xor(a, i8_msb));
  const auto b_adj = BitCast(du, Xor(b, i8_msb));

  // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an
  // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true
  return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj));
}

// ------------------------------ SumsOf4
#if HWY_TARGET <= HWY_AVX3
namespace detail {

template <size_t N>
HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4(
    hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/,
    Vec128<uint8_t, N> v) {
  const DFromV<decltype(v)> d;

  // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
  // zeroed out and the sums of the 4 consecutive lanes are already in the
  // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result.
  return Vec128<uint32_t, (N + 3) / 4>{
      _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)};
}

// detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h

}  // namespace detail
#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ SumsOfAdjQuadAbsDiff

#if HWY_TARGET <= HWY_SSE4
#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
#endif

template <int kAOffset, int kBOffset, size_t N>
HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff(
    Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
  static_assert(0 <= kAOffset && kAOffset <= 1,
                "kAOffset must be between 0 and 1");
  static_assert(0 <= kBOffset && kBOffset <= 3,
                "kBOffset must be between 0 and 3");
  return Vec128<uint16_t, (N + 1) / 2>{
      _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)};
}

// Generic for all vector lengths
template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWide<decltype(d)> dw;

  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
  // by 128) and then bitcasting the results of the Xor operations to u8
  // vectors.
  const auto i8_msb = SignBit(d);
  const auto a_adj = BitCast(du, Xor(a, i8_msb));
  const auto b_adj = BitCast(du, Xor(b, i8_msb));

  // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can
  // simply be bitcasted to an i16 vector as
  // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
  return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
}
#endif

// ------------------------------ SumsOfShuffledQuadAbsDiff

#if HWY_TARGET <= HWY_AVX3
#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#else
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
#endif

template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N>
HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff(
    Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
  static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
  static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
  static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
  static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
  return Vec128<uint16_t, (N + 1) / 2>{
      _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
}

// Generic for all vector lengths
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V,
          HWY_IF_I8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a,
                                                                       V b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWide<decltype(d)> dw;

  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
  // by 128) and then bitcasting the results of the Xor operations to u8
  // vectors.
  const auto i8_msb = SignBit(d);
  const auto a_adj = BitCast(du, Xor(a, i8_msb));
  const auto b_adj = BitCast(du, Xor(b, i8_msb));

  // The result of
  // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can
  // simply be bitcasted to an i16 vector as
  // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
  return BitCast(
      dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
}
#endif

// ------------------------------ SaturatedAdd

// Returns a + b clamped to the destination range.

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
                                        const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
}

#if HWY_TARGET <= HWY_AVX3
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif

#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif

template <size_t N>
HWY_API Vec128<int32_t, N> SaturatedAdd(Vec128<int32_t, N> a,
                                        Vec128<int32_t, N> b) {
  const DFromV<decltype(a)> d;
  const auto sum = a + b;
  const auto overflow_mask = MaskFromVec(
      Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)});
  const auto i32_max = Set(d, LimitsMax<int32_t>());
  const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
      i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
  return IfThenElse(overflow_mask, overflow_result, sum);
}

template <size_t N>
HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a,
                                        Vec128<int64_t, N> b) {
  const DFromV<decltype(a)> d;
  const auto sum = a + b;
  const auto overflow_mask = MaskFromVec(
      Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)});
  const auto i64_max = Set(d, LimitsMax<int64_t>());
  const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
      i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
  return IfThenElse(overflow_mask, overflow_result, sum);
}
#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ SaturatedSub

// Returns a - b clamped to the destination range.

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
                                        const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
}

#if HWY_TARGET <= HWY_AVX3
template <size_t N>
HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a,
                                        Vec128<int32_t, N> b) {
  const DFromV<decltype(a)> d;
  const auto diff = a - b;
  const auto overflow_mask = MaskFromVec(
      Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)});
  const auto i32_max = Set(d, LimitsMax<int32_t>());
  const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
      i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
  return IfThenElse(overflow_mask, overflow_result, diff);
}

template <size_t N>
HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a,
                                        Vec128<int64_t, N> b) {
  const DFromV<decltype(a)> d;
  const auto diff = a - b;
  const auto overflow_mask = MaskFromVec(
      Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)});
  const auto i64_max = Set(d, LimitsMax<int64_t>());
  const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
      i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
  return IfThenElse(overflow_mask, overflow_result, diff);
}
#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ AverageRound

// Returns (a + b + 1) / 2

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
}

// ------------------------------ Integer multiplication

template <size_t N>
HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
                                      const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
                                     const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
}

// Returns the upper 16 bits of a * b in each lane.
template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
                                    const Vec128<uint16_t, N> b) {
  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
                                   const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
}

// Multiplies even lanes (0, 2 ..) and places the double-wide result into
// even and the upper half into its odd neighbor lane.
template <class V, HWY_IF_U8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  const auto lo8_mask = Set(dw, uint16_t{0x00FF});
  return And(ResizeBitCast(dw, a), lo8_mask) *
         And(ResizeBitCast(dw, b), lo8_mask);
}

template <class V, HWY_IF_I8_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) *
         ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b)));
}

template <class V, HWY_IF_UI16_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  const RepartitionToNarrow<decltype(dw)> dw_as_d16;

  const auto lo = ResizeBitCast(dw, a * b);
  const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b)));
  return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
}

template <size_t N>
HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
                                              const Vec128<uint32_t, N> b) {
  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
}

template <size_t N>
HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
                                             const Vec128<int32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  const RebindToUnsigned<decltype(d)> du;

  // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) +
  //        (((a[i] >> 31) * b[i]) << 32) +
  //        (((b[i] >> 31) * a[i]) << 32) +
  //        ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF}))

  // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the
  // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero.

  // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) ==
  // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32)

  // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be
  // computed using MulEven(BitCast(du, a), BitCast(du, b))

  const auto neg_p_hi = ShiftLeft<32>(
      ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a)));
  const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b)));
  return p_lo - neg_p_hi;
#else
  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
#endif
}

template <class V, HWY_IF_T_SIZE_V(V, 1)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  return ShiftRight<8>(ResizeBitCast(dw, a)) *
         ShiftRight<8>(ResizeBitCast(dw, b));
}

template <class V, HWY_IF_UI16_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
  const DFromV<decltype(a)> d;
  const RepartitionToWide<decltype(d)> dw;
  const RebindToUnsigned<decltype(dw)> dw_u;
  const RepartitionToNarrow<decltype(dw)> dw_as_d16;

  const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b)));
  const auto hi = ResizeBitCast(dw, MulHigh(a, b));
  return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
}

template <class V, HWY_IF_UI32_D(DFromV<V>)>
HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
  return MulEven(DupOdd(a), DupOdd(b));
}

template <size_t N>
HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
                                      const Vec128<uint32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
  // 64-bit right shift would also work but also needs port 5, so no benefit.
  // Notation: x=don't care, z=0.
  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
  const auto mullo_x2x0 = MulEven(a, b);
  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
  const auto mullo_x3x1 =
      MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
  // the latter requires one more instruction or a constant.
  const __m128i mul_20 =
      _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
  const __m128i mul_31 =
      _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
#else
  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
#endif
}

template <size_t N>
HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
                                     const Vec128<int32_t, N> b) {
  // Same as unsigned; avoid duplicating the SSSE3 code.
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, BitCast(du, a) * BitCast(du, b));
}

// ------------------------------ RotateRight (ShiftRight, Or)

template <int kBits, typename T, size_t N,
          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
  constexpr size_t kSizeInBits = sizeof(T) * 8;
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
  if (kBits == 0) return v;
  // AVX3 does not support 8/16-bit.
  return Or(ShiftRight<kBits>(v),
            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
}

template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
#else
  if (kBits == 0) return v;
  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
#endif
}

template <int kBits, size_t N>
HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
#else
  if (kBits == 0) return v;
  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
#endif
}

// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)

template <size_t N>
HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
  const DFromV<decltype(v)> d;
  return VecFromMask(v < Zero(d));
}

template <size_t N>
HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) {
  return ShiftRight<15>(v);
}

template <size_t N>
HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) {
  return ShiftRight<31>(v);
}

template <size_t N>
HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) {
  const DFromV<decltype(v)> d;
#if HWY_TARGET <= HWY_AVX3
  (void)d;
  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
  return VecFromMask(v < Zero(d));
#else
  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
  // avoids generating a zero.
  const RepartitionToNarrow<decltype(d)> d32;
  const auto sign = ShiftRight<31>(BitCast(d32, v));
  return Vec128<int64_t, N>{
      _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
#endif
}

// ------------------------------ Integer Abs

// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
template <size_t N>
HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
#if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const auto zero = Zero(du);
  const auto v_as_u8 = BitCast(du, v);
  return BitCast(d, Min(v_as_u8, zero - v_as_u8));
#else
  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
#endif
}

template <size_t N>
HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
#if HWY_TARGET == HWY_SSE2
  const auto zero = Zero(DFromV<decltype(v)>());
  return Max(v, zero - v);
#else
  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
#endif
}

template <size_t N>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
#if HWY_TARGET <= HWY_SSSE3
  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
#else
  const auto zero = Zero(DFromV<decltype(v)>());
  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
#endif
}

#if HWY_TARGET <= HWY_AVX3
template <size_t N>
HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
}
#else
// I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class V, HWY_IF_I64(TFromV<V>)>
HWY_API V Abs(V v) {
  const auto zero = Zero(DFromV<decltype(v)>());
  return IfNegativeThenElse(v, zero - v, v);
}
#endif

#ifdef HWY_NATIVE_SATURATED_ABS
#undef HWY_NATIVE_SATURATED_ABS
#else
#define HWY_NATIVE_SATURATED_ABS
#endif

// Generic for all vector lengths
template <class V, HWY_IF_I8(TFromV<V>)>
HWY_API V SaturatedAbs(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
}

// Generic for all vector lengths
template <class V, HWY_IF_I16(TFromV<V>)>
HWY_API V SaturatedAbs(V v) {
  return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
}

// Generic for all vector lengths
template <class V, HWY_IF_I32(TFromV<V>)>
HWY_API V SaturatedAbs(V v) {
  const auto abs_v = Abs(v);

#if HWY_TARGET <= HWY_SSE4
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(d, Min(BitCast(du, abs_v),
                        Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
#else
  return Add(abs_v, BroadcastSignBit(abs_v));
#endif
}

// Generic for all vector lengths
template <class V, HWY_IF_I64(TFromV<V>)>
HWY_API V SaturatedAbs(V v) {
  const auto abs_v = Abs(v);
  return Add(abs_v, BroadcastSignBit(abs_v));
}

// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL
// srli_epi64: the count should be unsigned int. Note that this is not the same
// as the Shift3264Count in x86_512-inl.h (GCC also requires int).
#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400)
using Shift64Count = int;
#else
// Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this.
using Shift64Count = unsigned int;
#endif

template <int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<int64_t, N>{
      _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
#else
  const DFromV<decltype(v)> di;
  const RebindToUnsigned<decltype(di)> du;
  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
  return right | sign;
#endif
}

// ------------------------------ ZeroIfNegative (BroadcastSignBit)
template <typename T, size_t N>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only works for float");
  const DFromV<decltype(v)> d;
#if HWY_TARGET >= HWY_SSSE3
  const RebindToSigned<decltype(d)> di;
  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
#else
  const auto mask = MaskFromVec(v);  // MSB is sufficient for BLENDVPS
#endif
  return IfThenElse(mask, Zero(d), v);
}

// ------------------------------ IfNegativeThenElse
template <size_t N>
HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v,
                                             const Vec128<int8_t, N> yes,
                                             const Vec128<int8_t, N> no) {
// int8: IfThenElse only looks at the MSB on SSE4 or newer
#if HWY_TARGET <= HWY_SSE4
  const auto mask = MaskFromVec(v);
#else
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
#endif

  return IfThenElse(mask, yes, no);
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                        Vec128<T, N> no) {
  static_assert(IsSigned<T>(), "Only works for signed/float");

// 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's
// MSB.
#if HWY_TARGET <= HWY_AVX3
  const auto mask = MaskFromVec(v);
#else
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
#endif

  return IfThenElse(mask, yes, no);
}

template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                        Vec128<T, N> no) {
  static_assert(IsSigned<T>(), "Only works for signed/float");
  const DFromV<decltype(v)> d;

#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
  // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB
  // on SSE4 or later.
  const RebindToFloat<decltype(d)> df;
  const auto mask = MaskFromVec(BitCast(df, v));
  return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no)));
#else  // SSE2, SSSE3, or AVX3

#if HWY_TARGET <= HWY_AVX3
  // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only
  // looks at the MSB on AVX3
  (void)d;
  const auto mask = MaskFromVec(v);
#else
  const RebindToSigned<decltype(d)> di;
  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
#endif

  return IfThenElse(mask, yes, no);
#endif
}

// ------------------------------ IfNegativeThenNegOrUndefIfZero

#if HWY_TARGET <= HWY_SSSE3

#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#else
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
#endif

template <size_t N>
HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask,
                                                         Vec128<int8_t, N> v) {
  return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)};
}

template <size_t N>
HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero(
    Vec128<int16_t, N> mask, Vec128<int16_t, N> v) {
  return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)};
}

template <size_t N>
HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero(
    Vec128<int32_t, N> mask, Vec128<int32_t, N> v) {
  return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)};
}

// Generic for all vector lengths
template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
#if HWY_TARGET <= HWY_AVX3
  // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3
  const DFromV<decltype(v)> d;
  return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
#else
  // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2
  return IfNegativeThenElse(mask, Neg(v), v);
#endif
}

#endif  // HWY_TARGET <= HWY_SSSE3

// ------------------------------ ShiftLeftSame

template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
                                          const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)};
  }
#endif
  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
                                          const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)};
  }
#endif
  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
                                          const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)};
  }
#endif
  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
}

template <size_t N>
HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
                                         const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)};
  }
#endif
  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
}

template <size_t N>
HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
                                         const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)};
  }
#endif
  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
}

template <size_t N>
HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
                                         const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)};
  }
#endif
  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
  const DFromV<decltype(v)> d8;
  // Use raw instead of BitCast to support N=1.
  const Vec128<T, N> shifted{
      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
}

// ------------------------------ ShiftRightSame (BroadcastSignBit)

template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
                                           const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)};
  }
#endif
  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
                                           const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)};
  }
#endif
  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
                                           const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)};
  }
#endif
  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
}

template <size_t N>
HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
                                          const int bits) {
  const DFromV<decltype(v)> d8;
  // Use raw instead of BitCast to support N=1.
  const Vec128<uint8_t, N> shifted{
      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
}

template <size_t N>
HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
                                          const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)};
  }
#endif
  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
}

template <size_t N>
HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
                                          const int bits) {
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)};
  }
#endif
  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
}
template <size_t N>
HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
                                          const int bits) {
#if HWY_TARGET <= HWY_AVX3
#if HWY_COMPILER_GCC
  if (__builtin_constant_p(bits)) {
    return Vec128<int64_t, N>{
        _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))};
  }
#endif
  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
#else
  const DFromV<decltype(v)> di;
  const RebindToUnsigned<decltype(di)> du;
  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
  return right | sign;
#endif
}

template <size_t N>
HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
  const DFromV<decltype(v)> di;
  const RebindToUnsigned<decltype(di)> du;
  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
  const auto shifted_sign =
      BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
  return (shifted ^ shifted_sign) - shifted_sign;
}

// ------------------------------ Floating-point mul / div

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> operator*(Vec128<float16_t, N> a,
                                       Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
}
HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a,
                                   const Vec128<float, 1> b) {
  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
                                    const Vec128<double, N> b) {
  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
}
HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) {
  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a,
                                       const Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
                                   const Vec128<float, N> b) {
  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
}
HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a,
                                   const Vec128<float, 1> b) {
  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
                                    const Vec128<double, N> b) {
  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
}
HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) {
  return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
}

// Approximate reciprocal
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> ApproximateReciprocal(
    const Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
}
HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) {
  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
}

#if HWY_TARGET <= HWY_AVX3
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif

HWY_API Vec128<double> ApproximateReciprocal(Vec128<double> v) {
  return Vec128<double>{_mm_rcp14_pd(v.raw)};
}
HWY_API Vec64<double> ApproximateReciprocal(Vec64<double> v) {
  return Vec64<double>{_mm_rcp14_sd(v.raw, v.raw)};
}
#endif

// Generic for all vector lengths.
template <class V, HWY_IF_FLOAT_V(V)>
HWY_API V AbsDiff(V a, V b) {
  return Abs(a - b);
}

// ------------------------------ MaskedMinOr

#if HWY_TARGET <= HWY_AVX3

#ifdef HWY_NATIVE_MASKED_ARITH
#undef HWY_NATIVE_MASKED_ARITH
#else
#define HWY_NATIVE_MASKED_ARITH
#endif

template <typename T, size_t N, HWY_IF_U8(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I8(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U16(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I16(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U32(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I32(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U64(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I64(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F32(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F64(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <typename T, size_t N, HWY_IF_F16(T)>
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// ------------------------------ MaskedMaxOr

template <typename T, size_t N, HWY_IF_U8(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I8(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U16(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I16(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U32(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I32(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U64(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_I64(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F32(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F64(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <typename T, size_t N, HWY_IF_F16(T)>
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// ------------------------------ MaskedAddOr

template <typename T, size_t N, HWY_IF_UI8(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F32(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F64(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <typename T, size_t N, HWY_IF_F16(T)>
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// ------------------------------ MaskedSubOr

template <typename T, size_t N, HWY_IF_UI8(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F32(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_F64(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <typename T, size_t N, HWY_IF_F16(T)>
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                 Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// ------------------------------ MaskedMulOr

// There are no elementwise integer mask_mul. Generic for all vector lengths.
template <class V, class M>
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
  return IfThenElse(m, a * b, no);
}

template <size_t N>
HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m,
                                     Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <size_t N>
HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no,
                                      Mask128<double, N> m, Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
                                         Mask128<float16_t, N> m,
                                         Vec128<float16_t, N> a,
                                         Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// ------------------------------ MaskedDivOr

template <size_t N>
HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m,
                                     Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
}

template <size_t N>
HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no,
                                      Mask128<double, N> m, Vec128<double, N> a,
                                      Vec128<double, N> b) {
  return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
}

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
                                         Mask128<float16_t, N> m,
                                         Vec128<float16_t, N> a,
                                         Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16

// Generic for all vector lengths
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) {
  return IfThenElse(m, Div(a, b), no);
}

// ------------------------------ MaskedModOr
// Generic for all vector lengths
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) {
  return IfThenElse(m, Mod(a, b), no);
}

// ------------------------------ MaskedSatAddOr

template <typename T, size_t N, HWY_IF_I8(T)>
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U8(T)>
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_I16(T)>
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U16(T)>
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
}

// ------------------------------ MaskedSatSubOr

template <typename T, size_t N, HWY_IF_I8(T)>
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U8(T)>
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_I16(T)>
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
}

template <typename T, size_t N, HWY_IF_U16(T)>
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
                                    Vec128<T, N> a, Vec128<T, N> b) {
  return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ Floating-point multiply-add variants

#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> MulAdd(Vec128<float16_t, N> mul,
                                    Vec128<float16_t, N> x,
                                    Vec128<float16_t, N> add) {
  return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)};
}

template <size_t N>
HWY_API Vec128<float16_t, N> NegMulAdd(Vec128<float16_t, N> mul,
                                       Vec128<float16_t, N> x,
                                       Vec128<float16_t, N> add) {
  return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)};
}

template <size_t N>
HWY_API Vec128<float16_t, N> MulSub(Vec128<float16_t, N> mul,
                                    Vec128<float16_t, N> x,
                                    Vec128<float16_t, N> sub) {
  return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)};
}

template <size_t N>
HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul,
                                       Vec128<float16_t, N> x,
                                       Vec128<float16_t, N> sub) {
  return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)};
}

#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                Vec128<float, N> add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return mul * x + add;
#else
  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
                                 Vec128<double, N> add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return mul * x + add;
#else
  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
#endif
}

// Returns add - mul * x
template <size_t N>
HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
                                   Vec128<float, N> add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return add - mul * x;
#else
  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
                                    Vec128<double, N> add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return add - mul * x;
#else
  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
#endif
}

// Returns mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
                                Vec128<float, N> sub) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return mul * x - sub;
#else
  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x,
                                 Vec128<double, N> sub) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return mul * x - sub;
#else
  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
#endif
}

// Returns -mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
                                   Vec128<float, N> sub) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return Neg(mul) * x - sub;
#else
  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
                                    Vec128<double, N> sub) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return Neg(mul) * x - sub;
#else
  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
#endif
}

#if HWY_TARGET <= HWY_SSSE3

#if HWY_HAVE_FLOAT16
template <size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
                                       Vec128<float16_t, N> x,
                                       Vec128<float16_t, N> sub_or_add) {
  return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
}
#endif  // HWY_HAVE_FLOAT16

template <size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x,
                                   Vec128<float, N> sub_or_add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return AddSub(mul * x, sub_or_add);
#else
  return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
#endif
}

HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x,
                                 Vec128<double> sub_or_add) {
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
  return AddSub(mul * x, sub_or_add);
#else
  return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
#endif
}

#endif  // HWY_TARGET <= HWY_SSSE3

// ------------------------------ Floating-point square root

// Full precision square root
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) {
  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
}
HWY_API Vec128<float, 1> Sqrt(Vec128<float, 1> v) {
  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) {
  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
}
HWY_API Vec64<double> Sqrt(Vec64<double> v) {
  return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
}

// Approximate reciprocal square root
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
}
HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(Vec128<float, 1> v) {
  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
}

#if HWY_TARGET <= HWY_AVX3
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif

HWY_API Vec64<double> ApproximateReciprocalSqrt(Vec64<double> v) {
  return Vec64<double>{_mm_rsqrt14_sd(v.raw, v.raw)};
}
HWY_API Vec128<double> ApproximateReciprocalSqrt(Vec128<double> v) {
#if HWY_COMPILER_MSVC
  const DFromV<decltype(v)> d;
  return Vec128<double>{_mm_mask_rsqrt14_pd(
      Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)};
#else
  return Vec128<double>{_mm_rsqrt14_pd(v.raw)};
#endif
}
#endif

// ------------------------------ Min (Gt, IfThenElse)

namespace detail {

template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a,
                                              const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RebindToSigned<decltype(d)> di;
  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
  return IfThenElse(gt, b, a);
}

}  // namespace detail

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return detail::MinU(a, b);
#else
  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return detail::MinU(a, b);
#else
  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
#else
  return detail::MinU(a, b);
#endif
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return IfThenElse(a < b, a, b);
#else
  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return IfThenElse(a < b, a, b);
#else
  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
#else
  return IfThenElse(a < b, a, b);
#endif
}

// Float
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Min(Vec128<float16_t, N> a,
                                 Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) {
  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
}

// ------------------------------ Max (Gt, IfThenElse)

namespace detail {
template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a,
                                              const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RebindToSigned<decltype(d)> di;
  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
  return IfThenElse(gt, a, b);
}

}  // namespace detail

// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return detail::MaxU(a, b);
#else
  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return detail::MaxU(a, b);
#else
  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
#else
  return detail::MaxU(a, b);
#endif
}

// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return IfThenElse(a < b, b, a);
#else
  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  return IfThenElse(a < b, b, a);
#else
  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
#else
  return IfThenElse(a < b, b, a);
#endif
}

// Float
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Max(Vec128<float16_t, N> a,
                                 Vec128<float16_t, N> b) {
  return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
}

// ================================================== MEMORY (3)

// ------------------------------ Non-temporal stores

// On clang6, we see incorrect code generated for _mm_stream_pi, so
// round even partial vectors up to 16 bytes.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) {
  _mm_stream_ps(aligned, v.raw);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) {
  _mm_stream_pd(aligned, v.raw);
}

// ------------------------------ Scatter

// Work around warnings in the intrinsic definitions (passing -1 as a mask).
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")

// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
using GatherIndex64 = long long int;  // NOLINT(runtime/int)
static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");

#if HWY_TARGET <= HWY_AVX3

#ifdef HWY_NATIVE_SCATTER
#undef HWY_NATIVE_SCATTER
#else
#define HWY_NATIVE_SCATTER
#endif

namespace detail {

template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
                                 VI index) {
  if (d.MaxBytes() == 16) {
    _mm_i32scatter_epi32(base, index.raw, v.raw, kScale);
  } else {
    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
    _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale);
  }
}

template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
                                 VI index) {
  if (d.MaxBytes() == 16) {
    _mm_i64scatter_epi64(base, index.raw, v.raw, kScale);
  } else {
    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
    _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale);
  }
}

template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
HWY_INLINE void NativeScatter128(VFromD<D> v, D d, float* HWY_RESTRICT base,
                                 VI index) {
  if (d.MaxBytes() == 16) {
    _mm_i32scatter_ps(base, index.raw, v.raw, kScale);
  } else {
    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
    _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale);
  }
}

template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
HWY_INLINE void NativeScatter128(VFromD<D> v, D d, double* HWY_RESTRICT base,
                                 VI index) {
  if (d.MaxBytes() == 16) {
    _mm_i64scatter_pd(base, index.raw, v.raw, kScale);
  } else {
    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
    _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale);
  }
}

template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
                                       TFromD<D>* HWY_RESTRICT base, VI index) {
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
  _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale);
}

template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
                                       TFromD<D>* HWY_RESTRICT base, VI index) {
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
  _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale);
}

template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
                                       float* HWY_RESTRICT base, VI index) {
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
  _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale);
}

template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
                                       double* HWY_RESTRICT base, VI index) {
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
  _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale);
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API void ScatterOffset(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
                           VFromD<RebindToSigned<D>> offset) {
  return detail::NativeScatter128<1>(v, d, base, offset);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
                          VFromD<RebindToSigned<D>> index) {
  return detail::NativeScatter128<sizeof(TFromD<D>)>(v, d, base, index);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
                                TFromD<D>* HWY_RESTRICT base,
                                VFromD<RebindToSigned<D>> index) {
  return detail::NativeMaskedScatter128<sizeof(TFromD<D>)>(v, m, d, base,
                                                           index);
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ Gather (Load/Store)

#if HWY_TARGET <= HWY_AVX2

#ifdef HWY_NATIVE_GATHER
#undef HWY_NATIVE_GATHER
#else
#define HWY_NATIVE_GATHER
#endif

namespace detail {

template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
                                        Vec128<int32_t, N> indices) {
  return Vec128<T, N>{_mm_i32gather_epi32(
      reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
}

template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
                                        Vec128<int64_t, N> indices) {
  return Vec128<T, N>{_mm_i64gather_epi64(
      reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
}

template <int kScale, size_t N>
HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base,
                                            Vec128<int32_t, N> indices) {
  return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)};
}

template <int kScale, size_t N>
HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base,
                                             Vec128<int64_t, N> indices) {
  return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)};
}

template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
                                                Mask128<T, N> m,
                                                const T* HWY_RESTRICT base,
                                                Vec128<int32_t, N> indices) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<T, N>{_mm_mmask_i32gather_epi32(
      no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
      kScale)};
#else
  return Vec128<T, N>{
      _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base),
                               indices.raw, m.raw, kScale)};
#endif
}

template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
                                                Mask128<T, N> m,
                                                const T* HWY_RESTRICT base,
                                                Vec128<int64_t, N> indices) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<T, N>{_mm_mmask_i64gather_epi64(
      no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
      kScale)};
#else
  return Vec128<T, N>{_mm_mask_i64gather_epi64(
      no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw,
      kScale)};
#endif
}

template <int kScale, size_t N>
HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128(
    Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base,
    Vec128<int32_t, N> indices) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<float, N>{
      _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
#else
  return Vec128<float, N>{
      _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
#endif
}

template <int kScale, size_t N>
HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
    Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base,
    Vec128<int64_t, N> indices) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<double, N>{
      _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
#else
  return Vec128<double, N>{
      _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
#endif
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> offsets) {
  const RebindToSigned<decltype(d)> di;
  (void)di;  // for HWY_DASSERT
  HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
  return detail::NativeGather128<1>(base, offsets);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
                              VFromD<RebindToSigned<D>> indices) {
  const RebindToSigned<decltype(d)> di;
  (void)di;  // for HWY_DASSERT
  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
  return detail::NativeGather128<sizeof(T)>(base, indices);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
                                      const T* HWY_RESTRICT base,
                                      VFromD<RebindToSigned<D>> indices) {
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));

  const RebindToSigned<decltype(d)> di;
  (void)di;  // for HWY_DASSERT
  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
  return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
}

// Generic for all vector lengths.
template <class D>
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
                                    const TFromD<D>* HWY_RESTRICT base,
                                    VFromD<RebindToSigned<D>> indices) {
  return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
}

#endif  // HWY_TARGET <= HWY_AVX2

HWY_DIAGNOSTICS(pop)

// ================================================== SWIZZLE (2)

// ------------------------------ LowerHalf

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
  return VFromD<D>{v.raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
  return Vec128<T, N / 2>{v.raw};
}

// ------------------------------ ShiftLeftBytes

template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  const RebindToUnsigned<decltype(d)> du;
  return BitCast(
      d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)});
}

// Generic for all vector lengths.
template <int kBytes, class V>
HWY_API V ShiftLeftBytes(const V v) {
  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
}

// ------------------------------ ShiftLeftLanes

// Generic for all vector lengths.
template <int kLanes, class D>
HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
}

// Generic for all vector lengths.
template <int kLanes, class V>
HWY_API V ShiftLeftLanes(const V v) {
  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
}

// ------------------------------ ShiftRightBytes
template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  const RebindToUnsigned<decltype(d)> du;
  // For partial vectors, clear upper lanes so we shift in zeros.
  if (d.MaxBytes() != 16) {
    const Full128<TFromD<D>> dfull;
    const VFromD<decltype(dfull)> vfull{v.raw};
    v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
  }
  return BitCast(
      d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)});
}

// ------------------------------ ShiftRightLanes
// Generic for all vector lengths.
template <int kLanes, class D>
HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) {
  const Repartition<uint8_t, decltype(d)> d8;
  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
}

// ------------------------------ UpperHalf (ShiftRightBytes)

// Full input: copy hi into lo (smaller instruction encoding than shifts).
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
  const Twice<RebindToUnsigned<decltype(d)>> dut;
  using VUT = VFromD<decltype(dut)>;  // for float16_t
  const VUT vut = BitCast(dut, v);
  return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)}));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
  return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)};
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
  return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
  return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
}

// ------------------------------ ExtractLane (UpperHalf)

namespace detail {

template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  const int pair = _mm_extract_epi16(v.raw, kLane / 2);
  constexpr int kShift = kLane & 1 ? 8 : 0;
  return static_cast<T>((pair >> kShift) & 0xFF);
#else
  return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
#endif
}

template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
  static_assert(kLane < N, "Lane index out of bounds");
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const uint16_t lane = static_cast<uint16_t>(
      _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF);
  return BitCastScalar<T>(lane);
}

template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  return static_cast<T>(_mm_cvtsi128_si32(
      (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane)));
#else
  return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
#endif
}

template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_ARCH_X86_32
  alignas(16) T lanes[2];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[kLane];
#elif HWY_TARGET >= HWY_SSSE3
  return static_cast<T>(
      _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE)));
#else
  return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
#endif
}

template <size_t kLane, size_t N>
HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  return _mm_cvtss_f32((kLane == 0) ? v.raw
                                    : _mm_shuffle_ps(v.raw, v.raw, kLane));
#else
  // Bug in the intrinsic, returns int but should be float.
  const int32_t bits = _mm_extract_ps(v.raw, kLane);
  return BitCastScalar<float>(bits);
#endif
}

// There is no extract_pd; two overloads because there is no UpperHalf for N=1.
template <size_t kLane>
HWY_INLINE double ExtractLane(const Vec64<double> v) {
  static_assert(kLane == 0, "Lane index out of bounds");
  return GetLane(v);
}

template <size_t kLane>
HWY_INLINE double ExtractLane(const Vec128<double> v) {
  static_assert(kLane < 2, "Lane index out of bounds");
  const Half<DFromV<decltype(v)>> dh;
  return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
}

}  // namespace detail

// Requires one overload per vector length because ExtractLane<3> may be a
// compile error if it calls _mm_extract_epi64.
template <typename T>
HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
  HWY_DASSERT(i == 0);
  (void)i;
  return GetLane(v);
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::ExtractLane<0>(v);
      case 1:
        return detail::ExtractLane<1>(v);
    }
  }
#endif
  alignas(16) T lanes[2];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::ExtractLane<0>(v);
      case 1:
        return detail::ExtractLane<1>(v);
      case 2:
        return detail::ExtractLane<2>(v);
      case 3:
        return detail::ExtractLane<3>(v);
    }
  }
#endif
  alignas(16) T lanes[4];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::ExtractLane<0>(v);
      case 1:
        return detail::ExtractLane<1>(v);
      case 2:
        return detail::ExtractLane<2>(v);
      case 3:
        return detail::ExtractLane<3>(v);
      case 4:
        return detail::ExtractLane<4>(v);
      case 5:
        return detail::ExtractLane<5>(v);
      case 6:
        return detail::ExtractLane<6>(v);
      case 7:
        return detail::ExtractLane<7>(v);
    }
  }
#endif
  alignas(16) T lanes[8];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

template <typename T>
HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::ExtractLane<0>(v);
      case 1:
        return detail::ExtractLane<1>(v);
      case 2:
        return detail::ExtractLane<2>(v);
      case 3:
        return detail::ExtractLane<3>(v);
      case 4:
        return detail::ExtractLane<4>(v);
      case 5:
        return detail::ExtractLane<5>(v);
      case 6:
        return detail::ExtractLane<6>(v);
      case 7:
        return detail::ExtractLane<7>(v);
      case 8:
        return detail::ExtractLane<8>(v);
      case 9:
        return detail::ExtractLane<9>(v);
      case 10:
        return detail::ExtractLane<10>(v);
      case 11:
        return detail::ExtractLane<11>(v);
      case 12:
        return detail::ExtractLane<12>(v);
      case 13:
        return detail::ExtractLane<13>(v);
      case 14:
        return detail::ExtractLane<14>(v);
      case 15:
        return detail::ExtractLane<15>(v);
    }
  }
#endif
  alignas(16) T lanes[16];
  Store(v, DFromV<decltype(v)>(), lanes);
  return lanes[i];
}

// ------------------------------ InsertLane (UpperHalf)

namespace detail {

template <class V>
HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) {
  const DFromV<decltype(v)> d;

#if HWY_TARGET <= HWY_AVX3
  using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw);
  const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)};
#else
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i)));
#endif

  return IfThenElse(mask, Set(d, t), v);
}

template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
#else
  return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
#endif
}

template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  static_assert(kLane < N, "Lane index out of bounds");
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const uint16_t bits = BitCastScalar<uint16_t>(t);
  return BitCast(d, VFromD<decltype(du)>{
                        _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
}

template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
#else
  const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
#endif
}

template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32
  const DFromV<decltype(v)> d;
  const RebindToFloat<decltype(d)> df;
  const auto vt = BitCast(df, Set(d, t));
  if (kLane == 0) {
    return BitCast(
        d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)});
  }
  return BitCast(
      d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)});
#else
  const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
#endif
}

template <size_t kLane, size_t N>
HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
  static_assert(kLane < N, "Lane index out of bounds");
#if HWY_TARGET >= HWY_SSSE3
  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
#else
  return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
#endif
}

// There is no insert_pd; two overloads because there is no UpperHalf for N=1.
template <size_t kLane>
HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) {
  static_assert(kLane == 0, "Lane index out of bounds");
  return Set(DFromV<decltype(v)>(), t);
}

template <size_t kLane>
HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) {
  static_assert(kLane < 2, "Lane index out of bounds");
  const DFromV<decltype(v)> d;
  const Vec128<double> vt = Set(d, t);
  if (kLane == 0) {
    return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
  }
  return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
}

}  // namespace detail

// Requires one overload per vector length because InsertLane<3> may be a
// compile error if it calls _mm_insert_epi64.

template <typename T>
HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
  HWY_DASSERT(i == 0);
  (void)i;
  return Set(DFromV<decltype(v)>(), t);
}

template <typename T>
HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
    }
  }
#endif
  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
}

template <typename T>
HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
    }
  }
#endif
  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
}

template <typename T>
HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
    }
  }
#endif
  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
}

template <typename T>
HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(i)) {
    switch (i) {
      case 0:
        return detail::InsertLane<0>(v, t);
      case 1:
        return detail::InsertLane<1>(v, t);
      case 2:
        return detail::InsertLane<2>(v, t);
      case 3:
        return detail::InsertLane<3>(v, t);
      case 4:
        return detail::InsertLane<4>(v, t);
      case 5:
        return detail::InsertLane<5>(v, t);
      case 6:
        return detail::InsertLane<6>(v, t);
      case 7:
        return detail::InsertLane<7>(v, t);
      case 8:
        return detail::InsertLane<8>(v, t);
      case 9:
        return detail::InsertLane<9>(v, t);
      case 10:
        return detail::InsertLane<10>(v, t);
      case 11:
        return detail::InsertLane<11>(v, t);
      case 12:
        return detail::InsertLane<12>(v, t);
      case 13:
        return detail::InsertLane<13>(v, t);
      case 14:
        return detail::InsertLane<14>(v, t);
      case 15:
        return detail::InsertLane<15>(v, t);
    }
  }
#endif
  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
}

// ------------------------------ CombineShiftRightBytes

#if HWY_TARGET == HWY_SSE2
template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  static_assert(0 < kBytes && kBytes < 16, "kBytes invalid");
  return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
}
template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  constexpr size_t kSize = d.MaxBytes();
  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");

  const Twice<decltype(d)> dt;
  return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw};
}
#else
template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
                        BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
}

template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  constexpr size_t kSize = d.MaxBytes();
  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
  const Repartition<uint8_t, decltype(d)> d8;
  using V8 = Vec128<uint8_t>;
  const DFromV<V8> dfull8;
  const Repartition<TFromD<D>, decltype(dfull8)> dfull;
  const V8 hi8{BitCast(d8, hi).raw};
  // Move into most-significant bytes
  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
  return VFromD<D>{BitCast(dfull, r).raw};
}
#endif

// ------------------------------ Broadcast/splat any lane

template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const VU vu = BitCast(du, v);  // for float16_t
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  if (kLane < 4) {
    const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
    return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)});
  } else {
    const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
    return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)});
  }
}

template <int kLane, typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
}

template <int kLane, typename T, size_t N, HWY_IF_UI64(T)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
}

template <int kLane, size_t N>
HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
}

template <int kLane, size_t N>
HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
}

// ------------------------------ TableLookupLanes (Shuffle01)

// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
template <typename T, size_t N = 16 / sizeof(T)>
struct Indices128 {
  __m128i raw;
};

template <class D, typename T = TFromD<D>, typename TI, size_t kN,
          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)>
HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const Rebind<TI, decltype(d)> di;
  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
              AllTrue(di, Lt(vec, Set(di, kN * 2))));
#endif

  // No change as byte indices are always used for 8-bit lane types
  (void)d;
  return Indices128<T, kN>{vec.raw};
}

template <class D, typename T = TFromD<D>, typename TI, size_t kN,
          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)>
HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const Rebind<TI, decltype(d)> di;
  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
              AllTrue(di, Lt(vec, Set(di, kN * 2))));
#endif

#if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
  (void)d;
  return Indices128<T, kN>{vec.raw};
#else   // SSSE3, SSE4, or AVX2
  const Repartition<uint8_t, decltype(d)> d8;
  using V8 = VFromD<decltype(d8)>;
  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};

  // Broadcast each lane index to all 4 bytes of T
  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
      0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));

  // Shift to bytes
  const Repartition<uint16_t, decltype(d)> d16;
  const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices)));

  return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
#endif  // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
}

template <class D, typename T = TFromD<D>, typename TI, size_t kN,
          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)>
HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const Rebind<TI, decltype(d)> di;
  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
              AllTrue(di, Lt(vec, Set(di, kN * 2))));
#endif

#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
  (void)d;
  return Indices128<T, kN>{vec.raw};
#else
  const Repartition<uint8_t, decltype(d)> d8;
  using V8 = VFromD<decltype(d8)>;
  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};

  // Broadcast each lane index to all 4 bytes of T
  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));

  // Shift to bytes
  const Repartition<uint16_t, decltype(d)> d16;
  const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));

  return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
#endif
}

template <class D, typename T = TFromD<D>, typename TI, size_t kN,
          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)>
HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
  const Rebind<TI, decltype(d)> di;
  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2)))));
#else
  (void)d;
#endif

  // No change - even without AVX3, we can shuffle+blend.
  return Indices128<T, kN>{vec.raw};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
    D d, const TI* idx) {
  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
  const Rebind<TI, decltype(d)> di;
  return IndicesFromVec(d, LoadU(di, idx));
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
}

template <typename T, size_t N, HWY_IF_UI16(T)>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
#if HWY_TARGET <= HWY_AVX3
  return {_mm_permutexvar_epi16(idx.raw, v.raw)};
#elif HWY_TARGET == HWY_SSE2
#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
  typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16)));
  return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
      __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw),
                        reinterpret_cast<GccU16RawVectType>(idx.raw)))};
#else
  const Full128<T> d_full;
  alignas(16) T src_lanes[8];
  alignas(16) uint16_t indices[8];
  alignas(16) T result_lanes[8];

  Store(Vec128<T>{v.raw}, d_full, src_lanes);
  _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);

  for (int i = 0; i < 8; i++) {
    result_lanes[i] = src_lanes[indices[i] & 7u];
  }

  return Vec128<T, N>{Load(d_full, result_lanes).raw};
#endif  // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
#else
  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
#endif
}

#if HWY_HAVE_FLOAT16
template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 2)>
HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v,
                                              Indices128<float16_t, N> idx) {
  return {_mm_permutexvar_ph(idx.raw, v.raw)};
}
#endif  // HWY_HAVE_FLOAT16

template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
#if HWY_TARGET <= HWY_AVX2
  const DFromV<decltype(v)> d;
  const RebindToFloat<decltype(d)> df;
  const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
  return BitCast(d, perm);
#elif HWY_TARGET == HWY_SSE2
#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
  return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
      __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v.raw),
                        reinterpret_cast<GccU32RawVectType>(idx.raw)))};
#else
  const Full128<T> d_full;
  alignas(16) T src_lanes[4];
  alignas(16) uint32_t indices[4];
  alignas(16) T result_lanes[4];

  Store(Vec128<T>{v.raw}, d_full, src_lanes);
  _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);

  for (int i = 0; i < 4; i++) {
    result_lanes[i] = src_lanes[indices[i] & 3u];
  }

  return Vec128<T, N>{Load(d_full, result_lanes).raw};
#endif  // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
#else   // SSSE3 or SSE4
  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
#endif
}

#if HWY_TARGET <= HWY_SSSE3
template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v,
                                          Indices128<float, N> idx) {
#if HWY_TARGET <= HWY_AVX2
  return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
#else   // SSSE3 or SSE4
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;
  return BitCast(df,
                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
#endif  // HWY_TARGET <= HWY_AVX2
}
#endif  // HWY_TARGET <= HWY_SSSE3

// Single lane: no change
template <typename T>
HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
                                      Indices128<T, 1> /* idx */) {
  return v;
}

template <typename T, HWY_IF_UI64(T)>
HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
  const DFromV<decltype(v)> d;
  Vec128<int64_t> vidx{idx.raw};
#if HWY_TARGET <= HWY_AVX2
  // There is no _mm_permute[x]var_epi64.
  vidx += vidx;  // bit1 is the decider (unusual)
  const RebindToFloat<decltype(d)> df;
  return BitCast(
      d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
#else
  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
  // to obtain an all-zero or all-one mask.
  const RebindToSigned<decltype(d)> di;
  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
  const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
  return IfThenElse(mask_same, v, Shuffle01(v));
#endif
}

HWY_API Vec128<double> TableLookupLanes(Vec128<double> v,
                                        Indices128<double> idx) {
  Vec128<int64_t> vidx{idx.raw};
#if HWY_TARGET <= HWY_AVX2
  vidx += vidx;  // bit1 is the decider (unusual)
  return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
#else
  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
  // to obtain an all-zero or all-one mask.
  const DFromV<decltype(v)> d;
  const RebindToSigned<decltype(d)> di;
  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
  const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
  return IfThenElse(mask_same, v, Shuffle01(v));
#endif
}

// ------------------------------ ReverseBlocks

// Single block: no change
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
  return v;
}

// ------------------------------ Reverse (Shuffle0123, Shuffle2301)

// Single lane: no change
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
  return v;
}

// 32-bit x2: shuffle
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
  return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw};
}

// 64-bit x2: shuffle
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
  return Shuffle01(v);
}

// 32-bit x4: shuffle
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
  return Shuffle0123(v);
}

// 16-bit
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2),
          HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const VU vu = BitCast(du, v);  // for float16_t
  constexpr size_t kN = MaxLanes(d);
  if (kN == 1) return v;
  if (kN == 2) {
    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))});
  }
  if (kN == 4) {
    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
  }

#if HWY_TARGET == HWY_SSE2
  const VU rev4{
      _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
                          _MM_SHUFFLE(0, 1, 2, 3))};
  return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
#else
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
  return BitCast(d, TableLookupBytes(v, shuffle));
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1),
          HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
  constexpr int kN = static_cast<int>(MaxLanes(d));
  if (kN == 1) return v;
#if HWY_TARGET <= HWY_SSSE3
  // NOTE: Lanes with negative shuffle control mask values are set to zero.
  alignas(16) static constexpr int8_t kReverse[16] = {
      kN - 1, kN - 2,  kN - 3,  kN - 4,  kN - 5,  kN - 6,  kN - 7,  kN - 8,
      kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16};
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> idx = Load(di, kReverse);
  return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)};
#else
  const RepartitionToWide<decltype(d)> d16;
  return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v))));
#endif
}

// ------------------------------ Reverse2

// Single lane: no change
template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
  return v;
}

// Generic for all vector lengths (128-bit sufficient if SSE2).
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
#if HWY_TARGET <= HWY_AVX3
  const Repartition<uint32_t, decltype(d)> du32;
  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
#elif HWY_TARGET == HWY_SSE2
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const VU vu = BitCast(du, v);  // for float16_t
  constexpr size_t kN = MaxLanes(d);
  __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1));
  if (kN > 4) {
    shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1));
  }
  return BitCast(d, VU{shuf_result});
#else
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
      di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
  return BitCast(d, TableLookupBytes(v, shuffle));
#endif
}

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
  return Shuffle2301(v);
}

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
  return Shuffle01(v);
}

// ------------------------------ Reverse4

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;
  const VU vu = BitCast(du, v);  // for float16_t
  // 4x 16-bit: a single shufflelo suffices.
  constexpr size_t kN = MaxLanes(d);
  if (kN <= 4) {
    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
  }

#if HWY_TARGET == HWY_SSE2
  return BitCast(d, VU{_mm_shufflehi_epi16(
                        _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
                        _MM_SHUFFLE(0, 1, 2, 3))});
#else
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
      di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
  return BitCast(d, TableLookupBytes(v, shuffle));
#endif
}

// Generic for all vector lengths.
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
  return Shuffle0123(v);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
  HWY_ASSERT(0);  // don't have 4 u64 lanes
}

// ------------------------------ Reverse8

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
#if HWY_TARGET == HWY_SSE2
  const RepartitionToWide<decltype(d)> dw;
  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
#else
  const RebindToSigned<decltype(d)> di;
  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
  return BitCast(d, TableLookupBytes(v, shuffle));
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
  HWY_ASSERT(0);  // don't have 8 lanes if larger than 16-bit
}

// ------------------------------ ReverseBits in x86_512

// ------------------------------ InterleaveUpper (UpperHalf)

// Full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  const DFromV<decltype(a)> d;
  const RebindToUnsigned<decltype(d)> du;
  using VU = VFromD<decltype(du)>;  // for float16_t
  return BitCast(
      d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
  return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)};
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
  const Half<decltype(d)> d2;
  return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
                         VFromD<D>{UpperHalf(d2, b).raw});
}

// -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper)

template <int kLane, class T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
  static_assert(0 <= kLane && kLane < N, "Invalid lane");
  const DFromV<decltype(v)> d;

#if HWY_TARGET == HWY_SSE2
  const Full128<T> d_full;
  const Vec128<T> v_full{v.raw};
  const auto v_interleaved = (kLane < 8)
                                 ? InterleaveLower(d_full, v_full, v_full)
                                 : InterleaveUpper(d_full, v_full, v_full);
  return ResizeBitCast(
      d, Broadcast<kLane & 7>(BitCast(Full128<uint16_t>(), v_interleaved)));
#else
  return TableLookupBytes(v, Set(d, static_cast<T>(kLane)));
#endif
}

// ------------------------------ ZipLower/ZipUpper (InterleaveLower)

// Same as Interleave*, except that the return lanes are double-width integers;
// this is necessary because the single-lane scalar cannot return two values.
// Generic for all vector lengths.
template <class V, class DW = RepartitionToWide<DFromV<V>>>
HWY_API VFromD<DW> ZipLower(V a, V b) {
  return BitCast(DW(), InterleaveLower(a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
  return BitCast(dw, InterleaveLower(D(), a, b));
}

template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
  return BitCast(dw, InterleaveUpper(D(), a, b));
}

// ------------------------------ Per4LaneBlockShuffle
namespace detail {

#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#else
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
                                                const uint32_t x2,
                                                const uint32_t x1,
                                                const uint32_t x0) {
  return ResizeBitCast(
      d, Vec128<uint32_t>{_mm_set_epi32(
             static_cast<int32_t>(x3), static_cast<int32_t>(x2),
             static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
}

template <size_t kIdx3210, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                  hwy::SizeTag<2> /*lane_size_tag*/,
                                  hwy::SizeTag<8> /*vect_size_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d,
                 VFromD<decltype(du)>{_mm_shufflelo_epi16(
                     BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
}

#if HWY_TARGET == HWY_SSE2
template <size_t kIdx3210, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                  hwy::SizeTag<2> /*lane_size_tag*/,
                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
  return BitCast(
      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
             _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)});
}

template <size_t kIdx3210, size_t kVectSize, class V,
          hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* = nullptr>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag,
                                  hwy::SizeTag<1> /*lane_size_tag*/,
                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
                                  V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Rebind<uint16_t, decltype(d)> du16;
  const RebindToSigned<decltype(du16)> di16;

  const auto vu16 = PromoteTo(du16, BitCast(du, v));
  const auto shuf16_result = Per4LaneBlockShuffle(
      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<kVectSize * 2>(), vu16);
  return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result)));
}

template <size_t kIdx3210, size_t kVectSize, class V>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag,
                                  hwy::SizeTag<1> /*lane_size_tag*/,
                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Repartition<uint16_t, decltype(d)> du16;
  const RebindToSigned<decltype(du16)> di16;

  const auto zero = Zero(d);
  const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero));
  const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero));

  const auto lo_shuf_result = Per4LaneBlockShuffle(
      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16);
  const auto hi_shuf_result = Per4LaneBlockShuffle(
      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16);

  return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result),
                                     BitCast(di16, hi_shuf_result)));
}
#endif

template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                  hwy::SizeTag<4> /*lane_size_tag*/,
                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
  return V{_mm_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
}

template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
                                  hwy::SizeTag<4> /*lane_size_tag*/,
                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
  return V{_mm_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))};
}

}  // namespace detail

// ------------------------------ SlideUpLanes

namespace detail {

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE V SlideUpLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Full64<uint64_t> du64;
  const auto vu64 = ResizeBitCast(du64, v);
  return ResizeBitCast(
      d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
}

#if HWY_TARGET <= HWY_SSSE3
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V SlideUpLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  const auto idx =
      Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
  return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
}
#else
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V SlideUpLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Repartition<int32_t, decltype(d)> di32;
  const Repartition<uint64_t, decltype(d)> du64;
  constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);

  const auto vu64 = BitCast(du64, v);
  const auto v_hi = IfVecThenElse(
      BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
      BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64);
  const auto v_lo = ShiftLeftBytes<8>(du64, v_hi);

  const int shl_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
  return BitCast(
      d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt)));
}
#endif

}  // namespace detail

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
  return v;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftLeftLanes<1>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideUpLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftLeftLanes<1>(d, v);
      case 2:
        return ShiftLeftLanes<2>(d, v);
      case 3:
        return ShiftLeftLanes<3>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideUpLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftLeftLanes<1>(d, v);
      case 2:
        return ShiftLeftLanes<2>(d, v);
      case 3:
        return ShiftLeftLanes<3>(d, v);
      case 4:
        return ShiftLeftLanes<4>(d, v);
      case 5:
        return ShiftLeftLanes<5>(d, v);
      case 6:
        return ShiftLeftLanes<6>(d, v);
      case 7:
        return ShiftLeftLanes<7>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideUpLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftLeftLanes<1>(d, v);
      case 2:
        return ShiftLeftLanes<2>(d, v);
      case 3:
        return ShiftLeftLanes<3>(d, v);
      case 4:
        return ShiftLeftLanes<4>(d, v);
      case 5:
        return ShiftLeftLanes<5>(d, v);
      case 6:
        return ShiftLeftLanes<6>(d, v);
      case 7:
        return ShiftLeftLanes<7>(d, v);
      case 8:
        return ShiftLeftLanes<8>(d, v);
      case 9:
        return ShiftLeftLanes<9>(d, v);
      case 10:
        return ShiftLeftLanes<10>(d, v);
      case 11:
        return ShiftLeftLanes<11>(d, v);
      case 12:
        return ShiftLeftLanes<12>(d, v);
      case 13:
        return ShiftLeftLanes<13>(d, v);
      case 14:
        return ShiftLeftLanes<14>(d, v);
      case 15:
        return ShiftLeftLanes<15>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideUpLanes(v, amt);
}

// ------------------------------ SlideDownLanes

namespace detail {

template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
  return BitCast(d,
                 ShiftRightSame(BitCast(dv, v),
                                static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
}

#if HWY_TARGET <= HWY_SSSE3
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Repartition<int8_t, decltype(d)> di8;
  auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
  idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
  return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
}
#else
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
  const DFromV<decltype(v)> d;
  const Repartition<int32_t, decltype(d)> di32;
  const Repartition<uint64_t, decltype(d)> du64;
  constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);

  const auto vu64 = BitCast(du64, v);
  const auto v_lo = IfVecThenElse(
      BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
      BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64);
  const auto v_hi = ShiftRightBytes<8>(du64, v_lo);

  const int shr_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
  return BitCast(
      d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt)));
}
#endif

}  // namespace detail

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
  return v;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftRightLanes<1>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideDownLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftRightLanes<1>(d, v);
      case 2:
        return ShiftRightLanes<2>(d, v);
      case 3:
        return ShiftRightLanes<3>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideDownLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftRightLanes<1>(d, v);
      case 2:
        return ShiftRightLanes<2>(d, v);
      case 3:
        return ShiftRightLanes<3>(d, v);
      case 4:
        return ShiftRightLanes<4>(d, v);
      case 5:
        return ShiftRightLanes<5>(d, v);
      case 6:
        return ShiftRightLanes<6>(d, v);
      case 7:
        return ShiftRightLanes<7>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideDownLanes(v, amt);
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
  if (__builtin_constant_p(amt)) {
    switch (amt) {
      case 0:
        return v;
      case 1:
        return ShiftRightLanes<1>(d, v);
      case 2:
        return ShiftRightLanes<2>(d, v);
      case 3:
        return ShiftRightLanes<3>(d, v);
      case 4:
        return ShiftRightLanes<4>(d, v);
      case 5:
        return ShiftRightLanes<5>(d, v);
      case 6:
        return ShiftRightLanes<6>(d, v);
      case 7:
        return ShiftRightLanes<7>(d, v);
      case 8:
        return ShiftRightLanes<8>(d, v);
      case 9:
        return ShiftRightLanes<9>(d, v);
      case 10:
        return ShiftRightLanes<10>(d, v);
      case 11:
        return ShiftRightLanes<11>(d, v);
      case 12:
        return ShiftRightLanes<12>(d, v);
      case 13:
        return ShiftRightLanes<13>(d, v);
      case 14:
        return ShiftRightLanes<14>(d, v);
      case 15:
        return ShiftRightLanes<15>(d, v);
    }
  }
#else
  (void)d;
#endif

  return detail::SlideDownLanes(v, amt);
}

// ================================================== MEMORY (4)

// ------------------------------ StoreN (ExtractLane)

#if HWY_TARGET <= HWY_AVX2

#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif

template <class D, HWY_IF_T_SIZE_ONE_OF_D(
                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
                              (1 << 4) | (1 << 8))>
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  const size_t num_lanes_to_store =
      HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));

#if HWY_COMPILER_MSVC
  // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
  HWY_FENCE;
#endif

  BlendedStore(v, FirstN(d, num_lanes_to_store), d, p);

#if HWY_COMPILER_MSVC
  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
  HWY_FENCE;
#endif

  detail::MaybeUnpoison(p, num_lanes_to_store);
}

#if HWY_TARGET > HWY_AVX3
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_LANES_D(D, 1)>
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  if (max_lanes_to_store > 0) {
    StoreU(v, d, p);
  }
}

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_LANES_D(D, 2)>
HWY_API void StoreN(VFromD<D> v, D /*d*/, TFromD<D>* HWY_RESTRICT p,
                    size_t max_lanes_to_store) {
  if (max_lanes_to_store >= 1) {
    p[static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v);
    p[0] = GetLane(v);
  }
}

namespace detail {

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
                                        TFromD<D>* HWY_RESTRICT p,
                                        size_t num_lanes_to_store) {
  // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if
  // (num_lanes_to_store & 3) != 0 is true
  const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
  if ((num_lanes_to_store & 2) != 0) {
    const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128));
    p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
    CopyBytes<sizeof(uint16_t)>(&u16_bits,
                                p + (num_lanes_to_store & ~size_t{3}));
  } else {
    p[num_lanes_to_store - 1] = GetLane(v_full128);
  }
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
                                        TFromD<D>* p,
                                        size_t num_lanes_to_store) {
  // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16
  // vector if (num_lanes_to_store & 1) == 1 is true
  p[num_lanes_to_store - 1] = GetLane(v_trailing);
}

}  // namespace detail

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
          HWY_IF_LANES_GT_D(D, 2)>
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) {
  const size_t num_lanes_to_store =
      HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));

  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
      d_full;
  const RebindToUnsigned<decltype(d_full)> du_full;
  const Repartition<int32_t, decltype(d_full)> di32_full;

  const auto i32_store_mask = BitCast(
      di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store)));
  const auto vi32 = ResizeBitCast(di32_full, v);

#if HWY_COMPILER_MSVC
  // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
  HWY_FENCE;
#endif

  BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full,
               reinterpret_cast<int32_t*>(p));

  constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>);
  constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
  const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);

  if (trailing_n != 0) {
    const VFromD<D> v_trailing = ResizeBitCast(
        d, SlideDownLanes(di32_full, vi32,
                          num_lanes_to_store / kNumOfLanesPerI32));
    detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
  }

#if HWY_COMPILER_MSVC
  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
  HWY_FENCE;
#endif

  detail::MaybeUnpoison(p, num_lanes_to_store);
}
#endif  // HWY_TARGET > HWY_AVX3
#endif  // HWY_TARGET <= HWY_AVX2

// ================================================== COMBINE

// ------------------------------ Combine (InterleaveLower)

// N = N/2 + N/2 (upper half undefined)
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
  const Half<decltype(d)> dh;
  const RebindToUnsigned<decltype(dh)> duh;
  // Treat half-width input as one lane, and expand to two lanes.
  using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
  const VU lo{BitCast(duh, lo_half).raw};
  const VU hi{BitCast(duh, hi_half).raw};
  return BitCast(d, InterleaveLower(lo, hi));
}

// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  const RebindToUnsigned<decltype(d)> du;
  const Half<decltype(du)> duh;
  return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)});
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  const Half<D> dh;
  return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
}

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  const RebindToUnsigned<decltype(d)> du;
  const Half<decltype(du)> duh;
  return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
}
#endif

// Generic for all vector lengths.
template <class D, HWY_X86_IF_EMULATED_D(D)>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  const RebindToUnsigned<decltype(d)> du;
  const Half<decltype(du)> duh;
  return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
}

// ------------------------------ Concat full (InterleaveLower)

// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<uint64_t, decltype(d)> d64;
  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
}

// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<uint64_t, decltype(d)> d64;
  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
}

// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  return CombineShiftRightBytes<8>(d, hi, lo);
}

// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<double, decltype(d)> dd;
#if HWY_TARGET >= HWY_SSSE3
  return BitCast(
      d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
                                       _MM_SHUFFLE2(1, 0))});
#else
  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
                                                BitCast(dd, lo).raw, 1)});
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API Vec128<float> ConcatUpperLower(D d, Vec128<float> hi,
                                       Vec128<float> lo) {
#if HWY_TARGET >= HWY_SSSE3
  (void)d;
  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
#else
  // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
  const RepartitionToWide<decltype(d)> dd;
  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
                                                BitCast(dd, lo).raw, 1)});
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API Vec128<double> ConcatUpperLower(D /* tag */, Vec128<double> hi,
                                        Vec128<double> lo) {
#if HWY_TARGET >= HWY_SSSE3
  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
#else
  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
  return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
#endif
}

// ------------------------------ Concat partial (Combine, LowerHalf)

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> d2;
  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> d2;
  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi,
                                   const VFromD<D> lo) {
  const Half<decltype(d)> d2;
  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
  const Half<decltype(d)> d2;
  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
}

// ------------------------------ ConcatOdd

// 8-bit full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<uint16_t, decltype(d)> dw;
  // Right-shift 8 bits per u16 so we can pack.
  const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
  const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
  return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
}

// 8-bit x8
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  const Repartition<uint16_t, decltype(d)> dw;
  // Right-shift 8 bits per u16 so we can pack.
  const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
  const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
#else
  const Repartition<uint32_t, decltype(d)> du32;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
#endif
}

// 8-bit x4
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  const Repartition<uint16_t, decltype(d)> dw;
  const Twice<decltype(dw)> dw_2;
  // Right-shift 8 bits per u16 so we can pack.
  const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
  const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
  const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
  return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
#else
  const Repartition<uint16_t, decltype(d)> du16;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
  const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
  // 0xFFFF8000, which correctly saturates to 0x8000.
  const RebindToUnsigned<decltype(d)> du;
  const Repartition<int32_t, decltype(d)> dw;
  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
  return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
}

// 16-bit x4
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
  // 0xFFFF8000, which correctly saturates to 0x8000.
  const Repartition<int32_t, decltype(d)> dw;
  const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
  const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
#else
  const Repartition<uint32_t, decltype(d)> du32;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
#endif
}

// 32-bit full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
  const RebindToFloat<decltype(d)> df;
  return BitCast(
      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
                                      _MM_SHUFFLE(3, 1, 3, 1))});
}

// Any type x2
template <class D, HWY_IF_LANES_D(D, 2)>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
  return InterleaveUpper(d, lo, hi);
}

// ------------------------------ ConcatEven (InterleaveLower)

// 8-bit full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
  const Repartition<uint16_t, decltype(d)> dw;
  // Isolate lower 8 bits per u16 so we can pack.
  const Vec128<uint16_t> mask = Set(dw, 0x00FF);
  const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
  const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
  return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
}

// 8-bit x8
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  const Repartition<uint16_t, decltype(d)> dw;
  // Isolate lower 8 bits per u16 so we can pack.
  const Vec64<uint16_t> mask = Set(dw, 0x00FF);
  const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask);
  const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask);
  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
#else
  const Repartition<uint32_t, decltype(d)> du32;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
#endif
}

// 8-bit x4
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  const Repartition<uint16_t, decltype(d)> dw;
  const Twice<decltype(dw)> dw_2;
  // Isolate lower 8 bits per u16 so we can pack.
  const Vec32<uint16_t> mask = Set(dw, 0x00FF);
  const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask);
  const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask);
  const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
  return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
#else
  const Repartition<uint16_t, decltype(d)> du16;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
  const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
#endif
}

// 16-bit full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET <= HWY_SSE4
  // Isolate lower 16 bits per u32 so we can pack.
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  const Repartition<uint32_t, decltype(d)> dw;
  const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
  const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
  const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
  return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
#elif HWY_TARGET == HWY_SSE2
  const Repartition<uint32_t, decltype(d)> dw;
  return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
                   BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
#else
  const RebindToUnsigned<decltype(d)> du;
  // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
  // inputs, then concatenate them.
  alignas(16)
      const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
  const VFromD<D> shuf = BitCast(d, Load(du, kCompactEvenU16));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return ConcatLowerLower(d, H, L);
#endif
}

// 16-bit x4
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_TARGET == HWY_SSE2
  const Repartition<uint32_t, decltype(d)> dw;
  return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
                   BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
#else
  const Repartition<uint32_t, decltype(d)> du32;
  // Don't care about upper half, no need to zero.
  alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
  const VFromD<D> L = TableLookupBytes(lo, shuf);
  const VFromD<D> H = TableLookupBytes(hi, shuf);
  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
#endif
}

// 32-bit full
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
  const RebindToFloat<decltype(d)> df;
  return BitCast(
      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
                                      _MM_SHUFFLE(2, 0, 2, 0))});
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConcatEven(D /* d */, VFromD<D> hi, VFromD<D> lo) {
  return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
}

// Any T x2
template <class D, HWY_IF_LANES_D(D, 2)>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
  return InterleaveLower(d, lo, hi);
}

// ------------------------------ DupEven (InterleaveLower)

template <typename T>
HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) {
  return v;
}

template <typename T>
HWY_API Vec128<T, 2> DupEven(const Vec128<T, 2> v) {
  return InterleaveLower(DFromV<decltype(v)>(), v, v);
}

template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)>
HWY_API V DupEven(V v) {
  const DFromV<decltype(v)> d;

#if HWY_TARGET <= HWY_SSSE3
  const RebindToUnsigned<decltype(d)> du;
  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
      du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
  return TableLookupBytes(v, BitCast(d, shuffle));
#else
  const Repartition<uint16_t, decltype(d)> du16;
  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})),
                       BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v);
#endif
}

template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> DupEven(const Vec64<T> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
                        BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
}

// Generic for all vector lengths.
template <class V, HWY_IF_T_SIZE_V(V, 2)>
HWY_API V DupEven(const V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
#if HWY_TARGET <= HWY_SSSE3
  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
      du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
  return TableLookupBytes(v, BitCast(d, shuffle));
#else
  return BitCast(
      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
             _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)),
             _MM_SHUFFLE(2, 2, 0, 0))});
#endif
}

template <typename T, HWY_IF_UI32(T)>
HWY_API Vec128<T> DupEven(Vec128<T> v) {
  return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
}

HWY_API Vec128<float> DupEven(Vec128<float> v) {
  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
}

// ------------------------------ DupOdd (InterleaveUpper)

template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) {
  return v;
}

template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)>
HWY_API V DupOdd(V v) {
  const DFromV<decltype(v)> d;

#if HWY_TARGET <= HWY_SSSE3
  const RebindToUnsigned<decltype(d)> du;
  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
      du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
  return TableLookupBytes(v, BitCast(d, shuffle));
#else
  const Repartition<uint16_t, decltype(d)> du16;
  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})),
                       BitCast(d, ShiftRight<8>(BitCast(du16, v))), v);
#endif
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
                        BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))});
}

// Generic for all vector lengths.
template <typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)>
HWY_API V DupOdd(V v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
#if HWY_TARGET <= HWY_SSSE3
  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
      du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
  return TableLookupBytes(v, BitCast(d, shuffle));
#else
  return BitCast(
      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
             _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)),
             _MM_SHUFFLE(3, 3, 1, 1))});
#endif
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
}
template <size_t N>
HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) {
  return Vec128<float, N>{
      _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
}

// ------------------------------ TwoTablesLookupLanes (DupEven)

template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
                                          Indices128<T, N> idx) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
// TableLookupLanes currently requires table and index vectors to be the same
// size, though a half-length index vector would be sufficient here.
#if HWY_IS_MSAN
  const Vec128<T, N> idx_vec{idx.raw};
  const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
#else
  // We only keep LowerHalf of the result, which is valid in idx.
  const Indices128<T, N * 2> idx2{idx.raw};
#endif
  return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
}

template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                       Indices128<T> idx) {
#if HWY_TARGET <= HWY_AVX3_DL
  return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)};
#else  // AVX3 or below
  const DFromV<decltype(a)> d;
  const Vec128<T> idx_vec{idx.raw};

#if HWY_TARGET <= HWY_SSE4
  const Repartition<uint16_t, decltype(d)> du16;
  const auto sel_hi_mask =
      MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec))));
#else
  const RebindToSigned<decltype(d)> di;
  const auto sel_hi_mask =
      RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15}));
#endif

  const auto lo_lookup_result = TableLookupBytes(a, idx_vec);
#if HWY_TARGET <= HWY_AVX3
  const Vec128<T> lookup_result{_mm_mask_shuffle_epi8(
      lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)};
  return lookup_result;
#else
  const auto hi_lookup_result = TableLookupBytes(b, idx_vec);
  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
#endif  // HWY_TARGET <= HWY_AVX3
#endif  // HWY_TARGET <= HWY_AVX3_DL
}

template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                       Indices128<T> idx) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)};
#elif HWY_TARGET == HWY_SSE2
  const DFromV<decltype(a)> d;
  const RebindToSigned<decltype(d)> di;
  const Vec128<T> idx_vec{idx.raw};
  const auto sel_hi_mask =
      RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7}));
  const auto lo_lookup_result = TableLookupLanes(a, idx);
  const auto hi_lookup_result = TableLookupLanes(b, idx);
  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
#else
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
                                         Indices128<uint8_t>{idx.raw}));
#endif
}

template <typename T, HWY_IF_UI32(T)>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                       Indices128<T> idx) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)};
#else  // AVX2 or below
  const DFromV<decltype(a)> d;

#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
  const Vec128<T> idx_vec{idx.raw};

#if HWY_TARGET <= HWY_AVX2
  const RebindToFloat<decltype(d)> d_sel;
  const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec)));
#else
  const RebindToSigned<decltype(d)> d_sel;
  const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3});
#endif

  const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx));
  const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx));
  return BitCast(d,
                 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
#else   // SSSE3 or SSE4
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
                                         Indices128<uint8_t>{idx.raw}));
#endif  // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
#endif  // HWY_TARGET <= HWY_AVX3
}

#if HWY_HAVE_FLOAT16
HWY_API Vec128<float16_t> TwoTablesLookupLanes(Vec128<float16_t> a,
                                               Vec128<float16_t> b,
                                               Indices128<float16_t> idx) {
  return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)};
}
#endif  // HWY_HAVE_FLOAT16
HWY_API Vec128<float> TwoTablesLookupLanes(Vec128<float> a, Vec128<float> b,
                                           Indices128<float> idx) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)};
#elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
  const DFromV<decltype(a)> d;

#if HWY_TARGET <= HWY_AVX2
  const auto sel_hi_mask =
      MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw})));
#else
  const RebindToSigned<decltype(d)> di;
  const auto sel_hi_mask =
      RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3}));
#endif

  const auto lo_lookup_result = TableLookupLanes(a, idx);
  const auto hi_lookup_result = TableLookupLanes(b, idx);
  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
#else  // SSSE3 or SSE4
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> du8;
  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
                                         Indices128<uint8_t>{idx.raw}));
#endif
}

template <typename T, HWY_IF_UI64(T)>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
                                       Indices128<T> idx) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)};
#else
  const DFromV<decltype(a)> d;
  const Vec128<T> idx_vec{idx.raw};
  const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw};

#if HWY_TARGET <= HWY_SSE4
  const RebindToFloat<decltype(d)> d_sel;
  const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec)));
#else   // SSE2 or SSSE3
  const Repartition<int32_t, decltype(d)> di32;
  const RebindToSigned<decltype(d)> d_sel;
  const auto sel_hi_mask = MaskFromVec(
      BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
                                           Set(di32, int32_t{1}))));
#endif  // HWY_TARGET <= HWY_SSE4

  const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod));
  const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod));
  return BitCast(d,
                 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
#endif  // HWY_TARGET <= HWY_AVX3
}

HWY_API Vec128<double> TwoTablesLookupLanes(Vec128<double> a, Vec128<double> b,
                                            Indices128<double> idx) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)};
#else
  const DFromV<decltype(a)> d;
  const RebindToSigned<decltype(d)> di;
  const Vec128<int64_t> idx_vec{idx.raw};
  const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw};

#if HWY_TARGET <= HWY_SSE4
  const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec)));
#else   // SSE2 or SSSE3
  const Repartition<int32_t, decltype(d)> di32;
  const auto sel_hi_mask =
      MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
                                                   Set(di32, int32_t{1}))));
#endif  // HWY_TARGET <= HWY_SSE4

  const auto lo_lookup_result = TableLookupLanes(a, idx_mod);
  const auto hi_lookup_result = TableLookupLanes(b, idx_mod);
  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
#endif  // HWY_TARGET <= HWY_AVX3
}

// ------------------------------ OddEven (IfThenElse)

template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t mask[16] = {
      0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
  const DFromV<decltype(a)> d;
#if HWY_TARGET >= HWY_SSSE3
  const Repartition<uint8_t, decltype(d)> d8;
  alignas(16) static constexpr uint8_t mask[16] = {
      0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
#else
  const RebindToUnsigned<decltype(d)> du;  // for float16_t
  return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16(
                        BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
#endif
}

template <typename T, size_t N, HWY_IF_UI32(T)>
HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
#else
  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> df;
  return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
                                                  BitCast(df, b).raw, 5)});
#endif
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
  // Same as ConcatUpperLower for full vectors; do not call that because this
  // is more efficient for 64x1 vectors.
  const DFromV<decltype(a)> d;
  const RebindToFloat<decltype(d)> dd;
#if HWY_TARGET >= HWY_SSSE3
  return BitCast(
      d, Vec128<double, N>{_mm_shuffle_pd(
             BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
#else
  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
  return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
                                                   BitCast(dd, b).raw, 1)});
#endif
}

template <size_t N>
HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
#if HWY_TARGET >= HWY_SSSE3
  // SHUFPS must fill the lower half of the output from one input, so we
  // need another shuffle. Unpack avoids another immediate byte.
  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
#else
  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
#endif
}

// ------------------------------ OddEvenBlocks
template <typename T, size_t N>
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
  return even;
}

// ------------------------------ SwapAdjacentBlocks

template <typename T, size_t N>
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
  return v;
}

// ------------------------------ Shl (ZipLower, Mul)

// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
// two from loading float exponents, which is considerably faster (according
// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.

namespace detail {
#if HWY_TARGET == HWY_AVX2  // Unused for AVX3 - we use sllv directly
template <class V>
HWY_API V AVX2ShlU16Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint32_t, decltype(d)> du32;
  return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
}
#elif HWY_TARGET > HWY_AVX2
// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const RepartitionToWide<decltype(d)> dw;
  const Rebind<float, decltype(dw)> df;
  const auto zero = Zero(d);
  // Move into exponent (this u16 will become the upper half of an f32)
  const auto exp = ShiftLeft<23 - 16>(v);
  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
  // Insert 0 into lower halves for reinterpreting as binary32.
  const auto f0 = ZipLower(dw, zero, upper);
  const auto f1 = ZipUpper(dw, zero, upper);
  // See cvtps comment below.
  const VFromD<decltype(dw)> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
  const VFromD<decltype(dw)> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
#if HWY_TARGET <= HWY_SSE4
  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
#else
  return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0));
#endif
}

template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const Twice<decltype(du)> dt_u;
  const RepartitionToWide<decltype(dt_u)> dt_w;
  const RebindToFloat<decltype(dt_w)> dt_f;
  // Move into exponent (this u16 will become the upper half of an f32)
  const auto exp = ShiftLeft<23 - 16>(v);
  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
  // Insert 0 into lower halves for reinterpreting as binary32.
  const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper));
  // See cvtps comment below.
  const VFromD<decltype(dt_w)> bits0{_mm_cvtps_epi32(BitCast(dt_f, f0).raw)};
#if HWY_TARGET <= HWY_SSE4
  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
#elif HWY_TARGET == HWY_SSSE3
  alignas(16)
      const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
  return TableLookupBytes(bits0, Load(du, kCompactEvenU16));
#else
  const RebindToSigned<decltype(dt_w)> dt_i32;
  const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0)));
  return VFromD<decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)};
#endif
}

// Same, for 32-bit shifts.
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
  const DFromV<decltype(v)> d;
  const auto exp = ShiftLeft<23>(v);
  const auto f = exp + Set(d, 0x3F800000);  // 1.0f
  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
  // behavior.
  return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
}

#endif  // HWY_TARGET > HWY_AVX2

template <size_t N>
HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v,
                                Vec128<uint16_t, N> bits) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
#elif HWY_TARGET == HWY_AVX2
  return AVX2ShlU16Vec128(v, bits);
#else
  return v * Pow2(bits);
#endif
}

#if HWY_TARGET > HWY_AVX3
HWY_API Vec16<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec16<uint16_t> v,
                            Vec16<uint16_t> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
#else
  const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
#endif
  return Vec16<uint16_t>{_mm_sll_epi16(v.raw, bits16.raw)};
}
#endif

#if HWY_TARGET <= HWY_AVX3
template <class V>
HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint16_t, decltype(d)> du16;
  return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits));
}
#elif HWY_TARGET <= HWY_AVX2
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint32_t, decltype(d)> du32;
  return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
}
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Half<decltype(d)> dh;
  const Rebind<uint16_t, decltype(d)> du16;
  const Rebind<uint32_t, decltype(dh)> dh_u32;

  const VFromD<decltype(dh_u32)> lo_shl_result =
      PromoteTo(dh_u32, LowerHalf(dh, v))
      << PromoteTo(dh_u32, LowerHalf(dh, bits));
  const VFromD<decltype(dh_u32)> hi_shl_result =
      PromoteTo(dh_u32, UpperHalf(dh, v))
      << PromoteTo(dh_u32, UpperHalf(dh, bits));
  const VFromD<decltype(du16)> u16_shl_result = ConcatEven(
      du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result));
  return TruncateTo(d, u16_shl_result);
}
#endif  // HWY_TARGET <= HWY_AVX3

// 8-bit: may use the Shl overload for uint16_t.
template <size_t N>
HWY_API Vec128<uint8_t, N> Shl(hwy::UnsignedTag tag, Vec128<uint8_t, N> v,
                               Vec128<uint8_t, N> bits) {
  const DFromV<decltype(v)> d;
#if HWY_TARGET <= HWY_AVX3_DL
  (void)tag;
  // kMask[i] = 0xFF >> i
  alignas(16) static constexpr uint8_t kMasks[16] = {
      0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
  // kShl[i] = 1 << i
  alignas(16) static constexpr uint8_t kShl[16] = {1,    2,    4,    8,   0x10,
                                                   0x20, 0x40, 0x80, 0x00};
  v = And(v, TableLookupBytes(Load(Full64<uint8_t>(), kMasks), bits));
  const VFromD<decltype(d)> mul =
      TableLookupBytes(Load(Full64<uint8_t>(), kShl), bits);
  return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)};
#elif HWY_TARGET <= HWY_AVX2
  (void)tag;
  (void)d;
  return AVX2ShlU8Vec128(v, bits);
#else
  const Repartition<uint16_t, decltype(d)> dw;
  using VW = VFromD<decltype(dw)>;
  const VW even_mask = Set(dw, 0x00FF);
  const VW odd_mask = Set(dw, 0xFF00);
  const VW vw = BitCast(dw, v);
  const VW bits16 = BitCast(dw, bits);
  // Shift even lanes in-place
  const VW evens = Shl(tag, vw, And(bits16, even_mask));
  const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16));
  return OddEven(BitCast(d, odds), BitCast(d, evens));
#endif
}
HWY_API Vec128<uint8_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint8_t, 1> v,
                               Vec128<uint8_t, 1> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
#else
  const Vec16<uint16_t> bits8 =
      And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
#endif
  return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)};
}

template <size_t N>
HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v,
                                Vec128<uint32_t, N> bits) {
#if HWY_TARGET >= HWY_SSE4
  return v * Pow2(bits);
#else
  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
#endif
}

#if HWY_TARGET >= HWY_SSE4
HWY_API Vec32<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec32<uint32_t> v,
                            const Vec32<uint32_t> bits) {
#if HWY_TARGET == HWY_SSE4
  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
#else
  const auto bits32 =
      Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits);
#endif
  return Vec32<uint32_t>{_mm_sll_epi32(v.raw, bits32.raw)};
}
#endif

HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v,
                             Vec128<uint64_t> bits) {
#if HWY_TARGET >= HWY_SSE4
  const DFromV<decltype(v)> d;
  // Individual shifts and combine
  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
  return ConcatUpperLower(d, out1, out0);
#else
  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
#endif
}
HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v,
                            Vec64<uint64_t> bits) {
  return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
}

// Signed left shift is the same as unsigned.
template <typename T, size_t N>
HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
                         Vec128<T, N> bits) {
  const DFromV<decltype(v)> di;
  const RebindToUnsigned<decltype(di)> du;
  return BitCast(di,
                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
}

}  // namespace detail

template <typename T, size_t N>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
  return detail::Shl(hwy::TypeTag<T>(), v, bits);
}

// ------------------------------ Shr (mul, mask, BroadcastSignBit)

// Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use
// widening multiplication by powers of two obtained by loading float exponents,
// followed by a constant right-shift. This is still faster than a scalar or
// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.

#if HWY_TARGET <= HWY_AVX2
namespace detail {

#if HWY_TARGET <= HWY_AVX3
template <class V>
HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint16_t, decltype(d)> du16;
  const RebindToSigned<decltype(du16)> di16;
  return DemoteTo(d,
                  BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits)));
}
#else   // AVX2
template <class V>
HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint32_t, decltype(d)> du32;
  const RebindToSigned<decltype(du32)> di32;
  return DemoteTo(d,
                  BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
}
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<uint32_t, decltype(d)> du32;
  const RebindToSigned<decltype(du32)> di32;
  return DemoteTo(d,
                  BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
}
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Half<decltype(d)> dh;
  const Rebind<int16_t, decltype(d)> di16;
  const Rebind<uint16_t, decltype(d)> du16;
  const Rebind<int32_t, decltype(dh)> dh_i32;
  const Rebind<uint32_t, decltype(dh)> dh_u32;

  const auto lo_shr_result =
      BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >>
                          PromoteTo(dh_u32, LowerHalf(dh, bits)));
  const auto hi_shr_result =
      BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >>
                          PromoteTo(dh_u32, UpperHalf(dh, bits)));
  const auto i16_shr_result =
      BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result));
  return DemoteTo(d, i16_shr_result);
}
#endif  // HWY_TARGET <= HWY_AVX3

}  // namespace detail
#endif  // HWY_TARGET <= HWY_AVX2

template <size_t N>
HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> in,
                                       const Vec128<uint16_t, N> bits) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
#elif HWY_TARGET <= HWY_AVX2
  return detail::AVX2ShrU16Vec128(in, bits);
#else
  const DFromV<decltype(in)> d;
  // For bits=0, we cannot mul by 2^16, so fix the result later.
  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
  // Replace output with input where bits == 0.
  return IfThenElse(bits == Zero(d), in, out);
#endif
}

#if HWY_TARGET > HWY_AVX3
HWY_API Vec16<uint16_t> operator>>(const Vec16<uint16_t> in,
                                   const Vec16<uint16_t> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
#else
  const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
#endif
  return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)};
}
#endif

// 8-bit uses 16-bit shifts.
template <size_t N>
HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> in,
                                      const Vec128<uint8_t, N> bits) {
#if HWY_TARGET <= HWY_AVX2
  return detail::AVX2ShrU8Vec128(in, bits);
#else
  const DFromV<decltype(in)> d;
  const Repartition<uint16_t, decltype(d)> dw;
  using VW = VFromD<decltype(dw)>;
  const VW mask = Set(dw, 0x00FF);
  const VW vw = BitCast(dw, in);
  const VW bits16 = BitCast(dw, bits);
  const VW evens = And(vw, mask) >> And(bits16, mask);
  // Shift odd lanes in-place
  const VW odds = vw >> ShiftRight<8>(bits16);
  return OddEven(BitCast(d, odds), BitCast(d, evens));
#endif
}
HWY_API Vec128<uint8_t, 1> operator>>(const Vec128<uint8_t, 1> in,
                                      const Vec128<uint8_t, 1> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<uint16_t> in8{_mm_cvtepu8_epi16(in.raw)};
  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
#else
  const Vec16<uint16_t> mask{_mm_set_epi64x(0, 0xFF)};
  const Vec16<uint16_t> in8 = And(Vec16<uint16_t>{in.raw}, mask);
  const Vec16<uint16_t> bits8 = And(Vec16<uint16_t>{bits.raw}, mask);
#endif
  return Vec128<uint8_t, 1>{_mm_srl_epi16(in8.raw, bits8.raw)};
}

template <size_t N>
HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in,
                                       const Vec128<uint32_t, N> bits) {
#if HWY_TARGET >= HWY_SSE4
  // 32x32 -> 64 bit mul, then shift right by 32.
  const DFromV<decltype(in)> d32;
  // Move odd lanes into position for the second mul. Shuffle more gracefully
  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
  // For bits=0, we cannot mul by 2^32, so fix the result later.
  const auto mul = detail::Pow2(Set(d32, 32) - bits);
  const auto out20 = ShiftRight<32>(MulEven(in, mul));  // z 2 z 0
  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
  // No need to shift right, already in the correct position.
  const auto out31 = BitCast(d32, MulEven(in31, mul31));  // 3 ? 1 ?
  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
  // Replace output with input where bits == 0.
  return IfThenElse(bits == Zero(d32), in, out);
#else
  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
#endif
}

#if HWY_TARGET >= HWY_SSE4
HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in,
                                       const Vec128<uint32_t, 1> bits) {
#if HWY_TARGET == HWY_SSE4
  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
#else
  const auto bits32 =
      Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits);
#endif
  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits32.raw)};
}
#endif

HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
                                    const Vec128<uint64_t> bits) {
#if HWY_TARGET >= HWY_SSE4
  const DFromV<decltype(v)> d;
  // Individual shifts and combine
  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
  return ConcatUpperLower(d, out1, out0);
#else
  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
#endif
}
HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
                                   const Vec64<uint64_t> bits) {
  return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
}

namespace detail {

#if HWY_TARGET <= HWY_AVX3
template <class V>
HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<int16_t, decltype(d)> di16;
  return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits));
}
#elif HWY_TARGET <= HWY_AVX2  // AVX2
template <class V>
HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<int32_t, decltype(d)> di32;
  return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
}
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Rebind<int32_t, decltype(d)> di32;
  return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
}
template <class V, HWY_IF_V_SIZE_V(V, 16)>
HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
  const DFromV<decltype(v)> d;
  const Half<decltype(d)> dh;
  const Rebind<int16_t, decltype(d)> di16;
  const Rebind<int32_t, decltype(dh)> dh_i32;

  const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >>
                             PromoteTo(dh_i32, LowerHalf(dh, bits));
  const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >>
                             PromoteTo(dh_i32, UpperHalf(dh, bits));
  const auto i16_shr_result =
      OrderedDemote2To(di16, lo_shr_result, hi_shr_result);
  return DemoteTo(d, i16_shr_result);
}
#endif

#if HWY_TARGET > HWY_AVX3
// Also used in x86_256-inl.h.
template <class DI, class V>
HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
  const RebindToUnsigned<DI> du;
  const auto count = BitCast(du, count_i);  // same type as value to shift
  // Clear sign and restore afterwards. This is preferable to shifting the MSB
  // downwards because Shr is somewhat more expensive than Shl.
  const auto sign = BroadcastSignBit(v);
  const auto abs = BitCast(du, v ^ sign);  // off by one, but fixed below
  return BitCast(di, abs >> count) ^ sign;
}
#endif

}  // namespace detail

template <size_t N>
HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v,
                                      Vec128<int16_t, N> bits) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
#elif HWY_TARGET <= HWY_AVX2
  return detail::AVX2ShrI16Vec128(v, bits);
#else
  const DFromV<decltype(v)> d;
  return detail::SignedShr(d, v, bits);
#endif
}

#if HWY_TARGET > HWY_AVX3
HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
#else
  const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)});
#endif
  return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)};
}
#endif

template <size_t N>
HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v,
                                     Vec128<int8_t, N> bits) {
#if HWY_TARGET <= HWY_AVX2
  return detail::AVX2ShrI8Vec128(v, bits);
#else
  const DFromV<decltype(v)> d;
  return detail::SignedShr(d, v, bits);
#endif
}
HWY_API Vec128<int8_t, 1> operator>>(Vec128<int8_t, 1> v,
                                     Vec128<int8_t, 1> bits) {
#if HWY_TARGET <= HWY_SSE4
  const Vec16<int16_t> vi16{_mm_cvtepi8_epi16(v.raw)};
  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
#else
  const DFromV<decltype(v)> d;
  const Rebind<int16_t, decltype(d)> di16;
  const Twice<decltype(d)> dt;

  const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v)));
  const Vec16<uint16_t> bits8 =
      And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
#endif
  return Vec128<int8_t, 1>{_mm_sra_epi16(vi16.raw, bits8.raw)};
}

template <size_t N>
HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v,
                                      Vec128<int32_t, N> bits) {
#if HWY_TARGET <= HWY_AVX2
  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
#else
  const DFromV<decltype(v)> d;
  return detail::SignedShr(d, v, bits);
#endif
}

#if HWY_TARGET > HWY_AVX2
HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) {
#if HWY_TARGET == HWY_SSE4
  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
#else
  const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits);
#endif
  return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)};
}
#endif

template <size_t N>
HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v,
                                      Vec128<int64_t, N> bits) {
#if HWY_TARGET <= HWY_AVX3
  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
#else
  const DFromV<decltype(v)> d;
  return detail::SignedShr(d, v, bits);
#endif
}

// ------------------------------ MulEven/Odd 64x64 (UpperHalf)

HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  const DFromV<decltype(a)> d;
  alignas(16) uint64_t mul[2];
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
  return Load(d, mul);
}

HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
  const DFromV<decltype(a)> d;
  const Half<decltype(d)> d2;
  alignas(16) uint64_t mul[2];
  const uint64_t a1 = GetLane(UpperHalf(d2, a));
  const uint64_t b1 = GetLane(UpperHalf(d2, b));
  mul[0] = Mul128(a1, b1, &mul[1]);
  return Load(d, mul);
}

// ------------------------------ WidenMulPairwiseAdd

// Generic for all vector lengths.
template <class D32, HWY_IF_F32_D(D32),
          class V16 = VFromD<Repartition<bfloat16_t, D32>>>
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
  // TODO(janwas): _mm_dpbf16_ps when available
  const RebindToUnsigned<decltype(df32)> du32;
  // Lane order within sum0/1 is undefined, hence we can avoid the
  // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
  // leads to the odd/even order that RearrangeToOddPlusEven prefers.
  using VU32 = VFromD<decltype(du32)>;
  const VU32 odd = Set(du32, 0xFFFF0000u);
  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
  const VU32 ao = And(BitCast(du32, a), odd);
  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
  const VU32 bo = And(BitCast(du32, b), odd);
  return MulAdd(BitCast(df32, ae), BitCast(df32, be),
                Mul(BitCast(df32, ao), BitCast(df32, bo)));
}

// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
          class V16 = VFromD<RepartitionToNarrow<D32>>>
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
  return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)};
}

// Generic for all vector lengths.
template <class DU32, HWY_IF_U32_D(DU32),
          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
  const auto p_lo = a * b;
  const auto p_hi = MulHigh(a, b);

  const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo));
  const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)),
                            ShiftRight<16>(BitCast(du32, p_lo)));
  return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1));
}

// ------------------------------ SatWidenMulPairwiseAdd

#if HWY_TARGET <= HWY_SSSE3

#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#else
#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
#endif

// Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16
// is safe.
template <class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)>
HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
    DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
    VFromD<Repartition<int8_t, DI16>> b) {
  return VFromD<DI16>{_mm_maddubs_epi16(a.raw, b.raw)};
}

#endif

// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft)

// Generic for all vector lengths.
template <class D32, HWY_IF_F32_D(D32),
          class V16 = VFromD<Repartition<bfloat16_t, D32>>>
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
                                              const VFromD<D32> sum0,
                                              VFromD<D32>& sum1) {
  // TODO(janwas): _mm_dpbf16_ps when available
  const RebindToUnsigned<decltype(df32)> du32;
  // Lane order within sum0/1 is undefined, hence we can avoid the
  // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
  // leads to the odd/even order that RearrangeToOddPlusEven prefers.
  using VU32 = VFromD<decltype(du32)>;
  const VU32 odd = Set(du32, 0xFFFF0000u);
  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
  const VU32 ao = And(BitCast(du32, a), odd);
  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
  const VU32 bo = And(BitCast(du32, b), odd);
  sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
  return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
}

// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
          class V16 = VFromD<RepartitionToNarrow<D32>>>
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b,
                                              const VFromD<D32> sum0,
                                              VFromD<D32>& /*sum1*/) {
  (void)d;
#if HWY_TARGET <= HWY_AVX3_DL
  return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
#else
  return sum0 + WidenMulPairwiseAdd(d, a, b);
#endif
}

template <class DU32, HWY_IF_U32_D(DU32),
          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
                                               const VFromD<DU32> sum0,
                                               VFromD<DU32>& /*sum1*/) {
  (void)d;
  return sum0 + WidenMulPairwiseAdd(d, a, b);
}

// ------------------------------ RearrangeToOddPlusEven
template <size_t N>
HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0,
                                                  Vec128<int32_t, N> /*sum1*/) {
  return sum0;  // invariant already holds
}

template <size_t N>
HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
    const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
  return sum0;  // invariant already holds
}

template <class VW>
HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
  return Add(sum0, sum1);
}

// ------------------------------ SumOfMulQuadAccumulate
#if HWY_TARGET <= HWY_AVX3_DL

#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#endif

template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
  return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
}

#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
                                            VFromD<Repartition<int8_t, DI32>> a,
                                            VFromD<Repartition<int8_t, DI32>> b,
                                            VFromD<DI32> sum) {
  // TODO(janwas): AVX-VNNI-INT8 has dpbssd.
  const Repartition<uint8_t, decltype(di32)> du8;

  const auto a_u = BitCast(du8, a);
  const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum);
  const auto result_sum_1 = ShiftLeft<8>(
      SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32)));
  return result_sum_0 - result_sum_1;
}

#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#endif
template <class DU32, HWY_IF_U32_D(DU32)>
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
    DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  // TODO(janwas): AVX-VNNI-INT8 has dpbuud.
  const Repartition<uint8_t, decltype(du32)> du8;
  const RebindToSigned<decltype(du8)> di8;
  const RebindToSigned<decltype(du32)> di32;

  const auto b_i = BitCast(di8, b);
  const auto result_sum_0 =
      SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum));
  const auto result_sum_1 = ShiftLeft<8>(
      SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32)));

  return BitCast(du32, result_sum_0 - result_sum_1);
}

#endif  // HWY_TARGET <= HWY_AVX3_DL

// ================================================== CONVERT

// ------------------------------ Promotions (part w/ narrow lanes -> full)

// Unsigned: zero-extend.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const __m128i zero = _mm_setzero_si128();
  return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
#else
  return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
#else
  return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
#else
  return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const __m128i zero = _mm_setzero_si128();
  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
  return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
#else
  return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
#if HWY_TARGET > HWY_SSSE3
  const Rebind<uint32_t, decltype(d)> du32;
  return PromoteTo(d, PromoteTo(du32, v));
#elif HWY_TARGET == HWY_SSSE3
  alignas(16) static constexpr int8_t kShuffle[16] = {
      0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
  const Repartition<int8_t, decltype(d)> di8;
  return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
#else
  (void)d;
  return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
#if HWY_TARGET > HWY_SSSE3
  const Rebind<uint32_t, decltype(d)> du32;
  return PromoteTo(d, PromoteTo(du32, v));
#elif HWY_TARGET == HWY_SSSE3
  alignas(16) static constexpr int8_t kShuffle[16] = {
      0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
  const Repartition<int8_t, decltype(d)> di8;
  return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
#else
  (void)d;
  return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
#endif
}

// Unsigned to signed: same plus cast.
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
          HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
HWY_API VFromD<D> PromoteTo(D di, V v) {
  const RebindToUnsigned<decltype(di)> du;
  return BitCast(di, PromoteTo(du, v));
}

// Signed: replicate sign bit.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
#else
  return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
#else
  return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
#else
  return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
  return ShiftRight<24>(VFromD<D>{x4});
#else
  return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const Repartition<int32_t, decltype(d)> di32;
  const Half<decltype(di32)> dh_i32;
  const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
  const VFromD<decltype(di32)> s4{
      _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
  return ZipLower(d, x4, s4);
#else
  (void)d;
  return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const Repartition<int32_t, decltype(d)> di32;
  const Half<decltype(di32)> dh_i32;
  const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
  const VFromD<decltype(di32)> s2{
      _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
  return ZipLower(d, x2, s2);
#else
  (void)d;
  return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
#endif
}

#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)

// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
#ifdef HWY_NATIVE_F16C
#undef HWY_NATIVE_F16C
#else
#define HWY_NATIVE_F16C
#endif

// Workaround for origin tracking bug in Clang msan prior to 11.0
// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
#define HWY_INLINE_F16 HWY_NOINLINE
#else
#define HWY_INLINE_F16 HWY_INLINE
#endif
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
#if HWY_HAVE_FLOAT16
  const RebindToUnsigned<DFromV<decltype(v)>> du16;
  return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
#else
  return VFromD<D>{_mm_cvtph_ps(v.raw)};
#endif
}

#endif  // HWY_NATIVE_F16C

#if HWY_HAVE_FLOAT16

#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
#else
#define HWY_NATIVE_PROMOTE_F16_TO_F64
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
  return VFromD<D>{_mm_cvtph_pd(v.raw)};
}

#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
  const Rebind<uint16_t, decltype(df32)> du16;
  const RebindToSigned<decltype(df32)> di32;
  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
  return VFromD<D>{_mm_cvtps_pd(v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
}

#if HWY_TARGET <= HWY_AVX3
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
  return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
}
#else
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
  const Rebind<int32_t, decltype(df64)> di32;
  const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
  return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
                                                Set(df64, 4294967296.0),
                                                Zero(df64));
}
#endif

// ------------------------------ PromoteEvenTo/PromoteOddTo

#if HWY_TARGET > HWY_AVX3
namespace detail {

// I32->I64 PromoteEvenTo/PromoteOddTo

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
                                   hwy::SizeTag<8> /*to_lane_size_tag*/,
                                   hwy::SignedTag /*from_type_tag*/, D d_to,
                                   Vec64<int32_t> v) {
  return PromoteLowerTo(d_to, v);
}

template <class D, HWY_IF_LANES_D(D, 2)>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
                                   hwy::SizeTag<8> /*to_lane_size_tag*/,
                                   hwy::SignedTag /*from_type_tag*/, D d_to,
                                   Vec128<int32_t> v) {
  const Repartition<int32_t, D> d_from;
  return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
}

template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
                                  hwy::SizeTag<8> /*to_lane_size_tag*/,
                                  hwy::SignedTag /*from_type_tag*/, D d_to,
                                  V v) {
  const Repartition<int32_t, D> d_from;
  return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
}

}  // namespace detail
#endif

// ------------------------------ Demotions (full -> part w/ narrow lanes)

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
#if HWY_TARGET >= HWY_SSSE3
  const Rebind<int32_t, D> di32;
  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
  const auto clamped = Or(zero_if_neg, too_big);
#if HWY_TARGET == HWY_SSE2
  const Rebind<uint16_t, decltype(di32)> du16;
  const RebindToSigned<decltype(du16)> di16;
  return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
#else
  const Repartition<uint16_t, decltype(di32)> du16;
  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
  alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
      0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
  const auto lo2 = Load(du16, kLower2Bytes);
  return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
#endif
#else
  return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)};
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) {
  const DFromV<decltype(v)> du32;
  const RebindToSigned<decltype(du32)> di32;
#if HWY_TARGET >= HWY_SSSE3
  const auto too_big =
      VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32)));
  const auto clamped = Or(BitCast(di32, v), too_big);
#if HWY_TARGET == HWY_SSE2
  const RebindToSigned<decltype(du16)> di16;
  return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
#else
  (void)du16;
  const Repartition<uint16_t, decltype(di32)> du16_full;
  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
  alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
      0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
  const auto lo2 = Load(du16_full, kLower2Bytes);
  return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw};
#endif
#else
  return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF))));
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
  return VFromD<D>{_mm_packus_epi16(i16, i16)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
  return VFromD<D>{_mm_packs_epi16(i16, i16)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) {
#if HWY_TARGET <= HWY_AVX3
  // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned
  // integers to 8-bit unsigned integers
  (void)du8;
  return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)};
#else
  const DFromV<decltype(v)> du32;
  const RebindToSigned<decltype(du32)> di32;
  const auto max_i32 = Set(du32, 0x7FFFFFFFu);

#if HWY_TARGET >= HWY_SSSE3
  // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation
  // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.

  // The u8 Min operation below leaves the lower 24 bits of each 32-bit
  // lane unchanged.

  // The u8 Min operation below will leave any values that are less than or
  // equal to 0x7FFFFFFF unchanged.

  // For values that are greater than or equal to 0x80000000, the u8 Min
  // operation below will force the upper 8 bits to 0x7F and leave the lower
  // 24 bits unchanged.

  // An u8 Min operation is okay here as any clamped value that is greater than
  // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and
  // 0x7FFFFFFF through the u8 Min operation below, which will then be converted
  // to 0xFF through the i32->u8 demotion.
  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  const auto clamped = BitCast(
      di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32)));
#else
  const auto clamped = BitCast(di32, Min(v, max_i32));
#endif

  return DemoteTo(du8, clamped);
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
  const DFromV<decltype(v)> du16;
  const RebindToSigned<decltype(du16)> di16;
  const auto max_i16 = Set(du16, 0x7FFF);

#if HWY_TARGET >= HWY_SSSE3
  // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation
  // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.

  // The u8 Min operation below leaves the lower 8 bits of each 16-bit
  // lane unchanged.

  // The u8 Min operation below will leave any values that are less than or
  // equal to 0x7FFF unchanged.

  // For values that are greater than or equal to 0x8000, the u8 Min
  // operation below will force the upper 8 bits to 0x7F and leave the lower
  // 8 bits unchanged.

  // An u8 Min operation is okay here as any clamped value that is greater than
  // or equal to 0x8000 will be clamped to a value between 0x7F00 and
  // 0x7FFF through the u8 Min operation below, which will then be converted
  // to 0xFF through the i16->u8 demotion.
  const Repartition<uint8_t, decltype(du16)> du16_as_du8;
  const auto clamped = BitCast(
      di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16)));
#else
  const auto clamped = BitCast(di16, Min(v, max_i16));
#endif

  return DemoteTo(du8, clamped);
}

#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)

// HWY_NATIVE_F16C was already toggled above.

// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  const RebindToUnsigned<decltype(df16)> du16;
  return BitCast(
      df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
}

HWY_DIAGNOSTICS(pop)

#endif  // F16C

#if HWY_HAVE_FLOAT16

#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
#else
#define HWY_NATIVE_DEMOTE_F64_TO_F16
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
  return VFromD<D>{_mm_cvtpd_ph(v.raw)};
}

#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
  const Rebind<int32_t, decltype(dbf16)> di32;
  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
  const Rebind<uint16_t, decltype(dbf16)> du16;
  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
          class V32 = VFromD<Repartition<float, D>>>
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
  // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
  const RebindToUnsigned<decltype(dbf16)> du16;
  const Repartition<uint32_t, decltype(dbf16)> du32;
  const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}

// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int32_t> a,
                                   Vec64<int32_t> b) {
  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
                                   Vec128<int32_t> b) {
  return VFromD<D>{_mm_packs_epi32(a.raw, b.raw)};
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) {
#if HWY_TARGET >= HWY_SSSE3
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
#else
  (void)dn;
  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<int32_t> a, Vec128<int32_t> b) {
#if HWY_TARGET >= HWY_SSSE3
  const Half<decltype(dn)> dnh;
  const auto u16_a = DemoteTo(dnh, a);
  const auto u16_b = DemoteTo(dnh, b);
  return Combine(dn, u16_b, u16_a);
#else
  (void)dn;
  return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)};
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a,
                                   Vec128<uint32_t> b) {
  const DFromV<decltype(a)> du32;
  const RebindToSigned<decltype(du32)> di32;
  const auto max_i32 = Set(du32, 0x7FFFFFFFu);

#if HWY_TARGET >= HWY_SSSE3
  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  // On SSE2/SSSE3, clamp a and b using u8 Min operation
  const auto clamped_a = BitCast(
      di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32)));
  const auto clamped_b = BitCast(
      di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32)));
#else
  const auto clamped_a = BitCast(di32, Min(a, max_i32));
  const auto clamped_b = BitCast(di32, Min(b, max_i32));
#endif

  return ReorderDemote2To(dn, clamped_a, clamped_b);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a,
                                   VFromD<Repartition<uint32_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}

// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
                                   VFromD<Repartition<int16_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a,
                                   Vec64<int16_t> b) {
  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
                                   Vec128<int16_t> b) {
  return VFromD<D>{_mm_packs_epi16(a.raw, b.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
                                   VFromD<Repartition<int16_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a,
                                   Vec64<int16_t> b) {
  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw),
                                     _MM_SHUFFLE(2, 0, 2, 0))};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
                                   Vec128<int16_t> b) {
  return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)};
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a,
                                   Vec128<uint16_t> b) {
  const DFromV<decltype(a)> du16;
  const RebindToSigned<decltype(du16)> di16;
  const auto max_i16 = Set(du16, 0x7FFFu);

#if HWY_TARGET >= HWY_SSSE3
  const Repartition<uint8_t, decltype(du16)> du16_as_du8;
  // On SSE2/SSSE3, clamp a and b using u8 Min operation
  const auto clamped_a = BitCast(
      di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16)));
  const auto clamped_b = BitCast(
      di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16)));
#else
  const auto clamped_a = BitCast(di16, Min(a, max_i16));
  const auto clamped_b = BitCast(di16, Min(b, max_i16));
#endif

  return ReorderDemote2To(dn, clamped_a, clamped_b);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a,
                                   VFromD<Repartition<uint16_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}

template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
          HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
          HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
  return ReorderDemote2To(d, a, b);
}

template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
  const RebindToUnsigned<decltype(dbf16)> du16;
  return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
  return VFromD<D>{_mm_cvtpd_ps(v.raw)};
}

namespace detail {

// Generic for all vector lengths.
template <class D>
HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) {
  // The max can be exactly represented in binary64, so clamping beforehand
  // prevents x86 conversion from raising an exception and returning 80..00.
  return Min(v, Set(d, 2147483647.0));
}

// For ConvertTo float->int of same size, clamping before conversion would
// change the result because the max integer value is not exactly representable.
// Instead detect the overflow result after conversion and fix it.
// Generic for all vector lengths.
template <class DI>
HWY_INLINE VFromD<DI> FixConversionOverflow(DI di,
                                            VFromD<RebindToFloat<DI>> original,
                                            VFromD<DI> converted) {
  // Combinations of original and output sign:
  //   --: normal <0 or -huge_val to 80..00: OK
  //   -+: -0 to 0                         : OK
  //   +-: +huge_val to 80..00             : xor with FF..FF to get 7F..FF
  //   ++: normal >0                       : OK
  const VFromD<DI> sign_wrong = AndNot(BitCast(di, original), converted);
#if HWY_COMPILER_GCC_ACTUAL
  // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
  // Add() if using that instead. Work around with one more instruction.
  const RebindToUnsigned<DI> du;
  const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
  const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
  return IfVecThenElse(mask, max, converted);
#else
  return Xor(converted, BroadcastSignBit(sign_wrong));
#endif
}

}  // namespace detail

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D),
          class DF = Rebind<double, D>>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<DF> v) {
  const VFromD<DF> clamped = detail::ClampF64ToI32Max(DF(), v);
  return VFromD<D>{_mm_cvttpd_epi32(clamped.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
#if HWY_TARGET <= HWY_AVX3
  (void)du32;
  return VFromD<D>{
      _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
#else  // AVX2 or earlier
  const Rebind<double, decltype(du32)> df64;
  const RebindToUnsigned<decltype(df64)> du64;

  // Clamp v[i] to a value between 0 and 4294967295
  const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));

  const auto k2_31 = Set(df64, 2147483648.0);
  const auto clamped_is_ge_k2_31 = (clamped >= k2_31);
  const auto clamped_lo31_f64 =
      clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31);
  const VFromD<D> clamped_lo31_u32{_mm_cvttpd_epi32(clamped_lo31_f64.raw)};
  const auto clamped_u32_msb = ShiftLeft<31>(
      TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31))));
  return Or(clamped_lo31_u32, clamped_u32_msb);
#endif
}

#if HWY_TARGET <= HWY_AVX3
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  return VFromD<D>{_mm_cvtepi64_ps(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  return VFromD<D>{_mm_cvtepu64_ps(v.raw)};
}
#else
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
  const Rebind<double, decltype(df32)> df64;
  const RebindToUnsigned<decltype(df64)> du64;
  const RebindToSigned<decltype(df32)> di32;
  const RebindToUnsigned<decltype(df32)> du32;

  const auto k2p64_63 = Set(df64, 27670116110564327424.0);
  const auto f64_hi52 =
      Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
  const auto f64_lo12 =
      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
                                        Set(du32, uint32_t{0x00000FFF}))));

  const auto f64_sum = f64_hi52 + f64_lo12;
  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;

  const auto f64_sum_is_inexact =
      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
  const auto f64_bits_decrement =
      And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
          f64_sum_is_inexact);

  const auto adj_f64_val = BitCast(
      df64,
      Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));

  return DemoteTo(df32, adj_f64_val);
}

// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
  const Rebind<double, decltype(df32)> df64;
  const RebindToUnsigned<decltype(df64)> du64;
  const RebindToSigned<decltype(df32)> di32;
  const RebindToUnsigned<decltype(df32)> du32;

  const auto k2p64 = Set(df64, 18446744073709551616.0);
  const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
  const auto f64_lo12 =
      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
                                        Set(du32, uint32_t{0x00000FFF}))));

  const auto f64_sum = f64_hi52 + f64_lo12;
  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
  const auto f64_sum_is_inexact =
      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));

  const auto adj_f64_val = BitCast(
      df64,
      Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
         f64_sum_is_inexact));

  return DemoteTo(df32, adj_f64_val);
}
#endif

// For already range-limited input [0, 255].
template <size_t N>
HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
#if HWY_TARGET == HWY_SSE2
  const RebindToSigned<DFromV<decltype(v)>> di32;
  const Rebind<uint8_t, decltype(di32)> du8;
  return DemoteTo(du8, BitCast(di32, v));
#else
  const DFromV<decltype(v)> d32;
  const Repartition<uint8_t, decltype(d32)> d8;
  alignas(16) static constexpr uint32_t k8From32[4] = {
      0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
  // Also replicate bytes into all 32 bit lanes for safety.
  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
  return LowerHalf(LowerHalf(BitCast(d8, quad)));
#endif
}

// ------------------------------ F32->UI64 PromoteTo
#if HWY_TARGET <= HWY_AVX3
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
  const Rebind<float, decltype(di64)> df32;
  const RebindToFloat<decltype(di64)> df64;
  const Twice<decltype(df32)> dt_f32;

  return detail::FixConversionOverflow(
      di64,
      BitCast(df64, InterleaveLower(ResizeBitCast(dt_f32, v),
                                    ResizeBitCast(dt_f32, v))),
      VFromD<D>{_mm_cvttps_epi64(v.raw)});
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
  return VFromD<D>{
      _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
}
#else   // AVX2 or below

// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class D, HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
  const Rebind<int32_t, decltype(di64)> di32;
  const RebindToFloat<decltype(di32)> df32;
  const RebindToUnsigned<decltype(di32)> du32;
  const Repartition<uint8_t, decltype(du32)> du32_as_du8;

  const auto exponent_adj = BitCast(
      du32,
      Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
                       BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
          BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
  const auto adj_v =
      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));

  const auto f32_to_i32_result = ConvertTo(di32, adj_v);
  const auto lo64_or_mask = PromoteTo(
      di64,
      BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
                                         Set(di32, LimitsMax<int32_t>())))));

  return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
                << PromoteTo(di64, exponent_adj),
            lo64_or_mask);
}

namespace detail {

template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
    DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
  const Rebind<int32_t, decltype(du64)> di32;
  const Twice<decltype(di32)> dt_i32;

  const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask);
  return BitCast(du64,
                 InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask));
}

template <class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)>
HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
    DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
  const RebindToSigned<decltype(du64)> di64;
  return BitCast(du64, PromoteTo(di64, i32_overflow_mask));
}

}  // namespace detail

// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
  const Rebind<int32_t, decltype(du64)> di32;
  const RebindToFloat<decltype(di32)> df32;
  const RebindToUnsigned<decltype(di32)> du32;
  const Repartition<uint8_t, decltype(du32)> du32_as_du8;

  const auto non_neg_v = ZeroIfNegative(v);

  const auto exponent_adj = BitCast(
      du32, Min(SaturatedSub(BitCast(du32_as_du8,
                                     ShiftRight<23>(BitCast(du32, non_neg_v))),
                             BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
                BitCast(du32_as_du8, Set(du32, uint32_t{33}))));

  const auto adj_v =
      BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
  const VFromD<decltype(di32)> f32_to_i32_result{_mm_cvttps_epi32(adj_v.raw)};

  const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result);
  const auto overflow_result =
      detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask);

  return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result))
                << PromoteTo(du64, exponent_adj),
            overflow_result);
}
#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ MulFixedPoint15

#if HWY_TARGET == HWY_SSE2
HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a,
                                        const Vec128<int16_t> b) {
  const DFromV<decltype(a)> d;
  const Repartition<int32_t, decltype(d)> di32;

  auto lo_product = a * b;
  auto hi_product = MulHigh(a, b);

  const VFromD<decltype(di32)> i32_product_lo{
      _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
  const VFromD<decltype(di32)> i32_product_hi{
      _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)};

  const auto round_up_incr = Set(di32, 0x4000);
  return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr),
                          ShiftRight<15>(i32_product_hi + round_up_incr));
}

template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
                                           const Vec128<int16_t, N> b) {
  const DFromV<decltype(a)> d;
  const Rebind<int32_t, decltype(d)> di32;

  const auto lo_product = a * b;
  const auto hi_product = MulHigh(a, b);
  const VFromD<decltype(di32)> i32_product{
      _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};

  return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000)));
}
#else
template <size_t N>
HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
                                           const Vec128<int16_t, N> b) {
  return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
}
#endif

// ------------------------------ Truncations

template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) {
  // BitCast requires the same size; DTo might be u8x1 and v u16x1.
  const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
  return VFromD<DTo>{BitCast(dto, v).raw};
}

template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) {
#if HWY_TARGET == HWY_SSE2
  const Vec128<uint8_t, 1> lo{v.raw};
  const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
  return Combine(d, hi, lo);
#else
  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
  (void)d;
  alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8,
                                                   0, 8, 0, 8, 0, 8, 0, 8};
  const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx));
  return LowerHalf(LowerHalf(LowerHalf(v8)));
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) {
#if HWY_TARGET == HWY_SSE2
  const Vec128<uint16_t, 1> lo{v.raw};
  const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
  return Combine(d, hi, lo);
#else
  (void)d;
  const Repartition<uint16_t, DFromV<decltype(v)>> d16;
  alignas(16) static constexpr uint16_t kIdx[8] = {
      0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
  const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx));
  return LowerHalf(LowerHalf(v16));
#endif
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
  return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  const DFromV<decltype(v)> du32;
#if HWY_TARGET == HWY_SSE2
  const RebindToSigned<decltype(du32)> di32;
  const Rebind<uint8_t, decltype(di32)> du8;
  return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v))));
#else
  const Repartition<uint8_t, decltype(du32)> d;
  alignas(16) static constexpr uint8_t kIdx[16] = {
      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx))));
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  const DFromV<decltype(v)> du32;
#if HWY_TARGET == HWY_SSE2
  const RebindToSigned<decltype(du32)> di32;
  const Rebind<uint16_t, decltype(di32)> du16;
  const RebindToSigned<decltype(du16)> di16;
  return BitCast(
      du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v)))));
#else
  const Repartition<uint16_t, decltype(du32)> d;
  return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  const DFromV<decltype(v)> du16;
#if HWY_TARGET == HWY_SSE2
  const RebindToSigned<decltype(du16)> di16;
  const Rebind<uint8_t, decltype(di16)> du8;
  const RebindToSigned<decltype(du8)> di8;
  return BitCast(du8,
                 DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v)))));
#else
  const Repartition<uint8_t, decltype(du16)> d;
  return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
#endif
}

// ------------------------------ Demotions to/from i64

#if HWY_TARGET <= HWY_AVX3
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
}
#else   // AVX2 or below
namespace detail {
template <class D, HWY_IF_UNSIGNED_D(D)>
HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
    D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
  return v;
}

template <class D, HWY_IF_SIGNED_D(D)>
HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
    D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
  const DFromV<decltype(v)> du64;
  return And(v,
             Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>())));
}

template <class D>
HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate(
    D dn, VFromD<Rebind<uint64_t, D>> v) {
  const Rebind<uint64_t, D> du64;
  const RebindToSigned<decltype(du64)> di64;
  constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) -
                            static_cast<int>(hwy::IsSigned<TFromD<D>>());

  const auto too_big = BitCast(
      du64, VecFromMask(
                di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64))));
  return DemoteFromU64MaskOutResult(dn, Or(v, too_big));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V>
HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) {
  return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
}

}  // namespace detail

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
          HWY_IF_SIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
  const DFromV<decltype(v)> di64;
  const RebindToUnsigned<decltype(di64)> du64;
  const RebindToUnsigned<decltype(dn)> dn_u;

  // Negative values are saturated by first saturating their bitwise inverse
  // and then inverting the saturation result
  const auto invert_mask = BitCast(du64, BroadcastSignBit(v));
  const auto saturated_vals = Xor(
      invert_mask,
      detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v))));
  return BitCast(dn, TruncateTo(dn_u, saturated_vals));
}

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
          HWY_IF_UNSIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
  const DFromV<decltype(v)> di64;
  const RebindToUnsigned<decltype(di64)> du64;

  const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v));
  return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
}

template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
          HWY_IF_UNSIGNED_D(D)>
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
  return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
}
#endif  // HWY_TARGET <= HWY_AVX3

template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2),
          HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
                                   VFromD<Repartition<int64_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)>
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
                                   VFromD<Repartition<uint64_t, D>> b) {
  const DFromV<decltype(a)> d;
  const Twice<decltype(d)> dt;
  return DemoteTo(dn, Combine(dt, b, a));
}

#if HWY_TARGET > HWY_AVX2
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
                                         Vec128<int64_t> b) {
  const DFromV<decltype(a)> di64;
  const RebindToUnsigned<decltype(di64)> du64;
  const Half<decltype(dn)> dnh;

  // Negative values are saturated by first saturating their bitwise inverse
  // and then inverting the saturation result
  const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
  const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
  const auto saturated_a = Xor(
      invert_mask_a,
      detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
  const auto saturated_b = Xor(
      invert_mask_b,
      detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));

  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
                                          Vec128<int64_t> b) {
  const DFromV<decltype(a)> di64;
  const RebindToUnsigned<decltype(di64)> du64;
  const Half<decltype(dn)> dnh;

  const auto saturated_a = detail::DemoteFromU64Saturate(
      dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
  const auto saturated_b = detail::DemoteFromU64Saturate(
      dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));

  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a,
                                          Vec128<uint64_t> b) {
  const Half<decltype(dn)> dnh;

  const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
  const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);

  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
}
#endif  // HWY_TARGET > HWY_AVX2

// ------------------------------ Integer <=> fp (ShiftRight, OddEven)

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  return VFromD<D>{_mm_cvtepu16_ph(v.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  return VFromD<D>{_mm_cvtepi16_ph(v.raw)};
}
#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  return VFromD<D>{_mm_cvtepi32_ps(v.raw)};
}

#if HWY_TARGET <= HWY_AVX3
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) {
  return VFromD<D>{_mm_cvtepu32_ps(v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<int64_t, D>> v) {
  return VFromD<D>{_mm_cvtepi64_pd(v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<uint64_t, D>> v) {
  return VFromD<D>{_mm_cvtepu64_pd(v.raw)};
}
#else   // AVX2 or below
// Generic for all vector lengths.
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) {
  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
  const RebindToUnsigned<decltype(df)> du32;
  const RebindToSigned<decltype(df)> d32;

  const auto msk_lo = Set(du32, 0xFFFF);
  const auto cnst2_16_flt = Set(df, 65536.0f);  // 2^16

  // Extract the 16 lowest/highest significant bits of v and cast to signed int
  const auto v_lo = BitCast(d32, And(v, msk_lo));
  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
}

// Generic for all vector lengths.
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) {
  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
  const Repartition<uint32_t, decltype(dd)> d32;
  const Repartition<uint64_t, decltype(dd)> d64;

  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);

  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
  const auto k52 = Set(d32, 0x43300000);
  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));

  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
  return (v_upper - k84_63_52) + v_lower;  // order matters!
}

namespace detail {
template <class VW>
HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) {
  const DFromV<decltype(w)> d64;
  const RebindToFloat<decltype(d64)> dd;
  const auto cnst2_52_dbl = Set(dd, 0x0010000000000000);  // 2^52
  return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl;
}
}  // namespace detail

// Generic for all vector lengths.
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
  const RebindToUnsigned<decltype(dd)> d64;
  using VU = VFromD<decltype(d64)>;

  const VU msk_lo = Set(d64, 0xFFFFFFFF);
  const auto cnst2_32_dbl = Set(dd, 4294967296.0);  // 2^32

  // Extract the 32 lowest/highest significant bits of v
  const VU v_lo = And(v, msk_lo);
  const VU v_hi = ShiftRight<32>(v);

  const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
  return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
}
#endif  // HWY_TARGET <= HWY_AVX3

// Truncates (rounds toward zero).

#if HWY_HAVE_FLOAT16
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
  return detail::FixConversionOverflow(
      di, v, VFromD<RebindToSigned<D>>{_mm_cvttph_epi16(v.raw)});
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
  return VFromD<D>{
      _mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
}
#endif  // HWY_HAVE_FLOAT16

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
  return detail::FixConversionOverflow(
      di, v, VFromD<RebindToSigned<D>>{_mm_cvttps_epi32(v.raw)});
}

#if HWY_TARGET <= HWY_AVX3
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
  return detail::FixConversionOverflow(di, v,
                                       VFromD<DI>{_mm_cvttpd_epi64(v.raw)});
}

template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
  return VFromD<DU>{
      _mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
}

template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
  return VFromD<DU>{
      _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
}

#else  // AVX2 or below

template <class DU32, HWY_IF_V_SIZE_LE_D(DU32, 16), HWY_IF_U32_D(DU32)>
HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
  const RebindToSigned<decltype(du32)> di32;
  const RebindToFloat<decltype(du32)> df32;

  const auto non_neg_v = ZeroIfNegative(v);
  const auto exp_diff = Set(di32, int32_t{158}) -
                        BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v)));
  const auto scale_down_f32_val_mask =
      BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32))));

  const auto v_scaled = BitCast(
      df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask));
  const VFromD<decltype(du32)> f32_to_u32_result{
      _mm_cvttps_epi32(v_scaled.raw)};

  return Or(
      BitCast(du32, BroadcastSignBit(exp_diff)),
      f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask));
}

#if HWY_ARCH_X86_64
template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
HWY_API VFromD<DI> ConvertTo(DI di, Vec64<double> v) {
  const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
  return detail::FixConversionOverflow(di, v, i0);
}
template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
HWY_API VFromD<DI> ConvertTo(DI di, Vec128<double> v) {
  const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
  const Full64<double> dd2;
  const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
  return detail::FixConversionOverflow(
      di, v, Vec128<int64_t>{_mm_unpacklo_epi64(i0, i1)});
}
#endif  // HWY_ARCH_X86_64

#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
          HWY_IF_I64_D(DI)>
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
  using VI = VFromD<decltype(di)>;
  const RebindToUnsigned<decltype(di)> du;
  using VU = VFromD<decltype(du)>;
  const Repartition<uint16_t, decltype(di)> du16;
  const VI k1075 = Set(di, 1075); /* biased exponent of 2^52 */

  // Exponent indicates whether the number can be represented as int64_t.
  const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF);
#if HWY_TARGET <= HWY_SSE4
  const auto in_range = BitCast(di, biased_exp) < Set(di, 1086);
#else
  const Repartition<int32_t, decltype(di)> di32;
  const auto in_range = MaskFromVec(BitCast(
      di,
      VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
#endif

  // If we were to cap the exponent at 51 and add 2^52, the number would be in
  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
  // manually shift the mantissa into place (we already have many of the
  // inputs anyway).

  // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
  // shift_int since biased_exp[i] is a non-negative integer that is less than
  // or equal to 2047.

  // 16-bit saturated unsigned subtraction is also more efficient than a
  // 64-bit subtraction followed by a 64-bit signed Max operation on
  // SSE2/SSSE3/SSE4/AVX2.

  // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
  // zero as the upper 48 bits of both k1075 and biased_exp are zero.

  const VU shift_mnt = BitCast(
      du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
  const VU shift_int = BitCast(
      du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
  const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1);
  // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
  // returning zero in that case.
  const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;

  // For inputs larger than 2^53 - 1, insert zeros at the bottom.

  // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be
  // shifted out of the left shift result below as shift_int[i] <= 10 is true
  // for any inputs that are less than 2^63.

  const VU shifted = int53 << shift_int;

  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
  const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit);

  // If the input was negative, negate the integer (two's complement).
  return (magnitude ^ sign_mask) - sign_mask;
}
#endif  // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2

// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
template <class DU, HWY_IF_U64_D(DU)>
HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
  const RebindToSigned<decltype(du)> di;
  using VU = VFromD<decltype(du)>;
  const Repartition<uint16_t, decltype(di)> du16;
  const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */

  const auto non_neg_v = ZeroIfNegative(v);

  // Exponent indicates whether the number can be represented as int64_t.
  const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v));
#if HWY_TARGET <= HWY_SSE4
  const VU out_of_range =
      BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
#else
  const Repartition<int32_t, decltype(di)> di32;
  const VU out_of_range = BitCast(
      du,
      VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086)));
#endif

  // If we were to cap the exponent at 51 and add 2^52, the number would be in
  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
  // manually shift the mantissa into place (we already have many of the
  // inputs anyway).

  // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
  // shift_int since biased_exp[i] is a non-negative integer that is less than
  // or equal to 2047.

  // 16-bit saturated unsigned subtraction is also more efficient than a
  // 64-bit subtraction followed by a 64-bit signed Max operation on
  // SSE2/SSSE3/SSE4/AVX2.

  // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
  // zero as the upper 48 bits of both k1075 and biased_exp are zero.

  const VU shift_mnt = BitCast(
      du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
  const VU shift_int = BitCast(
      du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
  const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1);
  // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
  // returning zero in that case.
  const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;

  // For inputs larger than 2^53 - 1, insert zeros at the bottom.

  // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
  // shifted out of the left shift result below as shift_int[i] <= 11 is true
  // for any inputs that are less than 2^64.

  const VU shifted = int53 << shift_int;
  return (shifted | out_of_range);
}
#endif  // HWY_TARGET <= HWY_AVX3

template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
  const RebindToSigned<DFromV<decltype(v)>> di;
  return detail::FixConversionOverflow(
      di, v, VFromD<decltype(di)>{_mm_cvtps_epi32(v.raw)});
}

// ------------------------------ Floating-point rounding (ConvertTo)

#if HWY_TARGET >= HWY_SSSE3

// Toward nearest integer, ties to even
template <typename T, size_t N>
HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only for float");
  // Rely on rounding after addition with a large value such that no mantissa
  // bits remain (assuming the current mode is nearest-even). We may need a
  // compiler flag for precise floating-point to prevent "optimizing" this out.
  const DFromV<decltype(v)> df;
  const auto max = Set(df, MantissaEnd<T>());
  const auto large = CopySignToAbs(max, v);
  const auto added = large + v;
  const auto rounded = added - large;
  // Keep original if NaN or the magnitude is large (already an int).
  return IfThenElse(Abs(v) < max, rounded, v);
}

namespace detail {

// Truncating to integer and converting back to float is correct except when the
// input magnitude is large, in which case the input was already an integer
// (because mantissa >> exponent is zero).
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only for float");
  const DFromV<decltype(v)> d;
  return Abs(v) < Set(d, MantissaEnd<T>());
}

}  // namespace detail

// Toward zero, aka truncate
template <typename T, size_t N>
HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only for float");
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
}

// Toward +infinity, aka ceiling
template <typename T, size_t N>
HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only for float");
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  // Truncating a positive non-integer ends up smaller; if so, add 1.
  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));

  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
}

// Toward -infinity, aka floor
template <typename T, size_t N>
HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
  static_assert(IsFloat<T>(), "Only for float");
  const DFromV<decltype(v)> df;
  const RebindToSigned<decltype(df)> di;

  const auto integer = ConvertTo(di, v);  // round toward 0
  const auto int_f = ConvertTo(df, integer);

  // Truncating a negative non-integer ends up larger; if so, subtract 1.
  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));

  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
}

#else

// Toward nearest integer, ties to even
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Round(const Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{
      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
  return Vec128<float, N>{
      _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
}
template <size_t N>
HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
  return Vec128<double, N>{
      _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
}

// Toward zero, aka truncate
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Trunc(const Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{
      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
  return Vec128<float, N>{
      _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
}
template <size_t N>
HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
  return Vec128<double, N>{
      _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
}

// Toward +infinity, aka ceiling
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Ceil(const Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{
      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
  return Vec128<float, N>{
      _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
}
template <size_t N>
HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
  return Vec128<double, N>{
      _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
}

// Toward -infinity, aka floor
#if HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float16_t, N> Floor(const Vec128<float16_t, N> v) {
  return Vec128<float16_t, N>{
      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
}
#endif  // HWY_HAVE_FLOAT16
template <size_t N>
HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
  return Vec128<float, N>{
      _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
}
template <size_t N>
HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
  return Vec128<double, N>{
      _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
}

#endif  // !HWY_SSSE3

// ------------------------------ Floating-point classification

#define HWY_X86_FPCLASS_QNAN 0x01
#define HWY_X86_FPCLASS_POS0 0x02
#define HWY_X86_FPCLASS_NEG0 0x04
#define HWY_X86_FPCLASS_POS_INF 0x08
#define HWY_X86_FPCLASS_NEG_INF 0x10
#define HWY_X86_FPCLASS_SUBNORMAL 0x20
#define HWY_X86_FPCLASS_NEG 0x40
#define HWY_X86_FPCLASS_SNAN 0x80

#if HWY_HAVE_FLOAT16 || HWY_IDE

template <size_t N>
HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) {
  return Mask128<float16_t, N>{
      _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
}

template <size_t N>
HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) {
  return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
}

template <size_t N>
HWY_API Mask128<float16_t, N> IsFinite(const Vec128<float16_t, N> v) {
  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
  // and negate the mask.
  return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask(
      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
}

#endif  // HWY_HAVE_FLOAT16

template <size_t N>
HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) {
#if HWY_TARGET <= HWY_AVX3
  return Mask128<float, N>{
      _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
#else
  return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
#endif
}
template <size_t N>
HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
#if HWY_TARGET <= HWY_AVX3
  return Mask128<double, N>{
      _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
#else
  return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
#endif
}

#if HWY_TARGET <= HWY_AVX3

// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
#ifdef HWY_NATIVE_ISINF
#undef HWY_NATIVE_ISINF
#else
#define HWY_NATIVE_ISINF
#endif

template <size_t N>
HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) {
  return Mask128<float, N>{_mm_fpclass_ps_mask(
      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
}
template <size_t N>
HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) {
  return Mask128<double, N>{_mm_fpclass_pd_mask(
      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
}

// Returns whether normal/subnormal/zero.
template <size_t N>
HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) {
  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
  // and negate the mask.
  return Not(Mask128<float, N>{_mm_fpclass_ps_mask(
      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
}
template <size_t N>
HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
  return Not(Mask128<double, N>{_mm_fpclass_pd_mask(
      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
}

#endif  // HWY_TARGET <= HWY_AVX3

// ================================================== CRYPTO

#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4

// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
#ifdef HWY_NATIVE_AES
#undef HWY_NATIVE_AES
#else
#define HWY_NATIVE_AES
#endif

HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
                                 Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
}

HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
                                     Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
}

HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
  return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)};
}

HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
                                    Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)};
}

HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
                                        Vec128<uint8_t> round_key) {
  return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)};
}

template <uint8_t kRcon>
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
  return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)};
}

template <size_t N>
HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
                                       Vec128<uint64_t, N> b) {
  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
}

template <size_t N>
HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
                                       Vec128<uint64_t, N> b) {
  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
}

#endif  // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4

// ================================================== MISC

// ------------------------------ LoadMaskBits (TestBit)

#if HWY_TARGET > HWY_AVX3
namespace detail {

template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  // Easier than Set(), which would require an >8-bit type, which would not
  // compile for T=uint8_t, kN=1.
  const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};

#if HWY_TARGET == HWY_SSE2
  // {b0, b1, ...} ===> {b0, b0, b1, b1, ...}
  __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw);
  // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...}
  unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits);
  // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==>
  // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1}
  const VFromD<decltype(du)> rep8{
      _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)};
#else
  // Replicate bytes 8x such that each byte contains the bit that governs it.
  alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
                                                    1, 1, 1, 1, 1, 1, 1, 1};
  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
#endif
  const VFromD<decltype(du)> bit = Dup128VecFromValues(
      du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
  return RebindMask(d, TestBit(rep8, bit));
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> du;
  alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
}

}  // namespace detail
#endif  // HWY_TARGET > HWY_AVX3

// `p` points to at least 8 readable bytes, not all of which need be valid.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
  constexpr size_t kN = MaxLanes(d);
#if HWY_TARGET <= HWY_AVX3
  (void)d;
  uint64_t mask_bits = 0;
  constexpr size_t kNumBytes = (kN + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (kN < 8) {
    mask_bits &= (1ull << kN) - 1;
  }

  return MFromD<D>::FromBits(mask_bits);
#else
  uint64_t mask_bits = 0;
  constexpr size_t kNumBytes = (kN + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (kN < 8) {
    mask_bits &= (1ull << kN) - 1;
  }

  return detail::LoadMaskBits128(d, mask_bits);
#endif
}

// ------------------------------ Dup128MaskFromMaskBits

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
  constexpr size_t kN = MaxLanes(d);
  if (kN < 8) mask_bits &= (1u << kN) - 1;

#if HWY_TARGET <= HWY_AVX3
  return MFromD<D>::FromBits(mask_bits);
#else
  return detail::LoadMaskBits128(d, mask_bits);
#endif
}

template <typename T>
struct CompressIsPartition {
#if HWY_TARGET <= HWY_AVX3
  // AVX3 supports native compress, but a table-based approach allows
  // 'partitioning' (also moving mask=false lanes to the top), which helps
  // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
  // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
  // u32x8 etc.).
  enum { value = (sizeof(T) == 8) };
#else
  // generic_ops-inl does not guarantee IsPartition for 8-bit.
  enum { value = (sizeof(T) != 1) };
#endif
};

#if HWY_TARGET <= HWY_AVX3

// ------------------------------ StoreMaskBits

// `p` points to at least 8 writable bytes.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
  constexpr size_t kN = MaxLanes(d);
  constexpr size_t kNumBytes = (kN + 7) / 8;
  CopyBytes<kNumBytes>(&mask.raw, bits);

  // Non-full byte, need to clear the undefined upper bits.
  if (kN < 8) {
    const int mask_bits = (1 << kN) - 1;
    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
  }

  return kNumBytes;
}

// ------------------------------ Mask testing

// Beware: the suffix indicates the number of mask bits, not lane size!

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  return PopCount(mask_bits);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
                   : -1;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API bool AllFalse(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  return mask_bits == 0;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API bool AllTrue(D d, MFromD<D> mask) {
  constexpr size_t kN = MaxLanes(d);
  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  // Cannot use _kortestc because we may have less than 8 mask bits.
  return mask_bits == (1ull << kN) - 1;
}

// ------------------------------ Compress

// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512.

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
}

template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
  HWY_DASSERT(mask.raw < 4);

  // There are only 2 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[64] = {
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

  const DFromV<decltype(v)> d;
  const Repartition<uint8_t, decltype(d)> d8;
  const auto index = Load(d8, u8_indices + 16 * mask.raw);
  return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
}

// ------------------------------ CompressNot (Compress)

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  // See CompressIsPartition, PrintCompressNot64x2NibbleTables
  alignas(16) static constexpr uint64_t packed_array[16] = {
      0x00000010, 0x00000001, 0x00000010, 0x00000010};

  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
  // _mm_permutexvar_epi64 will ignore the upper bits.
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du64;
  const auto packed = Set(du64, packed_array[mask.raw]);
  alignas(16) static constexpr uint64_t shifts[2] = {0, 4};
  const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw};
  return TableLookupLanes(v, indices);
}

// ------------------------------ CompressBlocksNot
HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
                                           Mask128<uint64_t> /* m */) {
  return v;
}

// ------------------------------ CompressStore (defined in x86_512)

// ------------------------------ CompressBlendedStore (CompressStore)
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
                                    TFromD<D>* HWY_RESTRICT unaligned) {
  // AVX-512 already does the blending at no extra cost (latency 11,
  // rthroughput 2 - same as compress plus store).
  if (HWY_TARGET == HWY_AVX3_DL ||
      (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
    // We're relying on the mask to blend. Clear the undefined upper bits.
    constexpr size_t kN = MaxLanes(d);
    if (kN != 16 / sizeof(TFromD<D>)) {
      m = And(m, FirstN(d, kN));
    }
    return CompressStore(v, m, d, unaligned);
  } else {
    const size_t count = CountTrue(d, m);
    const VFromD<D> compressed = Compress(v, m);
#if HWY_MEM_OPS_MIGHT_FAULT
    // BlendedStore tests mask for each lane, but we know that the mask is
    // FirstN, so we can just copy.
    alignas(16) TFromD<D> buf[MaxLanes(d)];
    Store(compressed, d, buf);
    CopyBytes(buf, unaligned, count * sizeof(TFromD<D>));
#else
    BlendedStore(compressed, FirstN(d, count), d, unaligned);
#endif
    detail::MaybeUnpoison(unaligned, count);
    return count;
  }
}

// ------------------------------ CompressBitsStore (defined in x86_512)

#else  // AVX2 or below

// ------------------------------ StoreMaskBits

namespace detail {

constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                 const Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
  return U64FromInt(_mm_movemask_epi8(sign_bits));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
                                 const Mask128<T, N> mask) {
  // Remove useless lower half of each u16 while preserving the sign bit.
  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
  return U64FromInt(_mm_movemask_epi8(sign_bits));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  const Simd<float, N, 0> df;
  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
  return U64FromInt(_mm_movemask_ps(sign_bits.raw));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  const Simd<double, N, 0> df;
  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
  return U64FromInt(_mm_movemask_pd(sign_bits.raw));
}

template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
}

}  // namespace detail

// `p` points to at least 8 writable bytes.
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
  constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
  const uint64_t mask_bits = detail::BitsFromMask(mask);
  CopyBytes<kNumBytes>(&mask_bits, bits);
  return kNumBytes;
}

// ------------------------------ Mask testing

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
  // Cheaper than PTEST, which is 2 uop / 3L.
  return detail::BitsFromMask(mask) == 0;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API bool AllTrue(D d, MFromD<D> mask) {
  constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
  return detail::BitsFromMask(mask) == kAllBits;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
  return PopCount(detail::BitsFromMask(mask));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
  return Num0BitsBelowLS1Bit_Nonzero32(
      static_cast<uint32_t>(detail::BitsFromMask(mask)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
  const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
  return 31 - Num0BitsAboveMS1Bit_Nonzero32(
                  static_cast<uint32_t>(detail::BitsFromMask(mask)));
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
  const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
                   : -1;
}

// ------------------------------ Compress, CompressBits

namespace detail {

// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 256);
  const Rebind<uint8_t, decltype(d)> d8;
  const Twice<decltype(d8)> d8t;
  const RebindToUnsigned<decltype(d)> du;

  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  // is likely more costly than the higher cache footprint from storing bytes.
  alignas(16) static constexpr uint8_t table[2048] = {
      // PrintCompress16x8Tables
      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};

  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  return BitCast(d, pairs + Set(du, 0x0100));
}

template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 256);
  const Rebind<uint8_t, decltype(d)> d8;
  const Twice<decltype(d8)> d8t;
  const RebindToUnsigned<decltype(d)> du;

  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  // is likely more costly than the higher cache footprint from storing bytes.
  alignas(16) static constexpr uint8_t table[2048] = {
      // PrintCompressNot16x8Tables
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};

  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  return BitCast(d, pairs + Set(du, 0x0100));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 16);

  // There are only 4 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[256] = {
      // PrintCompress32x4Tables
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};

  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 16);

  // There are only 4 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[256] = {
      // PrintCompressNot32x4Tables
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
      12, 13, 14, 15};

  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 4);

  // There are only 2 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[64] = {
      // PrintCompress64x2Tables
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  HWY_DASSERT(mask_bits < 4);

  // There are only 2 lanes, so we can afford to load the index vector directly.
  alignas(16) static constexpr uint8_t u8_indices[64] = {
      // PrintCompressNot64x2Tables
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

  const Repartition<uint8_t, decltype(d)> d8;
  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}

template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;

  HWY_DASSERT(mask_bits < (1ull << N));
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
}

template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;

  HWY_DASSERT(mask_bits < (1ull << N));
  const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
}

}  // namespace detail

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

// Two lanes: conditional swap
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
  const DFromV<decltype(v)> d;
  const Vec128<T> m = VecFromMask(d, mask);
  const Vec128<T> maskL = DupEven(m);
  const Vec128<T> maskH = DupOdd(m);
  const Vec128<T> swap = AndNot(maskL, maskH);
  return IfVecThenElse(swap, Shuffle01(v), v);
}

// General case, 2 or 4 bytes
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  return detail::CompressBits(v, detail::BitsFromMask(mask));
}

// ------------------------------ CompressNot

// Single lane: no-op
template <typename T>
HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  return v;
}

// Two lanes: conditional swap
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
  const DFromV<decltype(v)> d;
  const Vec128<T> m = VecFromMask(d, mask);
  const Vec128<T> maskL = DupEven(m);
  const Vec128<T> maskH = DupOdd(m);
  const Vec128<T> swap = AndNot(maskH, maskL);
  return IfVecThenElse(swap, Shuffle01(v), v);
}

template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
  // For partial vectors, we cannot pull the Not() into the table because
  // BitsFromMask clears the upper bits.
  if (N < 16 / sizeof(T)) {
    return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
  }
  return detail::CompressNotBits(v, detail::BitsFromMask(mask));
}

// ------------------------------ CompressBlocksNot
HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
                                           Mask128<uint64_t> /* m */) {
  return v;
}

template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
                                  const uint8_t* HWY_RESTRICT bits) {
  uint64_t mask_bits = 0;
  constexpr size_t kNumBytes = (N + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (N < 8) {
    mask_bits &= (1ull << N) - 1;
  }

  return detail::CompressBits(v, mask_bits);
}

// ------------------------------ CompressStore, CompressBitsStore

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
                             TFromD<D>* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;

  const uint64_t mask_bits = detail::BitsFromMask(m);
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  const size_t count = PopCount(mask_bits);

  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  StoreU(compressed, d, unaligned);
  detail::MaybeUnpoison(unaligned, count);
  return count;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
                                    TFromD<D>* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;

  const uint64_t mask_bits = detail::BitsFromMask(m);
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  const size_t count = PopCount(mask_bits);

  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  BlendedStore(compressed, FirstN(d, count), d, unaligned);
  detail::MaybeUnpoison(unaligned, count);
  return count;
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;

  uint64_t mask_bits = 0;
  constexpr size_t kN = MaxLanes(d);
  constexpr size_t kNumBytes = (kN + 7) / 8;
  CopyBytes<kNumBytes>(bits, &mask_bits);
  if (kN < 8) {
    mask_bits &= (1ull << kN) - 1;
  }
  const size_t count = PopCount(mask_bits);

  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  StoreU(compressed, d, unaligned);

  detail::MaybeUnpoison(unaligned, count);
  return count;
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ Expand

// Otherwise, use the generic_ops-inl.h fallback.
#if HWY_TARGET <= HWY_AVX3 || HWY_IDE

// The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL),
// but we still want to override generic_ops-inl's table-based implementation
// whenever we have the 32-bit expand provided by AVX3.
#ifdef HWY_NATIVE_EXPAND
#undef HWY_NATIVE_EXPAND
#else
#define HWY_NATIVE_EXPAND
#endif

namespace detail {

#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE  // VBMI2

template <size_t N>
HWY_INLINE Vec128<uint8_t, N> NativeExpand(Vec128<uint8_t, N> v,
                                           Mask128<uint8_t, N> mask) {
  return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)};
}

template <size_t N>
HWY_INLINE Vec128<uint16_t, N> NativeExpand(Vec128<uint16_t, N> v,
                                            Mask128<uint16_t, N> mask) {
  return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
                                      const uint8_t* HWY_RESTRICT unaligned) {
  return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
                                      const uint16_t* HWY_RESTRICT unaligned) {
  return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)};
}

#endif  // HWY_TARGET <= HWY_AVX3_DL

template <size_t N>
HWY_INLINE Vec128<uint32_t, N> NativeExpand(Vec128<uint32_t, N> v,
                                            Mask128<uint32_t, N> mask) {
  return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)};
}

template <size_t N>
HWY_INLINE Vec128<uint64_t, N> NativeExpand(Vec128<uint64_t, N> v,
                                            Mask128<uint64_t, N> mask) {
  return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
                                      const uint32_t* HWY_RESTRICT unaligned) {
  return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)};
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
                                      const uint64_t* HWY_RESTRICT unaligned) {
  return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)};
}

}  // namespace detail

// Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo.
#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE  // VBMI2

template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
}

#endif  // HWY_TARGET <= HWY_AVX3_DL

template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  const DFromV<decltype(v)> d;
  const RebindToUnsigned<decltype(d)> du;
  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
}

// ------------------------------ LoadExpand

template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
                             const TFromD<D>* HWY_RESTRICT unaligned) {
#if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
#else
  return Expand(LoadU(d, unaligned), mask);
#endif
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
                             const TFromD<D>* HWY_RESTRICT unaligned) {
#if HWY_TARGET <= HWY_AVX3
  const RebindToUnsigned<decltype(d)> du;
  using TU = TFromD<decltype(du)>;
  const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
#else
  return Expand(LoadU(d, unaligned), mask);
#endif
}

#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ StoreInterleaved2/3/4

// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
// generic_ops-inl.h.

// ------------------------------ Additional mask logical operations

#if HWY_TARGET <= HWY_AVX3
namespace detail {

template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
static HWY_INLINE uint32_t AVX3Blsi(T x) {
  using TU = MakeUnsigned<T>;
  const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
#if HWY_COMPILER_CLANGCL
  return static_cast<uint32_t>(u32_val & (0u - u32_val));
#else
  return static_cast<uint32_t>(_blsi_u32(u32_val));
#endif
}
template <class T, HWY_IF_T_SIZE(T, 8)>
static HWY_INLINE uint64_t AVX3Blsi(T x) {
  const auto u64_val = static_cast<uint64_t>(x);
#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
  return static_cast<uint64_t>(u64_val & (0ULL - u64_val));
#else
  return static_cast<uint64_t>(_blsi_u64(u64_val));
#endif
}

template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
static HWY_INLINE uint32_t AVX3Blsmsk(T x) {
  using TU = MakeUnsigned<T>;
  const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
#if HWY_COMPILER_CLANGCL
  return static_cast<uint32_t>(u32_val ^ (u32_val - 1u));
#else
  return static_cast<uint32_t>(_blsmsk_u32(u32_val));
#endif
}
template <class T, HWY_IF_T_SIZE(T, 8)>
static HWY_INLINE uint64_t AVX3Blsmsk(T x) {
  const auto u64_val = static_cast<uint64_t>(x);
#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
  return static_cast<uint64_t>(u64_val ^ (u64_val - 1ULL));
#else
  return static_cast<uint64_t>(_blsmsk_u64(u64_val));
#endif
}

}  // namespace detail

template <class T, size_t N>
HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
      (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)};
}
template <class T, size_t N>
HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
      (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)};
}
template <class T, size_t N>
HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
      detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)};
}
template <class T, size_t N>
HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  return Mask128<T, N>{
      static_cast<typename Mask128<T, N>::Raw>(detail::AVX3Blsi(mask.raw))};
}
#else   // AVX2 or below
template <class T>
HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
  return mask;
}
template <class T>
HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
  const FixedTag<T, 2> d;
  const auto vmask = VecFromMask(d, mask);
  return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  const auto vmask = VecFromMask(d, mask);
  const auto neg_vmask =
      ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
  return MaskFromVec(Or(vmask, neg_vmask));
}
template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
  const Full128<T> d;
  const Repartition<int64_t, decltype(d)> di64;
  const Repartition<float, decltype(d)> df32;
  const Repartition<int32_t, decltype(d)> di32;
  using VF = VFromD<decltype(df32)>;

  auto vmask = BitCast(di64, VecFromMask(d, mask));
  vmask = Or(vmask, Neg(vmask));

  // Copy the sign bit of the first int64_t lane to the second int64_t lane
  const auto vmask2 = BroadcastSignBit(
      BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw,
                                      _MM_SHUFFLE(1, 1, 0, 0))}));
  return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2))));
}

template <class T, size_t N>
HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  return Not(SetAtOrAfterFirst(mask));
}

template <class T>
HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
  return mask;
}
template <class T>
HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
  const FixedTag<T, 2> d;
  const RebindToSigned<decltype(d)> di;

  const auto vmask = BitCast(di, VecFromMask(d, mask));
  const auto zero = Zero(di);
  const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
  return MaskFromVec(BitCast(d, And(vmask, vmask2)));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  const RebindToSigned<decltype(d)> di;

  const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
  const auto only_first_vmask =
      BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
  return MaskFromVec(only_first_vmask);
}
template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
  const Full128<T> d;
  const RebindToSigned<decltype(d)> di;
  const Repartition<int64_t, decltype(d)> di64;

  const auto zero = Zero(di64);
  const auto vmask = BitCast(di64, VecFromMask(d, mask));
  const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
  const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
  return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
}

template <class T>
HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
  const FixedTag<T, 1> d;
  const RebindToSigned<decltype(d)> di;
  using TI = MakeSigned<T>;

  return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  const Simd<T, N, 0> d;
  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
}
#endif  // HWY_TARGET <= HWY_AVX3

// ------------------------------ Reductions

// Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum.

// We provide specializations of u8x8 and u8x16, so exclude those.
#undef HWY_IF_SUM_OF_LANES_D
#define HWY_IF_SUM_OF_LANES_D(D)                                        \
  HWY_IF_LANES_GT_D(D, 1),                                              \
      hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() ||               \
                    (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
          nullptr

template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
}
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  const Repartition<uint64_t, decltype(d)> d64;
  VFromD<decltype(d64)> sums = SumsOf8(v);
  sums = SumOfLanes(d64, sums);
  return Broadcast<0>(BitCast(d, sums));
}

#if HWY_TARGET <= HWY_SSE4
// We provide specializations of u8x8, u8x16, and u16x8, so exclude those.
#undef HWY_IF_MINMAX_OF_LANES_D
#define HWY_IF_MINMAX_OF_LANES_D(D)                                        \
  HWY_IF_LANES_GT_D(D, 1),                                                 \
      hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() ||                 \
                     ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
                    (!hwy::IsSame<TFromD<D>, uint16_t>() ||                \
                     (HWY_V_SIZE_D(D) != 16))>* = nullptr

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) {
  return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)});
}

template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) {
  const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>());
  return max - MinOfLanes(d, max - v);
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) {
  const Rebind<uint16_t, decltype(d)> d16;
  return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) {
  const Half<decltype(d)> dh;
  Vec64<uint8_t> result =
      Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v)));
  return Combine(d, result, result);
}

template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) {
  const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
  return m - MinOfLanes(d, m - v);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
  const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
  return m - MinOfLanes(d, m - v);
}

#endif  // HWY_TARGET <= HWY_SSE4

// ------------------------------ Lt128

namespace detail {

// Returns vector-mask for Lt128. Generic for all vector lengths.
template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) {
  // Truth table of Eq and Lt for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
  //  0  0  0  0  |  0
  //  0  0  0  1  |  0
  //  0  0  1  0  |  1
  //  0  0  1  1  |  1
  //  0  1  0  0  |  0
  //  0  1  0  1  |  0
  //  0  1  1  0  |  1
  //  1  0  0  0  |  0
  //  1  0  0  1  |  1
  //  1  1  0  0  |  0
  const auto eqHL = Eq(a, b);
  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL);
  const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL);
  return InterleaveUpper(d, vecHx, vecHx);
}

// Returns vector-mask for Eq128. Generic for all vector lengths.
template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) {
  const auto eqHL = VecFromMask(d, Eq(a, b));
  const auto eqLH = Reverse2(d, eqHL);
  return And(eqHL, eqLH);
}

template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) {
  const auto neHL = VecFromMask(d, Ne(a, b));
  const auto neLH = Reverse2(d, neHL);
  return Or(neHL, neLH);
}

template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  // copying mask bits to their neighbor seems infeasible.
  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  return InterleaveUpper(d, ltHL, ltHL);
}

template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  // copying mask bits to their neighbor seems infeasible.
  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  return InterleaveUpper(d, eqHL, eqHL);
}

template <class D, HWY_IF_U64_D(D)>
HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  // copying mask bits to their neighbor seems infeasible.
  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  return InterleaveUpper(d, neHL, neHL);
}

}  // namespace detail

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Lt128Vec(d, a, b));
}

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Eq128Vec(d, a, b));
}

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Ne128Vec(d, a, b));
}

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
}

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
}

template <class D, HWY_IF_U64_D(D)>
HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return MaskFromVec(detail::Ne128UpperVec(d, a, b));
}

// ------------------------------ Min128, Max128 (Lt128)

// Avoids the extra MaskFromVec in Lt128.
template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
}

template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
}

template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
}

template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
}

// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex

#if HWY_TARGET <= HWY_AVX3

#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
#undef HWY_NATIVE_LEADING_ZERO_COUNT
#else
#define HWY_NATIVE_LEADING_ZERO_COUNT
#endif

template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
HWY_API V LeadingZeroCount(V v) {
  return V{_mm_lzcnt_epi32(v.raw)};
}

template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
HWY_API V LeadingZeroCount(V v) {
  return V{_mm_lzcnt_epi64(v.raw)};
}

// HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h
// for AVX3 targets

#endif  // HWY_TARGET <= HWY_AVX3

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();

#undef HWY_X86_IF_EMULATED_D

// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
// the warning seems to be issued at the call site of intrinsics, i.e. our code.
HWY_DIAGNOSTICS(pop)