trisquel-icecat/icecat/third_party/highway/hwy/tests/demote_test.cc

// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <algorithm>  // std::fill
#include <cmath>      // std::isfinite

#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
#include "hwy/foreach_target.h"  // IWYU pragma: keep
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"

// Causes build timeout.
#if !HWY_IS_MSAN

HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

template <typename T, HWY_IF_FLOAT(T)>
bool IsFiniteT(T t) {
  return std::isfinite(t);
}
// Wrapper avoids calling std::isfinite for integer types (ambiguous).
template <typename T, HWY_IF_NOT_FLOAT(T)>
bool IsFiniteT(T /*unused*/) {
  return true;
}

template <typename ToT>
struct TestDemoteTo {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
    const Rebind<ToT, D> to_d;

    const size_t N = Lanes(from_d);
    auto from = AllocateAligned<T>(N);
    auto expected = AllocateAligned<ToT>(N);
    HWY_ASSERT(from && expected);

    // Narrower range in the wider type, for clamping before we cast
    const T min = static_cast<T>(IsSigned<T>() ? LimitsMin<ToT>() : ToT{0});
    const T max = LimitsMax<ToT>();

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
      }
      const auto in = Load(from_d, from.get());
      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
    }

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(ToT)>(&bits, &expected[i]);  // not same size

        if (!IsSigned<T>() && IsSigned<ToT>()) {
          expected[i] &= static_cast<ToT>(max);
        }

        from[i] = static_cast<T>(expected[i]);
      }

      const auto in = Load(from_d, from.get());
      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
    }
  }
};

HWY_NOINLINE void TestAllDemoteToInt() {
  const ForDemoteVectors<TestDemoteTo<uint8_t>> from_i16_to_u8;
  from_i16_to_u8(int16_t());
  from_i16_to_u8(uint16_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>> from_i16_to_i8;
  from_i16_to_i8(int16_t());
  from_i16_to_i8(uint16_t());

  const ForDemoteVectors<TestDemoteTo<uint8_t>, 2> from_i32_to_u8;
  from_i32_to_u8(int32_t());
  from_i32_to_u8(uint32_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>, 2> from_i32_to_i8;
  from_i32_to_i8(int32_t());
  from_i32_to_i8(uint32_t());

#if HWY_HAVE_INTEGER64
  const ForDemoteVectors<TestDemoteTo<uint8_t>, 3> from_i64_to_u8;
  from_i64_to_u8(int64_t());
  from_i64_to_u8(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int8_t>, 3> from_i64_to_i8;
  from_i64_to_i8(int64_t());
  from_i64_to_i8(uint64_t());
#endif

  const ForDemoteVectors<TestDemoteTo<uint16_t>> from_i32_to_u16;
  from_i32_to_u16(int32_t());
  from_i32_to_u16(uint32_t());

  const ForDemoteVectors<TestDemoteTo<int16_t>> from_i32_to_i16;
  from_i32_to_i16(int32_t());
  from_i32_to_i16(uint32_t());

#if HWY_HAVE_INTEGER64
  const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> from_i64_to_u16;
  from_i64_to_u16(int64_t());
  from_i64_to_u16(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int16_t>, 2> from_i64_to_i16;
  from_i64_to_i16(int64_t());
  from_i64_to_i16(uint64_t());

  const ForDemoteVectors<TestDemoteTo<uint32_t>> from_i64_to_u32;
  from_i64_to_u32(int64_t());
  from_i64_to_u32(uint64_t());

  const ForDemoteVectors<TestDemoteTo<int32_t>> from_i64_to_i32;
  from_i64_to_i32(int64_t());
  from_i64_to_i32(uint64_t());
#endif
}

HWY_NOINLINE void TestAllDemoteToMixed() {
#if HWY_HAVE_FLOAT64
  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
  to_i32(double());
#endif
}

template <typename ToT>
struct TestDemoteToFloat {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
    // For floats, we clamp differently and cannot call LimitsMin.
    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
    const Rebind<ToT, D> to_d;

    const size_t N = Lanes(from_d);
    auto from = AllocateAligned<T>(N);
    auto expected = AllocateAligned<ToT>(N);
    HWY_ASSERT(from && expected);

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        } while (!IsFiniteT(from[i]));
        const T magn = std::abs(from[i]);
        const T max_abs = HighestValue<ToT>();
        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
        expected[i] = static_cast<ToT>(clipped);
      }

      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
                        DemoteTo(to_d, Load(from_d, from.get())));
    }
  }
};

HWY_NOINLINE void TestAllDemoteToFloat() {
  // Must test f16 separately because we can only load/store/convert them.

#if HWY_HAVE_FLOAT64
  const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
  to_float(double());
#endif
}

struct TestDemoteToBF16 {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
    // For floats, we clamp differently and cannot call LimitsMin.
    static_assert(IsSame<T, float>(),
                  "TestDemoteToBF16 can only be called if T is float");
    const Rebind<bfloat16_t, D> to_d;
    const Rebind<uint32_t, D> du32;
    const Rebind<uint16_t, D> du16;

    const size_t N = Lanes(from_d);
    auto from = AllocateAligned<T>(N);
    auto expected = AllocateAligned<bfloat16_t>(N);
    HWY_ASSERT(from && expected);

    const auto u16_zero_vect = Zero(du16);
    const auto u16_one_vect = Set(du16, 1);

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        } while (!IsFiniteT(from[i]));

        uint32_t fromBits;
        CopyBytes<sizeof(uint32_t)>(&from[i], &fromBits);

        uint16_t bf16Bits = static_cast<uint16_t>(fromBits >> 16);
        CopyBytes<sizeof(uint16_t)>(&bf16Bits, &expected[i]);
      }

      const auto in = Load(from_d, from.get());
      const auto actual = DemoteTo(to_d, in);

      // Adjust expected to account for any possible rounding that was
      // carried out by the DemoteTo operation
      auto expected_vect = BitCast(du16, Load(to_d, expected.get()));

      const auto low_f32_bits = TruncateTo(du16, BitCast(du32, in));

      // max_diff_from_expected is equal to (low_f32_bits == 0 ? 0 : 1)
      const auto max_diff_from_expected =
          Add(VecFromMask(du16, Eq(low_f32_bits, u16_zero_vect)), u16_one_vect);

      // expected_adj is equal to (actual_bits - expected_bits == 1 &&
      // max_diff_from_expected != 0) ? 1 : 0, where actual_bits is the bits of
      // actual and expected_bits is the bits of expected.
      auto expected_adj =
          And(max_diff_from_expected,
              VecFromMask(du16, Eq(Sub(BitCast(du16, actual), expected_vect),
                                   u16_one_vect)));

      // Increment expected_vect by expected_adj
      expected_vect = Add(expected_vect, expected_adj);

      // Store the adjusted expected_vect back into expected
      Store(BitCast(to_d, expected_vect), to_d, expected.get());

      HWY_ASSERT_VEC_EQ(to_d, expected.get(), actual);
    }
  }
};

HWY_NOINLINE void TestAllDemoteToBF16() {
  const ForDemoteVectors<TestDemoteToBF16, 1> to_bf16;
  to_bf16(float());
}

template <class D>
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
  const float test_cases[] = {
      // Same as BF16TestCases:
      // +/- 1
      1.0f,
      -1.0f,
      // +/- 0
      0.0f,
      -0.0f,
      // near 0
      0.25f,
      -0.25f,
      // +/- integer
      4.0f,
      -32.0f,
      // positive +/- delta
      2.015625f,
      3.984375f,
      // negative +/- delta
      -2.015625f,
      -3.984375f,

      // No huge values - would interfere with sum. But add more to fill 2 * N:
      -2.0f,
      -10.0f,
      0.03125f,
      1.03125f,
      1.5f,
      2.0f,
      4.0f,
      5.0f,
      6.0f,
      8.0f,
      10.0f,
      256.0f,
      448.0f,
      2080.0f,
  };
  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
  const size_t N = Lanes(d);
  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
  auto in = AllocateAligned<float>(padded);
  auto expected = AllocateAligned<float>(padded);
  HWY_ASSERT(in && expected);
  std::copy(test_cases, test_cases + kNumTestCases, in.get());
  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
  return in;
}

class TestReorderDemote2To {
  // In-place N^2 selection sort to avoid dependencies
  void Sort(float* p, size_t count) {
    for (size_t i = 0; i < count - 1; ++i) {
      // Find min_element
      size_t idx_min = i;
      for (size_t j = i + 1; j < count; j++) {
        if (p[j] < p[idx_min]) {
          idx_min = j;
        }
      }

      // Swap with current
      const float tmp = p[i];
      p[i] = p[idx_min];
      p[idx_min] = tmp;
    }
  }

 public:
  template <typename TF32, class DF32>
  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if HWY_TARGET != HWY_SCALAR
    size_t padded;
    auto in = ReorderBF16TestCases(d32, padded);

    using TBF16 = bfloat16_t;
    const Repartition<TBF16, DF32> dbf16;
    const Half<decltype(dbf16)> dbf16_half;
    const size_t N = Lanes(d32);
    auto temp16 = AllocateAligned<TBF16>(2 * N);
    auto expected = AllocateAligned<float>(2 * N);
    auto actual = AllocateAligned<float>(2 * N);
    HWY_ASSERT(temp16 && expected && actual);

    for (size_t i = 0; i < padded; i += 2 * N) {
      const auto f0 = Load(d32, &in[i + 0]);
      const auto f1 = Load(d32, &in[i + N]);
      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
      Store(v16, dbf16, temp16.get());
      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));

      // Smoke test: sum should be same (with tolerance for non-associativity)
      const auto sum_expected = ReduceSum(d32, Add(f0, f1));
      const auto sum_actual = ReduceSum(d32, Add(promoted0, promoted1));

      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
                 sum_actual <= sum_expected + 1E-4);

      // Ensure values are the same after sorting to undo the Reorder
      Store(f0, d32, expected.get() + 0);
      Store(f1, d32, expected.get() + N);
      Store(promoted0, d32, actual.get() + 0);
      Store(promoted1, d32, actual.get() + N);
      Sort(expected.get(), 2 * N);
      Sort(actual.get(), 2 * N);
      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
    }
#else  // HWY_SCALAR
    (void)d32;
#endif
  }
};

class TestIntegerReorderDemote2To {
#if HWY_TARGET != HWY_SCALAR

 private:
  // In-place N^2 selection sort to avoid dependencies
  template <class T>
  static void Sort(T* p, size_t count) {
    for (size_t i = 0; i < count - 1; ++i) {
      // Find min_element
      size_t idx_min = i;
      for (size_t j = i + 1; j < count; j++) {
        if (p[j] < p[idx_min]) {
          idx_min = j;
        }
      }

      // Swap with current
      const T tmp = p[i];
      p[i] = p[idx_min];
      p[idx_min] = tmp;
    }
  }

  template <class T, class D, class DN>
  static void DoIntegerReorderDemote2ToTest(DN dn, T /* t */, D d) {
    using TN = TFromD<DN>;

    const size_t N = Lanes(d);
    const size_t twiceN = N * 2;
    auto from = AllocateAligned<T>(twiceN);
    auto expected = AllocateAligned<TN>(twiceN);
    auto actual = AllocateAligned<TN>(twiceN);
    HWY_ASSERT(from && expected && actual);

    // Narrower range in the wider type, for clamping before we cast
    const T min = static_cast<T>(IsSigned<T>() ? LimitsMin<TN>() : TN{0});
    const T max = LimitsMax<TN>();

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < twiceN; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        expected[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, from[i]), max));
      }

      const auto in_1 = Load(d, from.get());
      const auto in_2 = Load(d, from.get() + N);
      const auto demoted_vect = ReorderDemote2To(dn, in_1, in_2);
      Store(demoted_vect, dn, actual.get());
      Sort(actual.get(), twiceN);
      Sort(expected.get(), twiceN);
      HWY_ASSERT_VEC_EQ(dn, expected.get(), Load(dn, actual.get()));
    }

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < twiceN; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(TN)>(&bits, &expected[i]);  // not same size
        if (!IsSigned<T>() && IsSigned<TN>()) {
          expected[i] &= static_cast<TN>(max);
        }

        from[i] = static_cast<T>(expected[i]);
      }

      const auto in_1 = Load(d, from.get());
      const auto in_2 = Load(d, from.get() + N);
      const auto demoted_vect = ReorderDemote2To(dn, in_1, in_2);
      Store(demoted_vect, dn, actual.get());
      Sort(actual.get(), twiceN);
      Sort(expected.get(), twiceN);
      HWY_ASSERT_VEC_EQ(dn, expected.get(), Load(dn, actual.get()));
    }
  }
#endif

 public:
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*t*/, D d) {
#if HWY_TARGET != HWY_SCALAR
    const RepartitionToNarrow<D> dn;
    const RebindToSigned<decltype(dn)> dn_i;
    const RebindToUnsigned<decltype(dn)> dn_u;

    DoIntegerReorderDemote2ToTest(dn_i, T(), d);
    DoIntegerReorderDemote2ToTest(dn_u, T(), d);
#else
    (void)d;
#endif
  }
};

HWY_NOINLINE void TestAllReorderDemote2To() {
  ForUI163264(ForShrinkableVectors<TestIntegerReorderDemote2To>());
  ForShrinkableVectors<TestReorderDemote2To>()(float());
}

struct TestFloatOrderedDemote2To {
  template <typename TF, class DF>
  HWY_NOINLINE void operator()(TF /*t*/, DF df) {
#if HWY_TARGET != HWY_SCALAR
    const Repartition<bfloat16_t, decltype(df)> dbf16;
    const RebindToUnsigned<decltype(dbf16)> du16;
    const RebindToUnsigned<decltype(df)> du32;
    const Half<decltype(du16)> du16_half;
    const size_t N = Lanes(df);
    const size_t twiceN = N * 2;
    auto from = AllocateAligned<TF>(twiceN);
    auto expected = AllocateAligned<bfloat16_t>(twiceN);
    HWY_ASSERT(from && expected);

    const auto u16_zero_vect = Zero(du16);
    const auto u16_one_vect = Set(du16, 1);

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < twiceN; ++i) {
        do {
          const uint64_t bits = rng();
          CopyBytes<sizeof(TF)>(&bits, &from[i]);  // not same size
        } while (!IsFiniteT(from[i]));

        uint32_t u32Bits;
        CopyBytes<sizeof(uint32_t)>(&from[i], &u32Bits);

        const uint16_t expected_bf16_bits =
            static_cast<uint16_t>(u32Bits >> 16);

        CopyBytes<sizeof(bfloat16_t)>(&expected_bf16_bits, &expected[i]);
      }

      const auto in_1 = Load(df, from.get());
      const auto in_2 = Load(df, from.get() + N);
      const auto actual = OrderedDemote2To(dbf16, in_1, in_2);

      // Adjust expected to account for any possible rounding that was
      // carried out by the OrderedDemote2To operation
      auto expected_vect = BitCast(du16, Load(dbf16, expected.get()));

      const auto low_f32_bits =
          Combine(du16, TruncateTo(du16_half, BitCast(du32, in_2)),
                  TruncateTo(du16_half, BitCast(du32, in_1)));
      // max_diff_from_expected is equal to (low_f32_bits == 0 ? 0 : 1)
      const auto max_diff_from_expected =
          Add(VecFromMask(du16, Eq(low_f32_bits, u16_zero_vect)), u16_one_vect);

      // expected_adj is equal to (actual_bits - expected_bits == 1 &&
      // max_diff_from_expected != 0) ? 1 : 0, where actual_bits is the bits of
      // actual and expected_bits is the bits of expected.
      auto expected_adj =
          And(max_diff_from_expected,
              VecFromMask(du16, Eq(Sub(BitCast(du16, actual), expected_vect),
                                   u16_one_vect)));

      // Increment expected_vect by expected_adj
      expected_vect = Add(expected_vect, expected_adj);

      // Store the adjusted expected_vect back into expected
      Store(BitCast(dbf16, expected_vect), dbf16, expected.get());

      HWY_ASSERT_VEC_EQ(dbf16, expected.get(), actual);
    }
#else
    (void)df;
#endif
  }
};

class TestIntegerOrderedDemote2To {
#if HWY_TARGET != HWY_SCALAR

 private:
  template <class T, class D, class DN>
  static void DoIntegerOrderedDemote2ToTest(DN dn, T /*t*/, D d) {
    using TN = TFromD<DN>;

    const size_t N = Lanes(d);
    const size_t twiceN = N * 2;
    auto from = AllocateAligned<T>(twiceN);
    auto expected = AllocateAligned<TN>(twiceN);
    HWY_ASSERT(from && expected);

    // Narrower range in the wider type, for clamping before we cast
    const T min = static_cast<T>(IsSigned<T>() ? LimitsMin<TN>() : TN{0});
    const T max = LimitsMax<TN>();

    RandomState rng;
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < twiceN; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
        expected[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, from[i]), max));
      }

      const auto in_1 = Load(d, from.get());
      const auto in_2 = Load(d, from.get() + N);
      const auto actual = OrderedDemote2To(dn, in_1, in_2);
      HWY_ASSERT_VEC_EQ(dn, expected.get(), actual);
    }

    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < twiceN; ++i) {
        const uint64_t bits = rng();
        CopyBytes<sizeof(TN)>(&bits, &expected[i]);  // not same size
        if (!IsSigned<T>() && IsSigned<TN>()) {
          expected[i] &= static_cast<TN>(max);
        }

        from[i] = static_cast<T>(expected[i]);
      }

      const auto in_1 = Load(d, from.get());
      const auto in_2 = Load(d, from.get() + N);
      const auto actual = OrderedDemote2To(dn, in_1, in_2);
      HWY_ASSERT_VEC_EQ(dn, expected.get(), actual);
    }
  }
#endif

 public:
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*t*/, D d) {
#if HWY_TARGET != HWY_SCALAR
    const RepartitionToNarrow<D> dn;
    const RebindToSigned<decltype(dn)> dn_i;
    const RebindToUnsigned<decltype(dn)> dn_u;

    DoIntegerOrderedDemote2ToTest(dn_i, T(), d);
    DoIntegerOrderedDemote2ToTest(dn_u, T(), d);
#else
    (void)d;
#endif
  }
};

HWY_NOINLINE void TestAllOrderedDemote2To() {
  ForUI163264(ForShrinkableVectors<TestIntegerOrderedDemote2To>());
  ForShrinkableVectors<TestFloatOrderedDemote2To>()(float());
}

struct TestI32F64 {
  template <typename TF, class DF>
  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
    using TI = int32_t;
    const Rebind<TI, DF> di;
    const size_t N = Lanes(df);

    // Integer positive
    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));

    // Integer negative
    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));

    // Above positive
    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));

    // Below positive
    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));

    const TF eps = static_cast<TF>(0.0001);
    // Above negative
    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));

    // Below negative
    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));

    // Huge positive float
    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
                      DemoteTo(di, Set(df, TF(1E12))));

    // Huge negative float
    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
                      DemoteTo(di, Set(df, TF(-1E12))));
  }
};

HWY_NOINLINE void TestAllI32F64() {
#if HWY_HAVE_FLOAT64
  ForDemoteVectors<TestI32F64>()(double());
#endif
}

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace hwy
HWY_AFTER_NAMESPACE();

#endif  //  !HWY_IS_MSAN

#if HWY_ONCE

namespace hwy {
#if !HWY_IS_MSAN
HWY_BEFORE_TEST(HwyDemoteTest);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToBF16);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllOrderedDemote2To);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
#endif  //  !HWY_IS_MSAN
}  // namespace hwy

#endif