4920 lines
185 KiB
C++
4920 lines
185 KiB
C++
// Copyright 2023 Google LLC
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// 128-bit vectors for VSX
|
|
// External include guard in highway.h - see comment there.
|
|
|
|
#pragma push_macro("vector")
|
|
#pragma push_macro("pixel")
|
|
#pragma push_macro("bool")
|
|
|
|
#undef vector
|
|
#undef pixel
|
|
#undef bool
|
|
|
|
#include <altivec.h>
|
|
|
|
#pragma pop_macro("vector")
|
|
#pragma pop_macro("pixel")
|
|
#pragma pop_macro("bool")
|
|
|
|
#include <string.h> // memcpy
|
|
|
|
#include "hwy/ops/shared-inl.h"
|
|
|
|
// clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__.
|
|
// This means we can only use POWER10-specific intrinsics in static dispatch
|
|
// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
|
|
// On other compilers, the usual target check is sufficient.
|
|
#if HWY_TARGET <= HWY_PPC9 && \
|
|
(!HWY_COMPILER_CLANG || defined(__POWER9_VECTOR__))
|
|
#define HWY_PPC_HAVE_9 1
|
|
#else
|
|
#define HWY_PPC_HAVE_9 0
|
|
#endif
|
|
|
|
#if HWY_TARGET <= HWY_PPC10 && \
|
|
(!HWY_COMPILER_CLANG || defined(__POWER10_VECTOR__))
|
|
#define HWY_PPC_HAVE_10 1
|
|
#else
|
|
#define HWY_PPC_HAVE_10 0
|
|
#endif
|
|
|
|
HWY_BEFORE_NAMESPACE();
|
|
namespace hwy {
|
|
namespace HWY_NAMESPACE {
|
|
namespace detail {
|
|
|
|
template <typename T>
|
|
struct Raw128;
|
|
|
|
// Each Raw128 specialization defines the following typedefs:
|
|
// - type:
|
|
// the backing Altivec/VSX raw vector type of the Vec128<T, N> type
|
|
// - RawBoolVec:
|
|
// the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type
|
|
// - RawT:
|
|
// the lane type for intrinsics, in particular vec_splat
|
|
// - AlignedRawVec:
|
|
// the 128-bit GCC/Clang vector type for aligned loads/stores
|
|
// - UnalignedRawVec:
|
|
// the 128-bit GCC/Clang vector type for unaligned loads/stores
|
|
#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
|
|
template <> \
|
|
struct Raw128<LANE_TYPE> { \
|
|
using type = __vector RAW_VECT_LANE_TYPE; \
|
|
using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
|
|
using RawT = RAW_VECT_LANE_TYPE; \
|
|
typedef LANE_TYPE AlignedRawVec \
|
|
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
|
|
typedef LANE_TYPE UnalignedRawVec __attribute__(( \
|
|
__vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
|
|
};
|
|
|
|
HWY_VSX_RAW128(int8_t, signed char, char)
|
|
HWY_VSX_RAW128(uint8_t, unsigned char, char)
|
|
HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int)
|
|
HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int)
|
|
HWY_VSX_RAW128(int32_t, signed int, int)
|
|
HWY_VSX_RAW128(uint32_t, unsigned int, int)
|
|
HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int)
|
|
HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int)
|
|
HWY_VSX_RAW128(float, float, int)
|
|
HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int)
|
|
|
|
template <>
|
|
struct Raw128<bfloat16_t> : public Raw128<uint16_t> {};
|
|
|
|
template <>
|
|
struct Raw128<float16_t> : public Raw128<uint16_t> {};
|
|
|
|
#undef HWY_VSX_RAW128
|
|
|
|
} // namespace detail
|
|
|
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
class Vec128 {
|
|
using Raw = typename detail::Raw128<T>::type;
|
|
|
|
public:
|
|
using PrivateT = T; // only for DFromV
|
|
static constexpr size_t kPrivateN = N; // only for DFromV
|
|
|
|
// Compound assignment. Only usable if there is a corresponding non-member
|
|
// binary operator overload. For example, only f32 and f64 support division.
|
|
HWY_INLINE Vec128& operator*=(const Vec128 other) {
|
|
return *this = (*this * other);
|
|
}
|
|
HWY_INLINE Vec128& operator/=(const Vec128 other) {
|
|
return *this = (*this / other);
|
|
}
|
|
HWY_INLINE Vec128& operator+=(const Vec128 other) {
|
|
return *this = (*this + other);
|
|
}
|
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
return *this = (*this - other);
|
|
}
|
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
return *this = (*this & other);
|
|
}
|
|
HWY_INLINE Vec128& operator|=(const Vec128 other) {
|
|
return *this = (*this | other);
|
|
}
|
|
HWY_INLINE Vec128& operator^=(const Vec128 other) {
|
|
return *this = (*this ^ other);
|
|
}
|
|
|
|
Raw raw;
|
|
};
|
|
|
|
template <typename T>
|
|
using Vec64 = Vec128<T, 8 / sizeof(T)>;
|
|
|
|
template <typename T>
|
|
using Vec32 = Vec128<T, 4 / sizeof(T)>;
|
|
|
|
template <typename T>
|
|
using Vec16 = Vec128<T, 2 / sizeof(T)>;
|
|
|
|
// FF..FF or 0.
|
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
struct Mask128 {
|
|
typename detail::Raw128<T>::RawBoolVec raw;
|
|
|
|
using PrivateT = T; // only for DFromM
|
|
static constexpr size_t kPrivateN = N; // only for DFromM
|
|
};
|
|
|
|
template <class V>
|
|
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
|
|
|
template <class M>
|
|
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
|
|
|
|
template <class V>
|
|
using TFromV = typename V::PrivateT;
|
|
|
|
// ------------------------------ Zero
|
|
|
|
// Returns an all-zero vector/part.
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
// There is no vec_splats for 64-bit, so we cannot rely on casting the 0
|
|
// argument in order to select the correct overload. We instead cast the
|
|
// return vector type; see also the comment in BitCast.
|
|
return Vec128<T, HWY_MAX_LANES_D(D)>{
|
|
reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))};
|
|
}
|
|
|
|
template <class D>
|
|
using VFromD = decltype(Zero(D()));
|
|
|
|
// ------------------------------ Tuple (VFromD)
|
|
#include "hwy/ops/tuple-inl.h"
|
|
|
|
// ------------------------------ BitCast
|
|
|
|
template <class D, typename FromT>
|
|
HWY_API VFromD<D> BitCast(D /*d*/,
|
|
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
|
|
// C-style casts are not sufficient when compiling with
|
|
// -fno-lax-vector-conversions, which will be the future default in Clang,
|
|
// but reinterpret_cast is.
|
|
return VFromD<D>{
|
|
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ ResizeBitCast
|
|
|
|
template <class D, typename FromV>
|
|
HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
|
|
// C-style casts are not sufficient when compiling with
|
|
// -fno-lax-vector-conversions, which will be the future default in Clang,
|
|
// but reinterpret_cast is.
|
|
return VFromD<D>{
|
|
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ Set
|
|
|
|
// Returns a vector/part with all lanes set to "t".
|
|
template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
|
|
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
using RawLane = typename detail::Raw128<TFromD<D>>::RawT;
|
|
return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
|
|
}
|
|
|
|
// Returns a vector with uninitialized elements.
|
|
template <class D>
|
|
HWY_API VFromD<D> Undefined(D d) {
|
|
#if HWY_COMPILER_GCC_ACTUAL
|
|
// Suppressing maybe-uninitialized both here and at the caller does not work,
|
|
// so initialize.
|
|
return Zero(d);
|
|
#else
|
|
HWY_DIAGNOSTICS(push)
|
|
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
|
|
typename detail::Raw128<TFromD<D>>::type raw;
|
|
return VFromD<decltype(d)>{raw};
|
|
HWY_DIAGNOSTICS(pop)
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ GetLane
|
|
|
|
// Gets the single value stored in a vector/part.
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API T GetLane(Vec128<T, N> v) {
|
|
return static_cast<T>(v.raw[0]);
|
|
}
|
|
|
|
// ================================================== LOGICAL
|
|
|
|
// ------------------------------ And
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
}
|
|
|
|
// ------------------------------ AndNot
|
|
|
|
// Returns ~not_mask & mask.
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
|
|
const DFromV<decltype(mask)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
return BitCast(
|
|
d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
|
|
}
|
|
|
|
// ------------------------------ Or
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
}
|
|
|
|
// ------------------------------ Xor
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
}
|
|
|
|
// ------------------------------ Not
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
|
|
}
|
|
|
|
// ------------------------------ IsConstantRawAltivecVect
|
|
namespace detail {
|
|
|
|
template <class RawV>
|
|
static HWY_INLINE bool IsConstantRawAltivecVect(
|
|
hwy::SizeTag<1> /* lane_size_tag */, RawV v) {
|
|
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
|
|
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
|
|
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
|
|
__builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
|
|
__builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
|
|
__builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
|
|
__builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
|
|
}
|
|
|
|
template <class RawV>
|
|
static HWY_INLINE bool IsConstantRawAltivecVect(
|
|
hwy::SizeTag<2> /* lane_size_tag */, RawV v) {
|
|
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
|
|
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
|
|
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
|
|
}
|
|
|
|
template <class RawV>
|
|
static HWY_INLINE bool IsConstantRawAltivecVect(
|
|
hwy::SizeTag<4> /* lane_size_tag */, RawV v) {
|
|
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
|
|
}
|
|
|
|
template <class RawV>
|
|
static HWY_INLINE bool IsConstantRawAltivecVect(
|
|
hwy::SizeTag<8> /* lane_size_tag */, RawV v) {
|
|
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
|
|
}
|
|
|
|
template <class RawV>
|
|
static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) {
|
|
return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v);
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// ------------------------------ TernaryLogic
|
|
#if HWY_PPC_HAVE_10
|
|
namespace detail {
|
|
|
|
// NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse
|
|
// order of the kTernLogOp bits of AVX3
|
|
// _mm_ternarylogic_epi64(a, b, c, kTernLogOp)
|
|
template <uint8_t kTernLogOp, class V>
|
|
HWY_INLINE V TernaryLogic(V a, V b, V c) {
|
|
const DFromV<decltype(a)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using VU = VFromD<decltype(du)>;
|
|
const auto a_raw = BitCast(du, a).raw;
|
|
const auto b_raw = BitCast(du, b).raw;
|
|
const auto c_raw = BitCast(du, c).raw;
|
|
|
|
#if HWY_COMPILER_GCC_ACTUAL
|
|
// Use inline assembly on GCC to work around GCC compiler bug
|
|
typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
|
|
__asm__("xxeval %x0,%x1,%x2,%x3,%4"
|
|
: "=wa"(raw_ternlog_result)
|
|
: "wa"(a_raw), "wa"(b_raw), "wa"(c_raw), "n"(kTernLogOp)
|
|
:);
|
|
#else
|
|
const auto raw_ternlog_result =
|
|
vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
|
|
#endif
|
|
|
|
return BitCast(d, VU{raw_ternlog_result});
|
|
}
|
|
|
|
} // namespace detail
|
|
#endif // HWY_PPC_HAVE_10
|
|
|
|
// ------------------------------ Xor3
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
|
|
#if HWY_PPC_HAVE_10
|
|
#if defined(__OPTIMIZE__)
|
|
if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) +
|
|
static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) +
|
|
static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
|
|
2) {
|
|
return Xor(x1, Xor(x2, x3));
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return detail::TernaryLogic<0x69>(x1, x2, x3);
|
|
}
|
|
#else
|
|
return Xor(x1, Xor(x2, x3));
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ Or3
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
|
|
#if HWY_PPC_HAVE_10
|
|
#if defined(__OPTIMIZE__)
|
|
if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) +
|
|
static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) +
|
|
static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
|
|
2) {
|
|
return Or(o1, Or(o2, o3));
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return detail::TernaryLogic<0x7F>(o1, o2, o3);
|
|
}
|
|
#else
|
|
return Or(o1, Or(o2, o3));
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ OrAnd
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
|
|
#if HWY_PPC_HAVE_10
|
|
#if defined(__OPTIMIZE__)
|
|
if (detail::IsConstantRawAltivecVect(a1.raw) &&
|
|
detail::IsConstantRawAltivecVect(a2.raw)) {
|
|
return Or(o, And(a1, a2));
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return detail::TernaryLogic<0x1F>(o, a1, a2);
|
|
}
|
|
#else
|
|
return Or(o, And(a1, a2));
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ IfVecThenElse
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
|
|
Vec128<T, N> no) {
|
|
const DFromV<decltype(yes)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return BitCast(
|
|
d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
|
|
BitCast(du, mask).raw)});
|
|
}
|
|
|
|
// ------------------------------ BitwiseIfThenElse
|
|
|
|
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
#else
|
|
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
#endif
|
|
|
|
template <class V>
|
|
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
|
|
return IfVecThenElse(mask, yes, no);
|
|
}
|
|
|
|
// ------------------------------ Operator overloads (internal-only if float)
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return And(a, b);
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Or(a, b);
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Xor(a, b);
|
|
}
|
|
|
|
// ================================================== SIGN
|
|
|
|
// ------------------------------ Neg
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_INLINE Vec128<T, N> Neg(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_neg(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ Abs
|
|
|
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
template <class T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_abs(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ CopySign
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
|
|
Vec128<float, N> sign) {
|
|
// Work around compiler bugs that are there with vec_cpsgn on older versions
|
|
// of GCC/Clang
|
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
|
|
return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
|
|
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
|
|
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
|
|
return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
|
|
#else
|
|
return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
|
|
#endif
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
|
|
Vec128<double, N> sign) {
|
|
// Work around compiler bugs that are there with vec_cpsgn on older versions
|
|
// of GCC/Clang
|
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
|
|
return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
|
|
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
|
|
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
|
|
return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
|
|
#else
|
|
return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
|
|
#endif
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
// PPC8 can also handle abs < 0, so no extra action needed.
|
|
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
|
|
return CopySign(abs, sign);
|
|
}
|
|
|
|
// ================================================== MEMORY (1)
|
|
|
|
// Note: type punning is safe because the types are tagged with may_alias.
|
|
// (https://godbolt.org/z/fqrWjfjsP)
|
|
|
|
// ------------------------------ Load
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
|
|
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
const LoadRaw* HWY_RESTRICT p = reinterpret_cast<const LoadRaw*>(aligned);
|
|
using ResultRaw = typename detail::Raw128<T>::type;
|
|
return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
|
|
}
|
|
|
|
// Any <= 64 bit
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
|
|
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
|
|
using BitsT = UnsignedFromSize<d.MaxBytes()>;
|
|
|
|
BitsT bits;
|
|
const Repartition<BitsT, decltype(d)> d_bits;
|
|
CopyBytes<d.MaxBytes()>(p, &bits);
|
|
return BitCast(d, Set(d_bits, bits));
|
|
}
|
|
|
|
// ================================================== MASK
|
|
|
|
// ------------------------------ Mask
|
|
|
|
// Mask and Vec are both backed by vector types (true = FF..FF).
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
|
|
using Raw = typename detail::Raw128<T>::RawBoolVec;
|
|
return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)};
|
|
}
|
|
|
|
template <class D>
|
|
using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) {
|
|
return Vec128<T, N>{
|
|
reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)};
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
|
|
return VFromD<D>{
|
|
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
|
|
}
|
|
|
|
// mask ? yes : no
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
Vec128<T, N> no) {
|
|
const DFromV<decltype(yes)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return BitCast(d, VFromD<decltype(du)>{vec_sel(
|
|
BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
|
|
}
|
|
|
|
// mask ? yes : 0
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
const DFromV<decltype(yes)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return BitCast(d,
|
|
VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)});
|
|
}
|
|
|
|
// mask ? 0 : no
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
const DFromV<decltype(no)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return BitCast(d,
|
|
VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)});
|
|
}
|
|
|
|
// ------------------------------ Mask logical
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
|
|
return Mask128<T, N>{vec_nor(m.raw, m.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
|
|
return Mask128<T, N>{vec_and(a.raw, b.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
|
|
return Mask128<T, N>{vec_andc(b.raw, a.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
|
|
return Mask128<T, N>{vec_or(a.raw, b.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
|
|
return Mask128<T, N>{vec_xor(a.raw, b.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
|
|
return Mask128<T, N>{vec_nor(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ BroadcastSignBit
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) {
|
|
return Vec128<int8_t, N>{
|
|
vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) {
|
|
return Vec128<int16_t, N>{
|
|
vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) {
|
|
return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) {
|
|
return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))};
|
|
}
|
|
|
|
// ------------------------------ ShiftLeftSame
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
|
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
return Vec128<T, N>{vec_sl(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
}
|
|
|
|
// ------------------------------ ShiftRightSame
|
|
|
|
template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
|
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
}
|
|
|
|
// ------------------------------ ShiftLeft
|
|
|
|
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
|
|
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
|
return ShiftLeftSame(v, kBits);
|
|
}
|
|
|
|
// ------------------------------ ShiftRight
|
|
|
|
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
|
|
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
|
return ShiftRightSame(v, kBits);
|
|
}
|
|
|
|
// ================================================== SWIZZLE (1)
|
|
|
|
// ------------------------------ TableLookupBytes
|
|
template <typename T, size_t N, typename TI, size_t NI>
|
|
HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
|
|
Vec128<TI, NI> from) {
|
|
const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
|
|
return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
|
|
vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
|
|
}
|
|
|
|
// ------------------------------ TableLookupBytesOr0
|
|
// For all vector widths; Altivec/VSX needs zero out
|
|
template <class V, class VI>
|
|
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
|
|
const DFromV<VI> di;
|
|
Repartition<int8_t, decltype(di)> di8;
|
|
const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
|
|
return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
|
|
}
|
|
|
|
// ------------------------------ Reverse
|
|
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
|
|
HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
|
|
return Vec128<T>{vec_reve(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ Shuffles (Reverse)
|
|
|
|
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
|
|
// Shuffle0321 rotates one lane to the right (the previous least-significant
|
|
// lane is now most-significant). These could also be implemented via
|
|
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
|
|
|
|
// Swap 32-bit halves in 64-bit halves.
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
|
|
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
|
|
static_assert(N == 2 || N == 4, "Does not make sense for N=1");
|
|
const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
|
|
12, 13, 14, 15, 8, 9, 10, 11};
|
|
return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
|
|
}
|
|
|
|
// These are used by generic_ops-inl to implement LoadInterleaved3. As with
|
|
// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
|
|
// comes from the first argument.
|
|
namespace detail {
|
|
|
|
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) {
|
|
const __vector unsigned char kShuffle16 = {1, 0, 19, 18};
|
|
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) {
|
|
const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
|
|
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) {
|
|
const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
|
|
28, 29, 30, 31, 24, 25, 26, 27};
|
|
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
|
|
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) {
|
|
const __vector unsigned char kShuffle = {0, 3, 18, 17};
|
|
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) {
|
|
const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
|
|
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) {
|
|
const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
|
|
24, 25, 26, 27, 20, 21, 22, 23};
|
|
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
|
|
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) {
|
|
const __vector unsigned char kShuffle = {2, 1, 16, 19};
|
|
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) {
|
|
const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
|
|
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) {
|
|
const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
|
|
16, 17, 18, 19, 28, 29, 30, 31};
|
|
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// Swap 64-bit halves
|
|
template <class T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
|
|
const Full128<T> d;
|
|
const Full128<uint64_t> du64;
|
|
return BitCast(d, Reverse(du64, BitCast(du64, v)));
|
|
}
|
|
template <class T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
|
|
return Reverse(Full128<T>(), v);
|
|
}
|
|
|
|
// Rotate right 32 bits
|
|
template <class T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
|
|
#else
|
|
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
|
|
#endif
|
|
}
|
|
// Rotate left 32 bits
|
|
template <class T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
|
|
#else
|
|
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
|
|
#endif
|
|
}
|
|
|
|
template <class T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
|
|
return Reverse(Full128<T>(), v);
|
|
}
|
|
|
|
// ================================================== COMPARE
|
|
|
|
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
|
|
|
template <class DTo, typename TFrom, size_t NFrom>
|
|
HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) {
|
|
static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
|
|
return MFromD<DTo>{m.raw};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
|
|
static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
|
|
return (v & bit) == bit;
|
|
}
|
|
|
|
// ------------------------------ Equality
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Inequality
|
|
|
|
// This cannot have T as a template argument, otherwise it is not more
|
|
// specialized than rewritten operator== in C++20, leading to compile
|
|
// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
|
|
template <size_t N>
|
|
HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
|
|
Vec128<uint8_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
|
|
Vec128<uint16_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
|
|
Vec128<uint32_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
|
|
Vec128<uint64_t, N> b) {
|
|
return Not(a == b);
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
|
|
Vec128<int8_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
|
|
Vec128<int16_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
|
|
Vec128<int32_t, N> b) {
|
|
#if HWY_PPC_HAVE_9
|
|
return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
|
|
#else
|
|
return Not(a == b);
|
|
#endif
|
|
}
|
|
template <size_t N>
|
|
HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
|
|
Vec128<int64_t, N> b) {
|
|
return Not(a == b);
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
|
|
return Not(a == b);
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
|
|
Vec128<double, N> b) {
|
|
return Not(a == b);
|
|
}
|
|
|
|
// ------------------------------ Strict inequality
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Weak inequality
|
|
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Not(b > a);
|
|
}
|
|
|
|
// ------------------------------ Reversed comparisons
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return b > a;
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// ================================================== MEMORY (2)
|
|
|
|
// ------------------------------ Load
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {
|
|
using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec;
|
|
const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p);
|
|
using ResultRaw = typename detail::Raw128<T>::type;
|
|
return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)};
|
|
}
|
|
|
|
// For < 128 bit, LoadU == Load.
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
|
|
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
|
|
return Load(d, p);
|
|
}
|
|
|
|
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
|
|
return LoadU(d, p);
|
|
}
|
|
|
|
// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
namespace detail {
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15};
|
|
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
|
|
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
|
|
}
|
|
|
|
template <class D, HWY_IF_UI32_D(D)>
|
|
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3};
|
|
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
|
|
}
|
|
|
|
template <class D, HWY_IF_UI64_D(D)>
|
|
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
constexpr __vector unsigned long long kU64Iota0 = {0, 1};
|
|
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
|
|
}
|
|
|
|
template <class D, HWY_IF_F32_D(D)>
|
|
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
|
|
return VFromD<D>{kF32Iota0};
|
|
}
|
|
|
|
template <class D, HWY_IF_F64_D(D)>
|
|
HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
constexpr __vector double kF64Iota0 = {0.0, 1.0};
|
|
return VFromD<D>{kF64Iota0};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <class D, typename T2>
|
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
|
|
}
|
|
|
|
// ------------------------------ FirstN (Iota, Lt)
|
|
|
|
template <class D>
|
|
HWY_API MFromD<D> FirstN(D d, size_t num) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using TU = TFromD<decltype(du)>;
|
|
return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num)));
|
|
}
|
|
|
|
// ------------------------------ MaskedLoad
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) {
|
|
return IfThenElseZero(m, LoadU(d, p));
|
|
}
|
|
|
|
// ------------------------------ MaskedLoadOr
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
const T* HWY_RESTRICT p) {
|
|
return IfThenElse(m, LoadU(d, p), v);
|
|
}
|
|
|
|
// ------------------------------ Store
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
|
|
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
*reinterpret_cast<StoreRaw*>(aligned) = reinterpret_cast<StoreRaw>(v.raw);
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
|
|
using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec;
|
|
*reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw);
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
|
|
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
|
|
using BitsT = UnsignedFromSize<d.MaxBytes()>;
|
|
|
|
const Repartition<BitsT, decltype(d)> d_bits;
|
|
const BitsT bits = GetLane(BitCast(d_bits, v));
|
|
CopyBytes<d.MaxBytes()>(&bits, p);
|
|
}
|
|
|
|
// For < 128 bit, StoreU == Store.
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
|
|
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
|
|
Store(v, d, p);
|
|
}
|
|
|
|
// ------------------------------ BlendedStore
|
|
|
|
template <class D>
|
|
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
TFromD<D>* HWY_RESTRICT p) {
|
|
const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
|
|
using TI = TFromD<decltype(di)>;
|
|
alignas(16) TI buf[MaxLanes(d)];
|
|
alignas(16) TI mask[MaxLanes(d)];
|
|
Store(BitCast(di, v), di, buf);
|
|
Store(BitCast(di, VecFromMask(d, m)), di, mask);
|
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
if (mask[i]) {
|
|
CopySameSize(buf + i, p + i);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ================================================== ARITHMETIC
|
|
|
|
// ------------------------------ Addition
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_add(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Subtraction
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_sub(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ SumsOf8
|
|
namespace detail {
|
|
|
|
// Casts nominally uint32_t result to D.
|
|
template <class D>
|
|
HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
|
|
__vector unsigned int b) {
|
|
const Repartition<uint32_t, D> du32;
|
|
#ifdef __OPTIMIZE__
|
|
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
const uint64_t sum0 =
|
|
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
|
|
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
|
|
static_cast<uint64_t>(b[0]);
|
|
const uint64_t sum1 =
|
|
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
|
|
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
|
|
static_cast<uint64_t>(b[1]);
|
|
const uint64_t sum2 =
|
|
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
|
|
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
|
|
static_cast<uint64_t>(b[2]);
|
|
const uint64_t sum3 =
|
|
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
|
|
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
|
|
static_cast<uint64_t>(b[3]);
|
|
return BitCast(
|
|
d,
|
|
VFromD<decltype(du32)>{(__vector unsigned int){
|
|
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
|
|
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
|
|
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
|
|
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
|
|
: 0xFFFFFFFFu)}});
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
|
|
}
|
|
}
|
|
|
|
// Casts nominally int32_t result to D.
|
|
template <class D>
|
|
HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
|
|
__vector signed int b) {
|
|
const Repartition<int32_t, D> di32;
|
|
#ifdef __OPTIMIZE__
|
|
const Repartition<uint64_t, D> du64;
|
|
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
|
|
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
|
|
__builtin_constant_p(b[kDestLaneOffset + 2])) {
|
|
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
static_cast<int64_t>(a[1]) +
|
|
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
static_cast<int64_t>(a[3]) +
|
|
static_cast<int64_t>(b[kDestLaneOffset + 2]);
|
|
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
|
|
(sign0 == (sum0 >> 31))
|
|
? static_cast<uint32_t>(sum0)
|
|
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
|
|
(sign1 == (sum1 >> 31))
|
|
? static_cast<uint32_t>(sum1)
|
|
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
__vector signed int sum;
|
|
|
|
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
|
|
// on little-endian PowerPC targets as the result of the vsum2sws
|
|
// instruction will already be in the correct lanes on little-endian
|
|
// PowerPC targets.
|
|
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
|
|
return BitCast(d, VFromD<decltype(di32)>{sum});
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<uint64_t, N / 8> SumsOf8(Vec128<uint8_t, N> v) {
|
|
const Repartition<uint64_t, DFromV<decltype(v)>> du64;
|
|
const Repartition<int32_t, decltype(du64)> di32;
|
|
const RebindToUnsigned<decltype(di32)> du32;
|
|
|
|
return detail::AltivecVsum2sws(
|
|
du64, detail::AltivecVsum4ubs(di32, v.raw, Zero(du32).raw).raw,
|
|
Zero(di32).raw);
|
|
}
|
|
|
|
// ------------------------------ SaturatedAdd
|
|
|
|
// Returns a + b clamped to the destination range.
|
|
|
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
#else
|
|
#define HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
#endif
|
|
|
|
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
|
|
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
|
|
#else
|
|
#define HWY_NATIVE_U32_SATURATED_ADDSUB
|
|
#endif
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_adds(a.raw, b.raw)};
|
|
}
|
|
|
|
#if HWY_PPC_HAVE_10
|
|
|
|
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
|
|
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
|
|
#else
|
|
#define HWY_NATIVE_I64_SATURATED_ADDSUB
|
|
#endif
|
|
|
|
template <class V, HWY_IF_I64_D(DFromV<V>)>
|
|
HWY_API V SaturatedAdd(V a, V b) {
|
|
const DFromV<decltype(a)> d;
|
|
const auto sum = Add(a, b);
|
|
const auto overflow_mask =
|
|
MaskFromVec(BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum)));
|
|
const auto overflow_result =
|
|
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
|
|
return IfThenElse(overflow_mask, overflow_result, sum);
|
|
}
|
|
|
|
#endif // HWY_PPC_HAVE_10
|
|
|
|
// ------------------------------ SaturatedSub
|
|
|
|
// Returns a - b clamped to the destination range.
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_subs(a.raw, b.raw)};
|
|
}
|
|
|
|
#if HWY_PPC_HAVE_10
|
|
|
|
template <class V, HWY_IF_I64_D(DFromV<V>)>
|
|
HWY_API V SaturatedSub(V a, V b) {
|
|
const DFromV<decltype(a)> d;
|
|
const auto diff = Sub(a, b);
|
|
const auto overflow_mask =
|
|
MaskFromVec(BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff)));
|
|
const auto overflow_result =
|
|
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
|
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
}
|
|
|
|
#endif // HWY_PPC_HAVE_10
|
|
|
|
// ------------------------------ AverageRound
|
|
|
|
// Returns (a + b + 1) / 2
|
|
|
|
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
|
|
HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
|
|
HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_avg(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Multiplication
|
|
|
|
// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
|
|
#ifdef HWY_NATIVE_MUL_8
|
|
#undef HWY_NATIVE_MUL_8
|
|
#else
|
|
#define HWY_NATIVE_MUL_8
|
|
#endif
|
|
#ifdef HWY_NATIVE_MUL_64
|
|
#undef HWY_NATIVE_MUL_64
|
|
#else
|
|
#define HWY_NATIVE_MUL_64
|
|
#endif
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{a.raw * b.raw};
|
|
}
|
|
|
|
// Returns the upper 16 bits of a * b in each lane.
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
|
|
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const RepartitionToWide<decltype(d)> dw;
|
|
const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
|
|
const VFromD<decltype(dw)> p2{vec_mulo(a.raw, b.raw)};
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23,
|
|
10, 11, 26, 27, 14, 15, 30, 31};
|
|
#else
|
|
const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21,
|
|
8, 9, 24, 25, 12, 13, 28, 29};
|
|
#endif
|
|
return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
Vec128<int16_t, N> b) {
|
|
const Vec128<int16_t> zero = Zero(Full128<int16_t>());
|
|
return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
|
|
}
|
|
|
|
// Multiplies even lanes (0, 2 ..) and places the double-wide result into
|
|
// even and the upper half into its odd neighbor lane.
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4), HWY_IF_NOT_FLOAT(T)>
|
|
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
|
|
Vec128<T, N> b) {
|
|
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ RotateRight
|
|
template <int kBits, typename T, size_t N>
|
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
if (kBits == 0) return v;
|
|
return Vec128<T, N>{vec_rl(v.raw, Set(d, kSizeInBits - kBits).raw)};
|
|
}
|
|
|
|
// ------------------------------ ZeroIfNegative (BroadcastSignBit)
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
static_assert(IsFloat<T>(), "Only works for float");
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToSigned<decltype(d)> di;
|
|
const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
|
|
return IfThenElse(mask, Zero(d), v);
|
|
}
|
|
|
|
// ------------------------------ IfNegativeThenElse
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
Vec128<T, N> no) {
|
|
static_assert(IsSigned<T>(), "Only works for signed/float");
|
|
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToSigned<decltype(d)> di;
|
|
return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))),
|
|
yes, no);
|
|
}
|
|
|
|
// Absolute value of difference.
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> AbsDiff(Vec128<float, N> a, Vec128<float, N> b) {
|
|
return Abs(a - b);
|
|
}
|
|
|
|
// ------------------------------ Floating-point multiply-add variants
|
|
|
|
// Returns mul * x + add
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
Vec128<T, N> add) {
|
|
return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
|
|
}
|
|
|
|
// Returns add - mul * x
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
Vec128<T, N> add) {
|
|
// NOTE: the vec_nmsub operation below computes -(mul * x - add),
|
|
// which is equivalent to add - mul * x in the round-to-nearest
|
|
// and round-towards-zero rounding modes
|
|
return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
|
|
}
|
|
|
|
// Returns mul * x - sub
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
Vec128<T, N> sub) {
|
|
return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
|
|
}
|
|
|
|
// Returns -mul * x - sub
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
Vec128<T, N> sub) {
|
|
// NOTE: The vec_nmadd operation below computes -(mul * x + sub),
|
|
// which is equivalent to -mul * x - sub in the round-to-nearest
|
|
// and round-towards-zero rounding modes
|
|
return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
|
|
}
|
|
|
|
// ------------------------------ Floating-point div
|
|
// Approximate reciprocal
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
|
|
return Vec128<float, N>{vec_re(v.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_div(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Floating-point square root
|
|
|
|
// Approximate reciprocal square root
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
|
|
return Vec128<float, N>{vec_rsqrte(v.raw)};
|
|
}
|
|
|
|
// Full precision square root
|
|
template <class T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_sqrt(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ Min (Gt, IfThenElse)
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_min(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------ Max (Gt, IfThenElse)
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_max(a.raw, b.raw)};
|
|
}
|
|
|
|
// ------------------------------- Integer AbsDiff for PPC9/PPC10
|
|
|
|
#if HWY_PPC_HAVE_9
|
|
#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
|
|
#undef HWY_NATIVE_INTEGER_ABS_DIFF
|
|
#else
|
|
#define HWY_NATIVE_INTEGER_ABS_DIFF
|
|
#endif
|
|
|
|
template <class V, HWY_IF_UNSIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
HWY_API V AbsDiff(const V a, const V b) {
|
|
return V{vec_absd(a.raw, b.raw)};
|
|
}
|
|
|
|
template <class V, HWY_IF_U64_D(DFromV<V>)>
|
|
HWY_API V AbsDiff(const V a, const V b) {
|
|
return Sub(Max(a, b), Min(a, b));
|
|
}
|
|
|
|
template <class V, HWY_IF_SIGNED_V(V)>
|
|
HWY_API V AbsDiff(const V a, const V b) {
|
|
return Sub(Max(a, b), Min(a, b));
|
|
}
|
|
|
|
#endif // HWY_PPC_HAVE_9
|
|
|
|
// ================================================== MEMORY (3)
|
|
|
|
// ------------------------------ Non-temporal stores
|
|
|
|
template <class D>
|
|
HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
|
|
__builtin_prefetch(aligned, 1, 0);
|
|
Store(v, d, aligned);
|
|
}
|
|
|
|
// ------------------------------ Scatter
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI>
|
|
HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, VI offset) {
|
|
using TI = TFromV<VI>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
|
|
alignas(16) T lanes[MaxLanes(d)];
|
|
Store(v, d, lanes);
|
|
|
|
alignas(16) TI offset_lanes[MaxLanes(d)];
|
|
Store(offset, Rebind<TI, decltype(d)>(), offset_lanes);
|
|
|
|
uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
|
|
}
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI>
|
|
HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, VI index) {
|
|
using TI = TFromV<VI>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
|
|
alignas(16) T lanes[MaxLanes(d)];
|
|
Store(v, d, lanes);
|
|
|
|
alignas(16) TI index_lanes[MaxLanes(d)];
|
|
Store(index, Rebind<TI, decltype(d)>(), index_lanes);
|
|
|
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
base[index_lanes[i]] = lanes[i];
|
|
}
|
|
}
|
|
|
|
// ------------------------------ Gather (Load/Store)
|
|
|
|
template <class D, typename T = TFromD<D>, class VI>
|
|
HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) {
|
|
using TI = TFromV<VI>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
|
|
alignas(16) TI offset_lanes[MaxLanes(d)];
|
|
Store(offset, Rebind<TI, decltype(d)>(), offset_lanes);
|
|
|
|
alignas(16) T lanes[MaxLanes(d)];
|
|
const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
|
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
|
|
}
|
|
return Load(d, lanes);
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, class VI>
|
|
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) {
|
|
using TI = TFromV<VI>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
|
|
alignas(16) TI index_lanes[MaxLanes(d)];
|
|
Store(index, Rebind<TI, decltype(d)>(), index_lanes);
|
|
|
|
alignas(16) T lanes[MaxLanes(d)];
|
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
lanes[i] = base[index_lanes[i]];
|
|
}
|
|
return Load(d, lanes);
|
|
}
|
|
|
|
// ================================================== SWIZZLE (2)
|
|
|
|
// ------------------------------ LowerHalf
|
|
|
|
// Returns upper/lower half of a vector.
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
|
|
return VFromD<D>{v.raw};
|
|
}
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
|
|
return Vec128<T, N / 2>{v.raw};
|
|
}
|
|
|
|
// ------------------------------ ShiftLeftBytes
|
|
|
|
// NOTE: The ShiftLeftBytes operation moves the elements of v to the right
|
|
// by kBytes bytes and zeroes out the first kBytes bytes of v on both
|
|
// little-endian and big-endian PPC targets
|
|
// (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both
|
|
// little-endian and big-endian targets)
|
|
|
|
template <int kBytes, class D>
|
|
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
|
|
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
if (kBytes == 0) return v;
|
|
const auto zeros = Zero(d);
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
|
|
#else
|
|
return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
|
|
#endif
|
|
}
|
|
|
|
template <int kBytes, typename T, size_t N>
|
|
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
|
|
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
|
|
}
|
|
|
|
// ------------------------------ ShiftLeftLanes
|
|
|
|
// NOTE: The ShiftLeftLanes operation moves the elements of v to the right
|
|
// by kLanes lanes and zeroes out the first kLanes lanes of v on both
|
|
// little-endian and big-endian PPC targets
|
|
// (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both
|
|
// little-endian and big-endian targets)
|
|
|
|
template <int kLanes, class D, typename T = TFromD<D>>
|
|
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
|
}
|
|
|
|
template <int kLanes, typename T, size_t N>
|
|
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
|
|
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
|
|
}
|
|
|
|
// ------------------------------ ShiftRightBytes
|
|
|
|
// NOTE: The ShiftRightBytes operation moves the elements of v to the left
|
|
// by kBytes bytes and zeroes out the last kBytes bytes of v on both
|
|
// little-endian and big-endian PPC targets
|
|
// (same behavior as the HWY_EMU128 ShiftRightBytes operation on both
|
|
// little-endian and big-endian targets)
|
|
|
|
template <int kBytes, class D>
|
|
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
|
|
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
if (kBytes == 0) return v;
|
|
|
|
// For partial vectors, clear upper lanes so we shift in zeros.
|
|
if (d.MaxBytes() != 16) {
|
|
const Full128<TFromD<D>> dfull;
|
|
VFromD<decltype(dfull)> vfull{v.raw};
|
|
v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
|
|
}
|
|
|
|
const auto zeros = Zero(d);
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
|
|
#else
|
|
return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ ShiftRightLanes
|
|
|
|
// NOTE: The ShiftRightLanes operation moves the elements of v to the left
|
|
// by kLanes lanes and zeroes out the last kLanes lanes of v on both
|
|
// little-endian and big-endian PPC targets
|
|
// (same behavior as the HWY_EMU128 ShiftRightLanes operation on both
|
|
// little-endian and big-endian targets)
|
|
|
|
template <int kLanes, class D>
|
|
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
|
|
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
|
|
}
|
|
|
|
// ------------------------------ UpperHalf (ShiftRightBytes)
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
|
|
}
|
|
|
|
// ------------------------------ ExtractLane (UpperHalf)
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
|
|
return static_cast<T>(v.raw[i]);
|
|
}
|
|
|
|
// ------------------------------ InsertLane (UpperHalf)
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
|
|
typename detail::Raw128<T>::type raw_result = v.raw;
|
|
raw_result[i] = t;
|
|
return Vec128<T, N>{raw_result};
|
|
}
|
|
|
|
// ------------------------------ CombineShiftRightBytes
|
|
|
|
// NOTE: The CombineShiftRightBytes operation below moves the elements of lo to
|
|
// the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes()
|
|
// - kBytes) bytes on both little-endian and big-endian PPC targets.
|
|
|
|
template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
HWY_API Vec128<T> CombineShiftRightBytes(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
|
|
constexpr size_t kSize = 16;
|
|
static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)};
|
|
#else
|
|
return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)};
|
|
#endif
|
|
}
|
|
|
|
template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
constexpr size_t kSize = d.MaxBytes();
|
|
static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
using V8 = Vec128<uint8_t>;
|
|
const DFromV<V8> dfull8;
|
|
const Repartition<TFromD<D>, decltype(dfull8)> dfull;
|
|
const V8 hi8{BitCast(d8, hi).raw};
|
|
// Move into most-significant bytes
|
|
const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
|
|
const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
|
|
return VFromD<D>{BitCast(dfull, r).raw};
|
|
}
|
|
|
|
// ------------------------------ Broadcast/splat any lane
|
|
|
|
template <int kLane, typename T, size_t N>
|
|
HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
|
|
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
return Vec128<T, N>{vec_splat(v.raw, kLane)};
|
|
}
|
|
|
|
// ------------------------------ TableLookupLanes (Shuffle01)
|
|
|
|
// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
|
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
struct Indices128 {
|
|
__vector unsigned char raw;
|
|
};
|
|
|
|
namespace detail {
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
|
|
D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return Iota(d8, 0);
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
|
|
D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
|
|
#else
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
|
|
#endif
|
|
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
|
|
D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
|
|
#else
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
|
|
#endif
|
|
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
|
|
D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
|
|
#else
|
|
constexpr __vector unsigned char kBroadcastLaneBytes = {
|
|
7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
|
|
#endif
|
|
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return Zero(d8);
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
|
|
0, 1, 0, 1, 0, 1, 0, 1};
|
|
return VFromD<decltype(d8)>{kByteOffsets};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
|
|
0, 1, 2, 3, 0, 1, 2, 3};
|
|
return VFromD<decltype(d8)>{kByteOffsets};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
|
|
0, 1, 2, 3, 4, 5, 6, 7};
|
|
return VFromD<decltype(d8)>{kByteOffsets};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
|
|
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
|
|
D d, Vec128<TI, MaxLanes(D())> vec) {
|
|
using T = TFromD<D>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
|
|
#if HWY_IS_DEBUG_BUILD
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using TU = TFromD<decltype(du)>;
|
|
HWY_DASSERT(AllTrue(
|
|
du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
|
|
#endif
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw};
|
|
}
|
|
|
|
template <class D, typename TI,
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
|
|
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
|
|
D d, Vec128<TI, MaxLanes(D())> vec) {
|
|
using T = TFromD<D>;
|
|
static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
|
|
#if HWY_IS_DEBUG_BUILD
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using TU = TFromD<decltype(du)>;
|
|
HWY_DASSERT(AllTrue(
|
|
du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
|
|
#endif
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
using V8 = VFromD<decltype(d8)>;
|
|
|
|
// Broadcast each lane index to all bytes of T and shift to bytes
|
|
const V8 lane_indices = TableLookupBytes(
|
|
BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
|
|
constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
|
|
const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
|
|
const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
|
|
return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
|
|
}
|
|
|
|
template <class D, typename TI>
|
|
HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
|
|
D d, const TI* idx) {
|
|
const Rebind<TI, decltype(d)> di;
|
|
return IndicesFromVec(d, LoadU(di, idx));
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
|
|
const DFromV<decltype(v)> d;
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw}));
|
|
}
|
|
|
|
// Single lane: no change
|
|
template <typename T>
|
|
HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
|
|
Indices128<T, 1> /* idx */) {
|
|
return v;
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
|
|
Indices128<T, N> idx) {
|
|
const DFromV<decltype(a)> d;
|
|
const Twice<decltype(d)> dt;
|
|
const Repartition<uint8_t, decltype(dt)> dt_u8;
|
|
// TableLookupLanes currently requires table and index vectors to be the same
|
|
// size, though a half-length index vector would be sufficient here.
|
|
#if HWY_IS_MSAN
|
|
const Vec128<T, N> idx_vec{idx.raw};
|
|
const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
|
|
#else
|
|
// We only keep LowerHalf of the result, which is valid in idx.
|
|
const Indices128<T, N * 2> idx2{idx.raw};
|
|
#endif
|
|
return LowerHalf(
|
|
d, TableLookupBytes(Combine(dt, b, a),
|
|
BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
|
|
}
|
|
|
|
template <typename T>
|
|
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
|
|
Indices128<T> idx) {
|
|
return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)};
|
|
}
|
|
|
|
// ------------------------------ ReverseBlocks
|
|
|
|
// Single block: no change
|
|
template <class D>
|
|
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
return v;
|
|
}
|
|
|
|
// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
|
|
|
|
// Single lane: no change
|
|
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
|
|
HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
|
|
return v;
|
|
}
|
|
|
|
// 32-bit x2: shuffle
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
|
|
return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
|
|
}
|
|
|
|
// 16-bit x4: shuffle
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
|
|
const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
|
|
14, 15, 12, 13, 10, 11, 8, 9};
|
|
return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
|
|
}
|
|
|
|
// 16-bit x2: rotate bytes
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
|
|
const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
|
|
return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
|
|
}
|
|
|
|
// ------------------------------- ReverseLaneBytes
|
|
|
|
#if HWY_PPC_HAVE_9 && \
|
|
(HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
|
|
|
|
// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
|
|
#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
|
|
#undef HWY_NATIVE_REVERSE_LANE_BYTES
|
|
#else
|
|
#define HWY_NATIVE_REVERSE_LANE_BYTES
|
|
#endif
|
|
|
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
|
|
HWY_API V ReverseLaneBytes(V v) {
|
|
return V{vec_revb(v.raw)};
|
|
}
|
|
|
|
// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
|
|
#ifdef HWY_NATIVE_REVERSE2_8
|
|
#undef HWY_NATIVE_REVERSE2_8
|
|
#else
|
|
#define HWY_NATIVE_REVERSE2_8
|
|
#endif
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
|
|
const Repartition<uint16_t, decltype(d)> du16;
|
|
return BitCast(d, ReverseLaneBytes(BitCast(du16, v)));
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
|
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
return BitCast(d, ReverseLaneBytes(BitCast(du32, v)));
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
|
|
const Repartition<uint64_t, decltype(d)> du64;
|
|
return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
|
|
}
|
|
|
|
#endif // HWY_PPC_HAVE_9
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
|
|
return Reverse2(d, v);
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
|
|
return Reverse4(d, v);
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec64<T> Reverse(D d, Vec64<T> v) {
|
|
return Reverse8(d, v);
|
|
}
|
|
|
|
// ------------------------------ Reverse2
|
|
|
|
// Single lane: no change
|
|
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
|
|
HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) {
|
|
return v;
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
|
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
|
|
const Repartition<uint64_t, decltype(d)> du64;
|
|
return BitCast(d, RotateRight<32>(BitCast(du64, v)));
|
|
}
|
|
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
|
|
return Shuffle01(v);
|
|
}
|
|
|
|
// ------------------------------ Reverse4
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) {
|
|
const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
|
|
14, 15, 12, 13, 10, 11, 8, 9};
|
|
return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
|
|
return Reverse(d, v);
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
|
|
HWY_ASSERT(0); // don't have 4 u64 lanes
|
|
}
|
|
|
|
// ------------------------------ Reverse8
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
|
|
return Reverse(d, v);
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
|
|
HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
|
|
HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
|
|
}
|
|
|
|
// ------------------------------ InterleaveLower
|
|
|
|
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
|
|
// the least-significant lane) and "b". To concatenate two half-width integers
|
|
// into one, use ZipLower/Upper instead (also works with scalar).
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
|
|
}
|
|
|
|
// Additional overload for the optional tag
|
|
template <class D>
|
|
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
return InterleaveLower(a, b);
|
|
}
|
|
|
|
// ------------------------------ InterleaveUpper (UpperHalf)
|
|
|
|
// Full
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) {
|
|
return Vec128<T>{vec_mergel(a.raw, b.raw)};
|
|
}
|
|
|
|
// Partial
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
const Half<decltype(d)> d2;
|
|
return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
|
|
VFromD<D>{UpperHalf(d2, b).raw});
|
|
}
|
|
|
|
// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
|
|
|
|
// Same as Interleave*, except that the return lanes are double-width integers;
|
|
// this is necessary because the single-lane scalar cannot return two values.
|
|
template <class V, class DW = RepartitionToWide<DFromV<V>>>
|
|
HWY_API VFromD<DW> ZipLower(V a, V b) {
|
|
return BitCast(DW(), InterleaveLower(a, b));
|
|
}
|
|
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
|
|
return BitCast(dw, InterleaveLower(D(), a, b));
|
|
}
|
|
|
|
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
return BitCast(dw, InterleaveUpper(D(), a, b));
|
|
}
|
|
|
|
// ================================================== COMBINE
|
|
|
|
// ------------------------------ Combine (InterleaveLower)
|
|
|
|
// N = N/2 + N/2 (upper half undefined)
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
|
|
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
const Half<decltype(d)> dh;
|
|
// Treat half-width input as one lane, and expand to two lanes.
|
|
using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
|
|
using Raw = typename detail::Raw128<TFromV<VU>>::type;
|
|
const VU lo{reinterpret_cast<Raw>(lo_half.raw)};
|
|
const VU hi{reinterpret_cast<Raw>(hi_half.raw)};
|
|
return BitCast(d, InterleaveLower(lo, hi));
|
|
}
|
|
|
|
// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
|
|
|
|
template <class D>
|
|
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
const Half<D> dh;
|
|
return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
|
|
}
|
|
|
|
// ------------------------------ Concat full (InterleaveLower)
|
|
|
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
const Repartition<uint64_t, decltype(d)> d64;
|
|
return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
const Repartition<uint64_t, decltype(d)> d64;
|
|
return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
return CombineShiftRightBytes<8>(d, hi, lo);
|
|
}
|
|
|
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
template <class D, typename T = TFromD<D>>
|
|
HWY_API Vec128<T> ConcatUpperLower(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
|
|
const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
|
|
24, 25, 26, 27, 28, 29, 30, 31};
|
|
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
|
|
}
|
|
|
|
// ------------------------------ Concat partial (Combine, LowerHalf)
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
const Half<decltype(d)> d2;
|
|
return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
const Half<decltype(d)> d2;
|
|
return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
const Half<decltype(d)> d2;
|
|
return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
const Half<decltype(d)> d2;
|
|
return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
|
|
}
|
|
|
|
// ------------------------------ TruncateTo
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr,
|
|
HWY_IF_LANES_D(D, 1)>
|
|
HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<FromT, 1> v) {
|
|
using Raw = typename detail::Raw128<TFromD<D>>::type;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return VFromD<D>{reinterpret_cast<Raw>(v.raw)};
|
|
#else
|
|
return VFromD<D>{reinterpret_cast<Raw>(
|
|
vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))};
|
|
#endif
|
|
}
|
|
|
|
namespace detail {
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
|
|
HWY_API VFromD<D> Truncate2To(
|
|
D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo,
|
|
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) {
|
|
return VFromD<D>{vec_pack(lo.raw, hi.raw)};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
|
|
HWY_API VFromD<D> TruncateTo(D /* d */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
return VFromD<D>{vec_pack(v.raw, v.raw)};
|
|
}
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr,
|
|
HWY_IF_LANES_GT_D(D, 1)>
|
|
HWY_API VFromD<D> TruncateTo(D d,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const Rebind<MakeNarrow<FromT>, decltype(d)> d2;
|
|
return TruncateTo(d, TruncateTo(d2, v));
|
|
}
|
|
|
|
// ------------------------------ ConcatOdd (TruncateTo)
|
|
|
|
// 8-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
// Right-shift 8 bits per u16 so we can pack.
|
|
const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
|
|
const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
|
|
#else
|
|
const Vec128<uint16_t> uH = BitCast(dw, hi);
|
|
const Vec128<uint16_t> uL = BitCast(dw, lo);
|
|
#endif
|
|
return BitCast(d, detail::Truncate2To(du, uL, uH));
|
|
}
|
|
|
|
// 8-bit x8
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
|
|
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
|
|
}
|
|
|
|
// 8-bit x4
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19};
|
|
return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
|
|
}
|
|
|
|
// 16-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
const Repartition<uint32_t, decltype(d)> dw;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
|
|
const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
|
|
#else
|
|
const Vec128<uint32_t> uH = BitCast(dw, hi);
|
|
const Vec128<uint32_t> uL = BitCast(dw, lo);
|
|
#endif
|
|
return BitCast(d, detail::Truncate2To(du, uL, uH));
|
|
}
|
|
|
|
// 16-bit x4
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
|
|
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
|
|
}
|
|
|
|
// 32-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
(void)d;
|
|
const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
|
|
20, 21, 22, 23, 28, 29, 30, 31};
|
|
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
|
|
#else
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
const Repartition<uint64_t, decltype(d)> dw;
|
|
return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
|
|
#endif
|
|
}
|
|
|
|
// Any type x2
|
|
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
|
|
HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
|
|
return InterleaveUpper(d, lo, hi);
|
|
}
|
|
|
|
// ------------------------------ ConcatEven (TruncateTo)
|
|
|
|
// 8-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Vec128<uint16_t> uH = BitCast(dw, hi);
|
|
const Vec128<uint16_t> uL = BitCast(dw, lo);
|
|
#else
|
|
// Right-shift 8 bits per u16 so we can pack.
|
|
const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
|
|
const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
|
|
#endif
|
|
return BitCast(d, detail::Truncate2To(du, uL, uH));
|
|
}
|
|
|
|
// 8-bit x8
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
|
|
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
|
|
}
|
|
|
|
// 8-bit x4
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18};
|
|
return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
|
|
}
|
|
|
|
// 16-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
// Isolate lower 16 bits per u32 so we can pack.
|
|
const Repartition<uint32_t, decltype(d)> dw;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Vec128<uint32_t> uH = BitCast(dw, hi);
|
|
const Vec128<uint32_t> uL = BitCast(dw, lo);
|
|
#else
|
|
const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
|
|
const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
|
|
#endif
|
|
return BitCast(d, detail::Truncate2To(du, uL, uH));
|
|
}
|
|
|
|
// 16-bit x4
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
|
|
// Don't care about upper half, no need to zero.
|
|
const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
|
|
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
|
|
}
|
|
|
|
// 32-bit full
|
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Repartition<uint64_t, decltype(d)> dw;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
|
|
#else
|
|
(void)d;
|
|
constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
|
|
16, 17, 18, 19, 24, 25, 26, 27};
|
|
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
|
|
#endif
|
|
}
|
|
|
|
// Any T x2
|
|
template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
|
|
HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
|
|
return InterleaveLower(d, lo, hi);
|
|
}
|
|
|
|
// ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd)
|
|
#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
|
|
#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
|
|
#else
|
|
#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
|
|
#endif
|
|
|
|
template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return ConcatEven(d, BitCast(d, b), BitCast(d, a));
|
|
#else
|
|
return ConcatOdd(d, BitCast(d, b), BitCast(d, a));
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ DupEven (InterleaveLower)
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_mergee(v.raw, v.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
|
|
return InterleaveLower(DFromV<decltype(v)>(), v, v);
|
|
}
|
|
|
|
// ------------------------------ DupOdd (InterleaveUpper)
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
return InterleaveUpper(DFromV<decltype(v)>(), v, v);
|
|
}
|
|
|
|
// ------------------------------ OddEven (IfThenElse)
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
|
|
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
|
|
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
|
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a);
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
|
|
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
|
|
0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
|
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a);
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
const DFromV<decltype(a)> d;
|
|
const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
|
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a);
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
// Same as ConcatUpperLower for full vectors; do not call that because this
|
|
// is more efficient for 64x1 vectors.
|
|
const DFromV<decltype(a)> d;
|
|
const __vector unsigned char mask = {
|
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
|
|
}
|
|
|
|
// ------------------------------ OddEvenBlocks
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
return even;
|
|
}
|
|
|
|
// ------------------------------ SwapAdjacentBlocks
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
return v;
|
|
}
|
|
|
|
// ------------------------------ Shl
|
|
|
|
namespace detail {
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
Vec128<T, N> bits) {
|
|
return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
|
|
}
|
|
|
|
// Signed left shift is the same as unsigned.
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
Vec128<T, N> bits) {
|
|
const DFromV<decltype(v)> di;
|
|
const RebindToUnsigned<decltype(di)> du;
|
|
return BitCast(di,
|
|
Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
|
|
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
return detail::Shl(hwy::TypeTag<T>(), v, bits);
|
|
}
|
|
|
|
// ------------------------------ Shr
|
|
|
|
namespace detail {
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
Vec128<T, N> bits) {
|
|
return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
Vec128<T, N> bits) {
|
|
const DFromV<decltype(v)> di;
|
|
const RebindToUnsigned<decltype(di)> du;
|
|
return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
return detail::Shr(hwy::TypeTag<T>(), v, bits);
|
|
}
|
|
|
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
|
|
HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
using VU64 = __vector unsigned long long;
|
|
const VU64 mul128_result = reinterpret_cast<VU64>(vec_mule(a.raw, b.raw));
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<uint64_t>{mul128_result};
|
|
#else
|
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
|
|
#endif
|
|
#else
|
|
alignas(16) uint64_t mul[2];
|
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
return Load(Full128<uint64_t>(), mul);
|
|
#endif
|
|
}
|
|
|
|
HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
using VU64 = __vector unsigned long long;
|
|
const VU64 mul128_result = reinterpret_cast<VU64>(vec_mulo(a.raw, b.raw));
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<uint64_t>{mul128_result};
|
|
#else
|
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
|
|
#endif
|
|
#else
|
|
alignas(16) uint64_t mul[2];
|
|
const Full64<uint64_t> d2;
|
|
mul[0] =
|
|
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
|
|
return Load(Full128<uint64_t>(), mul);
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ WidenMulPairwiseAdd
|
|
|
|
template <class D32, HWY_IF_F32_D(D32),
|
|
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
|
|
const RebindToUnsigned<decltype(df32)> du32;
|
|
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
using VU32 = VFromD<decltype(du32)>;
|
|
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
const VU32 ao = And(BitCast(du32, a), odd);
|
|
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
const VU32 bo = And(BitCast(du32, b), odd);
|
|
return Mul(BitCast(df32, ae), BitCast(df32, be)) + Mul(BitCast(df32, ao), BitCast(df32, bo));
|
|
}
|
|
|
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
template <class D32, HWY_IF_I32_D(D32),
|
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
|
|
return VFromD<D32>{a * b};
|
|
}
|
|
|
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
|
|
template <class D32, HWY_IF_F32_D(D32),
|
|
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
VFromD<D32> sum0,
|
|
VFromD<D32>& sum1) {
|
|
const RebindToUnsigned<decltype(df32)> du32;
|
|
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
using VU32 = VFromD<decltype(du32)>;
|
|
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
const VU32 ao = And(BitCast(du32, a), odd);
|
|
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
const VU32 bo = And(BitCast(du32, b), odd);
|
|
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
}
|
|
|
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
template <class D32, HWY_IF_I32_D(D32),
|
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b,
|
|
VFromD<D32> sum0,
|
|
VFromD<D32>& /*sum1*/) {
|
|
return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
|
|
}
|
|
|
|
// ------------------------------ RearrangeToOddPlusEven
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0,
|
|
Vec128<int32_t, N> /*sum1*/) {
|
|
return sum0; // invariant already holds
|
|
}
|
|
|
|
template <class VW>
|
|
HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
return Add(sum0, sum1);
|
|
}
|
|
|
|
// ================================================== CONVERT
|
|
|
|
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
|
|
|
// Unsigned to signed/unsigned: zero-extend.
|
|
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
|
|
HWY_IF_NOT_FLOAT_D(D), HWY_IF_UNSIGNED(FromT)>
|
|
HWY_API VFromD<D> PromoteTo(D /* d */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
// First pretend the input has twice the lanes - the upper half will be
|
|
// ignored by ZipLower.
|
|
const Rebind<FromT, Twice<D>> d2;
|
|
const VFromD<decltype(d2)> twice{v.raw};
|
|
// Then cast to narrow as expected by ZipLower, in case the sign of FromT
|
|
// differs from that of D.
|
|
const RepartitionToNarrow<D> dn;
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return ZipLower(BitCast(dn, twice), Zero(dn));
|
|
#else
|
|
return ZipLower(Zero(dn), BitCast(dn, twice));
|
|
#endif
|
|
}
|
|
|
|
// Signed: replicate sign bit.
|
|
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
|
|
HWY_IF_NOT_FLOAT_D(D), HWY_IF_SIGNED(FromT)>
|
|
HWY_API VFromD<D> PromoteTo(D /* d */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
using Raw = typename detail::Raw128<TFromD<D>>::type;
|
|
return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))};
|
|
}
|
|
|
|
// 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit.
|
|
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D),
|
|
HWY_IF_T_SIZE(FromT, 1)>
|
|
HWY_API VFromD<D> PromoteTo(D d32,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const DFromV<decltype(v)> d8;
|
|
const Rebind<MakeWide<FromT>, decltype(d8)> d16;
|
|
return PromoteTo(d32, PromoteTo(d16, v));
|
|
}
|
|
|
|
// 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then
|
|
// convert to 64-bit.
|
|
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D),
|
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))>
|
|
HWY_API VFromD<D> PromoteTo(D d64,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const Rebind<MakeWide<FromT>, decltype(d64)> dw;
|
|
return PromoteTo(d64, PromoteTo(dw, v));
|
|
}
|
|
|
|
// Workaround for origin tracking bug in Clang msan prior to 11.0
|
|
// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
|
|
#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
|
|
#define HWY_INLINE_F16 HWY_NOINLINE
|
|
#else
|
|
#define HWY_INLINE_F16 HWY_INLINE
|
|
#endif
|
|
template <class D, HWY_IF_F32_D(D)>
|
|
HWY_INLINE_F16 VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
|
|
#if HWY_PPC_HAVE_9
|
|
(void)df32;
|
|
return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
|
|
#else
|
|
const RebindToSigned<decltype(df32)> di32;
|
|
const RebindToUnsigned<decltype(df32)> du32;
|
|
// Expand to u32 so we can shift.
|
|
const auto bits16 = PromoteTo(du32, VFromD<Rebind<uint16_t, D>>{v.raw});
|
|
const auto sign = ShiftRight<15>(bits16);
|
|
const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
|
|
const auto mantissa = bits16 & Set(du32, 0x3FF);
|
|
const auto subnormal =
|
|
BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
|
|
Set(df32, 1.0f / 16384 / 1024));
|
|
|
|
const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
|
|
const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
|
|
const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
|
|
const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
|
|
return BitCast(df32, ShiftLeft<31>(sign) | bits32);
|
|
#endif
|
|
}
|
|
|
|
template <class D, HWY_IF_F32_D(D)>
|
|
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
|
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
const RebindToSigned<decltype(df32)> di32;
|
|
return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
|
|
}
|
|
|
|
template <class D, HWY_IF_F64_D(D)>
|
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
const __vector float raw_v = InterleaveLower(v, v).raw;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
#else
|
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
#endif
|
|
}
|
|
|
|
template <class D, HWY_IF_F64_D(D)>
|
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
|
|
const __vector signed int raw_v = InterleaveLower(v, v).raw;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
#else
|
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
#endif
|
|
}
|
|
|
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
|
|
HWY_API VFromD<D> DemoteTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
return VFromD<D>{vec_packsu(v.raw, v.raw)};
|
|
}
|
|
|
|
template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
|
|
HWY_API VFromD<D> DemoteTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
return VFromD<D>{vec_packs(v.raw, v.raw)};
|
|
}
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
|
|
HWY_API VFromD<D> DemoteTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
return VFromD<D>{vec_packs(v.raw, v.raw)};
|
|
}
|
|
|
|
template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
|
|
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
|
|
HWY_API VFromD<D> DemoteTo(D d,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const Rebind<MakeNarrow<FromT>, D> d2;
|
|
return DemoteTo(d, DemoteTo(d2, v));
|
|
}
|
|
|
|
template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
|
|
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
|
|
HWY_API VFromD<D> DemoteTo(D d,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const Rebind<MakeNarrow<FromT>, D> d2;
|
|
return DemoteTo(d, DemoteTo(d2, v));
|
|
}
|
|
|
|
template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT),
|
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
|
|
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
|
|
HWY_API VFromD<D> DemoteTo(D d,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
|
|
return DemoteTo(d, DemoteTo(d2, v));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
#if HWY_PPC_HAVE_9 && HWY_COMPILER_GCC_ACTUAL
|
|
// Do not use vec_pack_to_short_fp32 on clang as there is a bug in the clang
|
|
// version of vec_pack_to_short_fp32
|
|
(void)df16;
|
|
return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
|
|
#else
|
|
const Rebind<uint32_t, decltype(df16)> du;
|
|
const RebindToUnsigned<decltype(df16)> du16;
|
|
#if HWY_PPC_HAVE_9 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
|
|
// Work around bug in the clang implementation of vec_pack_to_short_fp32
|
|
// by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets
|
|
// if the __builtin_vsx_xvcvsphp intrinsic is available
|
|
const VFromD<decltype(du)> bits16{
|
|
reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
|
|
#else
|
|
const RebindToSigned<decltype(du)> di;
|
|
const auto bits32 = BitCast(du, v);
|
|
const auto sign = ShiftRight<31>(bits32);
|
|
const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
|
|
const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
|
|
|
|
const auto k15 = Set(di, 15);
|
|
const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
|
|
const auto is_tiny = exp < Set(di, -24);
|
|
|
|
const auto is_subnormal = exp < Set(di, -14);
|
|
const auto biased_exp16 =
|
|
BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
|
|
const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
|
|
const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
|
|
(mantissa32 >> (Set(du, 13) + sub_exp));
|
|
const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
|
|
ShiftRight<13>(mantissa32)); // <1024
|
|
|
|
const auto sign16 = ShiftLeft<15>(sign);
|
|
const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
|
|
const auto bits16 = IfThenZeroElse(RebindMask(du, is_tiny), normal16);
|
|
#endif // HWY_PPC_HAVE_9 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
|
|
return BitCast(df16, TruncateTo(du16, bits16));
|
|
#endif // HWY_PPC_HAVE_9 && HWY_COMPILER_GCC_ACTUAL
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
|
|
const Rebind<uint16_t, decltype(dbf16)> du16;
|
|
const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v));
|
|
return BitCast(dbf16, TruncateTo(du16, bits_in_32));
|
|
}
|
|
|
|
template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
|
|
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
|
|
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
const Repartition<uint32_t, decltype(dbf16)> du32;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const auto a_in_odd = a;
|
|
const auto b_in_even = ShiftRight<16>(BitCast(du32, b));
|
|
#else
|
|
const auto a_in_odd = ShiftRight<16>(BitCast(du32, a));
|
|
const auto b_in_even = b;
|
|
#endif
|
|
return BitCast(dbf16,
|
|
OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
|
|
}
|
|
|
|
// Specializations for partial vectors because vec_packs sets lanes above 2*N.
|
|
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
|
|
HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const DFromV<decltype(a)> d;
|
|
const Twice<decltype(d)> dt;
|
|
return DemoteTo(dn, Combine(dt, b, a));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN),
|
|
HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const Twice<decltype(dn)> dn_full;
|
|
const Repartition<uint32_t, decltype(dn_full)> du32_full;
|
|
|
|
const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
|
|
const auto vu32_full = BitCast(du32_full, v_full);
|
|
return LowerHalf(
|
|
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN),
|
|
HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
|
|
return VFromD<DN>{vec_packs(a.raw, b.raw)};
|
|
}
|
|
|
|
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
|
|
HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const DFromV<decltype(a)> d;
|
|
const Twice<decltype(d)> dt;
|
|
return DemoteTo(dn, Combine(dt, b, a));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
|
|
HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const Twice<decltype(dn)> dn_full;
|
|
const Repartition<uint32_t, decltype(dn_full)> du32_full;
|
|
|
|
const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
|
|
const auto vu32_full = BitCast(du32_full, v_full);
|
|
return LowerHalf(
|
|
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
|
|
HWY_IF_SIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
|
|
return VFromD<DN>{vec_packsu(a.raw, b.raw)};
|
|
}
|
|
|
|
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
|
|
HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const DFromV<decltype(a)> d;
|
|
const Twice<decltype(d)> dt;
|
|
return DemoteTo(dn, Combine(dt, b, a));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
|
|
HWY_IF_UNSIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
const Twice<decltype(dn)> dn_full;
|
|
const Repartition<uint32_t, decltype(dn_full)> du32_full;
|
|
|
|
const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
|
|
const auto vu32_full = BitCast(du32_full, v_full);
|
|
return LowerHalf(
|
|
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
|
|
}
|
|
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
|
|
HWY_IF_UNSIGNED_V(V),
|
|
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
|
|
HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
|
|
return VFromD<DN>{vec_packs(a.raw, b.raw)};
|
|
}
|
|
|
|
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
|
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
return ReorderDemote2To(d, a, b);
|
|
}
|
|
|
|
template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
|
|
HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
|
|
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
#else
|
|
return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
#endif
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
return Vec32<float>{vec_floate(v.raw)};
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Vec128<float> f64_to_f32{vec_floate(v.raw)};
|
|
#else
|
|
const Vec128<float> f64_to_f32{vec_floato(v.raw)};
|
|
#endif
|
|
|
|
const RebindToUnsigned<D> du;
|
|
const Rebind<uint64_t, D> du64;
|
|
return Vec64<float>{
|
|
BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
|
|
HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
return Vec32<int32_t>{vec_signede(v.raw)};
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
|
|
HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const Vec128<int32_t> f64_to_i32{vec_signede(v.raw)};
|
|
#else
|
|
const Vec128<int32_t> f64_to_i32{vec_signedo(v.raw)};
|
|
#endif
|
|
|
|
const Rebind<int64_t, D> di64;
|
|
const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
|
|
return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
|
|
}
|
|
|
|
// For already range-limited input [0, 255].
|
|
template <size_t N>
|
|
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
|
|
const Rebind<uint16_t, DFromV<decltype(v)>> du16;
|
|
const Rebind<uint8_t, decltype(du16)> du8;
|
|
return TruncateTo(du8, TruncateTo(du16, v));
|
|
}
|
|
// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
|
|
|
|
// Note: altivec.h vec_ct* currently contain C casts which triggers
|
|
// -Wdeprecate-lax-vec-conv-all warnings, so disable them.
|
|
|
|
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_NOT_FLOAT(FromT),
|
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
HWY_DIAGNOSTICS(push)
|
|
#if HWY_COMPILER_CLANG
|
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
#endif
|
|
return VFromD<D>{vec_ctf(v.raw, 0)};
|
|
HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
|
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
return VFromD<D>{vec_double(v.raw)};
|
|
}
|
|
|
|
// Truncates (rounds toward zero).
|
|
template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_FLOAT(FromT),
|
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
HWY_DIAGNOSTICS(push)
|
|
#if HWY_COMPILER_CLANG
|
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
#endif
|
|
return VFromD<D>{vec_cts(v.raw, 0)};
|
|
HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_FLOAT(FromT),
|
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
HWY_DIAGNOSTICS(push)
|
|
#if HWY_COMPILER_CLANG
|
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
#endif
|
|
return VFromD<D>{vec_ctu(v.raw, 0)};
|
|
HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
|
|
HWY_DIAGNOSTICS(push)
|
|
#if HWY_COMPILER_CLANG
|
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
#endif
|
|
return Vec128<int32_t, N>{vec_cts(vec_round(v.raw), 0)};
|
|
HWY_DIAGNOSTICS(pop)
|
|
}
|
|
|
|
// ------------------------------ Floating-point rounding (ConvertTo)
|
|
|
|
// Toward nearest integer, ties to even
|
|
template <size_t N>
|
|
HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
|
|
return Vec128<float, N>{vec_round(v.raw)};
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
|
|
return Vec128<double, N>{vec_rint(v.raw)};
|
|
}
|
|
|
|
// Toward zero, aka truncate
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_trunc(v.raw)};
|
|
}
|
|
|
|
// Toward +infinity, aka ceiling
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_ceil(v.raw)};
|
|
}
|
|
|
|
// Toward -infinity, aka floor
|
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
HWY_API Vec128<T, N> Floor(Vec128<T, N> v) {
|
|
return Vec128<T, N>{vec_floor(v.raw)};
|
|
}
|
|
|
|
// ------------------------------ Floating-point classification
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
|
|
static_assert(IsFloat<T>(), "Only for float");
|
|
return v != v;
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
|
|
static_assert(IsFloat<T>(), "Only for float");
|
|
using TU = MakeUnsigned<T>;
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
return RebindMask(
|
|
d,
|
|
Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
|
|
}
|
|
|
|
// Returns whether normal/subnormal/zero.
|
|
template <typename T, size_t N>
|
|
HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
|
|
static_assert(IsFloat<T>(), "Only for float");
|
|
using TU = MakeUnsigned<T>;
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
// 'Shift left' to clear the sign bit, check for exponent<max.
|
|
return RebindMask(
|
|
d,
|
|
Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
|
|
}
|
|
|
|
// ================================================== CRYPTO
|
|
|
|
#if !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
|
|
// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
|
|
#ifdef HWY_NATIVE_AES
|
|
#undef HWY_NATIVE_AES
|
|
#else
|
|
#define HWY_NATIVE_AES
|
|
#endif
|
|
|
|
namespace detail {
|
|
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
|
|
using CipherTag = Full128<uint64_t>;
|
|
#else
|
|
using CipherTag = Full128<uint8_t>;
|
|
#endif // !HWY_COMPILER_CLANG
|
|
using CipherVec = VFromD<CipherTag>;
|
|
} // namespace detail
|
|
|
|
HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
|
|
Vec128<uint8_t> round_key) {
|
|
const detail::CipherTag dc;
|
|
const Full128<uint8_t> du8;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Reverse(du8,
|
|
BitCast(du8, detail::CipherVec{vec_cipher_be(
|
|
BitCast(dc, Reverse(du8, state)).raw,
|
|
BitCast(dc, Reverse(du8, round_key)).raw)}));
|
|
#else
|
|
return BitCast(du8, detail::CipherVec{vec_cipher_be(
|
|
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
|
|
#endif
|
|
}
|
|
|
|
HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
|
|
Vec128<uint8_t> round_key) {
|
|
const detail::CipherTag dc;
|
|
const Full128<uint8_t> du8;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Reverse(du8,
|
|
BitCast(du8, detail::CipherVec{vec_cipherlast_be(
|
|
BitCast(dc, Reverse(du8, state)).raw,
|
|
BitCast(dc, Reverse(du8, round_key)).raw)}));
|
|
#else
|
|
return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
|
|
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
|
|
#endif
|
|
}
|
|
|
|
HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
|
|
Vec128<uint8_t> round_key) {
|
|
const detail::CipherTag dc;
|
|
const Full128<uint8_t> du8;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be(
|
|
BitCast(dc, Reverse(du8, state)).raw,
|
|
Zero(dc).raw)})),
|
|
round_key);
|
|
#else
|
|
return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be(
|
|
BitCast(dc, state).raw, Zero(dc).raw)}),
|
|
round_key);
|
|
#endif
|
|
}
|
|
|
|
HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
|
|
Vec128<uint8_t> round_key) {
|
|
const detail::CipherTag dc;
|
|
const Full128<uint8_t> du8;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Reverse(du8,
|
|
BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
|
|
BitCast(dc, Reverse(du8, state)).raw,
|
|
BitCast(dc, Reverse(du8, round_key)).raw)}));
|
|
#else
|
|
return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
|
|
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
|
|
#endif
|
|
}
|
|
|
|
HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
|
|
const Full128<uint8_t> du8;
|
|
const auto zero = Zero(du8);
|
|
|
|
// PPC8/PPC9/PPC10 does not have a single instruction for the AES
|
|
// InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do.
|
|
|
|
// The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10
|
|
// by doing an AESLastRound operation with a zero round_key followed by an
|
|
// AESRoundInv operation with a zero round_key.
|
|
return AESRoundInv(AESLastRound(state, zero), zero);
|
|
}
|
|
|
|
template <uint8_t kRcon>
|
|
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
|
|
0, 0, 0, 0, kRcon, 0, 0, 0};
|
|
constexpr __vector unsigned char kRotWordShuffle = {
|
|
4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
|
|
const detail::CipherTag dc;
|
|
const Full128<uint8_t> du8;
|
|
const auto sub_word_result =
|
|
BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)});
|
|
const auto rot_word_result =
|
|
TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle});
|
|
return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
|
|
Vec128<uint64_t, N> b) {
|
|
// NOTE: Lane 1 of both a and b need to be zeroed out for the
|
|
// vec_pmsum_be operation below as the vec_pmsum_be operation
|
|
// does a carryless multiplication of each 64-bit half and then
|
|
// adds the two halves using an bitwise XOR operation.
|
|
|
|
const DFromV<decltype(a)> d;
|
|
const auto zero = Zero(d);
|
|
|
|
using VU64 = __vector unsigned long long;
|
|
const VU64 pmsum_result = reinterpret_cast<VU64>(
|
|
vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw));
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<uint64_t, N>{pmsum_result};
|
|
#else
|
|
// Need to swap the two halves of pmsum_result on big-endian targets as
|
|
// the upper 64 bits of the carryless multiplication result are in lane 0 of
|
|
// pmsum_result and the lower 64 bits of the carryless multiplication result
|
|
// are in lane 1 of mul128_result
|
|
return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
|
|
#endif
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
|
|
Vec128<uint64_t, N> b) {
|
|
// NOTE: Lane 0 of both a and b need to be zeroed out for the
|
|
// vec_pmsum_be operation below as the vec_pmsum_be operation
|
|
// does a carryless multiplication of each 64-bit half and then
|
|
// adds the two halves using an bitwise XOR operation.
|
|
|
|
const DFromV<decltype(a)> d;
|
|
const auto zero = Zero(d);
|
|
|
|
using VU64 = __vector unsigned long long;
|
|
const VU64 pmsum_result = reinterpret_cast<VU64>(
|
|
vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return Vec128<uint64_t, N>{pmsum_result};
|
|
#else
|
|
// Need to swap the two halves of pmsum_result on big-endian targets as
|
|
// the upper 64 bits of the carryless multiplication result are in lane 0 of
|
|
// pmsum_result and the lower 64 bits of the carryless multiplication result
|
|
// are in lane 1 of mul128_result
|
|
return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
|
|
#endif
|
|
}
|
|
|
|
#endif // !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
|
|
// ================================================== MISC
|
|
|
|
// ------------------------------ LoadMaskBits (TestBit)
|
|
|
|
namespace detail {
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
|
|
#if HWY_PPC_HAVE_10
|
|
const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)};
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return MFromD<D>{MaskFromVec(mask_vec).raw};
|
|
#else
|
|
return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw};
|
|
#endif // HWY_IS_LITTLE_ENDIAN
|
|
|
|
#else // PPC9 or earlier
|
|
const Full128<uint8_t> du8;
|
|
const Full128<uint16_t> du16;
|
|
const Vec128<uint8_t> vbits =
|
|
BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
|
|
|
|
// Replicate bytes 8x such that each byte contains the bit that governs it.
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1};
|
|
#else
|
|
const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0};
|
|
#endif // HWY_IS_LITTLE_ENDIAN
|
|
|
|
const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)};
|
|
const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
|
|
1, 2, 4, 8, 16, 32, 64, 128};
|
|
return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw};
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
|
|
#if HWY_PPC_HAVE_10
|
|
const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)};
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return MFromD<D>{MaskFromVec(mask_vec).raw};
|
|
#else
|
|
return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw};
|
|
#endif // HWY_IS_LITTLE_ENDIAN
|
|
|
|
#else // PPC9 or earlier
|
|
const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
|
|
const auto vmask_bits =
|
|
Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits));
|
|
return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
|
|
#if HWY_PPC_HAVE_10
|
|
const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return MFromD<D>{MaskFromVec(mask_vec).raw};
|
|
#else
|
|
return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw};
|
|
#endif // HWY_IS_LITTLE_ENDIAN
|
|
|
|
#else // PPC9 or earlier
|
|
const __vector unsigned int kBit = {1, 2, 4, 8};
|
|
const auto vmask_bits =
|
|
Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits));
|
|
return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
|
|
#if HWY_PPC_HAVE_10
|
|
const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return MFromD<D>{MaskFromVec(mask_vec).raw};
|
|
#else
|
|
return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw};
|
|
#endif // HWY_IS_LITTLE_ENDIAN
|
|
|
|
#else // PPC9 or earlier
|
|
const __vector unsigned long long kBit = {1, 2};
|
|
const auto vmask_bits =
|
|
Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits));
|
|
return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// `p` points to at least 8 readable bytes, not all of which need be valid.
|
|
template <class D, HWY_IF_LANES_LE_D(D, 8)>
|
|
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
// If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t
|
|
uint64_t mask_bits = bits[0];
|
|
|
|
constexpr size_t kN = MaxLanes(d);
|
|
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
|
|
return detail::LoadMaskBits128(d, mask_bits);
|
|
}
|
|
|
|
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
// First, copy the mask bits to a uint16_t as there as there are at most
|
|
// 16 lanes in a vector.
|
|
|
|
// Copying the mask bits to a uint16_t first will also ensure that the
|
|
// mask bits are loaded into the lower 16 bits on big-endian PPC targets.
|
|
uint16_t u16_mask_bits;
|
|
CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
|
|
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return detail::LoadMaskBits128(d, u16_mask_bits);
|
|
#else
|
|
// On big-endian targets, u16_mask_bits need to be byte swapped as bits
|
|
// contains the mask bits in little-endian byte order
|
|
|
|
// GCC/Clang will optimize the load of u16_mask_bits and byte swap to a
|
|
// single lhbrx instruction on big-endian PPC targets when optimizations
|
|
// are enabled.
|
|
#if HWY_HAS_BUILTIN(__builtin_bswap16)
|
|
return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits));
|
|
#else
|
|
return detail::LoadMaskBits128(
|
|
d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
template <typename T>
|
|
struct CompressIsPartition {
|
|
// generic_ops-inl does not guarantee IsPartition for 8-bit.
|
|
enum { value = (sizeof(T) != 1) };
|
|
};
|
|
|
|
// ------------------------------ StoreMaskBits
|
|
|
|
namespace detail {
|
|
|
|
#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
// fallback for missing vec_extractm
|
|
template <size_t N>
|
|
HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
__vector unsigned char bit_shuffle) {
|
|
// clang POWER8 and 9 targets appear to differ in their return type of
|
|
// vec_vbpermq: unsigned or signed, so cast to avoid a warning.
|
|
using VU64 = detail::Raw128<uint64_t>::type;
|
|
const Vec128<uint64_t> extracted{
|
|
reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
|
|
return extracted.raw[HWY_IS_LITTLE_ENDIAN];
|
|
}
|
|
|
|
#endif // !HWY_PPC_HAVE_10
|
|
|
|
template <typename T, size_t N>
|
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
const DFromM<decltype(mask)> d;
|
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
|
|
#else
|
|
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
|
|
56, 48, 40, 32, 24, 16, 8, 0};
|
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
const DFromM<decltype(mask)> d;
|
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
|
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
#else
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const __vector unsigned char kBitShuffle = {
|
|
112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
|
|
#else
|
|
const __vector unsigned char kBitShuffle = {
|
|
128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
|
|
#endif
|
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
const DFromM<decltype(mask)> d;
|
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
#else
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
|
|
128, 128, 128, 128, 128, 128,
|
|
128, 128, 128, 128};
|
|
#else
|
|
const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
|
|
128, 128, 128, 128, 128, 128,
|
|
96, 64, 32, 0};
|
|
#endif
|
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
const DFromM<decltype(mask)> d;
|
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
#else
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
|
|
128, 128, 128, 128, 128, 128,
|
|
128, 128, 128, 128};
|
|
#else
|
|
const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
|
|
128, 128, 128, 128, 128, 128,
|
|
128, 128, 64, 0};
|
|
#endif
|
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
#endif // HWY_PPC_HAVE_10
|
|
}
|
|
|
|
// Returns the lowest N of the mask bits.
|
|
template <typename T, size_t N>
|
|
constexpr uint64_t OnlyActive(uint64_t mask_bits) {
|
|
return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
|
|
}
|
|
|
|
template <typename T, size_t N>
|
|
HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
|
|
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// `p` points to at least 8 writable bytes.
|
|
template <class D, HWY_IF_LANES_LE_D(D, 8)>
|
|
HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
|
|
// For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
|
|
// to an uint8_t and store the result in bits[0].
|
|
bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask));
|
|
return sizeof(uint8_t);
|
|
}
|
|
|
|
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
|
|
const auto mask_bits = detail::BitsFromMask(mask);
|
|
|
|
// First convert mask_bits to a uint16_t as we only want to store
|
|
// the lower 16 bits of mask_bits as there are 16 lanes in mask.
|
|
|
|
// Converting mask_bits to a uint16_t first will also ensure that
|
|
// the lower 16 bits of mask_bits are stored instead of the upper 16 bits
|
|
// of mask_bits on big-endian PPC targets.
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits);
|
|
#else
|
|
// On big-endian targets, the bytes of mask_bits need to be swapped
|
|
// as StoreMaskBits expects the mask bits to be stored in little-endian
|
|
// byte order.
|
|
|
|
// GCC will also optimize the byte swap and CopyBytes operations below
|
|
// to a single sthbrx instruction when optimizations are enabled on
|
|
// big-endian PPC targets
|
|
#if HWY_HAS_BUILTIN(__builtin_bswap16)
|
|
const uint16_t u16_mask_bits =
|
|
__builtin_bswap16(static_cast<uint16_t>(mask_bits));
|
|
#else
|
|
const uint16_t u16_mask_bits = static_cast<uint16_t>(
|
|
(mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
|
|
#endif
|
|
#endif
|
|
|
|
CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
|
|
return sizeof(uint16_t);
|
|
}
|
|
|
|
// ------------------------------ Mask testing
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
return static_cast<bool>(vec_all_eq(RebindMask(du, mask).raw, Zero(du).raw));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
using TU = TFromD<decltype(du)>;
|
|
return static_cast<bool>(
|
|
vec_all_eq(RebindMask(du, mask).raw, Set(du, hwy::LimitsMax<TU>()).raw));
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
const Full128<TFromD<D>> d_full;
|
|
constexpr size_t kN = MaxLanes(d);
|
|
return AllFalse(d_full, MFromD<decltype(d_full)>{
|
|
vec_and(mask.raw, FirstN(d_full, kN).raw)});
|
|
}
|
|
|
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
const Full128<TFromD<D>> d_full;
|
|
constexpr size_t kN = MaxLanes(d);
|
|
return AllTrue(d_full, MFromD<decltype(d_full)>{
|
|
vec_or(mask.raw, Not(FirstN(d_full, kN)).raw)});
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
|
|
return PopCount(detail::BitsFromMask(mask));
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
|
|
return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask));
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
|
|
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
|
|
return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask));
|
|
}
|
|
|
|
template <class D>
|
|
HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
|
|
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
|
|
: -1;
|
|
}
|
|
|
|
// ------------------------------ Compress, CompressBits
|
|
|
|
namespace detail {
|
|
|
|
// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 256);
|
|
const Rebind<uint8_t, decltype(d)> d8;
|
|
const Twice<decltype(d8)> d8t;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
// To reduce cache footprint, store lane indices and convert to byte indices
|
|
// (2*lane + 0..1), with the doubling baked into the table. It's not clear
|
|
// that the additional cost of unpacking nibbles is worthwhile.
|
|
alignas(16) static constexpr uint8_t table[2048] = {
|
|
// PrintCompress16x8Tables
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, //
|
|
2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, //
|
|
2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, //
|
|
4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, //
|
|
2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, //
|
|
2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, //
|
|
4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, //
|
|
2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, //
|
|
6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, //
|
|
2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, //
|
|
4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, //
|
|
2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, //
|
|
2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, //
|
|
4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, //
|
|
2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, //
|
|
6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, //
|
|
2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, //
|
|
4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, //
|
|
2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, //
|
|
8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, //
|
|
2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, //
|
|
4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, //
|
|
2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, //
|
|
6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, //
|
|
2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, //
|
|
4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, //
|
|
2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, //
|
|
2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, //
|
|
4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, //
|
|
2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, //
|
|
6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, //
|
|
2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, //
|
|
4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, //
|
|
2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, //
|
|
8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, //
|
|
2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, //
|
|
4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, //
|
|
2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, //
|
|
6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, //
|
|
2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, //
|
|
4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, //
|
|
2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, //
|
|
10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, //
|
|
2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, //
|
|
4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, //
|
|
2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, //
|
|
6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, //
|
|
2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, //
|
|
4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, //
|
|
2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, //
|
|
8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, //
|
|
2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, //
|
|
4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, //
|
|
2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, //
|
|
6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, //
|
|
2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, //
|
|
4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, //
|
|
2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
|
|
14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, //
|
|
2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, //
|
|
4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, //
|
|
2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, //
|
|
6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, //
|
|
2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, //
|
|
4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, //
|
|
2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, //
|
|
8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, //
|
|
2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, //
|
|
4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, //
|
|
2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, //
|
|
6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, //
|
|
2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, //
|
|
4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, //
|
|
2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, //
|
|
10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, //
|
|
2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, //
|
|
4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, //
|
|
2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, //
|
|
6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, //
|
|
2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, //
|
|
4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, //
|
|
2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, //
|
|
8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, //
|
|
2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, //
|
|
4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, //
|
|
2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, //
|
|
6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, //
|
|
2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, //
|
|
4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, //
|
|
2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, //
|
|
12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, //
|
|
2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, //
|
|
4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, //
|
|
2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, //
|
|
6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, //
|
|
2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, //
|
|
4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, //
|
|
2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, //
|
|
8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, //
|
|
2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, //
|
|
4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, //
|
|
2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, //
|
|
6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, //
|
|
2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, //
|
|
4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, //
|
|
2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, //
|
|
10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, //
|
|
2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, //
|
|
4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, //
|
|
2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, //
|
|
6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, //
|
|
2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, //
|
|
4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, //
|
|
2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, //
|
|
8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, //
|
|
2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, //
|
|
4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, //
|
|
2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, //
|
|
6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, //
|
|
2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, //
|
|
4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, //
|
|
2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
|
|
|
|
const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
|
|
const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
|
|
constexpr uint16_t kPairIndexIncrement =
|
|
HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
|
|
|
|
return BitCast(d, pairs + Set(du, kPairIndexIncrement));
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 256);
|
|
const Rebind<uint8_t, decltype(d)> d8;
|
|
const Twice<decltype(d8)> d8t;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
// To reduce cache footprint, store lane indices and convert to byte indices
|
|
// (2*lane + 0..1), with the doubling baked into the table. It's not clear
|
|
// that the additional cost of unpacking nibbles is worthwhile.
|
|
alignas(16) static constexpr uint8_t table[2048] = {
|
|
// PrintCompressNot16x8Tables
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, //
|
|
0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, //
|
|
0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, //
|
|
0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, //
|
|
0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, //
|
|
0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, //
|
|
0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, //
|
|
0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, //
|
|
0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, //
|
|
0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, //
|
|
0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, //
|
|
0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, //
|
|
0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, //
|
|
0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, //
|
|
0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, //
|
|
0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, //
|
|
0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, //
|
|
0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, //
|
|
0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, //
|
|
0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, //
|
|
0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, //
|
|
0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, //
|
|
0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, //
|
|
0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, //
|
|
0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, //
|
|
0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, //
|
|
0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, //
|
|
0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, //
|
|
0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, //
|
|
0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, //
|
|
0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, //
|
|
0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, //
|
|
0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, //
|
|
0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, //
|
|
0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, //
|
|
0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, //
|
|
0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, //
|
|
0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, //
|
|
0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, //
|
|
0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, //
|
|
0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, //
|
|
0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, //
|
|
0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, //
|
|
0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, //
|
|
0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, //
|
|
0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, //
|
|
0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, //
|
|
0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, //
|
|
0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, //
|
|
0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, //
|
|
0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, //
|
|
0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, //
|
|
0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, //
|
|
0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, //
|
|
0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, //
|
|
0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, //
|
|
0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, //
|
|
0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, //
|
|
0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, //
|
|
0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, //
|
|
0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, //
|
|
0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, //
|
|
0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, //
|
|
0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, //
|
|
0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, //
|
|
0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, //
|
|
0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, //
|
|
0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, //
|
|
0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, //
|
|
0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, //
|
|
0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, //
|
|
0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, //
|
|
0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, //
|
|
0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, //
|
|
0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, //
|
|
0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, //
|
|
0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, //
|
|
0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, //
|
|
0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, //
|
|
0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, //
|
|
0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, //
|
|
0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, //
|
|
0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, //
|
|
0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, //
|
|
0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, //
|
|
0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, //
|
|
0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, //
|
|
0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, //
|
|
0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, //
|
|
0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, //
|
|
0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, //
|
|
0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, //
|
|
0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, //
|
|
0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, //
|
|
0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, //
|
|
0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, //
|
|
0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, //
|
|
0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, //
|
|
0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, //
|
|
0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, //
|
|
0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, //
|
|
0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, //
|
|
0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, //
|
|
0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, //
|
|
0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, //
|
|
0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, //
|
|
0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, //
|
|
0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, //
|
|
0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, //
|
|
0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, //
|
|
0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, //
|
|
0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, //
|
|
0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, //
|
|
0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, //
|
|
0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, //
|
|
0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, //
|
|
0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, //
|
|
0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, //
|
|
0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, //
|
|
0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, //
|
|
0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, //
|
|
0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
|
|
|
|
const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
|
|
const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
|
|
constexpr uint16_t kPairIndexIncrement =
|
|
HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
|
|
|
|
return BitCast(d, pairs + Set(du, kPairIndexIncrement));
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 16);
|
|
|
|
// There are only 4 lanes, so we can afford to load the index vector directly.
|
|
alignas(16) static constexpr uint8_t u8_indices[256] = {
|
|
// PrintCompress32x4Tables
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
|
|
4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
|
|
8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
|
|
0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
|
|
4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
|
|
12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
|
|
0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
|
|
4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
|
|
0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
|
|
0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
|
|
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 16);
|
|
|
|
// There are only 4 lanes, so we can afford to load the index vector directly.
|
|
alignas(16) static constexpr uint8_t u8_indices[256] = {
|
|
// PrintCompressNot32x4Tables
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
|
|
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
|
|
14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
|
|
2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
|
|
2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
|
|
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
12, 13, 14, 15};
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 4);
|
|
|
|
// There are only 2 lanes, so we can afford to load the index vector directly.
|
|
alignas(16) static constexpr uint8_t u8_indices[64] = {
|
|
// PrintCompress64x2Tables
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
|
|
}
|
|
|
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
|
|
HWY_DASSERT(mask_bits < 4);
|
|
|
|
// There are only 2 lanes, so we can afford to load the index vector directly.
|
|
alignas(16) static constexpr uint8_t u8_indices[64] = {
|
|
// PrintCompressNot64x2Tables
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
|
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
HWY_DASSERT(mask_bits < (1ull << N));
|
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
HWY_DASSERT(mask_bits < (1ull << N));
|
|
const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
|
|
return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// Single lane: no-op
|
|
template <typename T>
|
|
HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
|
|
return v;
|
|
}
|
|
|
|
// Two lanes: conditional swap
|
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
|
|
// If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
|
|
const Full128<T> d;
|
|
const Vec128<T> m = VecFromMask(d, mask);
|
|
const Vec128<T> maskL = DupEven(m);
|
|
const Vec128<T> maskH = DupOdd(m);
|
|
const Vec128<T> swap = AndNot(maskL, maskH);
|
|
return IfVecThenElse(swap, Shuffle01(v), v);
|
|
}
|
|
|
|
// General case, 2 or 4 bytes
|
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
return detail::CompressBits(v, detail::BitsFromMask(mask));
|
|
}
|
|
|
|
// ------------------------------ CompressNot
|
|
|
|
// Single lane: no-op
|
|
template <typename T>
|
|
HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
|
|
return v;
|
|
}
|
|
|
|
// Two lanes: conditional swap
|
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
|
|
// If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
|
|
const Full128<T> d;
|
|
const Vec128<T> m = VecFromMask(d, mask);
|
|
const Vec128<T> maskL = DupEven(m);
|
|
const Vec128<T> maskH = DupOdd(m);
|
|
const Vec128<T> swap = AndNot(maskH, maskL);
|
|
return IfVecThenElse(swap, Shuffle01(v), v);
|
|
}
|
|
|
|
// General case, 2 or 4 bytes
|
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
// For partial vectors, we cannot pull the Not() into the table because
|
|
// BitsFromMask clears the upper bits.
|
|
if (N < 16 / sizeof(T)) {
|
|
return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
|
|
}
|
|
return detail::CompressNotBits(v, detail::BitsFromMask(mask));
|
|
}
|
|
|
|
// ------------------------------ CompressBlocksNot
|
|
HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
|
|
Mask128<uint64_t> /* m */) {
|
|
return v;
|
|
}
|
|
|
|
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
|
|
const uint8_t* HWY_RESTRICT bits) {
|
|
// As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
|
|
// convert bits[0] to a uint64_t
|
|
uint64_t mask_bits = bits[0];
|
|
if (N < 8) {
|
|
mask_bits &= (1ull << N) - 1;
|
|
}
|
|
|
|
return detail::CompressBits(v, mask_bits);
|
|
}
|
|
|
|
// ------------------------------ CompressStore, CompressBitsStore
|
|
|
|
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
|
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
const uint64_t mask_bits = detail::BitsFromMask(m);
|
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
const size_t count = PopCount(mask_bits);
|
|
|
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
StoreU(compressed, d, unaligned);
|
|
return count;
|
|
}
|
|
|
|
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
const uint64_t mask_bits = detail::BitsFromMask(m);
|
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
const size_t count = PopCount(mask_bits);
|
|
|
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
BlendedStore(compressed, FirstN(d, count), d, unaligned);
|
|
return count;
|
|
}
|
|
|
|
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
|
|
D d, TFromD<D>* HWY_RESTRICT unaligned) {
|
|
const RebindToUnsigned<decltype(d)> du;
|
|
|
|
// As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
|
|
// convert bits[0] to a uint64_t
|
|
uint64_t mask_bits = bits[0];
|
|
constexpr size_t kN = MaxLanes(d);
|
|
if (kN < 8) {
|
|
mask_bits &= (1ull << kN) - 1;
|
|
}
|
|
const size_t count = PopCount(mask_bits);
|
|
|
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
StoreU(compressed, d, unaligned);
|
|
|
|
return count;
|
|
}
|
|
|
|
// ------------------------------ StoreInterleaved2/3/4
|
|
|
|
// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
|
|
// generic_ops-inl.h.
|
|
|
|
// ------------------------------ Reductions
|
|
|
|
namespace detail {
|
|
|
|
// N=1 for any T: no-op
|
|
template <typename T>
|
|
HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
|
|
return v;
|
|
}
|
|
template <typename T>
|
|
HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
|
|
return v;
|
|
}
|
|
template <typename T>
|
|
HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
|
|
return v;
|
|
}
|
|
|
|
// u32/i32/f32:
|
|
|
|
// N=2
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
|
|
// NOTE: AltivecVsum2sws cannot be used here as AltivecVsum2sws
|
|
// computes the signed saturated sum of the lanes.
|
|
return v10 + Shuffle2301(v10);
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
|
|
return Min(v10, Shuffle2301(v10));
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
|
|
return Max(v10, Shuffle2301(v10));
|
|
}
|
|
|
|
// N=4 (full)
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v3210) {
|
|
// NOTE: AltivecVsumsws cannot be used here as AltivecVsumsws
|
|
// computes the signed saturated sum of the lanes.
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = v3210 + v1032;
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return v20_31_20_31 + v31_20_31_20;
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Min(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v3210) {
|
|
const Vec128<T> v1032 = Shuffle1032(v3210);
|
|
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
|
|
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
|
|
return Max(v20_31_20_31, v31_20_31_20);
|
|
}
|
|
|
|
// u64/i64/f64:
|
|
|
|
// N=2 (full)
|
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return v10 + v01;
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Min(v10, v01);
|
|
}
|
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) {
|
|
const Vec128<T> v01 = Shuffle01(v10);
|
|
return Max(v10, v01);
|
|
}
|
|
|
|
// Casts nominally int32_t result to D.
|
|
template <class D>
|
|
HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a,
|
|
__vector signed int b) {
|
|
const Repartition<int32_t, D> di32;
|
|
#ifdef __OPTIMIZE__
|
|
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
static_cast<int64_t>(a[1]) +
|
|
static_cast<int64_t>(b[0]);
|
|
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
static_cast<int64_t>(a[3]) +
|
|
static_cast<int64_t>(b[1]);
|
|
const int64_t sum2 = static_cast<int64_t>(a[4]) +
|
|
static_cast<int64_t>(a[5]) +
|
|
static_cast<int64_t>(b[2]);
|
|
const int64_t sum3 = static_cast<int64_t>(a[6]) +
|
|
static_cast<int64_t>(a[7]) +
|
|
static_cast<int64_t>(b[3]);
|
|
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
using Raw = typename detail::Raw128<int32_t>::type;
|
|
return BitCast(
|
|
d,
|
|
VFromD<decltype(di32)>{Raw{
|
|
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
(sign3 == (sum3 >> 31))
|
|
? static_cast<int32_t>(sum3)
|
|
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)});
|
|
}
|
|
}
|
|
|
|
// Casts nominally int32_t result to D.
|
|
template <class D>
|
|
HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
|
|
__vector signed int b) {
|
|
const Repartition<int32_t, D> di32;
|
|
#ifdef __OPTIMIZE__
|
|
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
const int64_t sum0 =
|
|
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
static_cast<int64_t>(b[0]);
|
|
const int64_t sum1 =
|
|
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
|
|
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
|
|
static_cast<int64_t>(b[1]);
|
|
const int64_t sum2 =
|
|
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
|
|
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
|
|
static_cast<int64_t>(b[2]);
|
|
const int64_t sum3 =
|
|
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
|
|
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
|
|
static_cast<int64_t>(b[3]);
|
|
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
using Raw = typename detail::Raw128<int32_t>::type;
|
|
return BitCast(
|
|
d,
|
|
VFromD<decltype(di32)>{Raw{
|
|
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
(sign3 == (sum3 >> 31))
|
|
? static_cast<int32_t>(sum3)
|
|
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
|
|
}
|
|
}
|
|
|
|
// Casts nominally int32_t result to D.
|
|
template <class D>
|
|
HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a,
|
|
__vector signed int b) {
|
|
const Repartition<int32_t, D> di32;
|
|
#ifdef __OPTIMIZE__
|
|
constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) {
|
|
const int64_t sum =
|
|
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
const int32_t sign = static_cast<int32_t>(sum >> 63);
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
return BitCast(
|
|
d, VFromD<decltype(di32)>{(__vector signed int){
|
|
(sign == (sum >> 31)) ? static_cast<int32_t>(sum)
|
|
: static_cast<int32_t>(sign ^ 0x7FFFFFFF),
|
|
0, 0, 0}});
|
|
#else
|
|
return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){
|
|
0, 0, 0,
|
|
(sign == (sum >> 31))
|
|
? static_cast<int32_t>(sum)
|
|
: static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
|
|
#endif
|
|
} else // NOLINT
|
|
#endif
|
|
{
|
|
__vector signed int sum;
|
|
|
|
// Inline assembly is used for vsumsws to avoid unnecessary shuffling
|
|
// on little-endian PowerPC targets as the result of the vsumsws
|
|
// instruction will already be in the correct lanes on little-endian
|
|
// PowerPC targets.
|
|
__asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
|
|
return BitCast(d, VFromD<decltype(di32)>{sum});
|
|
}
|
|
}
|
|
|
|
template <size_t N>
|
|
HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
|
|
const RebindToSigned<DFromV<decltype(v)>> di16;
|
|
const RepartitionToWide<decltype(di16)> di32;
|
|
return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
|
|
Set(di32, 65536).raw);
|
|
}
|
|
|
|
HWY_API Vec32<uint16_t> SumOfLanes(Vec32<uint16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
DFromV<decltype(v)> du16;
|
|
return Broadcast<kSumLaneIdx>(BitCast(du16, AltivecU16SumsOf2(v)));
|
|
}
|
|
|
|
HWY_API Vec64<uint16_t> SumOfLanes(Vec64<uint16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
const Full64<uint16_t> du16;
|
|
const auto zero = Zero(Full128<int32_t>());
|
|
return Broadcast<kSumLaneIdx>(
|
|
AltivecVsum2sws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec128<uint16_t> SumOfLanes(Vec128<uint16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
const Full128<uint16_t> du16;
|
|
const auto zero = Zero(Full128<int32_t>());
|
|
return Broadcast<kSumLaneIdx>(
|
|
AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec32<int16_t> SumOfLanes(Vec32<int16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
const Full32<int16_t> di16;
|
|
const auto zero = Zero(Full128<int32_t>());
|
|
return Broadcast<kSumLaneIdx>(AltivecVsum4shs(di16, v.raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec64<int16_t> SumOfLanes(Vec64<int16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
const Full128<int32_t> di32;
|
|
const Full64<int16_t> di16;
|
|
const auto zero = Zero(di32);
|
|
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec128<int16_t> SumOfLanes(Vec128<int16_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
const Full128<int16_t> di16;
|
|
const Full128<int32_t> di32;
|
|
const auto zero = Zero(di32);
|
|
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
}
|
|
|
|
// u8, N=2, N=4, N=8, N=16:
|
|
HWY_API Vec16<uint8_t> SumOfLanes(Vec16<uint8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
const Full16<uint8_t> du8;
|
|
const Full16<uint16_t> du16;
|
|
const Twice<decltype(du8)> dt_u8;
|
|
const Twice<decltype(du16)> dt_u16;
|
|
const Full128<uint32_t> du32;
|
|
return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs(
|
|
dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw,
|
|
Zero(du32).raw)));
|
|
}
|
|
|
|
HWY_API Vec32<uint8_t> SumOfLanes(Vec32<uint8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
const Full128<uint32_t> du32;
|
|
const Full32<uint8_t> du8;
|
|
return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw));
|
|
}
|
|
|
|
HWY_API Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
const Full64<uint8_t> du8;
|
|
return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
|
|
}
|
|
|
|
HWY_API Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
|
|
const Full128<uint32_t> du32;
|
|
const RebindToSigned<decltype(du32)> di32;
|
|
const Full128<uint8_t> du8;
|
|
const Vec128<uint32_t> zero = Zero(du32);
|
|
return Broadcast<kSumLaneIdx>(
|
|
AltivecVsumsws(du8, AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
|
|
BitCast(di32, zero).raw));
|
|
}
|
|
|
|
HWY_API Vec16<int8_t> SumOfLanes(Vec16<int8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
|
|
const Full128<uint16_t> du16;
|
|
const Repartition<int32_t, decltype(du16)> di32;
|
|
const Repartition<int8_t, decltype(du16)> di8;
|
|
const Vec128<int8_t> zzvv = BitCast(
|
|
di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16)));
|
|
return Vec16<int8_t>{
|
|
Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw))
|
|
.raw};
|
|
}
|
|
|
|
HWY_API Vec32<int8_t> SumOfLanes(Vec32<int8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
const Full32<int8_t> di8;
|
|
const Vec128<int32_t> zero = Zero(Full128<int32_t>());
|
|
return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec64<int8_t> SumOfLanes(Vec64<int8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
const Full128<int32_t> di32;
|
|
const Vec128<int32_t> zero = Zero(di32);
|
|
const Full64<int8_t> di8;
|
|
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
}
|
|
|
|
HWY_API Vec128<int8_t> SumOfLanes(Vec128<int8_t> v) {
|
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
const Full128<int8_t> di8;
|
|
const Full128<int32_t> di32;
|
|
const Vec128<int32_t> zero = Zero(di32);
|
|
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
|
|
HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RepartitionToWide<decltype(d)> d16;
|
|
const RepartitionToWide<decltype(d16)> d32;
|
|
Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
|
|
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
if (N > 8) {
|
|
const RepartitionToWide<decltype(d32)> d64;
|
|
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
}
|
|
return vm;
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
|
|
HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RepartitionToWide<decltype(d)> d16;
|
|
const RepartitionToWide<decltype(d16)> d32;
|
|
Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
|
|
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
if (N > 8) {
|
|
const RepartitionToWide<decltype(d32)> d64;
|
|
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
}
|
|
return vm;
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
|
|
HWY_API Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RepartitionToWide<decltype(d)> d16;
|
|
const RepartitionToWide<decltype(d16)> d32;
|
|
Vec128<int8_t, N> vm = Max(v, Reverse2(d, v));
|
|
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
if (N > 8) {
|
|
const RepartitionToWide<decltype(d32)> d64;
|
|
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
}
|
|
return vm;
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
|
|
HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RepartitionToWide<decltype(d)> d16;
|
|
const RepartitionToWide<decltype(d16)> d32;
|
|
Vec128<int8_t, N> vm = Min(v, Reverse2(d, v));
|
|
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
if (N > 8) {
|
|
const RepartitionToWide<decltype(d32)> d64;
|
|
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
}
|
|
return vm;
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
|
|
HWY_API Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) {
|
|
const Simd<uint16_t, N, 0> d;
|
|
const RepartitionToWide<decltype(d)> d32;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
#else
|
|
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
#endif
|
|
const auto min = MinOfLanes(Min(even, odd));
|
|
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
// on big-endian
|
|
return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)};
|
|
}
|
|
template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
|
|
HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) {
|
|
const Simd<int16_t, N, 0> d;
|
|
const RepartitionToWide<decltype(d)> d32;
|
|
// Sign-extend
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
#else
|
|
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
#endif
|
|
const auto min = MinOfLanes(Min(even, odd));
|
|
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
// on big-endian
|
|
return Vec128<int16_t, N>{vec_pack(min.raw, min.raw)};
|
|
}
|
|
|
|
template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
|
|
HWY_API Vec128<uint16_t, N> MaxOfLanes(Vec128<uint16_t, N> v) {
|
|
const Simd<uint16_t, N, 0> d;
|
|
const RepartitionToWide<decltype(d)> d32;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
#else
|
|
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
#endif
|
|
const auto max = MaxOfLanes(Max(even, odd));
|
|
// Also broadcast into odd lanes.
|
|
return Vec128<uint16_t, N>{vec_pack(max.raw, max.raw)};
|
|
}
|
|
template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
|
|
HWY_API Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) {
|
|
const Simd<int16_t, N, 0> d;
|
|
const RepartitionToWide<decltype(d)> d32;
|
|
// Sign-extend
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
#else
|
|
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
#endif
|
|
const auto max = MaxOfLanes(Max(even, odd));
|
|
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
// on big-endian
|
|
return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)};
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
// Supported for u/i/f 32/64. Returns the same value in each lane.
|
|
template <class D>
|
|
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
return detail::SumOfLanes(v);
|
|
}
|
|
template <class D>
|
|
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
return GetLane(detail::SumOfLanes(v));
|
|
}
|
|
template <class D>
|
|
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
return detail::MinOfLanes(v);
|
|
}
|
|
template <class D>
|
|
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
return detail::MaxOfLanes(v);
|
|
}
|
|
|
|
// ------------------------------ Lt128
|
|
|
|
namespace detail {
|
|
|
|
// Returns vector-mask for Lt128.
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Lt128Vec(D d, V a, V b) {
|
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
(void)d;
|
|
using VU64 = __vector unsigned long long;
|
|
using VU128 = __vector unsigned __int128;
|
|
#if HWY_IS_LITTLE_ENDIAN
|
|
const VU128 a_u128 = reinterpret_cast<VU128>(a.raw);
|
|
const VU128 b_u128 = reinterpret_cast<VU128>(b.raw);
|
|
#else
|
|
// NOTE: Need to swap the halves of both a and b on big-endian targets
|
|
// as the upper 64 bits of a and b are in lane 1 and the lower 64 bits
|
|
// of a and b are in lane 0 whereas the vec_cmplt operation below expects
|
|
// the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on
|
|
// big-endian PPC targets.
|
|
const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8));
|
|
const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8));
|
|
#endif
|
|
return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))};
|
|
#else // !HWY_PPC_HAVE_10
|
|
// Truth table of Eq and Lt for Hi and Lo u64.
|
|
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
|
|
// =H =L cH cL | out = cH | (=H & cL)
|
|
// 0 0 0 0 | 0
|
|
// 0 0 0 1 | 0
|
|
// 0 0 1 0 | 1
|
|
// 0 0 1 1 | 1
|
|
// 0 1 0 0 | 0
|
|
// 0 1 0 1 | 0
|
|
// 0 1 1 0 | 1
|
|
// 1 0 0 0 | 0
|
|
// 1 0 0 1 | 1
|
|
// 1 1 0 0 | 0
|
|
const auto eqHL = Eq(a, b);
|
|
const V ltHL = VecFromMask(d, Lt(a, b));
|
|
const V ltLX = ShiftLeftLanes<1>(ltHL);
|
|
const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
|
|
return InterleaveUpper(d, vecHx, vecHx);
|
|
#endif
|
|
}
|
|
|
|
// Returns vector-mask for Eq128.
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Eq128Vec(D d, V a, V b) {
|
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
(void)d;
|
|
using VU64 = __vector unsigned long long;
|
|
using VU128 = __vector unsigned __int128;
|
|
return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw),
|
|
reinterpret_cast<VU128>(b.raw)))};
|
|
#else
|
|
const auto eqHL = VecFromMask(d, Eq(a, b));
|
|
const auto eqLH = Reverse2(d, eqHL);
|
|
return And(eqHL, eqLH);
|
|
#endif
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Ne128Vec(D d, V a, V b) {
|
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
(void)d;
|
|
using VU64 = __vector unsigned long long;
|
|
using VU128 = __vector unsigned __int128;
|
|
return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw),
|
|
reinterpret_cast<VU128>(b.raw)))};
|
|
#else
|
|
const auto neHL = VecFromMask(d, Ne(a, b));
|
|
const auto neLH = Reverse2(d, neHL);
|
|
return Or(neHL, neLH);
|
|
#endif
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Lt128UpperVec(D d, V a, V b) {
|
|
const V ltHL = VecFromMask(d, Lt(a, b));
|
|
return InterleaveUpper(d, ltHL, ltHL);
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Eq128UpperVec(D d, V a, V b) {
|
|
const V eqHL = VecFromMask(d, Eq(a, b));
|
|
return InterleaveUpper(d, eqHL, eqHL);
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_INLINE V Ne128UpperVec(D d, V a, V b) {
|
|
const V neHL = VecFromMask(d, Ne(a, b));
|
|
return InterleaveUpper(d, neHL, neHL);
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Lt128(D d, V a, V b) {
|
|
return MaskFromVec(detail::Lt128Vec(d, a, b));
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Eq128(D d, V a, V b) {
|
|
return MaskFromVec(detail::Eq128Vec(d, a, b));
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Ne128(D d, V a, V b) {
|
|
return MaskFromVec(detail::Ne128Vec(d, a, b));
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Lt128Upper(D d, V a, V b) {
|
|
return MaskFromVec(detail::Lt128UpperVec(d, a, b));
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Eq128Upper(D d, V a, V b) {
|
|
return MaskFromVec(detail::Eq128UpperVec(d, a, b));
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API MFromD<D> Ne128Upper(D d, V a, V b) {
|
|
return MaskFromVec(detail::Ne128UpperVec(d, a, b));
|
|
}
|
|
|
|
// ------------------------------ Min128, Max128 (Lt128)
|
|
|
|
// Avoids the extra MaskFromVec in Lt128.
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API V Min128(D d, const V a, const V b) {
|
|
return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API V Max128(D d, const V a, const V b) {
|
|
return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API V Min128Upper(D d, const V a, const V b) {
|
|
return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
|
|
}
|
|
|
|
template <class D, class V = VFromD<D>>
|
|
HWY_API V Max128Upper(D d, const V a, const V b) {
|
|
return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
|
|
}
|
|
|
|
// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
|
|
|
|
#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
|
|
#undef HWY_NATIVE_LEADING_ZERO_COUNT
|
|
#else
|
|
#define HWY_NATIVE_LEADING_ZERO_COUNT
|
|
#endif
|
|
|
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
HWY_API V LeadingZeroCount(V v) {
|
|
return V{vec_cntlz(v.raw)};
|
|
}
|
|
|
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
HWY_API V HighestSetBitIndex(V v) {
|
|
const DFromV<decltype(v)> d;
|
|
using T = TFromD<decltype(d)>;
|
|
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
|
|
}
|
|
|
|
#if HWY_PPC_HAVE_9
|
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
HWY_API V TrailingZeroCount(V v) {
|
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
return V{vec_vctz(v.raw)};
|
|
#else
|
|
return V{vec_cnttz(v.raw)};
|
|
#endif
|
|
}
|
|
#else
|
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
HWY_API V TrailingZeroCount(V v) {
|
|
const DFromV<decltype(v)> d;
|
|
const RebindToSigned<decltype(d)> di;
|
|
using TI = TFromD<decltype(di)>;
|
|
|
|
const auto vi = BitCast(di, v);
|
|
const auto lowest_bit = And(vi, Neg(vi));
|
|
constexpr TI kNumOfBitsInT{sizeof(TI) * 8};
|
|
const auto bit_idx = HighestSetBitIndex(lowest_bit);
|
|
return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)),
|
|
Set(di, kNumOfBitsInT), bit_idx));
|
|
}
|
|
#endif
|
|
|
|
#undef HWY_PPC_HAVE_9
|
|
#undef HWY_PPC_HAVE_10
|
|
|
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
} // namespace HWY_NAMESPACE
|
|
} // namespace hwy
|
|
HWY_AFTER_NAMESPACE();
|