libstdc++/api/a00767_source.html

// Implementation of <simd> -*- C++ -*-


// Copyright The GNU Toolchain Authors.

//

// This file is part of the GNU ISO C++ Library.  This library is free

// software; you can redistribute it and/or modify it under the

// terms of the GNU General Public License as published by the

// Free Software Foundation; either version 3, or (at your option)

// any later version.


// This library is distributed in the hope that it will be useful,

// but WITHOUT ANY WARRANTY; without even the implied warranty of

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

// GNU General Public License for more details.


// Under Section 7 of GPL version 3, you are granted additional

// permissions described in the GCC Runtime Library Exception, version

// 3.1, as published by the Free Software Foundation.


// You should have received a copy of the GNU General Public License and

// a copy of the GCC Runtime Library Exception along with this program;

// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see

// <http://www.gnu.org/licenses/>.


#ifndef _GLIBCXX_SIMD_DETAILS_H

#define _GLIBCXX_SIMD_DETAILS_H 1


#ifdef _GLIBCXX_SYSHDR

#pragma GCC system_header

#endif


#if __cplusplus >= 202400L


#include <bit>

#include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32

#include <bits/stl_function.h> // plus, minus, multiplies, ...

#include <bits/utility.h> // integer_sequence, etc.

#include <cmath> // for math_errhandling :(

#include <concepts>

#include <cstdint>

#include <limits>

#include <span> // for dynamic_extent


#if __CHAR_BIT__ != 8

// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.

// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).

#error "<simd> is not supported for CHAR_BIT != 8"

#endif


// psabi warnings are bogus because the ABI of the internal types never leaks into user code

#pragma GCC diagnostic push

#pragma GCC diagnostic ignored "-Wpsabi"


#if defined __x86_64__ || defined __i386__

#define _GLIBCXX_X86 1

#else

#define _GLIBCXX_X86 0

#endif


#ifndef _GLIBCXX_SIMD_NOEXCEPT

/** @internal

 * For unit-testing preconditions, use this macro to remove noexcept.

 */

#define _GLIBCXX_SIMD_NOEXCEPT noexcept

#endif


#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x

#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)


// This is used for unit-testing precondition checking

#define __glibcxx_simd_precondition(expr, msg, ...)                                                \

  __glibcxx_assert(expr)


namespace std _GLIBCXX_VISIBILITY(default)

{

_GLIBCXX_BEGIN_NAMESPACE_VERSION


namespace simd

{

  template <typename _Tp>

    inline constexpr _Tp

    __iota = [] { static_assert(false, "invalid __iota specialization"); }();


  // [simd.general] vectorizable types


  template <typename _Tp>

    concept __vectorizable_scalar

      = same_as<remove_cv_t<_Tp>, _Tp>

#ifdef __STDCPP_BFLOAT16_T__

          && !same_as<_Tp, __gnu_cxx::__bfloat16_t>

#endif

          && ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>)

                 || (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double)));


  // [simd.general] p2

  template <typename _Tp>

    concept __vectorizable = __vectorizable_scalar<_Tp>;


  /** @internal

   * Describes variants of _Abi.

   */

  enum class _AbiVariant : unsigned long long

  {

    _BitMask      = 0x01, // AVX512 bit-masks

    _MaskVariants = 0x0f, // vector masks if bits [0:3] are 0

  };


  /** @internal

   * Return @p __in with only bits set that are set in any of @p __to_keep.

   */

  consteval _AbiVariant

  __filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep)

  {

    using _Up = underlying_type_t<_AbiVariant>;

    return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...));

  }


  /** @internal

   * Type used whenever no valid integer/value type exists.

   */

  struct _InvalidInteger

  {};


  /** @internal

   * Alias for a signed integer type T such that sizeof(T) equals _Bytes.

   *

   * C++26 [simd.expos.defn]

   */

  template <size_t _Bytes>

    using __integer_from

      = decltype([] consteval {

          if constexpr (sizeof(signed char) == _Bytes)

            return static_cast<signed char>(0);

          else if constexpr (sizeof(signed short) == _Bytes)

            return static_cast<signed short>(0);

          else if constexpr (sizeof(signed int) == _Bytes)

            return static_cast<signed int>(0);

          else if constexpr (sizeof(signed long long) == _Bytes)

            return static_cast<signed long long>(0);

          else

            return _InvalidInteger();

        }());


  /** @internal

   * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.

   */

  template <size_t _Bytes>

    using _UInt = make_unsigned_t<__integer_from<_Bytes>>;


  /** @internal

   * Divide @p __x by @p __y while rounding up instead of down.

   *

   * Preconditions: __x >= 0 && __y > 0.

   */

  template <typename _Tp>

    consteval _Tp

    __div_ceil(_Tp __x, _Tp __y)

    { return (__x + __y - 1) / __y; }


  /** @internal

   * Alias for an unsigned integer type that can store at least @p _NBits bits.

   */

  template <int _NBits>

    requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits)

    using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;


  /** @internal

   * Map a given type @p _Tp to an equivalent type.

   *

   * This helps with reducing the necessary branches && casts in the implementation as well as

   * reducing the number of template instantiations.

   */

  template <typename _Tp>

    struct __canonical_vec_type

    { using type = _Tp; };


  template <typename _Tp>

    using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;


#if __SIZEOF_INT__ == __SIZEOF_LONG__

  template <>

    struct __canonical_vec_type<long>

    { using type = int; };


  template <>

    struct __canonical_vec_type<unsigned long>

    { using type = unsigned int; };

#elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__

  template <>

    struct __canonical_vec_type<long>

    { using type = long long; };


  template <>

    struct __canonical_vec_type<unsigned long>

    { using type = unsigned long long; };

#endif


  template <typename _Tp>

    requires std::is_enum_v<_Tp>

    struct __canonical_vec_type<_Tp>

    { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };


  template <>

    struct __canonical_vec_type<char>

#if __CHAR_UNSIGNED__

    { using type = unsigned char; };

#else

    { using type = signed char; };

#endif


  template <>

    struct __canonical_vec_type<char8_t>

    { using type = unsigned char; };


  template <>

    struct __canonical_vec_type<char16_t>

    { using type = uint_least16_t; };


  template <>

    struct __canonical_vec_type<char32_t>

    { using type = uint_least32_t; };


  template <>

    struct __canonical_vec_type<wchar_t>

    {

      using type = std::__conditional_t<std::is_signed_v<wchar_t>,

                                        simd::__integer_from<sizeof(wchar_t)>,

                                        simd::_UInt<sizeof(wchar_t)>>;

    };


#if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64)

  template <>

    struct __canonical_vec_type<_Float64>

    { using type = double; };

#endif


#if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)

  template <>

    struct __canonical_vec_type<_Float32>

    { using type = float; };

#endif


  /** @internal

   * This ABI tag describes basic_vec objects that store one element per data member and basic_mask

   * objects that store one bool data members.

   *

   * @tparam _Np   The number of elements, which also matches the number of data members in

   *               basic_vec and basic_mask.

   */

  template <int _Np = 1>

    struct _ScalarAbi

    {

      static constexpr int _S_size = _Np;


      static constexpr int _S_nreg = _Np;


      static constexpr _AbiVariant _S_variant = {};


      template <typename _Tp>

        using _DataType = __canonical_vec_type_t<_Tp>;


      static constexpr bool _S_is_vecmask = false;


      // in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask

      static constexpr bool _S_is_bitmask = false;


      template <size_t>

        using _MaskDataType = bool;


      template <int _N2, int _Nreg2 = _N2>

        static consteval _ScalarAbi<_N2>

        _S_resize()

        {

          static_assert(_N2 == _Nreg2);

          return {};

        }

    };


  /** @internal

   * This ABI tag describes basic_vec objects that store one or more objects declared with the

   * [[gnu::vector_size(N)]] attribute.

   * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects

   * or bit-mask objects. Which one is used is determined via @p _Var.

   *

   * @tparam _Np    The number of elements.

   * @tparam _Nreg  The number of registers needed to store @p _Np elements.

   * @tparam _Var   Determines how complex value-types are layed out and whether mask types use

   *                bit-masks or vector-masks.

   */

  template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var>

    struct _Abi

    {

      static constexpr int _S_size = _Np;


      /** @internal

       * The number of registers needed to represent one basic_vec for the element type that was

       * used on ABI deduction.

       *

       * Examples:

       * - '_Abi< 8, 2>' for 'int' is 2x 128-bit

       * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit

       * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit

       * - '_Abi<10, 1>' for 'int' is 1x 512-bit

       * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit

       */

      static constexpr int _S_nreg = _Nreg;


      static_assert(_S_size > 0);

      static_assert(_S_nreg > 0);


      static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);


      static constexpr bool _S_is_bitmask

        = __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask;


      static constexpr bool _S_is_vecmask = !_S_is_bitmask;


      template <typename _Tp>

        using _DataType = decltype([] {

                            static_assert(_S_nreg == 1);

                            if constexpr (_S_size == 1)

                              return __canonical_vec_type_t<_Tp>();

                            else

                              {

                                constexpr int __n = __bit_ceil(unsigned(_S_size));

                                using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]

                                  = __canonical_vec_type_t<_Tp>;

                                return _Vp();

                              }

                          }());


      template <size_t _Bytes>

        using _MaskDataType

          = decltype([] {

              static_assert(_S_nreg == 1);

              if constexpr (_S_size == 1)

                return bool();

              else if constexpr (_S_is_vecmask)

                {

                  constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));

                  using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;

                  return _Vp();

                }

              else if constexpr (_Nreg > 1)

                return _InvalidInteger();

              else

                return _Bitmask<_S_size>();

            }());


      template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>

        static consteval auto

        _S_resize()

        {

          if constexpr (_N2 == 1)

            return _Abi<1, 1, _Var>();

          else

            return _Abi<_N2, _Nreg2, _Var>();

        }

    };


  /** @internal

   * Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer

   * value.

   *

   * Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an

   * alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the

   * template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int,

   * std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot

   * longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As

   * soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans

   * anyway.

   */

  template <int _Np, int _Nreg, _AbiVariant... _Vs>

    using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>;


  /** @internal

   * This type is used whenever ABI tag deduction can't give a useful answer.

   */

  struct _InvalidAbi

  { static constexpr int _S_size = 0; };


  /** @internal

   * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition

   * for an enabled basic_vec/basic_mask specialization.

   */

  template <typename _Tp>

    concept __abi_tag

      = same_as<decltype(_Tp::_S_variant), const _AbiVariant>

          && (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1)

          && requires(_Tp __x) {

            { __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;

          };


  template <typename _Tp>

    concept __scalar_abi_tag

      = same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>;


  // Determine if math functions must *raise* floating-point exceptions.

  // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions

  // need to be considered. A conforming C library must define math_errhandling, but in case it

  // isn't defined we simply use the fallback.

#ifdef math_errhandling

  template <int = 0>

    requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }

    consteval bool

    __handle_fpexcept_impl(int)

    { return 0 != (math_errhandling & MATH_ERREXCEPT); }

#endif


  // Fallback if math_errhandling doesn't work: implement correct exception behavior.

  consteval bool

  __handle_fpexcept_impl(float)

  { return true; }


  /** @internal

   * This type can be used as a template parameter for avoiding ODR violations, where code needs to

   * differ depending on optimization flags (mostly fp-math related).

   */

  struct _OptTraits

  {

    consteval bool

    _M_test(int __bit) const

    { return ((_M_build_flags >> __bit) & 1) == 1; }


    // true iff floating-point operations can signal an exception (allow non-default handler)

    consteval bool

    _M_fp_may_signal() const

    { return _M_test(0); }


    // true iff floating-point operations can raise an exception flag

    consteval bool

    _M_fp_may_raise() const

    { return _M_test(12); }


    consteval bool

    _M_fast_math() const

    { return _M_test(1); }


    consteval bool

    _M_finite_math_only() const

    { return _M_test(2); }


    consteval bool

    _M_no_signed_zeros() const

    { return _M_test(3); }


    consteval bool

    _M_signed_zeros() const

    { return !_M_test(3); }


    consteval bool

    _M_reciprocal_math() const

    { return _M_test(4); }


    consteval bool

    _M_no_math_errno() const

    { return _M_test(5); }


    consteval bool

    _M_math_errno() const

    { return !_M_test(5); }


    consteval bool

    _M_associative_math() const

    { return _M_test(6); }


    consteval bool

    _M_conforming_to_STDC_annex_G() const

    { return _M_test(10) && !_M_finite_math_only(); }


    consteval bool

    _M_support_snan() const

    { return _M_test(11); }


    __UINT64_TYPE__ _M_build_flags

      = 0

#if !__NO_TRAPPING_MATH__

          + (1 << 0)

#endif

          + (__handle_fpexcept_impl(0) << 12)

#if __FAST_MATH__

          + (1 << 1)

#endif

#if __FINITE_MATH_ONLY__

          + (1 << 2)

#endif

#if __NO_SIGNED_ZEROS__

          + (1 << 3)

#endif

#if __RECIPROCAL_MATH__

          + (1 << 4)

#endif

#if __NO_MATH_ERRNO__

          + (1 << 5)

#endif

#if __ASSOCIATIVE_MATH__

          + (1 << 6)

#endif

        // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__

#if __FLT_EVAL_METHOD__ == 1

          + (1 << 7)

#elif __FLT_EVAL_METHOD__ == 2

          + (2 << 7)

#elif __FLT_EVAL_METHOD__ != 0

          + (3 << 7)

#endif


        // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If

        // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>

        // will do so as well. However, Clang never defines the macro.

#if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG

          + (1 << 10)

#endif

#if __SUPPORT_SNAN__

          + (1 << 11)

#endif

        ;

  };


  /** @internal

   * Return true iff @p __s equals "1".

   */

  consteval bool

  __streq_to_1(const char* __s)

  { return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; }


  /** @internal

   * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.

   * Otherwise, expand to zero.

   */

#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \

  (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)


#if _GLIBCXX_X86


#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT {                      \

  _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__)                         \

    | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__)                    \

    | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__)                  \

    | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__)                 \

    | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__)                 \

    | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__)                 \

    | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__)                    \

    | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__)                    \

    | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__)                  \

    | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__)                    \

    | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__)                \

    | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__)               \

    | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__)               \

    | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__)               \

    | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__)               \

    | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__)           \

    | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__)             \

    | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__)            \

    | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__)             \

    | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__)             \

    | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__)        \

    | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__)             \

    | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__)             \

    | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__)                \

    | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__)           \

    | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__)                \

    | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__)            \

    | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__)           \

    | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__)                \

    | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__)                \

    | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__)     \

    | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__)                  \

    | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__)                   \

    | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__)                    \

  }

  // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>

  // no ODR issue? The same could be said about several other flags above that are not checked

  // anywhere.


  struct _ArchTraits

  {

    __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;


    consteval bool

    _M_test(int __bit) const

    { return ((_M_flags >> __bit) & 1) == 1; }


    consteval bool

    _M_have_mmx() const

    { return _M_test(0); }


    consteval bool

    _M_have_sse() const

    { return _M_test(1); }


    consteval bool

    _M_have_sse2() const

    { return _M_test(2); }


    consteval bool

    _M_have_sse3() const

    { return _M_test(3); }


    consteval bool

    _M_have_ssse3() const

    { return _M_test(4); }


    consteval bool

    _M_have_sse4_1() const

    { return _M_test(5); }


    consteval bool

    _M_have_sse4_2() const

    { return _M_test(6); }


    consteval bool

    _M_have_popcnt() const

    { return _M_test(7); }


    consteval bool

    _M_have_avx() const

    { return _M_test(8); }


    consteval bool

    _M_have_f16c() const

    { return _M_test(9); }


    consteval bool

    _M_have_bmi() const

    { return _M_test(10); }


    consteval bool

    _M_have_bmi2() const

    { return _M_test(11); }


    consteval bool

    _M_have_lzcnt() const

    { return _M_test(12); }


    consteval bool

    _M_have_avx2() const

    { return _M_test(13); }


    consteval bool

    _M_have_fma() const

    { return _M_test(14); }


    consteval bool

    _M_have_avx512f() const

    { return _M_test(15); }


    consteval bool

    _M_have_avx512cd() const

    { return _M_test(16); }


    consteval bool

    _M_have_avx512dq() const

    { return _M_test(17); }


    consteval bool

    _M_have_avx512bw() const

    { return _M_test(18); }


    consteval bool

    _M_have_avx512vl() const

    { return _M_test(19); }


    consteval bool

    _M_have_avx512bitalg() const

    { return _M_test(20); }


    consteval bool

    _M_have_avx512vbmi() const

    { return _M_test(21); }


    consteval bool

    _M_have_avx512vbmi2() const

    { return _M_test(22); }


    consteval bool

    _M_have_avx512ifma() const

    { return _M_test(23); }


    consteval bool

    _M_have_avx512vnni() const

    { return _M_test(24); }


    consteval bool

    _M_have_avx512vpopcntdq() const

    { return _M_test(25); }


    consteval bool

    _M_have_avx512fp16() const

    { return _M_test(26); }


    consteval bool

    _M_have_avx512bf16() const

    { return _M_test(27); }


    consteval bool

    _M_have_avxifma() const

    { return _M_test(28); }


    consteval bool

    _M_have_avxneconvert() const

    { return _M_test(29); }


    consteval bool

    _M_have_avxvnni() const

    { return _M_test(30); }


    consteval bool

    _M_have_avxvnniint8() const

    { return _M_test(31); }


    consteval bool

    _M_have_avxvnniint16() const

    { return _M_test(32); }


    consteval bool

    _M_have_avx10_1() const

    { return _M_test(33); }


    consteval bool

    _M_have_avx10_2() const

    { return _M_test(34); }


    consteval bool

    _M_have_avx512vp2intersect() const

    { return _M_test(35); }


    consteval bool

    _M_have_sse4a() const

    { return _M_test(36); }


    consteval bool

    _M_have_fma4() const

    { return _M_test(37); }


    consteval bool

    _M_have_xop() const

    { return _M_test(38); }


    template <typename _Tp>

      consteval bool

      _M_eval_as_f32() const

      { return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); }

  };


  template <typename _Tp, _ArchTraits _Traits = {}>

    consteval auto

    __native_abi()

    {

      constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);

      if constexpr (!__vectorizable<_Tp>)

        return _InvalidAbi();

      else if constexpr (_Traits._M_have_avx512fp16())

        return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();

      else if constexpr (_Traits._M_have_avx512f())

        return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();

      else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c())

        return _ScalarAbi<1>();

      else if constexpr (_Traits._M_have_avx2())

        return _Abi_t<32 / __adj_sizeof, 1>();

      else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>)

        return _Abi_t<32 / __adj_sizeof, 1>();

      else if constexpr (_Traits._M_have_sse2())

        return _Abi_t<16 / __adj_sizeof, 1>();

      else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp>

                           && sizeof(_Tp) == sizeof(float))

        return _Abi_t<16 / __adj_sizeof, 1>();

      // no MMX: we can't emit EMMS where it would be necessary

      else

        return _ScalarAbi<1>();

    }


#else


  // scalar fallback

  struct _ArchTraits

  {

    __UINT64_TYPE__ _M_flags = 0;


    constexpr bool

    _M_test(int __bit) const

    { return ((_M_flags >> __bit) & 1) == 1; }

  };


  template <typename _Tp>

    consteval auto

    __native_abi()

    {

      if constexpr (!__vectorizable<_Tp>)

        return _InvalidAbi();

      else

        return _ScalarAbi<1>();

    }


#endif


  /** @internal

   * You must use this type as template argument to function templates that are not declared

   * always_inline (to avoid issues when linking code compiled with different compiler flags).

   */

  struct _TargetTraits

  : _ArchTraits, _OptTraits

  {};


  /** @internal

   * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of

   * optimal width.

   *

   * @tparam _Tp  A vectorizable type.

   *

   * C++26 [simd.expos.abi]

   */

  template <typename _Tp>

    using __native_abi_t = decltype(std::simd::__native_abi<_Tp>());


  template <typename _Tp, int _Np, _TargetTraits _Target = {}>

    consteval auto

    __deduce_abi()

    {

      constexpr auto __native = std::simd::__native_abi<_Tp>();

      if constexpr (0 == __native._S_size || _Np <= 0)

        return _InvalidAbi();

      else if constexpr (_Np == __native._S_size)

        return __native;

      else

        return __native.template _S_resize<_Np>();

    }


  /** @internal

   * Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements.

   *

   * C++26 [simd.expos.abi]

   */

  template <typename _Tp, int _Np>

    using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>());


  /** @internal

   * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination

   * value-type

   */

  template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>

    consteval auto

    __abi_rebind()

    {

      if constexpr (_Np <= 0 || !__vectorizable<_Tp>)

        return _InvalidAbi();


      else if constexpr (__scalar_abi_tag<_A0>)

        return _A0::template _S_resize<_Np>();


      else

        {

          using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>;

          static_assert(0 != _Native::_S_size);

          constexpr int __nreg = __div_ceil(_Np, _Native::_S_size);


          if constexpr (__scalar_abi_tag<_Native>)

            return _Native::template _S_resize<_Np>();

          else

            return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant,

                                                                    _AbiVariant::_MaskVariants)

                         >::template _S_resize<_Np, __nreg>();

        }

    }


  /** @internal

   * @c rebind implementation detail for basic_mask.

   *

   * The important difference here is that we have no information about the actual value-type other

   * than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t.

   * E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x

   * vector(2) long long`.

   * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the

   * value-type doesn't change.

   */

  template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>

    consteval auto

    __abi_rebind()

    {

      if constexpr (_Bytes == 0 || _Np <= 0)

        return _InvalidAbi();


      else if constexpr (__scalar_abi_tag<_A0>)

        return _A0::template _S_resize<_Np>();


#if _GLIBCXX_X86

      // AVX w/o AVX2:

      // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>

      // We determine whether _A0 identifies an AVX vector by looking at the size of a native

      // register. If it's 32, it's a YMM register, otherwise it's 16 or less.

      else if constexpr (_IsOnlyResize

                           && _Traits._M_have_avx() && !_Traits._M_have_avx2()

                           && __bit_ceil(__div_ceil<unsigned>(

                                            _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)

        {

          if constexpr (_Bytes == sizeof(double))

            return __abi_rebind<double, _Np, _A0>();

          else if constexpr (_Bytes == sizeof(float))

            return __abi_rebind<float, _Np, _A0>();

          else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16))

            return __abi_rebind<_Float16, _Np, _A0>();

          else // impossible

            static_assert(false);

        }

#endif


      else

        return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();

    }


  /** @internal

   * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.

   *

   * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does

   * compile on basically every other target, though. This is due to the difference in ABI tag:

   * _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers).

   * I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question

   * is whether we can specify this in general for C++29.

   *

   * Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as

   *   deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this

   *   makes the conversion behavior dependent on compiler flags. Probably not what we want.

   */

  template <typename _To, typename _From>

  consteval bool

    __is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1)

    {

      constexpr int __n = _To::_S_size;

      static_assert(__n == _From::_S_size);

#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION

      /// C++26 [simd.mask.ctor] uses unconditional explicit

      return true;

#else

      if (__b0 != __b1)

        return true;


      // everything is better than _ScalarAbi, except when converting to a single bool

      if constexpr (__scalar_abi_tag<_To>)

        return __n > 1;

      else if constexpr (__scalar_abi_tag<_From>)

        return true;


      // converting to a bit-mask is better

      else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask)

        return _To::_S_is_vecmask; // to vector-mask is explicit


      // with vec-masks, fewer registers is better

      else if constexpr (_From::_S_nreg != _To::_S_nreg)

        return _From::_S_nreg < _To::_S_nreg;


      else

        __builtin_unreachable();

#endif

    }


  /** @internal

   * An alias for a signed integer type.

   *

   * libstdc++ unconditionally uses @c int here, since it matches the return type of

   * 'Bit Operation Builtins' in GCC.

   *

   * C++26 [simd.expos.defn]

   */

  using __simd_size_type = int;


  // integral_constant shortcut

  template <__simd_size_type _Xp>

    inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {};


  // [simd.syn]

  template <typename _Tp, typename _Ap = __native_abi_t<_Tp>>

    class basic_vec;


  template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>

    using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;


  template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>>

    class basic_mask;


  template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>

    using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;


  // [simd.ctor] load constructor constraints

  template <typename _Tp, size_t _Np = -1uz>

    concept __static_sized_range

      = ranges::sized_range<_Tp> && requires(_Tp&& __r) {

        typename integral_constant<size_t, ranges::size(__r)>;

        requires (_Np == -1uz || ranges::size(__r) == _Np);

      };


  template <typename _Rg>

    consteval size_t

    __static_range_size(_Rg& __r)

    {

      if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; })

        return ranges::size(__r);

      else

        return dynamic_extent;

    }


  // [simd.general] value-preserving

  template <typename _From, typename _To>

    concept __arithmetic_only_value_preserving_convertible_to

      = convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To>

          && !(is_signed_v<_From> && is_unsigned_v<_To>)

          && numeric_limits<_From>::digits <= numeric_limits<_To>::digits

          && numeric_limits<_From>::max() <= numeric_limits<_To>::max()

          && numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();


  /** @internal

   * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.

   *

   * C++26 [simd.general]

   */

  template <typename _From, typename _To>

    concept __value_preserving_convertible_to

      = __arithmetic_only_value_preserving_convertible_to<_From, _To>;


  // LWG4420

  template <typename _From, typename _To>

    concept __explicitly_convertible_to = requires {

      static_cast<_To>(declval<_From>());

    };


  /** @internal

   * C++26 [simd.expos]

   */

  template<typename _Tp>

    concept __constexpr_wrapper_like

      = convertible_to<_Tp, decltype(_Tp::value)>

          && equality_comparable_with<_Tp, decltype(_Tp::value)>

          && bool_constant<_Tp() == _Tp::value>::value

          && bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;


  // [simd.ctor] explicit(...) of broadcast ctor

  template <auto _From, typename _To>

    concept __non_narrowing_constexpr_conversion

      = is_arithmetic_v<decltype(_From)>

          && static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From

          && !(unsigned_integral<_To> && _From < decltype(_From)())

          && _From <= std::numeric_limits<_To>::max()

          && _From >= std::numeric_limits<_To>::lowest();


  // [simd.ctor] p4

  // This implements LWG4436 (submitted on 2025-10-28)

  template <typename _From, typename _To>

    concept __broadcast_constructible

      = ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>>

            && !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1

           || __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2

           || (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3

                 && __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value),

                                                         _To>));


  // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)

  template <typename _From, typename _To>

    consteval bool

    __higher_floating_point_rank_than()

    {

      return floating_point<_From> && floating_point<_To>

               && is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>;

    }


  // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)

  template <typename _From, typename _To>

    consteval bool

    __higher_integer_rank_than()

    {

      return integral<_From> && integral<_To>

               && (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>)

               && !is_same_v<_From, _To>;

    }


  template <typename _From, typename _To>

    concept __higher_rank_than

      = __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>();


  struct __convert_flag;


  template <typename _From, typename _To, typename... _Flags>

    concept __loadstore_convertible_to

      = same_as<_From, _To>

          || (__vectorizable<_From> && __vectorizable<_To>

                && (__value_preserving_convertible_to<_From, _To>

                       || (__explicitly_convertible_to<_From, _To>

                             && (std::is_same_v<_Flags, __convert_flag> || ...))));


  template <typename _From, typename _To>

    concept __simd_generator_convertible_to

      = std::convertible_to<_From, _To>

          && (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>);


  template <typename _Fp, typename _Tp, __simd_size_type... _Is>

    requires (__simd_generator_convertible_to<

                decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...)

    constexpr void

    __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);


  template <typename _Fp, typename _Tp, __simd_size_type _Np>

    concept __simd_generator_invokable = requires {

      __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());

    };


  template <typename _Fp>

    concept __index_permutation_function_sized = requires(_Fp const& __f)

      {

        { __f(0, 0) } -> std::integral;

      };


  template <typename _Fp, typename _Simd>

    concept __index_permutation_function

      = __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) {

        { __f(0) } -> std::integral;

      };


  /** @internal

   * The value of the @c _Bytes template argument to a @c basic_mask specialization.

   *

   * C++26 [simd.expos.defn]

   */

  template <typename _Tp>

    constexpr size_t __mask_element_size = 0;


  template <size_t _Bytes, __abi_tag _Ap>

    constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes;


  // [simd.expos]

  template <typename _Vp>

    concept __simd_vec_type

      = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>

          && is_default_constructible_v<_Vp>;


  template <typename _Vp>

    concept __simd_mask_type

      = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>

        && is_default_constructible_v<_Vp>;


  /** @internal

   * Satisfied if @p _Tp is a data-parallel type.

   */

  template <typename _Vp>

    concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>;


  template <typename _Vp>

    concept __simd_floating_point

      = __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>;


  template <typename _Vp>

    concept __simd_integral

      = __simd_vec_type<_Vp> && integral<typename _Vp::value_type>;


  template <typename _Tp>

    concept __converts_to_vec

      = __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>;


  template <__converts_to_vec _Tp>

    using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>());


  template <typename _Vp, typename _Tp>

    using __make_compatible_simd_t

      = decltype([] {

          using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());

          if constexpr (__simd_vec_type<_Up>)

            return _Up();

          else

            return vec<_Up, _Vp::size()>();

      }());


  template <typename _Tp>

    concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>;


  template <typename _BinaryOperation, typename _Tp>

    concept __reduction_binary_operation

      = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {

        { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;

      };


  /** @internal

   * Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1.

   */

  [[__gnu__::__always_inline__]]

  constexpr __simd_size_type

  __highest_bit(std::unsigned_integral auto __bits)

  {

    using __gnu_cxx::__int_traits;

    constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits;

    return _Nd - 1 - __countl_zero(__bits);

  }


  template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>

    using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;


  // Allow _Tp to be _InvalidInteger for __integer_from<16>

  template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>

    using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;


  // LWG4470 [simd.expos]

  template <size_t _Bytes, typename _Ap>

    using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;


#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844)

  class __bad_value_preserving_cast

  {};


#define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast

#else

  void __bad_value_preserving_cast(); // not defined


#define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast

#endif


  template <typename _To, typename _From>

#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844

    [[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions

#endif

    consteval _To

    __value_preserving_cast(const _From& __x)

    {

      static_assert(is_arithmetic_v<_From>);

      if constexpr (!__value_preserving_convertible_to<_From, _To>)

        {

          using _Up = typename __make_unsigned<_From>::__type;

          if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x))

            __glibcxx_on_bad_value_preserving_cast();

          else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>)

            {

              if (__x < _From())

                __glibcxx_on_bad_value_preserving_cast();

            }

          else if constexpr (unsigned_integral<_From> && signed_integral<_To>)

            {

              if (__x > numeric_limits<_To>::max())

                __glibcxx_on_bad_value_preserving_cast();

            }

        }

      return static_cast<_To>(__x);

    }


  template <typename _From, typename _To>

    concept __simd_vec_bcast_consteval

      = __explicitly_convertible_to<_From, _To>

          && is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To>

          && !__value_preserving_convertible_to<remove_cvref_t<_From>, _To>

          && (is_same_v<common_type_t<_From, _To>, _To>

                || (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>)

                || (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>));


  /** @internal

   * std::pair is not trivially copyable, this one is

   */

  template <typename _T0, typename _T1>

    struct __trivial_pair

    {

      _T0 _M_first;

      _T1 _M_second;

    };


  template <typename _From, typename _To>

    concept __converts_trivially = convertible_to<_From, _To>

                                     && sizeof(_From) == sizeof(_To)

                                     && is_integral_v<_From> == is_integral_v<_To>

                                     && is_floating_point_v<_From> == is_floating_point_v<_To>;


  [[__gnu__::__always_inline__]]

  constexpr void

  __bit_foreach(unsigned_integral auto __bits, auto&& __fun)

  {

    static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int

    while (__bits)

      {

        __fun(__countr_zero(__bits));

        __bits &= (__bits - 1);

      }

  }


  /** @internal

   * Optimized @c memcpy for use in partial loads and stores.

   *

   * The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the

   * number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy

   * calls.

   *

   * @tparam _Chunk   Copies @p __n times @p _Chunk bytes.

   * @tparam _Max     Copy no more than @p _Max bytes.

   *

   * @param  __dst    The destination pointer.

   * @param  __src    The source pointer.

   * @param  __n      Thu number of chunks that need to be copied.

   */

  template <size_t _Chunk, size_t _Max>

    inline void

    __memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src,

                    size_t __n)

    {

      static_assert(_Max <= 64);

      static_assert(__has_single_bit(_Chunk) && _Chunk <= 8);

      size_t __bytes = _Chunk * __n;

      if (__builtin_constant_p(__bytes))

        { // If __n is known via constant propagation use a single memcpy call. Since this is still

          // a fixed-size memcpy to the compiler, this leaves more room for optimization.

          __builtin_memcpy(__dst, __src, __bytes);

        }

      else if (__bytes > 32 && _Max > 32)

        {

          __builtin_memcpy(__dst, __src, 32);

          __bytes -= 32;

          __builtin_memcpy(__dst + __bytes, __src + __bytes, 32);

        }

      else if (__bytes > 16 && _Max > 16)

        {

          __builtin_memcpy(__dst, __src, 16);

          if constexpr (_Chunk == 8)

            {

              __bytes -= 8;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);

            }

          else

            {

              __bytes -= 16;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 16);

            }

        }

      else if (__bytes > 8 && _Max > 8)

        {

          __builtin_memcpy(__dst, __src, 8);

          if constexpr (_Chunk == 4)

            {

              __bytes -= 4;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);

            }

          else if constexpr (_Chunk < 4)

            {

              __bytes -= 8;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);

            }

        }

      else if (__bytes > 4 && _Max > 4)

        {

          __builtin_memcpy(__dst, __src, 4);

          if constexpr (_Chunk == 2)

            {

              __bytes -= 2;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);

            }

          else if constexpr (_Chunk == 1)

            {

              __bytes -= 4;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);

            }

        }

      else if (__bytes >= 2)

        {

          __builtin_memcpy(__dst, __src, 2);

          if constexpr (_Chunk == 2)

            {

              __bytes -= 2;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);

            }

          else if constexpr (_Chunk == 1)

            {

              __bytes -= 1;

              __builtin_memcpy(__dst + __bytes, __src + __bytes, 1);

            }

        }

      else if (__bytes == 1)

        __builtin_memcpy(__dst, __src, 1);

    }


  // [simd.reductions] identity_element = *see below*

  template <typename _Tp, typename _BinaryOperation>

    requires __is_one_of<_BinaryOperation,

                         plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value

    consteval _Tp

    __default_identity_element()

    {

      if constexpr (same_as<_BinaryOperation, multiplies<>>)

        return _Tp(1);

      else if constexpr (same_as<_BinaryOperation, bit_and<>>)

        return _Tp(~_Tp());

      else

        return _Tp(0);

    }

} // namespace simd

_GLIBCXX_END_NAMESPACE_VERSION

} // namespace std


#pragma GCC diagnostic pop

#endif // C++26

#endif // _GLIBCXX_SIMD_DETAILS_H

bit

concepts

limits

span

cmath

cstdint

stl_function.h

utility.h

c++config.h

std::underlying_type_t
typename underlying_type< _Tp >::type underlying_type_t
Alias template for underlying_type.
Definition type_traits:2952

std::make_unsigned_t
typename make_unsigned< _Tp >::type make_unsigned_t
Alias template for make_unsigned.
Definition type_traits:2246

std::declval
auto declval() noexcept -> decltype(__declval< _Tp >(0))
Definition type_traits:2714

std
ISO C++ entities toplevel namespace is std.

std::make_integer_sequence
__make_integer_seq< integer_sequence, _Tp, _Num > make_integer_sequence
Alias template make_integer_sequence.
Definition utility.h:522

__gnu_cxx::__int_traits
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.
Definition ext/numeric_traits.h:134

std::__numeric_limits_base::digits
static constexpr int digits
Definition limits:218

std::numeric_limits::max
static constexpr _Tp max() noexcept
Definition limits:328

std::numeric_limits::lowest
static constexpr _Tp lowest() noexcept
Definition limits:334