libstdc++
simd_details.h
1// Implementation of <simd> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_SIMD_DETAILS_H
26#define _GLIBCXX_SIMD_DETAILS_H 1
27
28#ifdef _GLIBCXX_SYSHDR
29#pragma GCC system_header
30#endif
31
32#if __cplusplus >= 202400L
33
34#include <bit>
35#include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32
36#include <bits/stl_function.h> // plus, minus, multiplies, ...
37#include <bits/utility.h> // integer_sequence, etc.
38#include <cmath> // for math_errhandling :(
39#include <concepts>
40#include <cstdint>
41#include <limits>
42#include <span> // for dynamic_extent
43
44#if __CHAR_BIT__ != 8
45// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
46// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
47#error "<simd> is not supported for CHAR_BIT != 8"
48#endif
49
50// psabi warnings are bogus because the ABI of the internal types never leaks into user code
51#pragma GCC diagnostic push
52#pragma GCC diagnostic ignored "-Wpsabi"
53
54#if defined __x86_64__ || defined __i386__
55#define _GLIBCXX_X86 1
56#else
57#define _GLIBCXX_X86 0
58#endif
59
60#ifndef _GLIBCXX_SIMD_NOEXCEPT
61/** @internal
62 * For unit-testing preconditions, use this macro to remove noexcept.
63 */
64#define _GLIBCXX_SIMD_NOEXCEPT noexcept
65#endif
66
67#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
68#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)
69
70// This is used for unit-testing precondition checking
71#define __glibcxx_simd_precondition(expr, msg, ...) \
72 __glibcxx_assert(expr)
73
74namespace std _GLIBCXX_VISIBILITY(default)
75{
76_GLIBCXX_BEGIN_NAMESPACE_VERSION
77
78namespace simd
79{
80 template <typename _Tp>
81 inline constexpr _Tp
82 __iota = [] { static_assert(false, "invalid __iota specialization"); }();
83
84 // [simd.general] vectorizable types
85
86 template <typename _Tp>
87 concept __vectorizable_scalar
88 = same_as<remove_cv_t<_Tp>, _Tp>
89#ifdef __STDCPP_BFLOAT16_T__
90 && !same_as<_Tp, __gnu_cxx::__bfloat16_t>
91#endif
92 && ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>)
93 || (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double)));
94
95 // [simd.general] p2
96 template <typename _Tp>
97 concept __vectorizable = __vectorizable_scalar<_Tp>;
98
99 /** @internal
100 * Describes variants of _Abi.
101 */
102 enum class _AbiVariant : unsigned long long
103 {
104 _BitMask = 0x01, // AVX512 bit-masks
105 _MaskVariants = 0x0f, // vector masks if bits [0:3] are 0
106 };
107
108 /** @internal
109 * Return @p __in with only bits set that are set in any of @p __to_keep.
110 */
111 consteval _AbiVariant
112 __filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep)
113 {
115 return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...));
116 }
117
118 /** @internal
119 * Type used whenever no valid integer/value type exists.
120 */
121 struct _InvalidInteger
122 {};
123
124 /** @internal
125 * Alias for a signed integer type T such that sizeof(T) equals _Bytes.
126 *
127 * C++26 [simd.expos.defn]
128 */
129 template <size_t _Bytes>
130 using __integer_from
131 = decltype([] consteval {
132 if constexpr (sizeof(signed char) == _Bytes)
133 return static_cast<signed char>(0);
134 else if constexpr (sizeof(signed short) == _Bytes)
135 return static_cast<signed short>(0);
136 else if constexpr (sizeof(signed int) == _Bytes)
137 return static_cast<signed int>(0);
138 else if constexpr (sizeof(signed long long) == _Bytes)
139 return static_cast<signed long long>(0);
140 else
141 return _InvalidInteger();
142 }());
143
144 /** @internal
145 * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
146 */
147 template <size_t _Bytes>
149
150 /** @internal
151 * Divide @p __x by @p __y while rounding up instead of down.
152 *
153 * Preconditions: __x >= 0 && __y > 0.
154 */
155 template <typename _Tp>
156 consteval _Tp
157 __div_ceil(_Tp __x, _Tp __y)
158 { return (__x + __y - 1) / __y; }
159
160 /** @internal
161 * Alias for an unsigned integer type that can store at least @p _NBits bits.
162 */
163 template <int _NBits>
164 requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits)
165 using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;
166
167 /** @internal
168 * Map a given type @p _Tp to an equivalent type.
169 *
170 * This helps with reducing the necessary branches && casts in the implementation as well as
171 * reducing the number of template instantiations.
172 */
173 template <typename _Tp>
174 struct __canonical_vec_type
175 { using type = _Tp; };
176
177 template <typename _Tp>
178 using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;
179
180#if __SIZEOF_INT__ == __SIZEOF_LONG__
181 template <>
182 struct __canonical_vec_type<long>
183 { using type = int; };
184
185 template <>
186 struct __canonical_vec_type<unsigned long>
187 { using type = unsigned int; };
188#elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__
189 template <>
190 struct __canonical_vec_type<long>
191 { using type = long long; };
192
193 template <>
194 struct __canonical_vec_type<unsigned long>
195 { using type = unsigned long long; };
196#endif
197
198 template <typename _Tp>
199 requires std::is_enum_v<_Tp>
200 struct __canonical_vec_type<_Tp>
201 { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };
202
203 template <>
204 struct __canonical_vec_type<char>
205#if __CHAR_UNSIGNED__
206 { using type = unsigned char; };
207#else
208 { using type = signed char; };
209#endif
210
211 template <>
212 struct __canonical_vec_type<char8_t>
213 { using type = unsigned char; };
214
215 template <>
216 struct __canonical_vec_type<char16_t>
217 { using type = uint_least16_t; };
218
219 template <>
220 struct __canonical_vec_type<char32_t>
221 { using type = uint_least32_t; };
222
223 template <>
224 struct __canonical_vec_type<wchar_t>
225 {
226 using type = std::__conditional_t<std::is_signed_v<wchar_t>,
227 simd::__integer_from<sizeof(wchar_t)>,
228 simd::_UInt<sizeof(wchar_t)>>;
229 };
230
231#if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64)
232 template <>
233 struct __canonical_vec_type<_Float64>
234 { using type = double; };
235#endif
236
237#if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
238 template <>
239 struct __canonical_vec_type<_Float32>
240 { using type = float; };
241#endif
242
243 /** @internal
244 * This ABI tag describes basic_vec objects that store one element per data member and basic_mask
245 * objects that store one bool data members.
246 *
247 * @tparam _Np The number of elements, which also matches the number of data members in
248 * basic_vec and basic_mask.
249 */
250 template <int _Np = 1>
251 struct _ScalarAbi
252 {
253 static constexpr int _S_size = _Np;
254
255 static constexpr int _S_nreg = _Np;
256
257 static constexpr _AbiVariant _S_variant = {};
258
259 template <typename _Tp>
260 using _DataType = __canonical_vec_type_t<_Tp>;
261
262 static constexpr bool _S_is_vecmask = false;
263
264 // in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask
265 static constexpr bool _S_is_bitmask = false;
266
267 template <size_t>
268 using _MaskDataType = bool;
269
270 template <int _N2, int _Nreg2 = _N2>
271 static consteval _ScalarAbi<_N2>
272 _S_resize()
273 {
274 static_assert(_N2 == _Nreg2);
275 return {};
276 }
277 };
278
279 /** @internal
280 * This ABI tag describes basic_vec objects that store one or more objects declared with the
281 * [[gnu::vector_size(N)]] attribute.
282 * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
283 * or bit-mask objects. Which one is used is determined via @p _Var.
284 *
285 * @tparam _Np The number of elements.
286 * @tparam _Nreg The number of registers needed to store @p _Np elements.
287 * @tparam _Var Determines how complex value-types are layed out and whether mask types use
288 * bit-masks or vector-masks.
289 */
290 template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var>
291 struct _Abi
292 {
293 static constexpr int _S_size = _Np;
294
295 /** @internal
296 * The number of registers needed to represent one basic_vec for the element type that was
297 * used on ABI deduction.
298 *
299 * Examples:
300 * - '_Abi< 8, 2>' for 'int' is 2x 128-bit
301 * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
302 * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
303 * - '_Abi<10, 1>' for 'int' is 1x 512-bit
304 * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
305 */
306 static constexpr int _S_nreg = _Nreg;
307
308 static_assert(_S_size > 0);
309 static_assert(_S_nreg > 0);
310
311 static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);
312
313 static constexpr bool _S_is_bitmask
314 = __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask;
315
316 static constexpr bool _S_is_vecmask = !_S_is_bitmask;
317
318 template <typename _Tp>
319 using _DataType = decltype([] {
320 static_assert(_S_nreg == 1);
321 if constexpr (_S_size == 1)
322 return __canonical_vec_type_t<_Tp>();
323 else
324 {
325 constexpr int __n = __bit_ceil(unsigned(_S_size));
326 using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
327 = __canonical_vec_type_t<_Tp>;
328 return _Vp();
329 }
330 }());
331
332 template <size_t _Bytes>
333 using _MaskDataType
334 = decltype([] {
335 static_assert(_S_nreg == 1);
336 if constexpr (_S_size == 1)
337 return bool();
338 else if constexpr (_S_is_vecmask)
339 {
340 constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
341 using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
342 return _Vp();
343 }
344 else if constexpr (_Nreg > 1)
345 return _InvalidInteger();
346 else
347 return _Bitmask<_S_size>();
348 }());
349
350 template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
351 static consteval auto
352 _S_resize()
353 {
354 if constexpr (_N2 == 1)
355 return _Abi<1, 1, _Var>();
356 else
357 return _Abi<_N2, _Nreg2, _Var>();
358 }
359 };
360
361 /** @internal
362 * Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer
363 * value.
364 *
365 * Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an
366 * alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the
367 * template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int,
368 * std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot
369 * longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As
370 * soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans
371 * anyway.
372 */
373 template <int _Np, int _Nreg, _AbiVariant... _Vs>
374 using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>;
375
376 /** @internal
377 * This type is used whenever ABI tag deduction can't give a useful answer.
378 */
379 struct _InvalidAbi
380 { static constexpr int _S_size = 0; };
381
382 /** @internal
383 * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
384 * for an enabled basic_vec/basic_mask specialization.
385 */
386 template <typename _Tp>
387 concept __abi_tag
388 = same_as<decltype(_Tp::_S_variant), const _AbiVariant>
389 && (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1)
390 && requires(_Tp __x) {
391 { __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
392 };
393
394 template <typename _Tp>
395 concept __scalar_abi_tag
396 = same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>;
397
398 // Determine if math functions must *raise* floating-point exceptions.
399 // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
400 // need to be considered. A conforming C library must define math_errhandling, but in case it
401 // isn't defined we simply use the fallback.
402#ifdef math_errhandling
403 template <int = 0>
404 requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
405 consteval bool
406 __handle_fpexcept_impl(int)
407 { return 0 != (math_errhandling & MATH_ERREXCEPT); }
408#endif
409
410 // Fallback if math_errhandling doesn't work: implement correct exception behavior.
411 consteval bool
412 __handle_fpexcept_impl(float)
413 { return true; }
414
415 /** @internal
416 * This type can be used as a template parameter for avoiding ODR violations, where code needs to
417 * differ depending on optimization flags (mostly fp-math related).
418 */
419 struct _OptTraits
420 {
421 consteval bool
422 _M_test(int __bit) const
423 { return ((_M_build_flags >> __bit) & 1) == 1; }
424
425 // true iff floating-point operations can signal an exception (allow non-default handler)
426 consteval bool
427 _M_fp_may_signal() const
428 { return _M_test(0); }
429
430 // true iff floating-point operations can raise an exception flag
431 consteval bool
432 _M_fp_may_raise() const
433 { return _M_test(12); }
434
435 consteval bool
436 _M_fast_math() const
437 { return _M_test(1); }
438
439 consteval bool
440 _M_finite_math_only() const
441 { return _M_test(2); }
442
443 consteval bool
444 _M_no_signed_zeros() const
445 { return _M_test(3); }
446
447 consteval bool
448 _M_signed_zeros() const
449 { return !_M_test(3); }
450
451 consteval bool
452 _M_reciprocal_math() const
453 { return _M_test(4); }
454
455 consteval bool
456 _M_no_math_errno() const
457 { return _M_test(5); }
458
459 consteval bool
460 _M_math_errno() const
461 { return !_M_test(5); }
462
463 consteval bool
464 _M_associative_math() const
465 { return _M_test(6); }
466
467 consteval bool
468 _M_conforming_to_STDC_annex_G() const
469 { return _M_test(10) && !_M_finite_math_only(); }
470
471 consteval bool
472 _M_support_snan() const
473 { return _M_test(11); }
474
475 __UINT64_TYPE__ _M_build_flags
476 = 0
477#if !__NO_TRAPPING_MATH__
478 + (1 << 0)
479#endif
480 + (__handle_fpexcept_impl(0) << 12)
481#if __FAST_MATH__
482 + (1 << 1)
483#endif
484#if __FINITE_MATH_ONLY__
485 + (1 << 2)
486#endif
487#if __NO_SIGNED_ZEROS__
488 + (1 << 3)
489#endif
490#if __RECIPROCAL_MATH__
491 + (1 << 4)
492#endif
493#if __NO_MATH_ERRNO__
494 + (1 << 5)
495#endif
496#if __ASSOCIATIVE_MATH__
497 + (1 << 6)
498#endif
499 // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
500#if __FLT_EVAL_METHOD__ == 1
501 + (1 << 7)
502#elif __FLT_EVAL_METHOD__ == 2
503 + (2 << 7)
504#elif __FLT_EVAL_METHOD__ != 0
505 + (3 << 7)
506#endif
507
508 // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
509 // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
510 // will do so as well. However, Clang never defines the macro.
511#if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG
512 + (1 << 10)
513#endif
514#if __SUPPORT_SNAN__
515 + (1 << 11)
516#endif
517 ;
518 };
519
520 /** @internal
521 * Return true iff @p __s equals "1".
522 */
523 consteval bool
524 __streq_to_1(const char* __s)
525 { return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; }
526
527 /** @internal
528 * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
529 * Otherwise, expand to zero.
530 */
531#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
532 (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)
533
534#if _GLIBCXX_X86
535
536#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT { \
537 _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__) \
538 | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__) \
539 | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__) \
540 | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__) \
541 | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__) \
542 | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__) \
543 | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__) \
544 | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__) \
545 | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__) \
546 | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__) \
547 | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__) \
548 | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__) \
549 | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__) \
550 | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__) \
551 | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__) \
552 | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__) \
553 | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__) \
554 | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__) \
555 | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__) \
556 | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__) \
557 | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__) \
558 | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__) \
559 | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__) \
560 | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__) \
561 | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__) \
562 | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__) \
563 | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__) \
564 | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__) \
565 | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__) \
566 | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__) \
567 | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__) \
568 | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__) \
569 | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__) \
570 | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__) \
571 | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__) \
572 | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__) \
573 | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__) \
574 | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__) \
575 | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__) \
576 }
577 // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
578 // no ODR issue? The same could be said about several other flags above that are not checked
579 // anywhere.
580
581 struct _ArchTraits
582 {
583 __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;
584
585 consteval bool
586 _M_test(int __bit) const
587 { return ((_M_flags >> __bit) & 1) == 1; }
588
589 consteval bool
590 _M_have_mmx() const
591 { return _M_test(0); }
592
593 consteval bool
594 _M_have_sse() const
595 { return _M_test(1); }
596
597 consteval bool
598 _M_have_sse2() const
599 { return _M_test(2); }
600
601 consteval bool
602 _M_have_sse3() const
603 { return _M_test(3); }
604
605 consteval bool
606 _M_have_ssse3() const
607 { return _M_test(4); }
608
609 consteval bool
610 _M_have_sse4_1() const
611 { return _M_test(5); }
612
613 consteval bool
614 _M_have_sse4_2() const
615 { return _M_test(6); }
616
617 consteval bool
618 _M_have_popcnt() const
619 { return _M_test(7); }
620
621 consteval bool
622 _M_have_avx() const
623 { return _M_test(8); }
624
625 consteval bool
626 _M_have_f16c() const
627 { return _M_test(9); }
628
629 consteval bool
630 _M_have_bmi() const
631 { return _M_test(10); }
632
633 consteval bool
634 _M_have_bmi2() const
635 { return _M_test(11); }
636
637 consteval bool
638 _M_have_lzcnt() const
639 { return _M_test(12); }
640
641 consteval bool
642 _M_have_avx2() const
643 { return _M_test(13); }
644
645 consteval bool
646 _M_have_fma() const
647 { return _M_test(14); }
648
649 consteval bool
650 _M_have_avx512f() const
651 { return _M_test(15); }
652
653 consteval bool
654 _M_have_avx512cd() const
655 { return _M_test(16); }
656
657 consteval bool
658 _M_have_avx512dq() const
659 { return _M_test(17); }
660
661 consteval bool
662 _M_have_avx512bw() const
663 { return _M_test(18); }
664
665 consteval bool
666 _M_have_avx512vl() const
667 { return _M_test(19); }
668
669 consteval bool
670 _M_have_avx512bitalg() const
671 { return _M_test(20); }
672
673 consteval bool
674 _M_have_avx512vbmi() const
675 { return _M_test(21); }
676
677 consteval bool
678 _M_have_avx512vbmi2() const
679 { return _M_test(22); }
680
681 consteval bool
682 _M_have_avx512ifma() const
683 { return _M_test(23); }
684
685 consteval bool
686 _M_have_avx512vnni() const
687 { return _M_test(24); }
688
689 consteval bool
690 _M_have_avx512vpopcntdq() const
691 { return _M_test(25); }
692
693 consteval bool
694 _M_have_avx512fp16() const
695 { return _M_test(26); }
696
697 consteval bool
698 _M_have_avx512bf16() const
699 { return _M_test(27); }
700
701 consteval bool
702 _M_have_avxifma() const
703 { return _M_test(28); }
704
705 consteval bool
706 _M_have_avxneconvert() const
707 { return _M_test(29); }
708
709 consteval bool
710 _M_have_avxvnni() const
711 { return _M_test(30); }
712
713 consteval bool
714 _M_have_avxvnniint8() const
715 { return _M_test(31); }
716
717 consteval bool
718 _M_have_avxvnniint16() const
719 { return _M_test(32); }
720
721 consteval bool
722 _M_have_avx10_1() const
723 { return _M_test(33); }
724
725 consteval bool
726 _M_have_avx10_2() const
727 { return _M_test(34); }
728
729 consteval bool
730 _M_have_avx512vp2intersect() const
731 { return _M_test(35); }
732
733 consteval bool
734 _M_have_sse4a() const
735 { return _M_test(36); }
736
737 consteval bool
738 _M_have_fma4() const
739 { return _M_test(37); }
740
741 consteval bool
742 _M_have_xop() const
743 { return _M_test(38); }
744
745 template <typename _Tp>
746 consteval bool
747 _M_eval_as_f32() const
748 { return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); }
749 };
750
751 template <typename _Tp, _ArchTraits _Traits = {}>
752 consteval auto
753 __native_abi()
754 {
755 constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
756 if constexpr (!__vectorizable<_Tp>)
757 return _InvalidAbi();
758 else if constexpr (_Traits._M_have_avx512fp16())
759 return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
760 else if constexpr (_Traits._M_have_avx512f())
761 return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
762 else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c())
763 return _ScalarAbi<1>();
764 else if constexpr (_Traits._M_have_avx2())
765 return _Abi_t<32 / __adj_sizeof, 1>();
766 else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>)
767 return _Abi_t<32 / __adj_sizeof, 1>();
768 else if constexpr (_Traits._M_have_sse2())
769 return _Abi_t<16 / __adj_sizeof, 1>();
770 else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp>
771 && sizeof(_Tp) == sizeof(float))
772 return _Abi_t<16 / __adj_sizeof, 1>();
773 // no MMX: we can't emit EMMS where it would be necessary
774 else
775 return _ScalarAbi<1>();
776 }
777
778#else
779
780 // scalar fallback
781 struct _ArchTraits
782 {
783 __UINT64_TYPE__ _M_flags = 0;
784
785 constexpr bool
786 _M_test(int __bit) const
787 { return ((_M_flags >> __bit) & 1) == 1; }
788 };
789
790 template <typename _Tp>
791 consteval auto
792 __native_abi()
793 {
794 if constexpr (!__vectorizable<_Tp>)
795 return _InvalidAbi();
796 else
797 return _ScalarAbi<1>();
798 }
799
800#endif
801
802 /** @internal
803 * You must use this type as template argument to function templates that are not declared
804 * always_inline (to avoid issues when linking code compiled with different compiler flags).
805 */
806 struct _TargetTraits
807 : _ArchTraits, _OptTraits
808 {};
809
810 /** @internal
811 * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
812 * optimal width.
813 *
814 * @tparam _Tp A vectorizable type.
815 *
816 * C++26 [simd.expos.abi]
817 */
818 template <typename _Tp>
819 using __native_abi_t = decltype(std::simd::__native_abi<_Tp>());
820
821 template <typename _Tp, int _Np, _TargetTraits _Target = {}>
822 consteval auto
823 __deduce_abi()
824 {
825 constexpr auto __native = std::simd::__native_abi<_Tp>();
826 if constexpr (0 == __native._S_size || _Np <= 0)
827 return _InvalidAbi();
828 else if constexpr (_Np == __native._S_size)
829 return __native;
830 else
831 return __native.template _S_resize<_Np>();
832 }
833
834 /** @internal
835 * Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements.
836 *
837 * C++26 [simd.expos.abi]
838 */
839 template <typename _Tp, int _Np>
840 using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>());
841
842 /** @internal
843 * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
844 * value-type
845 */
846 template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
847 consteval auto
848 __abi_rebind()
849 {
850 if constexpr (_Np <= 0 || !__vectorizable<_Tp>)
851 return _InvalidAbi();
852
853 else if constexpr (__scalar_abi_tag<_A0>)
854 return _A0::template _S_resize<_Np>();
855
856 else
857 {
858 using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>;
859 static_assert(0 != _Native::_S_size);
860 constexpr int __nreg = __div_ceil(_Np, _Native::_S_size);
861
862 if constexpr (__scalar_abi_tag<_Native>)
863 return _Native::template _S_resize<_Np>();
864 else
865 return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant,
866 _AbiVariant::_MaskVariants)
867 >::template _S_resize<_Np, __nreg>();
868 }
869 }
870
871 /** @internal
872 * @c rebind implementation detail for basic_mask.
873 *
874 * The important difference here is that we have no information about the actual value-type other
875 * than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t.
876 * E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x
877 * vector(2) long long`.
878 * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
879 * value-type doesn't change.
880 */
881 template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
882 consteval auto
883 __abi_rebind()
884 {
885 if constexpr (_Bytes == 0 || _Np <= 0)
886 return _InvalidAbi();
887
888 else if constexpr (__scalar_abi_tag<_A0>)
889 return _A0::template _S_resize<_Np>();
890
891#if _GLIBCXX_X86
892 // AVX w/o AVX2:
893 // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
894 // We determine whether _A0 identifies an AVX vector by looking at the size of a native
895 // register. If it's 32, it's a YMM register, otherwise it's 16 or less.
896 else if constexpr (_IsOnlyResize
897 && _Traits._M_have_avx() && !_Traits._M_have_avx2()
898 && __bit_ceil(__div_ceil<unsigned>(
899 _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
900 {
901 if constexpr (_Bytes == sizeof(double))
902 return __abi_rebind<double, _Np, _A0>();
903 else if constexpr (_Bytes == sizeof(float))
904 return __abi_rebind<float, _Np, _A0>();
905 else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16))
906 return __abi_rebind<_Float16, _Np, _A0>();
907 else // impossible
908 static_assert(false);
909 }
910#endif
911
912 else
913 return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
914 }
915
916 /** @internal
917 * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
918 *
919 * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
920 * compile on basically every other target, though. This is due to the difference in ABI tag:
921 * _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers).
922 * I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question
923 * is whether we can specify this in general for C++29.
924 *
925 * Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as
926 * deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this
927 * makes the conversion behavior dependent on compiler flags. Probably not what we want.
928 */
929 template <typename _To, typename _From>
930 consteval bool
931 __is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1)
932 {
933 constexpr int __n = _To::_S_size;
934 static_assert(__n == _From::_S_size);
935#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
936 /// C++26 [simd.mask.ctor] uses unconditional explicit
937 return true;
938#else
939 if (__b0 != __b1)
940 return true;
941
942 // everything is better than _ScalarAbi, except when converting to a single bool
943 if constexpr (__scalar_abi_tag<_To>)
944 return __n > 1;
945 else if constexpr (__scalar_abi_tag<_From>)
946 return true;
947
948 // converting to a bit-mask is better
949 else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask)
950 return _To::_S_is_vecmask; // to vector-mask is explicit
951
952 // with vec-masks, fewer registers is better
953 else if constexpr (_From::_S_nreg != _To::_S_nreg)
954 return _From::_S_nreg < _To::_S_nreg;
955
956 else
957 __builtin_unreachable();
958#endif
959 }
960
961 /** @internal
962 * An alias for a signed integer type.
963 *
964 * libstdc++ unconditionally uses @c int here, since it matches the return type of
965 * 'Bit Operation Builtins' in GCC.
966 *
967 * C++26 [simd.expos.defn]
968 */
969 using __simd_size_type = int;
970
971 // integral_constant shortcut
972 template <__simd_size_type _Xp>
973 inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {};
974
975 // [simd.syn]
976 template <typename _Tp, typename _Ap = __native_abi_t<_Tp>>
977 class basic_vec;
978
979 template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
980 using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
981
982 template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>>
983 class basic_mask;
984
985 template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
986 using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
987
988 // [simd.ctor] load constructor constraints
989 template <typename _Tp, size_t _Np = -1uz>
990 concept __static_sized_range
991 = ranges::sized_range<_Tp> && requires(_Tp&& __r) {
992 typename integral_constant<size_t, ranges::size(__r)>;
993 requires (_Np == -1uz || ranges::size(__r) == _Np);
994 };
995
996 template <typename _Rg>
997 consteval size_t
998 __static_range_size(_Rg& __r)
999 {
1000 if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; })
1001 return ranges::size(__r);
1002 else
1003 return dynamic_extent;
1004 }
1005
1006 // [simd.general] value-preserving
1007 template <typename _From, typename _To>
1008 concept __arithmetic_only_value_preserving_convertible_to
1009 = convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To>
1010 && !(is_signed_v<_From> && is_unsigned_v<_To>)
1014
1015 /** @internal
1016 * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
1017 *
1018 * C++26 [simd.general]
1019 */
1020 template <typename _From, typename _To>
1021 concept __value_preserving_convertible_to
1022 = __arithmetic_only_value_preserving_convertible_to<_From, _To>;
1023
1024 // LWG4420
1025 template <typename _From, typename _To>
1026 concept __explicitly_convertible_to = requires {
1027 static_cast<_To>(declval<_From>());
1028 };
1029
1030 /** @internal
1031 * C++26 [simd.expos]
1032 */
1033 template<typename _Tp>
1034 concept __constexpr_wrapper_like
1035 = convertible_to<_Tp, decltype(_Tp::value)>
1036 && equality_comparable_with<_Tp, decltype(_Tp::value)>
1037 && bool_constant<_Tp() == _Tp::value>::value
1038 && bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;
1039
1040 // [simd.ctor] explicit(...) of broadcast ctor
1041 template <auto _From, typename _To>
1042 concept __non_narrowing_constexpr_conversion
1043 = is_arithmetic_v<decltype(_From)>
1044 && static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From
1045 && !(unsigned_integral<_To> && _From < decltype(_From)())
1046 && _From <= std::numeric_limits<_To>::max()
1048
1049 // [simd.ctor] p4
1050 // This implements LWG4436 (submitted on 2025-10-28)
1051 template <typename _From, typename _To>
1052 concept __broadcast_constructible
1053 = ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>>
1054 && !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
1055 || __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
1056 || (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3
1057 && __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value),
1058 _To>));
1059
1060 // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
1061 template <typename _From, typename _To>
1062 consteval bool
1063 __higher_floating_point_rank_than()
1064 {
1065 return floating_point<_From> && floating_point<_To>
1066 && is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>;
1067 }
1068
1069 // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
1070 template <typename _From, typename _To>
1071 consteval bool
1072 __higher_integer_rank_than()
1073 {
1074 return integral<_From> && integral<_To>
1075 && (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>)
1076 && !is_same_v<_From, _To>;
1077 }
1078
1079 template <typename _From, typename _To>
1080 concept __higher_rank_than
1081 = __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>();
1082
1083 struct __convert_flag;
1084
1085 template <typename _From, typename _To, typename... _Flags>
1086 concept __loadstore_convertible_to
1087 = same_as<_From, _To>
1088 || (__vectorizable<_From> && __vectorizable<_To>
1089 && (__value_preserving_convertible_to<_From, _To>
1090 || (__explicitly_convertible_to<_From, _To>
1091 && (std::is_same_v<_Flags, __convert_flag> || ...))));
1092
1093 template <typename _From, typename _To>
1094 concept __simd_generator_convertible_to
1095 = std::convertible_to<_From, _To>
1096 && (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>);
1097
1098 template <typename _Fp, typename _Tp, __simd_size_type... _Is>
1099 requires (__simd_generator_convertible_to<
1100 decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...)
1101 constexpr void
1102 __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
1103
1104 template <typename _Fp, typename _Tp, __simd_size_type _Np>
1105 concept __simd_generator_invokable = requires {
1106 __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
1107 };
1108
1109 template <typename _Fp>
1110 concept __index_permutation_function_sized = requires(_Fp const& __f)
1111 {
1112 { __f(0, 0) } -> std::integral;
1113 };
1114
1115 template <typename _Fp, typename _Simd>
1116 concept __index_permutation_function
1117 = __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) {
1118 { __f(0) } -> std::integral;
1119 };
1120
1121 /** @internal
1122 * The value of the @c _Bytes template argument to a @c basic_mask specialization.
1123 *
1124 * C++26 [simd.expos.defn]
1125 */
1126 template <typename _Tp>
1127 constexpr size_t __mask_element_size = 0;
1128
1129 template <size_t _Bytes, __abi_tag _Ap>
1130 constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes;
1131
1132 // [simd.expos]
1133 template <typename _Vp>
1134 concept __simd_vec_type
1135 = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
1136 && is_default_constructible_v<_Vp>;
1137
1138 template <typename _Vp>
1139 concept __simd_mask_type
1140 = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
1141 && is_default_constructible_v<_Vp>;
1142
1143 /** @internal
1144 * Satisfied if @p _Tp is a data-parallel type.
1145 */
1146 template <typename _Vp>
1147 concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>;
1148
1149 template <typename _Vp>
1150 concept __simd_floating_point
1151 = __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>;
1152
1153 template <typename _Vp>
1154 concept __simd_integral
1155 = __simd_vec_type<_Vp> && integral<typename _Vp::value_type>;
1156
1157 template <typename _Tp>
1158 concept __converts_to_vec
1159 = __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>;
1160
1161 template <__converts_to_vec _Tp>
1162 using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>());
1163
1164 template <typename _Vp, typename _Tp>
1165 using __make_compatible_simd_t
1166 = decltype([] {
1167 using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
1168 if constexpr (__simd_vec_type<_Up>)
1169 return _Up();
1170 else
1171 return vec<_Up, _Vp::size()>();
1172 }());
1173
1174 template <typename _Tp>
1175 concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>;
1176
1177 template <typename _BinaryOperation, typename _Tp>
1178 concept __reduction_binary_operation
1179 = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
1180 { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
1181 };
1182
1183 /** @internal
1184 * Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1.
1185 */
1186 [[__gnu__::__always_inline__]]
1187 constexpr __simd_size_type
1188 __highest_bit(std::unsigned_integral auto __bits)
1189 {
1191 constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits;
1192 return _Nd - 1 - __countl_zero(__bits);
1193 }
1194
1195 template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
1196 using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
1197
1198 // Allow _Tp to be _InvalidInteger for __integer_from<16>
1199 template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
1200 using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
1201
1202 // LWG4470 [simd.expos]
1203 template <size_t _Bytes, typename _Ap>
1204 using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;
1205
1206#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844)
1207 class __bad_value_preserving_cast
1208 {};
1209
1210#define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast
1211#else
1212 void __bad_value_preserving_cast(); // not defined
1213
1214#define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast
1215#endif
1216
1217 template <typename _To, typename _From>
1218#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844
1219 [[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions
1220#endif
1221 consteval _To
1222 __value_preserving_cast(const _From& __x)
1223 {
1224 static_assert(is_arithmetic_v<_From>);
1225 if constexpr (!__value_preserving_convertible_to<_From, _To>)
1226 {
1227 using _Up = typename __make_unsigned<_From>::__type;
1228 if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x))
1229 __glibcxx_on_bad_value_preserving_cast();
1230 else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>)
1231 {
1232 if (__x < _From())
1233 __glibcxx_on_bad_value_preserving_cast();
1234 }
1235 else if constexpr (unsigned_integral<_From> && signed_integral<_To>)
1236 {
1237 if (__x > numeric_limits<_To>::max())
1238 __glibcxx_on_bad_value_preserving_cast();
1239 }
1240 }
1241 return static_cast<_To>(__x);
1242 }
1243
1244 template <typename _From, typename _To>
1245 concept __simd_vec_bcast_consteval
1246 = __explicitly_convertible_to<_From, _To>
1247 && is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To>
1248 && !__value_preserving_convertible_to<remove_cvref_t<_From>, _To>
1249 && (is_same_v<common_type_t<_From, _To>, _To>
1250 || (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>)
1251 || (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>));
1252
1253 /** @internal
1254 * std::pair is not trivially copyable, this one is
1255 */
1256 template <typename _T0, typename _T1>
1257 struct __trivial_pair
1258 {
1259 _T0 _M_first;
1260 _T1 _M_second;
1261 };
1262
1263 template <typename _From, typename _To>
1264 concept __converts_trivially = convertible_to<_From, _To>
1265 && sizeof(_From) == sizeof(_To)
1266 && is_integral_v<_From> == is_integral_v<_To>
1267 && is_floating_point_v<_From> == is_floating_point_v<_To>;
1268
1269 [[__gnu__::__always_inline__]]
1270 constexpr void
1271 __bit_foreach(unsigned_integral auto __bits, auto&& __fun)
1272 {
1273 static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int
1274 while (__bits)
1275 {
1276 __fun(__countr_zero(__bits));
1277 __bits &= (__bits - 1);
1278 }
1279 }
1280
1281 /** @internal
1282 * Optimized @c memcpy for use in partial loads and stores.
1283 *
1284 * The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the
1285 * number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy
1286 * calls.
1287 *
1288 * @tparam _Chunk Copies @p __n times @p _Chunk bytes.
1289 * @tparam _Max Copy no more than @p _Max bytes.
1290 *
1291 * @param __dst The destination pointer.
1292 * @param __src The source pointer.
1293 * @param __n Thu number of chunks that need to be copied.
1294 */
1295 template <size_t _Chunk, size_t _Max>
1296 inline void
1297 __memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src,
1298 size_t __n)
1299 {
1300 static_assert(_Max <= 64);
1301 static_assert(__has_single_bit(_Chunk) && _Chunk <= 8);
1302 size_t __bytes = _Chunk * __n;
1303 if (__builtin_constant_p(__bytes))
1304 { // If __n is known via constant propagation use a single memcpy call. Since this is still
1305 // a fixed-size memcpy to the compiler, this leaves more room for optimization.
1306 __builtin_memcpy(__dst, __src, __bytes);
1307 }
1308 else if (__bytes > 32 && _Max > 32)
1309 {
1310 __builtin_memcpy(__dst, __src, 32);
1311 __bytes -= 32;
1312 __builtin_memcpy(__dst + __bytes, __src + __bytes, 32);
1313 }
1314 else if (__bytes > 16 && _Max > 16)
1315 {
1316 __builtin_memcpy(__dst, __src, 16);
1317 if constexpr (_Chunk == 8)
1318 {
1319 __bytes -= 8;
1320 __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
1321 }
1322 else
1323 {
1324 __bytes -= 16;
1325 __builtin_memcpy(__dst + __bytes, __src + __bytes, 16);
1326 }
1327 }
1328 else if (__bytes > 8 && _Max > 8)
1329 {
1330 __builtin_memcpy(__dst, __src, 8);
1331 if constexpr (_Chunk == 4)
1332 {
1333 __bytes -= 4;
1334 __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
1335 }
1336 else if constexpr (_Chunk < 4)
1337 {
1338 __bytes -= 8;
1339 __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
1340 }
1341 }
1342 else if (__bytes > 4 && _Max > 4)
1343 {
1344 __builtin_memcpy(__dst, __src, 4);
1345 if constexpr (_Chunk == 2)
1346 {
1347 __bytes -= 2;
1348 __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
1349 }
1350 else if constexpr (_Chunk == 1)
1351 {
1352 __bytes -= 4;
1353 __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
1354 }
1355 }
1356 else if (__bytes >= 2)
1357 {
1358 __builtin_memcpy(__dst, __src, 2);
1359 if constexpr (_Chunk == 2)
1360 {
1361 __bytes -= 2;
1362 __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
1363 }
1364 else if constexpr (_Chunk == 1)
1365 {
1366 __bytes -= 1;
1367 __builtin_memcpy(__dst + __bytes, __src + __bytes, 1);
1368 }
1369 }
1370 else if (__bytes == 1)
1371 __builtin_memcpy(__dst, __src, 1);
1372 }
1373
1374 // [simd.reductions] identity_element = *see below*
1375 template <typename _Tp, typename _BinaryOperation>
1376 requires __is_one_of<_BinaryOperation,
1377 plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value
1378 consteval _Tp
1379 __default_identity_element()
1380 {
1381 if constexpr (same_as<_BinaryOperation, multiplies<>>)
1382 return _Tp(1);
1383 else if constexpr (same_as<_BinaryOperation, bit_and<>>)
1384 return _Tp(~_Tp());
1385 else
1386 return _Tp(0);
1387 }
1388} // namespace simd
1389_GLIBCXX_END_NAMESPACE_VERSION
1390} // namespace std
1391
1392#pragma GCC diagnostic pop
1393#endif // C++26
1394#endif // _GLIBCXX_SIMD_DETAILS_H
typename underlying_type< _Tp >::type underlying_type_t
Alias template for underlying_type.
Definition type_traits:2952
typename make_unsigned< _Tp >::type make_unsigned_t
Alias template for make_unsigned.
Definition type_traits:2246
auto declval() noexcept -> decltype(__declval< _Tp >(0))
Definition type_traits:2714
ISO C++ entities toplevel namespace is std.
__make_integer_seq< integer_sequence, _Tp, _Num > make_integer_sequence
Alias template make_integer_sequence.
Definition utility.h:522
__numeric_traits_integer< _Tp > __int_traits
Convenience alias for __numeric_traits<integer-type>.
static constexpr int digits
Definition limits:218
static constexpr _Tp max() noexcept
Definition limits:328
static constexpr _Tp lowest() noexcept
Definition limits:334