libstdc++
simd_vec.h
1// Implementation of <simd> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_SIMD_VEC_H
26#define _GLIBCXX_SIMD_VEC_H 1
27
28#ifdef _GLIBCXX_SYSHDR
29#pragma GCC system_header
30#endif
31
32#if __cplusplus >= 202400L
33
34#include "simd_mask.h"
35#include "simd_flags.h"
36
37#include <bits/utility.h>
38#include <bits/stl_function.h>
39#include <cmath>
40
41// psabi warnings are bogus because the ABI of the internal types never leaks into user code
42#pragma GCC diagnostic push
43#pragma GCC diagnostic ignored "-Wpsabi"
44
45namespace std _GLIBCXX_VISIBILITY(default)
46{
47_GLIBCXX_BEGIN_NAMESPACE_VERSION
48namespace simd
49{
50 // disabled basic_vec
51 template <typename _Tp, typename _Ap>
52 class basic_vec
53 {
54 public:
55 using value_type = _Tp;
56
57 using abi_type = _Ap;
58
59 using mask_type = basic_mask<0, void>; // disabled
60
61#define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \
62 "of template arguments to basic_vec."
63
64 basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
65
66 ~basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
67
68 basic_vec(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
69
70 basic_vec& operator=(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
71
72#undef _GLIBCXX_DELETE_SIMD
73 };
74
75 template <typename _Tp, typename _Ap>
76 class _VecBase
77 {
78 using _Vp = basic_vec<_Tp, _Ap>;
79
80 public:
81 using value_type = _Tp;
82
83 using abi_type = _Ap;
84
85 using mask_type = basic_mask<sizeof(_Tp), abi_type>;
86
87 using iterator = __iterator<_Vp>;
88
89 using const_iterator = __iterator<const _Vp>;
90
91 constexpr iterator
92 begin() noexcept
93 { return {static_cast<_Vp&>(*this), 0}; }
94
95 constexpr const_iterator
96 begin() const noexcept
97 { return cbegin(); }
98
99 constexpr const_iterator
100 cbegin() const noexcept
101 { return {static_cast<const _Vp&>(*this), 0}; }
102
103 constexpr default_sentinel_t
104 end() const noexcept
105 { return {}; }
106
107 constexpr default_sentinel_t
108 cend() const noexcept
109 { return {}; }
110
111 static constexpr auto size = __simd_size_c<_Ap::_S_size>;
112
113 _VecBase() = default;
114
115 // LWG issue from 2026-03-04 / P4042R0
116 template <typename _Up, typename _UAbi>
117 requires (_Ap::_S_size != _UAbi::_S_size)
118 _VecBase(const basic_vec<_Up, _UAbi>&) = delete("size mismatch");
119
120 template <typename _Up, typename _UAbi>
121 requires (_Ap::_S_size == _UAbi::_S_size) && (!__explicitly_convertible_to<_Up, _Tp>)
122 explicit
123 _VecBase(const basic_vec<_Up, _UAbi>&)
124 = delete("the value types are not convertible");
125
126 [[__gnu__::__always_inline__]]
127 friend constexpr _Vp
128 operator+(const _Vp& __x, const _Vp& __y) noexcept
129 {
130 _Vp __r = __x;
131 __r += __y;
132 return __r;
133 }
134
135 [[__gnu__::__always_inline__]]
136 friend constexpr _Vp
137 operator-(const _Vp& __x, const _Vp& __y) noexcept
138 {
139 _Vp __r = __x;
140 __r -= __y;
141 return __r;
142 }
143
144 [[__gnu__::__always_inline__]]
145 friend constexpr _Vp
146 operator*(const _Vp& __x, const _Vp& __y) noexcept
147 {
148 _Vp __r = __x;
149 __r *= __y;
150 return __r;
151 }
152
153 [[__gnu__::__always_inline__]]
154 friend constexpr _Vp
155 operator/(const _Vp& __x, const _Vp& __y) noexcept
156 {
157 _Vp __r = __x;
158 __r /= __y;
159 return __r;
160 }
161
162 [[__gnu__::__always_inline__]]
163 friend constexpr _Vp
164 operator%(const _Vp& __x, const _Vp& __y) noexcept
165 requires requires (_Tp __a) { __a % __a; }
166 {
167 _Vp __r = __x;
168 __r %= __y;
169 return __r;
170 }
171
172 [[__gnu__::__always_inline__]]
173 friend constexpr _Vp
174 operator&(const _Vp& __x, const _Vp& __y) noexcept
175 requires requires (_Tp __a) { __a & __a; }
176 {
177 _Vp __r = __x;
178 __r &= __y;
179 return __r;
180 }
181
182 [[__gnu__::__always_inline__]]
183 friend constexpr _Vp
184 operator|(const _Vp& __x, const _Vp& __y) noexcept
185 requires requires (_Tp __a) { __a | __a; }
186 {
187 _Vp __r = __x;
188 __r |= __y;
189 return __r;
190 }
191
192 [[__gnu__::__always_inline__]]
193 friend constexpr _Vp
194 operator^(const _Vp& __x, const _Vp& __y) noexcept
195 requires requires (_Tp __a) { __a ^ __a; }
196 {
197 _Vp __r = __x;
198 __r ^= __y;
199 return __r;
200 }
201
202 [[__gnu__::__always_inline__]]
203 friend constexpr _Vp
204 operator<<(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
205 requires requires (_Tp __a) { __a << __a; }
206 {
207 _Vp __r = __x;
208 __r <<= __y;
209 return __r;
210 }
211
212 [[__gnu__::__always_inline__]]
213 friend constexpr _Vp
214 operator<<(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
215 requires requires (_Tp __a, __simd_size_type __b) { __a << __b; }
216 {
217 _Vp __r = __x;
218 __r <<= __y;
219 return __r;
220 }
221
222 [[__gnu__::__always_inline__]]
223 friend constexpr _Vp
224 operator>>(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
225 requires requires (_Tp __a) { __a >> __a; }
226 {
227 _Vp __r = __x;
228 __r >>= __y;
229 return __r;
230 }
231
232 [[__gnu__::__always_inline__]]
233 friend constexpr _Vp
234 operator>>(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
235 requires requires (_Tp __a, __simd_size_type __b) { __a >> __b; }
236 {
237 _Vp __r = __x;
238 __r >>= __y;
239 return __r;
240 }
241 };
242
243 struct _LoadCtorTag
244 {};
245
246 template <integral _Tp>
247 inline constexpr _Tp __max_shift
248 = (sizeof(_Tp) < sizeof(int) ? sizeof(int) : sizeof(_Tp)) * __CHAR_BIT__;
249
250 template <__vectorizable _Tp, __abi_tag _Ap>
251 requires (_Ap::_S_nreg == 1)
252 class basic_vec<_Tp, _Ap>
253 : public _VecBase<_Tp, _Ap>
254 {
255 template <typename, typename>
256 friend class basic_vec;
257
258 template <size_t, typename>
259 friend class basic_mask;
260
261 static constexpr int _S_size = _Ap::_S_size;
262
263 static constexpr int _S_full_size = __bit_ceil(unsigned(_S_size));
264
265 static constexpr bool _S_is_scalar = _S_size == 1;
266
267 static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask && !_S_is_scalar;
268
269 using _DataType = typename _Ap::template _DataType<_Tp>;
270
271 /** @internal
272 * @brief Underlying vector data storage.
273 *
274 * This member holds the vector object using a GNU vector type or a platform-specific vector
275 * type determined by the ABI tag. For size 1 vectors, this is a single value (_Tp).
276 */
277 _DataType _M_data;
278
279 static constexpr bool _S_is_partial = sizeof(_M_data) > sizeof(_Tp) * _S_size;
280
281 using __canon_value_type = __canonical_vec_type_t<_Tp>;
282
283 public:
284 using value_type = _Tp;
285
286 using mask_type = _VecBase<_Tp, _Ap>::mask_type;
287
288 // internal but public API ----------------------------------------------
289 [[__gnu__::__always_inline__]]
290 static constexpr basic_vec
291 _S_init(_DataType __x)
292 {
293 basic_vec __r;
294 __r._M_data = __x;
295 return __r;
296 }
297
298 [[__gnu__::__always_inline__]]
299 constexpr const _DataType&
300 _M_get() const
301 { return _M_data; }
302
303 [[__gnu__::__always_inline__]]
304 friend constexpr bool
305 __is_const_known(const basic_vec& __x)
306 { return __builtin_constant_p(__x._M_data); }
307
308 [[__gnu__::__always_inline__]]
309 constexpr auto
310 _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
311 {
312 if constexpr (_S_is_scalar)
313 return __vec_builtin_type<__canon_value_type, 1>{_M_data};
314 else
315 return _M_data;
316 }
317
318 template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
319 [[__gnu__::__always_inline__]]
320 static constexpr basic_vec
321 _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
322 {
323 using _Xp = basic_vec<value_type, _A0>;
324 basic_vec __r;
325 if constexpr (_S_is_scalar)
326 {
327 constexpr __simd_size_type __j = [&] consteval {
328 if constexpr (__index_permutation_function_sized<_Fp>)
329 return __idxmap(_Offset, _Size);
330 else
331 return __idxmap(_Offset);
332 }();
333 if constexpr (__j == simd::zero_element || __j == simd::uninit_element)
334 return basic_vec();
335 else
336 static_assert(__j >= 0 && __j < _Xp::_S_size);
337 __r._M_data = __x[__j];
338 }
339 else
340 {
341 auto __idxmap2 = [=](auto __i) consteval {
342 if constexpr (int(__i + _Offset) >= _Size) // _S_full_size > _Size
343 return __simd_size_c<simd::uninit_element>;
344 else if constexpr (__index_permutation_function_sized<_Fp>)
345 return __simd_size_c<__idxmap(__i + _Offset, _Size)>;
346 else
347 return __simd_size_c<__idxmap(__i + _Offset)>;
348 };
349 constexpr auto __adj_idx = [](auto __i) {
350 constexpr int __j = __i;
351 if constexpr (__j == simd::zero_element)
352 return __simd_size_c<__bit_ceil(unsigned(_Xp::_S_size))>;
353 else if constexpr (__j == simd::uninit_element)
354 return __simd_size_c<-1>;
355 else
356 {
357 static_assert(__j >= 0 && __j < _Xp::_S_size);
358 return __simd_size_c<__j>;
359 }
360 };
361 constexpr auto [...__is0] = _IotaArray<_S_size>;
362 constexpr bool __needs_zero_element
363 = ((__idxmap2(__simd_size_c<__is0>).value == simd::zero_element) || ...);
364 constexpr auto [...__is_full] = _IotaArray<_S_full_size>;
365 if constexpr (_A0::_S_nreg == 2 && !__needs_zero_element)
366 {
367 __r._M_data = __builtin_shufflevector(
368 __x._M_data0._M_data, __x._M_data1._M_data,
369 __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
370 }
371 else
372 {
373 __r._M_data = __builtin_shufflevector(
374 __x._M_concat_data(), decltype(__x._M_concat_data())(),
375 __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
376 }
377 }
378 return __r;
379 }
380
381 template <typename _Vp>
382 [[__gnu__::__always_inline__]]
383 constexpr auto
384 _M_chunk() const noexcept
385 {
386 constexpr int __n = _S_size / _Vp::_S_size;
387 constexpr int __rem = _S_size % _Vp::_S_size;
388 constexpr auto [...__is] = _IotaArray<__n>;
389 if constexpr (__rem == 0)
390 return array<_Vp, __n> {__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...};
391 else
392 {
393 using _Rest = resize_t<__rem, _Vp>;
394 return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...,
395 __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, *this));
396 }
397 }
398
399 [[__gnu__::__always_inline__]]
400 static constexpr basic_vec
401 _S_concat(const basic_vec& __x0) noexcept
402 { return __x0; }
403
404 template <typename... _As>
405 requires (sizeof...(_As) > 1)
406 [[__gnu__::__always_inline__]]
407 static constexpr basic_vec
408 _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
409 {
410 static_assert(_S_size == (_As::_S_size + ...));
411 return __extract_simd_at<basic_vec>(cw<0>, __xs...);
412 }
413
414 /** @internal
415 * Shifts elements to the front by @p _Shift positions (or to the back for negative @p
416 * _Shift).
417 *
418 * This function moves elements towards lower indices (front of the vector).
419 * Elements that would shift beyond the vector bounds are replaced with zero. Negative shift
420 * values shift in the opposite direction.
421 *
422 * @warning The naming can be confusing due to little-endian byte order:
423 * - Despite the name "shifted_to_front", the underlying hardware instruction
424 * shifts bits to the right (psrl...)
425 * - The function name refers to element indices, not bit positions
426 *
427 * @tparam _Shift Number of positions to shift elements towards the front.
428 * Must be -size() < _Shift < size().
429 *
430 * @return A new vector with elements shifted to front or back.
431 *
432 * Example:
433 * @code
434 * __iota<vec<int, 4>>._M_elements_shifted_to_front<2>(); // {2, 3, 0, 0}
435 * __iota<vec<int, 4>>._M_elements_shifted_to_front<-2>(); // {0, 0, 0, 1}
436 * @endcode
437 */
438 template <int _Shift, _ArchTraits _Traits = {}>
439 [[__gnu__::__always_inline__]]
440 constexpr basic_vec
441 _M_elements_shifted_to_front() const
442 {
443 static_assert(_Shift < _S_size && -_Shift < _S_size);
444 if constexpr (_Shift == 0)
445 return *this;
446#ifdef __SSE2__
447 else if (!__is_const_known(*this))
448 {
449 if constexpr (sizeof(_M_data) == 16 && _Shift > 0)
450 return reinterpret_cast<_DataType>(
451 __builtin_ia32_psrldqi128(__vec_bit_cast<long long>(_M_data),
452 _Shift * sizeof(value_type) * 8));
453 else if constexpr (sizeof(_M_data) == 16 && _Shift < 0)
454 return reinterpret_cast<_DataType>(
455 __builtin_ia32_pslldqi128(__vec_bit_cast<long long>(_M_data),
456 -_Shift * sizeof(value_type) * 8));
457 else if constexpr (sizeof(_M_data) < 16)
458 {
459 auto __x = reinterpret_cast<__vec_builtin_type_bytes<long long, 16>>(
460 __vec_zero_pad_to_16(_M_data));
461 if constexpr (_Shift > 0)
462 __x = __builtin_ia32_psrldqi128(__x, _Shift * sizeof(value_type) * 8);
463 else
464 __x = __builtin_ia32_pslldqi128(__x, -_Shift * sizeof(value_type) * 8);
465 return _VecOps<_DataType>::_S_extract(__vec_bit_cast<__canon_value_type>(__x));
466 }
467 }
468#endif
469 return _S_static_permute(*this, [](int __i) consteval {
470 int __off = __i + _Shift;
471 return __off >= _S_size || __off < 0 ? zero_element : __off;
472 });
473 }
474
475 /** @internal
476 * @brief Set padding elements to @p __id; add more padding elements if necessary.
477 *
478 * @note This function can rearrange the element order since the result is only used for
479 * reductions.
480 */
481 template <typename _Vp, __canon_value_type __id>
482 [[__gnu__::__always_inline__]]
483 constexpr _Vp
484 _M_pad_to_T_with_value() const noexcept
485 {
486 static_assert(!_Vp::_S_is_partial);
487 static_assert(_Ap::_S_nreg == 1);
488 if constexpr (sizeof(_Vp) == 32)
489 { // when we need to reduce from a 512-bit register
490 static_assert(sizeof(_M_data) == 32);
491 constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
492 return __select_impl(__k, _Vp::_S_init(_M_data), __id);
493 }
494 else
495 {
496 static_assert(sizeof(_Vp) <= 16); // => max. 7 Bytes need to be zeroed
497 static_assert(sizeof(_M_data) <= sizeof(_Vp));
498 _Vp __v1 = __vec_zero_pad_to<sizeof(_Vp)>(_M_data);
499 if constexpr (__id == 0 && _S_is_partial)
500 // cheapest solution: shift values to the back while shifting in zeros
501 // This is valid because we shift out padding elements and use all elements in a
502 // subsequent reduction.
503 __v1 = __v1.template _M_elements_shifted_to_front<-(_Vp::_S_size - _S_size)>();
504 else if constexpr (_Vp::_S_size - _S_size == 1)
505 // if a single element needs to be changed, use an insert instruction
506 __vec_set(__v1._M_data, _Vp::_S_size - 1, __id);
507 else if constexpr (__has_single_bit(unsigned(_Vp::_S_size - _S_size)))
508 { // if 2^n elements need to be changed, use a single insert instruction
509 constexpr int __n = _Vp::_S_size - _S_size;
510 using _Ip = __integer_from<__n * sizeof(__canon_value_type)>;
511 constexpr auto [...__is] = _IotaArray<__n>;
512 constexpr __canon_value_type __idn[__n] = {((void)__is, __id)...};
513 auto __vn = __vec_bit_cast<_Ip>(__v1._M_data);
514 __vec_set(__vn, _Vp::_S_size / __n - 1, __builtin_bit_cast(_Ip, __idn));
515 __v1._M_data = reinterpret_cast<typename _Vp::_DataType>(__vn);
516 }
517 else if constexpr (__id != 0 && !_S_is_partial)
518 { // if __vec_zero_pad_to added zeros in all the places where we need __id, a
519 // bitwise or is sufficient (needs a vector constant for the __id vector, which
520 // isn't optimal)
521 constexpr _Vp __idn([](int __i) {
522 return __i >= _S_size ? __id : __canon_value_type();
523 });
524 __v1._M_data = __vec_or(__v1._M_data, __idn._M_data);
525 }
526 else if constexpr (__id != 0 || _S_is_partial)
527 { // fallback
528 constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
529 __v1 = __select_impl(__k, __v1, __id);
530 }
531 return __v1;
532 }
533 }
534
535 [[__gnu__::__always_inline__]]
536 constexpr auto
537 _M_reduce_to_half(auto __binary_op) const
538 {
539 static_assert(__has_single_bit(unsigned(_S_size)));
540 auto [__a, __b] = chunk<_S_size / 2>(*this);
541 return __binary_op(__a, __b);
542 }
543
544 template <typename _Rest, typename _BinaryOp>
545 [[__gnu__::__always_inline__]]
546 constexpr value_type
547 _M_reduce_tail(const _Rest& __rest, _BinaryOp __binary_op) const
548 {
549 if constexpr (_S_is_scalar)
550 return __binary_op(*this, __rest)._M_data;
551 else if constexpr (_Rest::_S_size == _S_size)
552 return __binary_op(*this, __rest)._M_reduce(__binary_op);
553 else if constexpr (_Rest::_S_size > _S_size)
554 {
555 auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
556 return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
557 }
558 else if constexpr (_Rest::_S_size == 1)
559 return __binary_op(_Rest(_M_reduce(__binary_op)), __rest)[0];
560 else if constexpr (sizeof(_M_data) <= 16
561 && requires { __default_identity_element<__canon_value_type, _BinaryOp>(); })
562 { // extend __rest with identity element for more parallelism
563 constexpr __canon_value_type __id
564 = __default_identity_element<__canon_value_type, _BinaryOp>();
565 return __binary_op(_M_data, __rest.template _M_pad_to_T_with_value<basic_vec, __id>())
566 ._M_reduce(__binary_op);
567 }
568 else
569 return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
570 }
571
572 /** @internal
573 * @brief Reduction over @p __binary_op of all (non-padding) elements.
574 *
575 * @note The implementation assumes it is most efficient to first reduce to one 128-bit SIMD
576 * register and then shuffle elements while sticking to 128-bit registers.
577 */
578 template <typename _BinaryOp, _ArchTraits _Traits = {}>
579 [[__gnu__::__always_inline__]]
580 constexpr value_type
581 _M_reduce(_BinaryOp __binary_op) const
582 {
583 constexpr bool __have_id_elem
584 = requires { __default_identity_element<__canon_value_type, _BinaryOp>(); };
585 if constexpr (_S_size == 1)
586 return operator[](0);
587 else if constexpr (_Traits.template _M_eval_as_f32<value_type>()
588 && (is_same_v<_BinaryOp, plus<>>
589 || is_same_v<_BinaryOp, multiplies<>>))
590 return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
591#ifdef __SSE2__
592 else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
593 && is_same_v<decltype(__binary_op), multiplies<>>)
594 {
595 // convert to unsigned short because of missing 8-bit mul instruction
596 // we don't need to preserve the order of elements
597 //
598 // The left columns under Latency and Throughput show bit-cast to ushort with shift by
599 // 8. The right column uses the alternative in the else branch.
600 // Benchmark on Intel Ultra 7 165U (AVX2)
601 // TYPE Latency Throughput
602 // [cycles/call] [cycles/call]
603 //schar, 2 9.11 7.73 3.17 3.21
604 //schar, 4 31.6 34.9 5.11 6.97
605 //schar, 8 35.7 41.5 7.77 7.17
606 //schar, 16 36.7 44.1 6.66 8.96
607 //schar, 32 42.2 61.1 8.82 10.1
608 if constexpr (!_S_is_partial)
609 { // If all elements participate in the reduction we can take this shortcut
610 using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
611 auto __a = __builtin_bit_cast(_V16, *this);
612 return __binary_op(__a, __a >> 8)._M_reduce(__binary_op);
613 }
614 else
615 {
616 using _V16 = rebind_t<unsigned short, basic_vec>;
617 return _V16(*this)._M_reduce(__binary_op);
618 }
619 }
620#endif
621 else if constexpr (__has_single_bit(unsigned(_S_size)))
622 {
623 if constexpr (sizeof(_M_data) > 16)
624 return _M_reduce_to_half(__binary_op)._M_reduce(__binary_op);
625 else if constexpr (_S_size == 2)
626 return _M_reduce_to_half(__binary_op)[0];
627 else
628 {
629 static_assert(_S_size <= 16);
630 auto __x = *this;
631#ifdef __SSE2__
632 if constexpr (sizeof(_M_data) <= 16 && is_integral_v<value_type>)
633 {
634 if constexpr (_S_size > 8)
635 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<8>());
636 if constexpr (_S_size > 4)
637 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<4>());
638 if constexpr (_S_size > 2)
639 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<2>());
640 // We could also call __binary_op with vec<T, 1> arguments. However,
641 // micro-benchmarking on Intel Ultra 7 165U showed this to be more efficient:
642 return __binary_op(__x, __x.template _M_elements_shifted_to_front<1>())[0];
643 }
644#endif
645 if constexpr (_S_size > 8)
646 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<8>()));
647 if constexpr (_S_size > 4)
648 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<4>()));
649#ifdef __SSE2__
650 // avoid pshufb by "promoting" to int
651 if constexpr (is_integral_v<value_type> && sizeof(value_type) <= 1)
652 return value_type(resize_t<4, rebind_t<int, basic_vec>>(chunk<4>(__x)[0])
653 ._M_reduce(__binary_op));
654#endif
655 if constexpr (_S_size > 2)
656 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<2>()));
657 if constexpr (is_integral_v<value_type> && sizeof(value_type) == 2)
658 return __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<1>()))[0];
659 else
660 return __binary_op(vec<value_type, 1>(__x[0]), vec<value_type, 1>(__x[1]))[0];
661 }
662 }
663 else if constexpr (sizeof(_M_data) == 32)
664 {
665 const auto [__lo, __hi] = chunk<__bit_floor(unsigned(_S_size))>(*this);
666 return __lo._M_reduce_tail(__hi, __binary_op);
667 }
668 else if constexpr (sizeof(_M_data) == 64)
669 {
670 // e.g. _S_size = 16 + 16 + 15 (vec<char, 47>)
671 // -> 8 + 8 + 7 -> 4 + 4 + 3 -> 2 + 2 + 1 -> 1
672 auto __chunked = chunk<__bit_floor(unsigned(_S_size)) / 2>(*this);
673 using _Cp = decltype(__chunked);
674 if constexpr (tuple_size_v<_Cp> == 4)
675 {
676 const auto& [__a, __b, __c, __rest] = __chunked;
677 constexpr bool __amd_cpu = _Traits._M_have_sse4a();
678 if constexpr (__have_id_elem && __rest._S_size > 1 && __amd_cpu)
679 { // do one 256-bit op -> one 128-bit op
680 // 4 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
681 // 9 cycles on Skylake-AVX512 until _M_reduce
682 // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
683 // 17 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
684 const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
685 using _Vp = remove_cvref_t<decltype(__a)>;
686 constexpr __canon_value_type __id
687 = __default_identity_element<__canon_value_type, _BinaryOp>();
688 const _Vp __b = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
689 return __binary_op(__a, __b)._M_reduce(__binary_op);
690 }
691 else if constexpr (__have_id_elem && __rest._S_size > 1)
692 { // do two 128-bit ops -> one 128-bit op
693 // 5 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
694 // 7 cycles on Skylake-AVX512 until _M_reduce (short, 26, plus<>)
695 // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
696 // 16 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
697 using _Vp = remove_cvref_t<decltype(__a)>;
698 constexpr __canon_value_type __id
699 = __default_identity_element<__canon_value_type, _BinaryOp>();
700 const _Vp __d = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
701 return __binary_op(__binary_op(__a, __b), __binary_op(__c, __d))
702 ._M_reduce(__binary_op);
703 }
704 else
705 return __binary_op(__binary_op(__a, __b), __c)
706 ._M_reduce_tail(__rest, __binary_op);
707 }
708 else if constexpr (tuple_size_v<_Cp> == 3)
709 {
710 const auto& [__a, __b, __rest] = __chunked;
711 return __binary_op(__a, __b)._M_reduce_tail(__rest, __binary_op);
712 }
713 else
714 static_assert(false);
715 }
716 else if constexpr (__have_id_elem)
717 {
718 constexpr __canon_value_type __id
719 = __default_identity_element<__canon_value_type, _BinaryOp>();
720 using _Vp = resize_t<__bit_ceil(unsigned(_S_size)), basic_vec>;
721 return _M_pad_to_T_with_value<_Vp, __id>()._M_reduce(__binary_op);
722 }
723 else
724 {
725 const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
726 return __a._M_reduce_tail(__rest, __binary_op);
727 }
728 }
729
730 // [simd.math] ----------------------------------------------------------
731 //
732 // ISO/IEC 60559 on the classification operations (5.7.2 General Operations):
733 // "They are never exceptional, even for signaling NaNs."
734 //
735 template <_OptTraits _Traits = {}>
736 [[__gnu__::__always_inline__]]
737 constexpr mask_type
738 _M_isnan() const requires is_floating_point_v<value_type>
739 {
740 if constexpr (_Traits._M_finite_math_only())
741 return mask_type(false);
742 else if constexpr (_S_is_scalar)
743 return mask_type(std::isnan(_M_data));
744 else if constexpr (_S_use_bitmask)
745 return _M_isunordered(*this);
746 else if constexpr (!_Traits._M_support_snan())
747 return !(*this == *this);
748 else if (__is_const_known(_M_data))
749 return mask_type([&](int __i) { return std::isnan(_M_data[__i]); });
750 else
751 {
752 // 60559: NaN is represented as Inf + non-zero mantissa bits
753 using _Ip = __integer_from<sizeof(value_type)>;
754 return __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity())
755 < __builtin_bit_cast(rebind_t<_Ip, basic_vec>, _M_fabs());
756 }
757 }
758
759 template <_TargetTraits _Traits = {}>
760 [[__gnu__::__always_inline__]]
761 constexpr mask_type
762 _M_isinf() const requires is_floating_point_v<value_type>
763 {
764 if constexpr (_Traits._M_finite_math_only())
765 return mask_type(false);
766 else if constexpr (_S_is_scalar)
767 return mask_type(std::isinf(_M_data));
768 else if (__is_const_known(_M_data))
769 return mask_type([&](int __i) { return std::isinf(_M_data[__i]); });
770#ifdef _GLIBCXX_X86
771 else if constexpr (_S_use_bitmask)
772 return mask_type::_S_init(__x86_bitmask_isinf(_M_data));
773 else if constexpr (_Traits._M_have_avx512dq())
774 return __x86_bit_to_vecmask<typename mask_type::_DataType>(
775 __x86_bitmask_isinf(_M_data));
776#endif
777 else
778 {
779 using _Ip = __integer_from<sizeof(value_type)>;
780 return __vec_bit_cast<_Ip>(_M_fabs()._M_data)
781 == __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity());
782 }
783 }
784
785 [[__gnu__::__always_inline__]]
786 constexpr basic_vec
787 _M_abs() const requires signed_integral<value_type>
788 { return _M_data < 0 ? -_M_data : _M_data; }
789
790 [[__gnu__::__always_inline__]]
791 constexpr basic_vec
792 _M_fabs() const requires floating_point<value_type>
793 {
794 if constexpr (_S_is_scalar)
795 return std::fabs(_M_data);
796 else
797 return __vec_and(__vec_not(_S_signmask<_DataType>), _M_data);
798 }
799
800 template <_TargetTraits _Traits = {}>
801 [[__gnu__::__always_inline__]]
802 constexpr mask_type
803 _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
804 {
805 if constexpr (_Traits._M_finite_math_only())
806 return mask_type(false);
807 else if constexpr (_S_is_scalar)
808 return mask_type(std::isunordered(_M_data, __y._M_data));
809#ifdef _GLIBCXX_X86
810 else if constexpr (_S_use_bitmask)
811 return _M_bitmask_cmp<_X86Cmp::_Unord>(__y._M_data);
812#endif
813 else
814 return mask_type([&](int __i) {
815 return std::isunordered(_M_data[__i], __y._M_data[__i]);
816 });
817 }
818
819 /** @internal
820 * Implementation of @ref partial_load.
821 *
822 * @param __mem A pointer to an array of @p __n values. Can be complex or real.
823 * @param __n Read no more than @p __n values from memory. However, depending on @p __mem
824 * alignment, out of bounds reads are benign.
825 */
826 template <typename _Up, _ArchTraits _Traits = {}>
827 static inline basic_vec
828 _S_partial_load(const _Up* __mem, size_t __n)
829 {
830 if constexpr (_S_is_scalar)
831 return __n == 0 ? basic_vec() : basic_vec(static_cast<value_type>(*__mem));
832 else if (__is_const_known_equal_to(__n >= size_t(_S_size), true))
833 return basic_vec(_LoadCtorTag(), __mem);
834 else if constexpr (!__converts_trivially<_Up, value_type>)
835 return static_cast<basic_vec>(rebind_t<_Up, basic_vec>::_S_partial_load(__mem, __n));
836 else
837 {
838#if _GLIBCXX_X86
839 if constexpr (_Traits._M_have_avx512f()
840 || (_Traits._M_have_avx() && sizeof(_Up) >= 4))
841 {
842 const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
843 : mask_type(true);
844 return _S_masked_load(__mem, mask_type::_S_partial_mask_of_n(int(__n)));
845 }
846#endif
847 if (__n >= size_t(_S_size)) [[unlikely]]
848 return basic_vec(_LoadCtorTag(), __mem);
849#if _GLIBCXX_X86 // TODO: where else is this "safe"?
850 // allow out-of-bounds read when it cannot lead to a #GP
851 else if (__is_const_known_equal_to(
852 is_sufficiently_aligned<sizeof(_Up) * _S_full_size>(__mem), true))
853 return __select_impl(mask_type::_S_partial_mask_of_n(int(__n)),
854 basic_vec(_LoadCtorTag(), __mem), basic_vec());
855#endif
856 else if constexpr (_S_size > 4)
857 {
858 alignas(_DataType) byte __dst[sizeof(_DataType)] = {};
859 const byte* __src = reinterpret_cast<const byte*>(__mem);
860 __memcpy_chunks<sizeof(_Up), sizeof(_DataType)>(__dst, __src, __n);
861 return __builtin_bit_cast(_DataType, __dst);
862 }
863 else if (__n == 0) [[unlikely]]
864 return basic_vec();
865 else if constexpr (_S_size == 2)
866 return _DataType {static_cast<value_type>(__mem[0]), 0};
867 else
868 {
869 constexpr auto [...__is] = _IotaArray<_S_size - 2>;
870 return _DataType{
871 static_cast<value_type>(__mem[0]),
872 static_cast<value_type>(__is + 1 < __n ? __mem[__is + 1] : 0)...
873 };
874 }
875 }
876 }
877
878 /** @internal
879 * Loads elements from @p __mem according to mask @p __k.
880 *
881 * @param __mem Pointer (in)to array.
882 * @param __k Mask controlling which elements to load. For each bit i in the mask:
883 * - If bit i is 1: copy __mem[i] into result[i]
884 * - If bit i is 0: result[i] is default initialized
885 *
886 * @note This function assumes it's called after determining that no other method
887 * (like full load) is more appropriate. Calling with all mask bits set to 1
888 * is suboptimal for performance but still correct.
889 */
890 template <typename _Up, _ArchTraits _Traits = {}>
891 static inline basic_vec
892 _S_masked_load(const _Up* __mem, mask_type __k)
893 {
894 if constexpr (_S_size == 1)
895 return __k[0] ? static_cast<value_type>(__mem[0]) : value_type();
896#if _GLIBCXX_X86
897 else if constexpr (_Traits._M_have_avx512f())
898 return __x86_masked_load<_DataType>(__mem, __k._M_data);
899 else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
900 {
901 if constexpr (__converts_trivially<_Up, value_type>)
902 return __x86_masked_load<_DataType>(__mem, __k._M_data);
903 else
904 {
905 using _UV = rebind_t<_Up, basic_vec>;
906 return basic_vec(_UV::_S_masked_load(__mem, typename _UV::mask_type(__k)));
907 }
908 }
909#endif
910 else if (__k._M_none_of()) [[unlikely]]
911 return basic_vec();
912 else if constexpr (_S_is_scalar)
913 return basic_vec(static_cast<value_type>(*__mem));
914 else
915 {
916 // Use at least 4-byte __bits in __bit_foreach for better code-gen
917 _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
918 [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
919 if constexpr (__converts_trivially<_Up, value_type>)
920 {
921 _DataType __r = {};
922 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
923 __r[__i] = __mem[__i];
924 });
925 return __r;
926 }
927 else
928 {
929 using _UV = rebind_t<_Up, basic_vec>;
930 alignas(_UV) _Up __tmp[sizeof(_UV) / sizeof(_Up)] = {};
931 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
932 __tmp[__i] = __mem[__i];
933 });
934 return basic_vec(__builtin_bit_cast(_UV, __tmp));
935 }
936 }
937 }
938
939 template <typename _Up>
940 [[__gnu__::__always_inline__]]
941 inline void
942 _M_store(_Up* __mem) const
943 {
944 if constexpr (__converts_trivially<value_type, _Up>)
945 __builtin_memcpy(__mem, &_M_data, sizeof(_Up) * _S_size);
946 else
947 rebind_t<_Up, basic_vec>(*this)._M_store(__mem);
948 }
949
950 /** @internal
951 * Implementation of @ref partial_store.
952 *
953 * @note This is a static function to allow passing @p __v via register in case the function
954 * is not inlined.
955 *
956 * @note The function is not marked @c __always_inline__ since code-gen can become fairly
957 * long.
958 */
959 template <typename _Up, _ArchTraits _Traits = {}>
960 static inline void
961 _S_partial_store(const basic_vec __v, _Up* __mem, size_t __n)
962 {
963 if (__is_const_known_equal_to(__n >= _S_size, true))
964 __v._M_store(__mem);
965#if _GLIBCXX_X86
966 else if constexpr (_Traits._M_have_avx512f() && !_S_is_scalar)
967 {
968 const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
969 : mask_type(true);
970 return _S_masked_store(__v, __mem, __k);
971 }
972#endif
973 else if (__n >= _S_size) [[unlikely]]
974 __v._M_store(__mem);
975 else if (__n == 0) [[unlikely]]
976 return;
977 else if constexpr (__converts_trivially<value_type, _Up>)
978 {
979 byte* __dst = reinterpret_cast<byte*>(__mem);
980 const byte* __src = reinterpret_cast<const byte*>(&__v._M_data);
981 __memcpy_chunks<sizeof(_Up), sizeof(_M_data)>(__dst, __src, __n);
982 }
983 else
984 {
985 using _UV = rebind_t<_Up, basic_vec>;
986 _UV::_S_partial_store(_UV(__v), __mem, __n);
987 }
988 }
989
990 /** @internal
991 * Stores elements of @p __v to @p __mem according to mask @p __k.
992 *
993 * @param __v Values to store to @p __mem.
994 * @param __mem Pointer (in)to array.
995 * @param __k Mask controlling which elements to store. For each bit i in the mask:
996 * - If bit i is 1: store __v[i] to __mem[i]
997 * - If bit i is 0: __mem[i] is left unchanged
998 *
999 * @note This function assumes it's called after determining that no other method
1000 * (like full store) is more appropriate. Calling with all mask bits set to 1
1001 * is suboptimal for performance but still correct.
1002 */
1003 template <typename _Up, _ArchTraits _Traits = {}>
1004 //[[__gnu__::__always_inline__]]
1005 static inline void
1006 _S_masked_store(const basic_vec __v, _Up* __mem, const mask_type __k)
1007 {
1008#if _GLIBCXX_X86
1009 if constexpr (_Traits._M_have_avx512f())
1010 {
1011 __x86_masked_store(__v._M_data, __mem, __k._M_data);
1012 return;
1013 }
1014 else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
1015 {
1016 if constexpr (__converts_trivially<value_type, _Up>)
1017 __x86_masked_store(__v._M_data, __mem, __k._M_data);
1018 else
1019 {
1020 using _UV = rebind_t<_Up, basic_vec>;
1021 _UV::_S_masked_store(_UV(__v), __mem, typename _UV::mask_type(__k));
1022 }
1023 return;
1024 }
1025#endif
1026 if (__k._M_none_of()) [[unlikely]]
1027 return;
1028 else if constexpr (_S_is_scalar)
1029 __mem[0] = __v._M_data;
1030 else
1031 {
1032 // Use at least 4-byte __bits in __bit_foreach for better code-gen
1033 _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
1034 [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
1035 if constexpr (__converts_trivially<value_type, _Up>)
1036 {
1037 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
1038 __mem[__i] = __v[__i];
1039 });
1040 }
1041 else
1042 {
1043 const rebind_t<_Up, basic_vec> __cvted(__v);
1044 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
1045 __mem[__i] = __cvted[__i];
1046 });
1047 }
1048 }
1049 }
1050
1051 // [simd.overview] default constructor ----------------------------------
1052 basic_vec() = default;
1053
1054 // [simd.overview] p2 impl-def conversions ------------------------------
1055 using _NativeVecType = decltype([] {
1056 if constexpr (_S_is_scalar)
1057 return __vec_builtin_type<__canon_value_type, 1>();
1058 else
1059 return _DataType();
1060 }());
1061 /**
1062 * @brief Converting constructor from GCC vector builtins.
1063 *
1064 * This constructor enables direct construction from GCC vector builtins
1065 * (`[[gnu::vector_size(N)]]`).
1066 *
1067 * @param __x GCC vector builtin to convert from.
1068 *
1069 * @note This constructor is not available when size() equals 1.
1070 *
1071 * @see operator _NativeVecType() for the reverse conversion.
1072 */
1073 constexpr
1074 basic_vec(_NativeVecType __x)
1075 : _M_data([&] [[__gnu__::__always_inline__]] {
1076 if constexpr (_S_is_scalar)
1077 return __x[0];
1078 else
1079 return __x;
1080 }())
1081 {}
1082
1083 /**
1084 * @brief Conversion operator to GCC vector builtins.
1085 *
1086 * This operator enables implicit conversion from basic_vec to GCC vector builtins.
1087 *
1088 * @note This operator is not available when size() equals 1.
1089 *
1090 * @see basic_vec(_NativeVecType) for the reverse conversion.
1091 */
1092 constexpr
1093 operator _NativeVecType() const
1094 {
1095 if constexpr (_S_is_scalar)
1096 return _NativeVecType{_M_data};
1097 else
1098 return _M_data;
1099 }
1100
1101#if _GLIBCXX_X86
1102 /**
1103 * @brief Converting constructor from Intel Intrinsics (__m128, __m128i, ...).
1104 */
1105 template <__vec_builtin _IV>
1106 requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
1107 && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
1108 && !is_same_v<_IV, _DataType>)
1109 constexpr
1110 basic_vec(_IV __x)
1111 : _M_data(reinterpret_cast<_DataType>(__x))
1112 {}
1113
1114 /**
1115 * @brief Conversion operator to Intel Intrinsics (__m128, __m128i, ...).
1116 */
1117 template <__vec_builtin _IV>
1118 requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
1119 && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
1120 && !is_same_v<_IV, _DataType>)
1121 constexpr
1122 operator _IV() const
1123 { return reinterpret_cast<_IV>(_M_data); }
1124#endif
1125
1126 // [simd.ctor] broadcast constructor ------------------------------------
1127 /**
1128 * @brief Broadcast constructor from scalar value.
1129 *
1130 * Constructs a vector where all elements are initialized to the same scalar value.
1131 * The scalar value is converted to the vector's element type.
1132 *
1133 * @param __x Scalar value to broadcast to all vector elements.
1134 * @tparam _Up Type of scalar value (must be explicitly convertible to value_type).
1135 *
1136 * @note The constructor is implicit if the conversion (if any) is value-preserving.
1137 */
1138 template <__broadcast_constructible<value_type> _Up>
1139 [[__gnu__::__always_inline__]]
1140 constexpr
1141 basic_vec(_Up&& __x) noexcept
1142 : _M_data(_DataType() == _DataType() ? static_cast<value_type>(__x) : value_type())
1143 {}
1144
1145 // [simd.ctor] conversion constructor -----------------------------------
1146 template <typename _Up, typename _UAbi, _TargetTraits _Traits = {}>
1147 requires (_S_size == _UAbi::_S_size)
1148 && __explicitly_convertible_to<_Up, value_type>
1149 [[__gnu__::__always_inline__]]
1150 constexpr
1151 explicit(!__value_preserving_convertible_to<_Up, value_type>
1152 || __higher_rank_than<_Up, value_type>)
1153 basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
1154 : _M_data([&] [[__gnu__::__always_inline__]] {
1155 if constexpr (_S_is_scalar)
1156 return static_cast<value_type>(__x[0]);
1157 else if constexpr (_UAbi::_S_nreg >= 2)
1158 // __builtin_convertvector (__vec_cast) is inefficient for over-sized inputs.
1159 // Also e.g. vec<float, 12> -> vec<char, 12> (with SSE2) would otherwise emit 4
1160 // vcvttps2dq instructions, where only 3 are needed
1161 return _S_concat(resize_t<__x._N0, basic_vec>(__x._M_data0),
1162 resize_t<__x._N1, basic_vec>(__x._M_data1))._M_data;
1163 else
1164 return __vec_cast<_DataType>(__x._M_concat_data());
1165 }())
1166 {}
1167
1168 using _VecBase<_Tp, _Ap>::_VecBase;
1169
1170 // [simd.ctor] generator constructor ------------------------------------
1171 template <__simd_generator_invokable<value_type, _S_size> _Fp>
1172 [[__gnu__::__always_inline__]]
1173 constexpr explicit
1174 basic_vec(_Fp&& __gen)
1175 : _M_data([&] [[__gnu__::__always_inline__]] {
1176 constexpr auto [...__is] = _IotaArray<_S_size>;
1177 return _DataType{static_cast<value_type>(__gen(__simd_size_c<__is>))...};
1178 }())
1179 {}
1180
1181 // [simd.ctor] load constructor -----------------------------------------
1182 template <typename _Up>
1183 [[__gnu__::__always_inline__]]
1184 constexpr
1185 basic_vec(_LoadCtorTag, const _Up* __ptr)
1186 : _M_data()
1187 {
1188 if constexpr (_S_is_scalar)
1189 _M_data = static_cast<value_type>(__ptr[0]);
1190 else if consteval
1191 {
1192 constexpr auto [...__is] = _IotaArray<_S_size>;
1193 _M_data = _DataType{static_cast<value_type>(__ptr[__is])...};
1194 }
1195 else
1196 {
1197 if constexpr (__converts_trivially<_Up, value_type>)
1198 // This assumes std::floatN_t to be bitwise equal to float/double
1199 __builtin_memcpy(&_M_data, __ptr, sizeof(value_type) * _S_size);
1200 else
1201 {
1202 __vec_builtin_type<_Up, _S_full_size> __tmp = {};
1203 __builtin_memcpy(&__tmp, __ptr, sizeof(_Up) * _S_size);
1204 _M_data = __vec_cast<_DataType>(__tmp);
1205 }
1206 }
1207 }
1208
1209 template <ranges::contiguous_range _Rg, typename... _Flags>
1210 requires __static_sized_range<_Rg, _S_size>
1211 && __vectorizable<ranges::range_value_t<_Rg>>
1212 && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
1213 [[__gnu__::__always_inline__]]
1214 constexpr
1215 basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
1216 : basic_vec(_LoadCtorTag(), __flags.template _S_adjust_pointer<basic_vec>(
1217 ranges::data(__range)))
1218 {
1219 static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
1220 _Flags...>);
1221 }
1222
1223 // [simd.subscr] --------------------------------------------------------
1224 /**
1225 * @brief Return the value of the element at index @p __i.
1226 *
1227 * @pre __i >= 0 && __i < size().
1228 */
1229 [[__gnu__::__always_inline__]]
1230 constexpr value_type
1231 operator[](__simd_size_type __i) const
1232 {
1233 __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
1234 if constexpr (_S_is_scalar)
1235 return _M_data;
1236 else
1237 return _M_data[__i];
1238 }
1239
1240 // [simd.unary] unary operators -----------------------------------------
1241 // increment and decrement are implemented in terms of operator+=/-= which avoids UB on
1242 // padding elements while not breaking UBsan
1243 [[__gnu__::__always_inline__]]
1244 constexpr basic_vec&
1245 operator++() noexcept requires requires(value_type __a) { ++__a; }
1246 { return *this += value_type(1); }
1247
1248 [[__gnu__::__always_inline__]]
1249 constexpr basic_vec
1250 operator++(int) noexcept requires requires(value_type __a) { __a++; }
1251 {
1252 basic_vec __r = *this;
1253 *this += value_type(1);
1254 return __r;
1255 }
1256
1257 [[__gnu__::__always_inline__]]
1258 constexpr basic_vec&
1259 operator--() noexcept requires requires(value_type __a) { --__a; }
1260 { return *this -= value_type(1); }
1261
1262 [[__gnu__::__always_inline__]]
1263 constexpr basic_vec
1264 operator--(int) noexcept requires requires(value_type __a) { __a--; }
1265 {
1266 basic_vec __r = *this;
1267 *this -= value_type(1);
1268 return __r;
1269 }
1270
1271 [[__gnu__::__always_inline__]]
1272 constexpr mask_type
1273 operator!() const noexcept requires requires(value_type __a) { !__a; }
1274 { return *this == value_type(); }
1275
1276 /**
1277 * @brief Unary plus operator (no-op).
1278 *
1279 * Returns an unchanged copy of the object.
1280 */
1281 [[__gnu__::__always_inline__]]
1282 constexpr basic_vec
1283 operator+() const noexcept requires requires(value_type __a) { +__a; }
1284 { return *this; }
1285
1286 /**
1287 * @brief Unary negation operator.
1288 *
1289 * Returns a new SIMD vector after element-wise negation.
1290 */
1291 [[__gnu__::__always_inline__]]
1292 constexpr basic_vec
1293 operator-() const noexcept requires requires(value_type __a) { -__a; }
1294 { return _S_init(-_M_data); }
1295
1296 /**
1297 * @brief Bitwise NOT / complement operator.
1298 *
1299 * Returns a new SIMD vector after element-wise complement.
1300 */
1301 [[__gnu__::__always_inline__]]
1302 constexpr basic_vec
1303 operator~() const noexcept requires requires(value_type __a) { ~__a; }
1304 { return _S_init(~_M_data); }
1305
1306 // [simd.cassign] binary operators
1307 /**
1308 * @brief Bitwise AND operator.
1309 *
1310 * Returns a new SIMD vector after element-wise AND.
1311 */
1312 [[__gnu__::__always_inline__]]
1313 friend constexpr basic_vec&
1314 operator&=(basic_vec& __x, const basic_vec& __y) noexcept
1315 requires requires(value_type __a) { __a & __a; }
1316 {
1317 __x._M_data &= __y._M_data;
1318 return __x;
1319 }
1320
1321 /**
1322 * @brief Bitwise OR operator.
1323 *
1324 * Returns a new SIMD vector after element-wise OR.
1325 */
1326 [[__gnu__::__always_inline__]]
1327 friend constexpr basic_vec&
1328 operator|=(basic_vec& __x, const basic_vec& __y) noexcept
1329 requires requires(value_type __a) { __a | __a; }
1330 {
1331 __x._M_data |= __y._M_data;
1332 return __x;
1333 }
1334
1335 /**
1336 * @brief Bitwise XOR operator.
1337 *
1338 * Returns a new SIMD vector after element-wise XOR.
1339 */
1340 [[__gnu__::__always_inline__]]
1341 friend constexpr basic_vec&
1342 operator^=(basic_vec& __x, const basic_vec& __y) noexcept
1343 requires requires(value_type __a) { __a ^ __a; }
1344 {
1345 __x._M_data ^= __y._M_data;
1346 return __x;
1347 }
1348
1349 /**
1350 * @brief Applies the compound assignment operator element-wise.
1351 *
1352 * @pre If @c value_type is a signed integral type, the result is representable by @c
1353 * value_type. (This does not apply to padding elements the implementation might add for
1354 * non-power-of-2 widths.) UBsan will only see a call to @c unreachable() on overflow.
1355 *
1356 * @note The overflow detection code is discarded unless UBsan is active.
1357 */
1358 [[__gnu__::__always_inline__]]
1359 friend constexpr basic_vec&
1360 operator+=(basic_vec& __x, const basic_vec& __y) noexcept
1361 requires requires(value_type __a) { __a + __a; }
1362 {
1363 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1364 { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
1365 // remove UB of the active elements (so that UBsan can still do its job).
1366 //
1367 // This check is essentially free (at runtime) because DCE removes everything except
1368 // the final change to _M_data. The overflow check is only emitted if UBsan is active.
1369 //
1370 // The alternative would be to always zero padding elements after operations that can
1371 // produce non-zero values. However, right now:
1372 // - auto f(simd::mask<int, 3> k) { return +k; } is a single VPABSD and would have to
1373 // sanitize
1374 // - bit_cast to basic_vec with non-zero padding elements is fine
1375 // - conversion from intrinsics can create non-zero padding elements
1376 // - shuffles are allowed to put whatever they want into padding elements for
1377 // optimization purposes (e.g. for better instruction selection)
1378 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1379 const _DataType __result
1380 = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1381 + reinterpret_cast<_UV>(__y._M_data));
1382 const auto __positive = __y > value_type();
1383 const auto __overflow = __positive != (__result > __x);
1384 if (__overflow._M_any_of())
1385 __builtin_unreachable(); // trigger UBsan
1386 __x._M_data = __result;
1387 }
1388 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1389 __x = basic_vec(rebind_t<float, basic_vec>(__x) + __y);
1390 else
1391 __x._M_data += __y._M_data;
1392 return __x;
1393 }
1394
1395 /** @copydoc operator+=
1396 */
1397 [[__gnu__::__always_inline__]]
1398 friend constexpr basic_vec&
1399 operator-=(basic_vec& __x, const basic_vec& __y) noexcept
1400 requires requires(value_type __a) { __a - __a; }
1401 {
1402 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1403 { // see comment on operator+=
1404 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1405 const _DataType __result
1406 = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1407 - reinterpret_cast<_UV>(__y._M_data));
1408 const auto __positive = __y > value_type();
1409 const auto __overflow = __positive != (__result < __x);
1410 if (__overflow._M_any_of())
1411 __builtin_unreachable(); // trigger UBsan
1412 __x._M_data = __result;
1413 }
1414 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1415 __x = basic_vec(rebind_t<float, basic_vec>(__x) - __y);
1416 else
1417 __x._M_data -= __y._M_data;
1418 return __x;
1419 }
1420
1421 /** @copydoc operator+=
1422 */
1423 [[__gnu__::__always_inline__]]
1424 friend constexpr basic_vec&
1425 operator*=(basic_vec& __x, const basic_vec& __y) noexcept
1426 requires requires(value_type __a) { __a * __a; }
1427 {
1428 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1429 { // see comment on operator+=
1430 for (int __i = 0; __i < _S_size; ++__i)
1431 {
1432 if (__builtin_mul_overflow_p(__x._M_data[__i], __y._M_data[__i], value_type()))
1433 __builtin_unreachable();
1434 }
1435 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1436 __x._M_data = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1437 * reinterpret_cast<_UV>(__y._M_data));
1438 }
1439
1440 // 'uint16 * uint16' promotes to int and can therefore lead to UB. The standard does not
1441 // require to avoid the undefined behavior. It's unnecessary and easy to avoid. It's also
1442 // unexpected because there's no UB on the vector types (which don't promote).
1443 else if constexpr (_S_is_scalar && is_unsigned_v<value_type>
1444 && is_signed_v<decltype(value_type() * value_type())>)
1445 __x._M_data = unsigned(__x._M_data) * unsigned(__y._M_data);
1446
1447 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1448 __x = basic_vec(rebind_t<float, basic_vec>(__x) * __y);
1449
1450 else
1451 __x._M_data *= __y._M_data;
1452 return __x;
1453 }
1454
1455 template <_TargetTraits _Traits = {}>
1456 [[__gnu__::__always_inline__]]
1457 friend constexpr basic_vec&
1458 operator/=(basic_vec& __x, const basic_vec& __y) noexcept
1459 requires requires(value_type __a) { __a / __a; }
1460 {
1461 const basic_vec __result([&](int __i) -> value_type { return __x[__i] / __y[__i]; });
1462 if (__is_const_known(__result))
1463 // the optimizer already knows the values of the result
1464 return __x = __result;
1465
1466#ifdef __SSE2__
1467 // x86 doesn't have integral SIMD division instructions
1468 // While division is faster, the required conversions are still a problem:
1469 // see PR121274, PR121284, and PR121296 for missed optimizations wrt. conversions
1470 //
1471 // With only 1 or 2 divisions, the conversion to and from fp is too expensive.
1472 if constexpr (is_integral_v<value_type> && _S_size > 2
1473 && __value_preserving_convertible_to<value_type, double>)
1474 {
1475 // If the denominator (y) is known to the optimizer, don't convert to fp because the
1476 // integral division can be translated into shifts/multiplications.
1477 if (!__is_const_known(__y))
1478 {
1479 // With AVX512FP16 use vdivph for 8-bit integers
1480 if constexpr (_Traits._M_have_avx512fp16()
1481 && __value_preserving_convertible_to<value_type, _Float16>)
1482 return __x = basic_vec(rebind_t<_Float16, basic_vec>(__x) / __y);
1483 else if constexpr (__value_preserving_convertible_to<value_type, float>)
1484 return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
1485 else
1486 return __x = basic_vec(rebind_t<double, basic_vec>(__x) / __y);
1487 }
1488 }
1489#endif
1490 if constexpr (_Traits._M_eval_as_f32<value_type>())
1491 return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
1492
1493 basic_vec __y1 = __y;
1494 if constexpr (_S_is_partial)
1495 {
1496 if constexpr (is_integral_v<value_type>)
1497 {
1498 // Assume integral division doesn't have SIMD instructions and must be done per
1499 // element anyway. Partial vectors should skip their padding elements.
1500 for (int __i = 0; __i < _S_size; ++__i)
1501 __x._M_data[__i] /= __y._M_data[__i];
1502 return __x;
1503 }
1504 else
1505 __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
1506 __y, basic_vec(value_type(1)));
1507 }
1508 __x._M_data /= __y1._M_data;
1509 return __x;
1510 }
1511
1512 [[__gnu__::__always_inline__]]
1513 friend constexpr basic_vec&
1514 operator%=(basic_vec& __x, const basic_vec& __y) noexcept
1515 requires requires(value_type __a) { __a % __a; }
1516 {
1517 static_assert(is_integral_v<value_type>);
1518 if constexpr (_S_is_partial)
1519 {
1520 const basic_vec __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
1521 __y, basic_vec(value_type(1)));
1522 if (__is_const_known(__y1))
1523 __x._M_data %= __y1._M_data;
1524 else
1525 {
1526 // Assume integral division doesn't have SIMD instructions and must be done per
1527 // element anyway. Partial vectors should skip their padding elements.
1528 for (int __i = 0; __i < _S_size; ++__i)
1529 __x._M_data[__i] %= __y._M_data[__i];
1530 }
1531 }
1532 else
1533 __x._M_data %= __y._M_data;
1534 return __x;
1535 }
1536
1537 [[__gnu__::__always_inline__]]
1538 friend constexpr basic_vec&
1539 operator<<=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
1540 requires requires(value_type __a) { __a << __a; }
1541 {
1542 __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
1543 "negative shift is undefined behavior");
1544 __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
1545 "too large shift invokes undefined behavior");
1546 __x._M_data <<= __y._M_data;
1547 return __x;
1548 }
1549
1550 [[__gnu__::__always_inline__]]
1551 friend constexpr basic_vec&
1552 operator>>=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
1553 requires requires(value_type __a) { __a >> __a; }
1554 {
1555 __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
1556 "negative shift is undefined behavior");
1557 __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
1558 "too large shift invokes undefined behavior");
1559 __x._M_data >>= __y._M_data;
1560 return __x;
1561 }
1562
1563 [[__gnu__::__always_inline__]]
1564 friend constexpr basic_vec&
1565 operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
1566 requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
1567 {
1568 __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
1569 __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
1570 "too large shift invokes undefined behavior");
1571 __x._M_data <<= __y;
1572 return __x;
1573 }
1574
1575 [[__gnu__::__always_inline__]]
1576 friend constexpr basic_vec&
1577 operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
1578 requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
1579 {
1580 __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
1581 __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
1582 "too large shift invokes undefined behavior");
1583 __x._M_data >>= __y;
1584 return __x;
1585 }
1586
1587 // [simd.comparison] ----------------------------------------------------
1588#if _GLIBCXX_X86
1589 template <_X86Cmp _Cmp>
1590 [[__gnu__::__always_inline__]]
1591 constexpr mask_type
1592 _M_bitmask_cmp(_DataType __y) const
1593 {
1594 static_assert(_S_use_bitmask);
1595 if (__is_const_known(_M_data, __y))
1596 {
1597 constexpr auto [...__is] = _IotaArray<_S_size>;
1598 constexpr auto __cmp_op = [] [[__gnu__::__always_inline__]]
1599 (value_type __a, value_type __b) {
1600 if constexpr (_Cmp == _X86Cmp::_Eq)
1601 return __a == __b;
1602 else if constexpr (_Cmp == _X86Cmp::_Lt)
1603 return __a < __b;
1604 else if constexpr (_Cmp == _X86Cmp::_Le)
1605 return __a <= __b;
1606 else if constexpr (_Cmp == _X86Cmp::_Unord)
1607 return std::isunordered(__a, __b);
1608 else if constexpr (_Cmp == _X86Cmp::_Neq)
1609 return __a != __b;
1610 else if constexpr (_Cmp == _X86Cmp::_Nlt)
1611 return !(__a < __b);
1612 else if constexpr (_Cmp == _X86Cmp::_Nle)
1613 return !(__a <= __b);
1614 else
1615 static_assert(false);
1616 };
1617 const _Bitmask<_S_size> __bits
1618 = ((__cmp_op(__vec_get(_M_data, __is), __vec_get(__y, __is))
1619 ? (1ULL << __is) : 0) | ...);
1620 return mask_type::_S_init(__bits);
1621 }
1622 else
1623 return mask_type::_S_init(__x86_bitmask_cmp<_Cmp>(_M_data, __y));
1624 }
1625#endif
1626
1627 [[__gnu__::__always_inline__]]
1628 friend constexpr mask_type
1629 operator==(const basic_vec& __x, const basic_vec& __y) noexcept
1630 {
1631#if _GLIBCXX_X86
1632 if constexpr (_S_use_bitmask)
1633 return __x._M_bitmask_cmp<_X86Cmp::_Eq>(__y._M_data);
1634 else
1635#endif
1636 return mask_type::_S_init(__x._M_data == __y._M_data);
1637 }
1638
1639 [[__gnu__::__always_inline__]]
1640 friend constexpr mask_type
1641 operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
1642 {
1643#if _GLIBCXX_X86
1644 if constexpr (_S_use_bitmask)
1645 return __x._M_bitmask_cmp<_X86Cmp::_Neq>(__y._M_data);
1646 else
1647#endif
1648 return mask_type::_S_init(__x._M_data != __y._M_data);
1649 }
1650
1651 [[__gnu__::__always_inline__]]
1652 friend constexpr mask_type
1653 operator<(const basic_vec& __x, const basic_vec& __y) noexcept
1654 {
1655#if _GLIBCXX_X86
1656 if constexpr (_S_use_bitmask)
1657 return __x._M_bitmask_cmp<_X86Cmp::_Lt>(__y._M_data);
1658 else
1659#endif
1660 return mask_type::_S_init(__x._M_data < __y._M_data);
1661 }
1662
1663 [[__gnu__::__always_inline__]]
1664 friend constexpr mask_type
1665 operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
1666 {
1667#if _GLIBCXX_X86
1668 if constexpr (_S_use_bitmask)
1669 return __x._M_bitmask_cmp<_X86Cmp::_Le>(__y._M_data);
1670 else
1671#endif
1672 return mask_type::_S_init(__x._M_data <= __y._M_data);
1673 }
1674
1675 [[__gnu__::__always_inline__]]
1676 friend constexpr mask_type
1677 operator>(const basic_vec& __x, const basic_vec& __y) noexcept
1678 { return __y < __x; }
1679
1680 [[__gnu__::__always_inline__]]
1681 friend constexpr mask_type
1682 operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
1683 { return __y <= __x; }
1684
1685 // [simd.cond] ---------------------------------------------------------
1686 template <_TargetTraits _Traits = {}>
1687 [[__gnu__::__always_inline__]]
1688 friend constexpr basic_vec
1689 __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
1690 {
1691 if constexpr (_S_size == 1)
1692 return __k[0] ? __t : __f;
1693 else if constexpr (_S_use_bitmask)
1694 {
1695#if _GLIBCXX_X86
1696 if (__is_const_known(__k, __t, __f))
1697 return basic_vec([&](int __i) { return __k[__i] ? __t[__i] : __f[__i]; });
1698 else
1699 return __x86_bitmask_blend(__k._M_data, __t._M_data, __f._M_data);
1700#else
1701 static_assert(false, "TODO");
1702#endif
1703 }
1704 else if consteval
1705 {
1706 return __k._M_data ? __t._M_data : __f._M_data;
1707 }
1708 else
1709 {
1710 constexpr bool __uses_simd_register = sizeof(_M_data) >= 8;
1711 using _VO = _VecOps<_DataType>;
1712 if (_VO::_S_is_const_known_equal_to(__f._M_data, 0))
1713 {
1714 if (is_integral_v<value_type> && __uses_simd_register
1715 && _VO::_S_is_const_known_equal_to(__t._M_data, 1))
1716 // This is equivalent to converting the mask into a vec of 0s and 1s. So +__k.
1717 // However, basic_mask::operator+ arrives here; returning +__k would be
1718 // recursive. Instead we use -__k (which is a no-op for vector-masks) and then
1719 // flip all -1 elements to +1 by taking the absolute value.
1720 return basic_vec((-__k)._M_abs());
1721 else
1722 return __vec_and(reinterpret_cast<_DataType>(__k._M_data), __t._M_data);
1723 }
1724 else if (_VecOps<_DataType>::_S_is_const_known_equal_to(__t._M_data, 0))
1725 {
1726 if (is_integral_v<value_type> && __uses_simd_register
1727 && _VO::_S_is_const_known_equal_to(__f._M_data, 1))
1728 return value_type(1) + basic_vec(-__k);
1729 else
1730 return __vec_and(reinterpret_cast<_DataType>(__vec_not(__k._M_data)), __f._M_data);
1731 }
1732 else
1733 {
1734#if _GLIBCXX_X86
1735 // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
1736 // This pattern, is recognized to match the x86 blend instructions, which only consider
1737 // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
1738 // is a vector-mask, then the '< 0' is elided.
1739 return __k._M_data < 0 ? __t._M_data : __f._M_data;
1740#endif
1741 return __k._M_data ? __t._M_data : __f._M_data;
1742 }
1743 }
1744 }
1745 };
1746
1747 template <__vectorizable _Tp, __abi_tag _Ap>
1748 requires (_Ap::_S_nreg > 1)
1749 class basic_vec<_Tp, _Ap>
1750 : public _VecBase<_Tp, _Ap>
1751 {
1752 template <typename, typename>
1753 friend class basic_vec;
1754
1755 template <size_t, typename>
1756 friend class basic_mask;
1757
1758 static constexpr int _S_size = _Ap::_S_size;
1759
1760 static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
1761
1762 static constexpr int _N1 = _S_size - _N0;
1763
1764 using _DataType0 = __similar_vec<_Tp, _N0, _Ap>;
1765
1766 // the implementation (and users) depend on elements being contiguous in memory
1767 static_assert(_N0 * sizeof(_Tp) == sizeof(_DataType0));
1768
1769 using _DataType1 = __similar_vec<_Tp, _N1, _Ap>;
1770
1771 static_assert(_DataType0::abi_type::_S_nreg + _DataType1::abi_type::_S_nreg == _Ap::_S_nreg);
1772
1773 static constexpr bool _S_is_scalar = _DataType0::_S_is_scalar;
1774
1775 _DataType0 _M_data0;
1776
1777 _DataType1 _M_data1;
1778
1779 static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask;
1780
1781 static constexpr bool _S_is_partial = _DataType1::_S_is_partial;
1782
1783 public:
1784 using value_type = _Tp;
1785
1786 using mask_type = _VecBase<_Tp, _Ap>::mask_type;
1787
1788 [[__gnu__::__always_inline__]]
1789 static constexpr basic_vec
1790 _S_init(const _DataType0& __x, const _DataType1& __y)
1791 {
1792 basic_vec __r;
1793 __r._M_data0 = __x;
1794 __r._M_data1 = __y;
1795 return __r;
1796 }
1797
1798 [[__gnu__::__always_inline__]]
1799 constexpr const _DataType0&
1800 _M_get_low() const
1801 { return _M_data0; }
1802
1803 [[__gnu__::__always_inline__]]
1804 constexpr const _DataType1&
1805 _M_get_high() const
1806 { return _M_data1; }
1807
1808 [[__gnu__::__always_inline__]]
1809 friend constexpr bool
1810 __is_const_known(const basic_vec& __x)
1811 { return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); }
1812
1813 [[__gnu__::__always_inline__]]
1814 constexpr auto
1815 _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
1816 {
1817 return __vec_concat(_M_data0._M_concat_data(false),
1818 __vec_zero_pad_to<sizeof(_M_data0)>(
1819 _M_data1._M_concat_data(__do_sanitize)));
1820 }
1821
1822 template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
1823 [[__gnu__::__always_inline__]]
1824 static constexpr basic_vec
1825 _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
1826 {
1827 return _S_init(
1828 _DataType0::template _S_static_permute<_Size, _Offset>(__x, __idxmap),
1829 _DataType1::template _S_static_permute<_Size, _Offset + _N0>(__x, __idxmap));
1830 }
1831
1832 template <typename _Vp>
1833 [[__gnu__::__always_inline__]]
1834 constexpr auto
1835 _M_chunk() const noexcept
1836 {
1837 constexpr int __n = _S_size / _Vp::_S_size;
1838 constexpr int __rem = _S_size % _Vp::_S_size;
1839 constexpr auto [...__is] = _IotaArray<__n>;
1840 if constexpr (__rem == 0)
1841 return array<_Vp, __n>{__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>,
1842 _M_data0, _M_data1)...};
1843 else
1844 {
1845 using _Rest = resize_t<__rem, _Vp>;
1846 return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, _M_data0, _M_data1)...,
1847 __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, _M_data0, _M_data1));
1848 }
1849 }
1850
1851 [[__gnu__::__always_inline__]]
1852 static constexpr const basic_vec&
1853 _S_concat(const basic_vec& __x0) noexcept
1854 { return __x0; }
1855
1856 template <typename... _As>
1857 requires (sizeof...(_As) >= 2)
1858 [[__gnu__::__always_inline__]]
1859 static constexpr basic_vec
1860 _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
1861 {
1862 static_assert(_S_size == (_As::_S_size + ...));
1863 return _S_init(__extract_simd_at<_DataType0>(cw<0>, __xs...),
1864 __extract_simd_at<_DataType1>(cw<_N0>, __xs...));
1865 }
1866
1867 [[__gnu__::__always_inline__]]
1868 constexpr auto
1869 _M_reduce_to_half(auto __binary_op) const requires (_N0 == _N1)
1870 { return __binary_op(_M_data0, _M_data1); }
1871
1872 [[__gnu__::__always_inline__]]
1873 constexpr value_type
1874 _M_reduce_tail(const auto& __rest, auto __binary_op) const
1875 {
1876 if constexpr (__rest.size() > _S_size)
1877 {
1878 auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
1879 return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
1880 }
1881 else if constexpr (__rest.size() == _S_size)
1882 return __binary_op(*this, __rest)._M_reduce(__binary_op);
1883 else
1884 return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
1885 }
1886
1887 template <typename _BinaryOp, _TargetTraits _Traits = {}>
1888 [[__gnu__::__always_inline__]]
1889 constexpr value_type
1890 _M_reduce(_BinaryOp __binary_op) const
1891 {
1892 if constexpr (_Traits.template _M_eval_as_f32<value_type>()
1893 && (is_same_v<_BinaryOp, plus<>>
1894 || is_same_v<_BinaryOp, multiplies<>>))
1895 return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
1896#ifdef __SSE2__
1897 else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
1898 && is_same_v<decltype(__binary_op), multiplies<>>)
1899 {
1900 // convert to unsigned short because of missing 8-bit mul instruction
1901 // we don't need to preserve the order of elements
1902 //
1903 // The left columns under Latency and Throughput show bit-cast to ushort with shift by
1904 // 8. The right column uses the alternative in the else branch.
1905 // Benchmark on Intel Ultra 7 165U (AVX2)
1906 // TYPE Latency Throughput
1907 // [cycles/call] [cycles/call]
1908 //schar, 64 59.9 70.7 10.5 13.3
1909 //schar, 128 81.4 97.2 12.2 21
1910 //schar, 256 92.4 129 17.2 35.2
1911 if constexpr (_DataType1::_S_is_scalar)
1912 return __binary_op(_DataType1(_M_data0._M_reduce(__binary_op)), _M_data1)[0];
1913 // TODO: optimize trailing scalar (e.g. (8+8)+(8+1))
1914 else if constexpr (_S_size % 2 == 0)
1915 { // If all elements participate in the reduction we can take this shortcut
1916 using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
1917 auto __a = __builtin_bit_cast(_V16, *this);
1918 return __binary_op(__a, __a >> __CHAR_BIT__)._M_reduce(__binary_op);
1919 }
1920 else
1921 {
1922 using _V16 = rebind_t<unsigned short, basic_vec>;
1923 return _V16(*this)._M_reduce(__binary_op);
1924 }
1925 }
1926#endif
1927 else
1928 return _M_data0._M_reduce_tail(_M_data1, __binary_op);
1929 }
1930
1931 [[__gnu__::__always_inline__]]
1932 constexpr mask_type
1933 _M_isnan() const requires is_floating_point_v<value_type>
1934 { return mask_type::_S_init(_M_data0._M_isnan(), _M_data1._M_isnan()); }
1935
1936 [[__gnu__::__always_inline__]]
1937 constexpr mask_type
1938 _M_isinf() const requires is_floating_point_v<value_type>
1939 { return mask_type::_S_init(_M_data0._M_isinf(), _M_data1._M_isinf()); }
1940
1941 [[__gnu__::__always_inline__]]
1942 constexpr mask_type
1943 _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
1944 {
1945 return mask_type::_S_init(_M_data0._M_isunordered(__y._M_data0),
1946 _M_data1._M_isunordered(__y._M_data1));
1947 }
1948
1949 [[__gnu__::__always_inline__]]
1950 constexpr basic_vec
1951 _M_abs() const requires signed_integral<value_type>
1952 { return _S_init(_M_data0._M_abs(), _M_data1._M_abs()); }
1953
1954 [[__gnu__::__always_inline__]]
1955 constexpr basic_vec
1956 _M_fabs() const requires floating_point<value_type>
1957 { return _S_init(_M_data0._M_fabs(), _M_data1._M_fabs()); }
1958
1959 template <typename _Up>
1960 [[__gnu__::__always_inline__]]
1961 static inline basic_vec
1962 _S_partial_load(const _Up* __mem, size_t __n)
1963 {
1964 if (__n >= _N0)
1965 return _S_init(_DataType0(_LoadCtorTag(), __mem),
1966 _DataType1::_S_partial_load(__mem + _N0, __n - _N0));
1967 else
1968 return _S_init(_DataType0::_S_partial_load(__mem, __n),
1969 _DataType1());
1970 }
1971
1972 template <typename _Up, _ArchTraits _Traits = {}>
1973 static inline basic_vec
1974 _S_masked_load(const _Up* __mem, mask_type __k)
1975 {
1976 return _S_init(_DataType0::_S_masked_load(__mem, __k._M_data0),
1977 _DataType1::_S_masked_load(__mem + _N0, __k._M_data1));
1978 }
1979
1980 template <typename _Up>
1981 [[__gnu__::__always_inline__]]
1982 inline void
1983 _M_store(_Up* __mem) const
1984 {
1985 _M_data0._M_store(__mem);
1986 _M_data1._M_store(__mem + _N0);
1987 }
1988
1989 template <typename _Up>
1990 [[__gnu__::__always_inline__]]
1991 static inline void
1992 _S_partial_store(const basic_vec& __v, _Up* __mem, size_t __n)
1993 {
1994 if (__n >= _N0)
1995 {
1996 __v._M_data0._M_store(__mem);
1997 _DataType1::_S_partial_store(__v._M_data1, __mem + _N0, __n - _N0);
1998 }
1999 else
2000 {
2001 _DataType0::_S_partial_store(__v._M_data0, __mem, __n);
2002 }
2003 }
2004
2005 template <typename _Up>
2006 [[__gnu__::__always_inline__]]
2007 static inline void
2008 _S_masked_store(const basic_vec& __v, _Up* __mem, const mask_type& __k)
2009 {
2010 _DataType0::_S_masked_store(__v._M_data0, __mem, __k._M_data0);
2011 _DataType1::_S_masked_store(__v._M_data1, __mem + _N0, __k._M_data1);
2012 }
2013
2014 basic_vec() = default;
2015
2016 // [simd.overview] p2 impl-def conversions ------------------------------
2017 using _NativeVecType = __vec_builtin_type<value_type, __bit_ceil(unsigned(_S_size))>;
2018
2019 [[__gnu__::__always_inline__]]
2020 constexpr
2021 basic_vec(const _NativeVecType& __x)
2022 : _M_data0(_VecOps<__vec_builtin_type<value_type, _N0>>::_S_extract(__x)),
2023 _M_data1(_VecOps<__vec_builtin_type<value_type, __bit_ceil(unsigned(_N1))>>
2024 ::_S_extract(__x, integral_constant<int, _N0>()))
2025 {}
2026
2027 [[__gnu__::__always_inline__]]
2028 constexpr
2029 operator _NativeVecType() const
2030 { return _M_concat_data(); }
2031
2032 // [simd.ctor] broadcast constructor ------------------------------------
2033 template <__broadcast_constructible<value_type> _Up>
2034 [[__gnu__::__always_inline__]]
2035 constexpr
2036 basic_vec(_Up&& __x) noexcept
2037 : _M_data0(static_cast<value_type>(__x)), _M_data1(static_cast<value_type>(__x))
2038 {}
2039
2040 // [simd.ctor] conversion constructor -----------------------------------
2041 template <typename _Up, typename _UAbi>
2042 requires (_S_size == _UAbi::_S_size)
2043 && __explicitly_convertible_to<_Up, value_type>
2044 [[__gnu__::__always_inline__]]
2045 constexpr
2046 explicit(!__value_preserving_convertible_to<_Up, value_type>
2047 || __higher_rank_than<_Up, value_type>)
2048 basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
2049 : _M_data0(get<0>(chunk<_N0>(__x))),
2050 _M_data1(get<1>(chunk<_N0>(__x)))
2051 {}
2052
2053 using _VecBase<_Tp, _Ap>::_VecBase;
2054
2055 // [simd.ctor] generator constructor ------------------------------------
2056 template <__simd_generator_invokable<value_type, _S_size> _Fp>
2057 [[__gnu__::__always_inline__]]
2058 constexpr explicit
2059 basic_vec(_Fp&& __gen)
2060 : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
2061 return __gen(__simd_size_c<__i + _N0>);
2062 })
2063 {}
2064
2065 // [simd.ctor] load constructor -----------------------------------------
2066 template <typename _Up>
2067 [[__gnu__::__always_inline__]]
2068 constexpr
2069 basic_vec(_LoadCtorTag, const _Up* __ptr)
2070 : _M_data0(_LoadCtorTag(), __ptr),
2071 _M_data1(_LoadCtorTag(), __ptr + _N0)
2072 {}
2073
2074 template <ranges::contiguous_range _Rg, typename... _Flags>
2075 requires __static_sized_range<_Rg, _S_size>
2076 && __vectorizable<ranges::range_value_t<_Rg>>
2077 && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
2078 constexpr
2079 basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
2080 : basic_vec(_LoadCtorTag(),
2081 __flags.template _S_adjust_pointer<basic_vec>(ranges::data(__range)))
2082 {
2083 static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
2084 _Flags...>);
2085 }
2086
2087 // [simd.subscr] --------------------------------------------------------
2088 [[__gnu__::__always_inline__]]
2089 constexpr value_type
2090 operator[](__simd_size_type __i) const
2091 {
2092 __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
2093 if (__is_const_known(__i))
2094 return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
2095 else
2096 {
2097 using _AliasingT [[__gnu__::__may_alias__]] = value_type;
2098 return reinterpret_cast<const _AliasingT*>(this)[__i];
2099 }
2100 }
2101
2102 // [simd.unary] unary operators -----------------------------------------
2103 [[__gnu__::__always_inline__]]
2104 constexpr basic_vec&
2105 operator++() noexcept requires requires(value_type __a) { ++__a; }
2106 {
2107 ++_M_data0;
2108 ++_M_data1;
2109 return *this;
2110 }
2111
2112 [[__gnu__::__always_inline__]]
2113 constexpr basic_vec
2114 operator++(int) noexcept requires requires(value_type __a) { __a++; }
2115 {
2116 basic_vec __r = *this;
2117 ++_M_data0;
2118 ++_M_data1;
2119 return __r;
2120 }
2121
2122 [[__gnu__::__always_inline__]]
2123 constexpr basic_vec&
2124 operator--() noexcept requires requires(value_type __a) { --__a; }
2125 {
2126 --_M_data0;
2127 --_M_data1;
2128 return *this;
2129 }
2130
2131 [[__gnu__::__always_inline__]]
2132 constexpr basic_vec
2133 operator--(int) noexcept requires requires(value_type __a) { __a--; }
2134 {
2135 basic_vec __r = *this;
2136 --_M_data0;
2137 --_M_data1;
2138 return __r;
2139 }
2140
2141 [[__gnu__::__always_inline__]]
2142 constexpr mask_type
2143 operator!() const noexcept requires requires(value_type __a) { !__a; }
2144 { return mask_type::_S_init(!_M_data0, !_M_data1); }
2145
2146 [[__gnu__::__always_inline__]]
2147 constexpr basic_vec
2148 operator+() const noexcept requires requires(value_type __a) { +__a; }
2149 { return *this; }
2150
2151 [[__gnu__::__always_inline__]]
2152 constexpr basic_vec
2153 operator-() const noexcept requires requires(value_type __a) { -__a; }
2154 { return _S_init(-_M_data0, -_M_data1); }
2155
2156 [[__gnu__::__always_inline__]]
2157 constexpr basic_vec
2158 operator~() const noexcept requires requires(value_type __a) { ~__a; }
2159 { return _S_init(~_M_data0, ~_M_data1); }
2160
2161 // [simd.cassign] -------------------------------------------------------
2162#define _GLIBCXX_SIMD_DEFINE_OP(sym) \
2163 [[__gnu__::__always_inline__]] \
2164 friend constexpr basic_vec& \
2165 operator sym##=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT \
2166 { \
2167 __x._M_data0 sym##= __y._M_data0; \
2168 __x._M_data1 sym##= __y._M_data1; \
2169 return __x; \
2170 }
2171
2172 _GLIBCXX_SIMD_DEFINE_OP(+)
2173 _GLIBCXX_SIMD_DEFINE_OP(-)
2174 _GLIBCXX_SIMD_DEFINE_OP(*)
2175 _GLIBCXX_SIMD_DEFINE_OP(/)
2176 _GLIBCXX_SIMD_DEFINE_OP(%)
2177 _GLIBCXX_SIMD_DEFINE_OP(&)
2178 _GLIBCXX_SIMD_DEFINE_OP(|)
2179 _GLIBCXX_SIMD_DEFINE_OP(^)
2180 _GLIBCXX_SIMD_DEFINE_OP(<<)
2181 _GLIBCXX_SIMD_DEFINE_OP(>>)
2182
2183#undef _GLIBCXX_SIMD_DEFINE_OP
2184
2185 [[__gnu__::__always_inline__]]
2186 friend constexpr basic_vec&
2187 operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
2188 requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
2189 {
2190 __x._M_data0 <<= __y;
2191 __x._M_data1 <<= __y;
2192 return __x;
2193 }
2194
2195 [[__gnu__::__always_inline__]]
2196 friend constexpr basic_vec&
2197 operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
2198 requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
2199 {
2200 __x._M_data0 >>= __y;
2201 __x._M_data1 >>= __y;
2202 return __x;
2203 }
2204
2205 // [simd.comparison] ----------------------------------------------------
2206 [[__gnu__::__always_inline__]]
2207 friend constexpr mask_type
2208 operator==(const basic_vec& __x, const basic_vec& __y) noexcept
2209 { return mask_type::_S_init(__x._M_data0 == __y._M_data0, __x._M_data1 == __y._M_data1); }
2210
2211 [[__gnu__::__always_inline__]]
2212 friend constexpr mask_type
2213 operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
2214 { return mask_type::_S_init(__x._M_data0 != __y._M_data0, __x._M_data1 != __y._M_data1); }
2215
2216 [[__gnu__::__always_inline__]]
2217 friend constexpr mask_type
2218 operator<(const basic_vec& __x, const basic_vec& __y) noexcept
2219 { return mask_type::_S_init(__x._M_data0 < __y._M_data0, __x._M_data1 < __y._M_data1); }
2220
2221 [[__gnu__::__always_inline__]]
2222 friend constexpr mask_type
2223 operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
2224 { return mask_type::_S_init(__x._M_data0 <= __y._M_data0, __x._M_data1 <= __y._M_data1); }
2225
2226 [[__gnu__::__always_inline__]]
2227 friend constexpr mask_type
2228 operator>(const basic_vec& __x, const basic_vec& __y) noexcept
2229 { return mask_type::_S_init(__x._M_data0 > __y._M_data0, __x._M_data1 > __y._M_data1); }
2230
2231 [[__gnu__::__always_inline__]]
2232 friend constexpr mask_type
2233 operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
2234 { return mask_type::_S_init(__x._M_data0 >= __y._M_data0, __x._M_data1 >= __y._M_data1); }
2235
2236 // [simd.cond] ---------------------------------------------------------
2237 [[__gnu__::__always_inline__]]
2238 friend constexpr basic_vec
2239 __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
2240 {
2241 return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
2242 __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
2243 }
2244 };
2245
2246 // [simd.overview] deduction guide ------------------------------------------
2247 template <ranges::contiguous_range _Rg, typename... _Ts>
2248 requires __static_sized_range<_Rg>
2249 basic_vec(_Rg&& __r, _Ts...)
2250 -> basic_vec<ranges::range_value_t<_Rg>,
2251 __deduce_abi_t<ranges::range_value_t<_Rg>,
2252#if 0 // PR117849
2253 static_cast<__simd_size_type>(ranges::size(__r))>>;
2254#else
2255 static_cast<__simd_size_type>(decltype(std::span(__r))::extent)>>;
2256#endif
2257
2258 template <size_t _Bytes, typename _Ap>
2259 basic_vec(basic_mask<_Bytes, _Ap>)
2260 -> basic_vec<__integer_from<_Bytes>,
2261 decltype(__abi_rebind<__integer_from<_Bytes>, basic_mask<_Bytes, _Ap>::size.value,
2262 _Ap>())>;
2263
2264 // [P3319R5] ----------------------------------------------------------------
2265 template <__vectorizable _Tp>
2266 requires is_arithmetic_v<_Tp>
2267 inline constexpr _Tp
2268 __iota<_Tp> = _Tp();
2269
2270 template <typename _Tp, typename _Ap>
2271 inline constexpr basic_vec<_Tp, _Ap>
2272 __iota<basic_vec<_Tp, _Ap>> = basic_vec<_Tp, _Ap>([](_Tp __i) -> _Tp {
2273 static_assert(_Ap::_S_size - 1 <= numeric_limits<_Tp>::max(),
2274 "iota object would overflow");
2275 return __i;
2276 });
2277} // namespace simd
2278_GLIBCXX_END_NAMESPACE_VERSION
2279} // namespace std
2280
2281#pragma GCC diagnostic pop
2282#endif // C++26
2283#endif // _GLIBCXX_SIMD_VEC_H
constexpr bool operator<=(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:859
constexpr bool operator>=(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:873
constexpr bool operator<(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:826
constexpr bool operator>(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:866
constexpr complex< _Tp > operator-(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x minus y.
Definition complex:404
constexpr complex< _Tp > operator+(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x plus y.
Definition complex:374
bool is_sufficiently_aligned(_Tp *__ptr)
Is __ptr aligned to an _Align byte boundary?
Definition align.h:118
ISO C++ entities toplevel namespace is std.
_Tp fabs(const std::complex< _Tp > &__z)
fabs(__z) TR1 8.1.8 [tr.c99.cmplx.fabs]
Definition complex:2525
constexpr auto data(_Container &__cont) noexcept(noexcept(__cont.data())) -> decltype(__cont.data())
Return the data pointer of a container.
static constexpr _Tp max() noexcept
Definition limits:328
static constexpr _Tp infinity() noexcept
Definition limits:348