libstdc++
simd_vec.h
1// Implementation of <simd> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_SIMD_VEC_H
26#define _GLIBCXX_SIMD_VEC_H 1
27
28#ifdef _GLIBCXX_SYSHDR
29#pragma GCC system_header
30#endif
31
32#if __cplusplus >= 202400L
33
34#include "simd_mask.h"
35#include "simd_flags.h"
36
37#include <bits/utility.h>
38#include <bits/stl_function.h>
39#include <cmath>
40
41// psabi warnings are bogus because the ABI of the internal types never leaks into user code
42#pragma GCC diagnostic push
43#pragma GCC diagnostic ignored "-Wpsabi"
44
45namespace std _GLIBCXX_VISIBILITY(default)
46{
47_GLIBCXX_BEGIN_NAMESPACE_VERSION
48namespace simd
49{
50 // disabled basic_vec
51 template <typename _Tp, typename _Ap>
52 class basic_vec
53 {
54 public:
55 using value_type = _Tp;
56
57 using abi_type = _Ap;
58
59 using mask_type = basic_mask<0, void>; // disabled
60
61#define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \
62 "of template arguments to basic_vec."
63
64 basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
65
66 ~basic_vec() = delete(_GLIBCXX_DELETE_SIMD);
67
68 basic_vec(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
69
70 basic_vec& operator=(const basic_vec&) = delete(_GLIBCXX_DELETE_SIMD);
71
72#undef _GLIBCXX_DELETE_SIMD
73 };
74
75 template <typename _Tp, typename _Ap>
76 class _VecBase
77 {
78 using _Vp = basic_vec<_Tp, _Ap>;
79
80 public:
81 using value_type = _Tp;
82
83 using abi_type = _Ap;
84
85 using mask_type = basic_mask<sizeof(_Tp), abi_type>;
86
87 using iterator = __iterator<_Vp>;
88
89 using const_iterator = __iterator<const _Vp>;
90
91 constexpr iterator
92 begin() noexcept
93 { return {static_cast<_Vp&>(*this), 0}; }
94
95 constexpr const_iterator
96 begin() const noexcept
97 { return cbegin(); }
98
99 constexpr const_iterator
100 cbegin() const noexcept
101 { return {static_cast<const _Vp&>(*this), 0}; }
102
103 constexpr default_sentinel_t
104 end() const noexcept
105 { return {}; }
106
107 constexpr default_sentinel_t
108 cend() const noexcept
109 { return {}; }
110
111 static constexpr auto size = __simd_size_c<_Ap::_S_size>;
112
113 _VecBase() = default;
114
115 // LWG issue from 2026-03-04 / P4042R0
116 template <typename _Up, typename _UAbi>
117 requires (_Ap::_S_size != _UAbi::_S_size)
118 _VecBase(const basic_vec<_Up, _UAbi>&) = delete("size mismatch");
119
120 template <typename _Up, typename _UAbi>
121 requires (_Ap::_S_size == _UAbi::_S_size) && (!__explicitly_convertible_to<_Up, _Tp>)
122 explicit
123 _VecBase(const basic_vec<_Up, _UAbi>&)
124 = delete("the value types are not convertible");
125
126 [[__gnu__::__always_inline__]]
127 friend constexpr _Vp
128 operator+(const _Vp& __x, const _Vp& __y) noexcept
129 {
130 _Vp __r = __x;
131 __r += __y;
132 return __r;
133 }
134
135 [[__gnu__::__always_inline__]]
136 friend constexpr _Vp
137 operator-(const _Vp& __x, const _Vp& __y) noexcept
138 {
139 _Vp __r = __x;
140 __r -= __y;
141 return __r;
142 }
143
144 [[__gnu__::__always_inline__]]
145 friend constexpr _Vp
146 operator*(const _Vp& __x, const _Vp& __y) noexcept
147 {
148 _Vp __r = __x;
149 __r *= __y;
150 return __r;
151 }
152
153 [[__gnu__::__always_inline__]]
154 friend constexpr _Vp
155 operator/(const _Vp& __x, const _Vp& __y) noexcept
156 {
157 _Vp __r = __x;
158 __r /= __y;
159 return __r;
160 }
161
162 [[__gnu__::__always_inline__]]
163 friend constexpr _Vp
164 operator%(const _Vp& __x, const _Vp& __y) noexcept
165 requires requires (_Tp __a) { __a % __a; }
166 {
167 _Vp __r = __x;
168 __r %= __y;
169 return __r;
170 }
171
172 [[__gnu__::__always_inline__]]
173 friend constexpr _Vp
174 operator&(const _Vp& __x, const _Vp& __y) noexcept
175 requires requires (_Tp __a) { __a & __a; }
176 {
177 _Vp __r = __x;
178 __r &= __y;
179 return __r;
180 }
181
182 [[__gnu__::__always_inline__]]
183 friend constexpr _Vp
184 operator|(const _Vp& __x, const _Vp& __y) noexcept
185 requires requires (_Tp __a) { __a | __a; }
186 {
187 _Vp __r = __x;
188 __r |= __y;
189 return __r;
190 }
191
192 [[__gnu__::__always_inline__]]
193 friend constexpr _Vp
194 operator^(const _Vp& __x, const _Vp& __y) noexcept
195 requires requires (_Tp __a) { __a ^ __a; }
196 {
197 _Vp __r = __x;
198 __r ^= __y;
199 return __r;
200 }
201
202 [[__gnu__::__always_inline__]]
203 friend constexpr _Vp
204 operator<<(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
205 requires requires (_Tp __a) { __a << __a; }
206 {
207 _Vp __r = __x;
208 __r <<= __y;
209 return __r;
210 }
211
212 [[__gnu__::__always_inline__]]
213 friend constexpr _Vp
214 operator<<(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
215 requires requires (_Tp __a, __simd_size_type __b) { __a << __b; }
216 {
217 _Vp __r = __x;
218 __r <<= __y;
219 return __r;
220 }
221
222 [[__gnu__::__always_inline__]]
223 friend constexpr _Vp
224 operator>>(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
225 requires requires (_Tp __a) { __a >> __a; }
226 {
227 _Vp __r = __x;
228 __r >>= __y;
229 return __r;
230 }
231
232 [[__gnu__::__always_inline__]]
233 friend constexpr _Vp
234 operator>>(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
235 requires requires (_Tp __a, __simd_size_type __b) { __a >> __b; }
236 {
237 _Vp __r = __x;
238 __r >>= __y;
239 return __r;
240 }
241 };
242
243 struct _LoadCtorTag
244 {};
245
246 template <integral _Tp>
247 inline constexpr _Tp __max_shift
248 = (sizeof(_Tp) < sizeof(int) ? sizeof(int) : sizeof(_Tp)) * __CHAR_BIT__;
249
250 template <__vectorizable _Tp, __abi_tag _Ap>
251 requires (_Ap::_S_nreg == 1)
252 class basic_vec<_Tp, _Ap>
253 : public _VecBase<_Tp, _Ap>
254 {
255 template <typename, typename>
256 friend class basic_vec;
257
258 template <size_t, typename>
259 friend class basic_mask;
260
261 static constexpr int _S_size = _Ap::_S_size;
262
263 static constexpr int _S_full_size = __bit_ceil(unsigned(_S_size));
264
265 static constexpr bool _S_is_scalar = _S_size == 1;
266
267 static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask && !_S_is_scalar;
268
269 using _DataType = typename _Ap::template _DataType<_Tp>;
270
271 /** @internal
272 * @brief Underlying vector data storage.
273 *
274 * This member holds the vector object using a GNU vector type or a platform-specific vector
275 * type determined by the ABI tag. For size 1 vectors, this is a single value (_Tp).
276 */
277 _DataType _M_data;
278
279 static constexpr bool _S_is_partial = sizeof(_M_data) > sizeof(_Tp) * _S_size;
280
281 using __canon_value_type = __canonical_vec_type_t<_Tp>;
282
283 public:
284 using value_type = _Tp;
285
286 using mask_type = _VecBase<_Tp, _Ap>::mask_type;
287
288 // internal but public API ----------------------------------------------
289 [[__gnu__::__always_inline__]]
290 static constexpr basic_vec
291 _S_init(_DataType __x)
292 {
293 basic_vec __r;
294 __r._M_data = __x;
295 return __r;
296 }
297
298 [[__gnu__::__always_inline__]]
299 constexpr const _DataType&
300 _M_get() const
301 { return _M_data; }
302
303 [[__gnu__::__always_inline__]]
304 friend constexpr bool
305 __is_const_known(const basic_vec& __x)
306 { return __builtin_constant_p(__x._M_data); }
307
308 [[__gnu__::__always_inline__]]
309 constexpr auto
310 _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
311 {
312 if constexpr (_S_is_scalar)
313 return __vec_builtin_type<__canon_value_type, 1>{_M_data};
314 else
315 return _M_data;
316 }
317
318 template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
319 [[__gnu__::__always_inline__]]
320 static constexpr basic_vec
321 _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
322 {
323 using _Xp = basic_vec<value_type, _A0>;
324 basic_vec __r;
325 if constexpr (_S_is_scalar)
326 {
327 constexpr __simd_size_type __j = [&] consteval {
328 if constexpr (__index_permutation_function_sized<_Fp>)
329 return __idxmap(_Offset, _Size);
330 else
331 return __idxmap(_Offset);
332 }();
333 if constexpr (__j == simd::zero_element || __j == simd::uninit_element)
334 return basic_vec();
335 else
336 static_assert(__j >= 0 && __j < _Xp::_S_size);
337 __r._M_data = __x[__j];
338 }
339 else
340 {
341 auto __idxmap2 = [=](auto __i) consteval {
342 if constexpr (int(__i + _Offset) >= _Size) // _S_full_size > _Size
343 return __simd_size_c<simd::uninit_element>;
344 else if constexpr (__index_permutation_function_sized<_Fp>)
345 return __simd_size_c<__idxmap(__i + _Offset, _Size)>;
346 else
347 return __simd_size_c<__idxmap(__i + _Offset)>;
348 };
349 constexpr auto __adj_idx = [](auto __i) {
350 constexpr int __j = __i;
351 if constexpr (__j == simd::zero_element)
352 return __simd_size_c<__bit_ceil(unsigned(_Xp::_S_size))>;
353 else if constexpr (__j == simd::uninit_element)
354 return __simd_size_c<-1>;
355 else
356 {
357 static_assert(__j >= 0 && __j < _Xp::_S_size);
358 return __simd_size_c<__j>;
359 }
360 };
361 constexpr auto [...__is0] = _IotaArray<_S_size>;
362 constexpr bool __needs_zero_element
363 = ((__idxmap2(__simd_size_c<__is0>).value == simd::zero_element) || ...);
364 constexpr auto [...__is_full] = _IotaArray<_S_full_size>;
365 if constexpr (_A0::_S_nreg == 2 && !__needs_zero_element)
366 {
367 __r._M_data = __builtin_shufflevector(
368 __x._M_data0._M_data, __x._M_data1._M_data,
369 __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
370 }
371 else
372 {
373 __r._M_data = __builtin_shufflevector(
374 __x._M_concat_data(), decltype(__x._M_concat_data())(),
375 __adj_idx(__idxmap2(__simd_size_c<__is_full>)).value...);
376 }
377 }
378 return __r;
379 }
380
381 template <typename _Vp>
382 [[__gnu__::__always_inline__]]
383 constexpr auto
384 _M_chunk() const noexcept
385 {
386 constexpr int __n = _S_size / _Vp::_S_size;
387 constexpr int __rem = _S_size % _Vp::_S_size;
388 constexpr auto [...__is] = _IotaArray<__n>;
389 if constexpr (__rem == 0)
390 return array<_Vp, __n> {__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...};
391 else
392 {
393 using _Rest = resize_t<__rem, _Vp>;
394 return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, *this)...,
395 __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, *this));
396 }
397 }
398
399 [[__gnu__::__always_inline__]]
400 static constexpr basic_vec
401 _S_concat(const basic_vec& __x0) noexcept
402 { return __x0; }
403
404 template <typename... _As>
405 requires (sizeof...(_As) > 1)
406 [[__gnu__::__always_inline__]]
407 static constexpr basic_vec
408 _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
409 {
410 static_assert(_S_size == (_As::_S_size + ...));
411 return __extract_simd_at<basic_vec>(cw<0>, __xs...);
412 }
413
414 /** @internal
415 * Shifts elements to the front by @p _Shift positions (or to the back for negative @p
416 * _Shift).
417 *
418 * This function moves elements towards lower indices (front of the vector).
419 * Elements that would shift beyond the vector bounds are replaced with zero. Negative shift
420 * values shift in the opposite direction.
421 *
422 * @warning The naming can be confusing due to little-endian byte order:
423 * - Despite the name "shifted_to_front", the underlying hardware instruction
424 * shifts bits to the right (psrl...)
425 * - The function name refers to element indices, not bit positions
426 *
427 * @tparam _Shift Number of positions to shift elements towards the front.
428 * Must be -size() < _Shift < size().
429 *
430 * @return A new vector with elements shifted to front or back.
431 *
432 * Example:
433 * @code
434 * __iota<vec<int, 4>>._M_elements_shifted_to_front<2>(); // {2, 3, 0, 0}
435 * __iota<vec<int, 4>>._M_elements_shifted_to_front<-2>(); // {0, 0, 0, 1}
436 * @endcode
437 */
438 template <int _Shift, _ArchTraits _Traits = {}>
439 [[__gnu__::__always_inline__]]
440 constexpr basic_vec
441 _M_elements_shifted_to_front() const
442 {
443 static_assert(_Shift < _S_size && -_Shift < _S_size);
444 if constexpr (_Shift == 0)
445 return *this;
446#ifdef __SSE2__
447 else if (!__is_const_known(*this))
448 {
449 if constexpr (sizeof(_M_data) == 16 && _Shift > 0)
450 return reinterpret_cast<_DataType>(
451 __builtin_ia32_psrldqi128(__vec_bit_cast<long long>(_M_data),
452 _Shift * sizeof(value_type) * 8));
453 else if constexpr (sizeof(_M_data) == 16 && _Shift < 0)
454 return reinterpret_cast<_DataType>(
455 __builtin_ia32_pslldqi128(__vec_bit_cast<long long>(_M_data),
456 -_Shift * sizeof(value_type) * 8));
457 else if constexpr (sizeof(_M_data) < 16)
458 {
459 auto __x = reinterpret_cast<__vec_builtin_type_bytes<long long, 16>>(
460 __vec_zero_pad_to_16(_M_data));
461 if constexpr (_Shift > 0)
462 __x = __builtin_ia32_psrldqi128(__x, _Shift * sizeof(value_type) * 8);
463 else
464 __x = __builtin_ia32_pslldqi128(__x, -_Shift * sizeof(value_type) * 8);
465 return _VecOps<_DataType>::_S_extract(__vec_bit_cast<__canon_value_type>(__x));
466 }
467 }
468#endif
469 return _S_static_permute(*this, [](int __i) consteval {
470 int __off = __i + _Shift;
471 return __off >= _S_size || __off < 0 ? zero_element : __off;
472 });
473 }
474
475 /** @internal
476 * @brief Set padding elements to @p __id; add more padding elements if necessary.
477 *
478 * @note This function can rearrange the element order since the result is only used for
479 * reductions.
480 */
481 template <typename _Vp, __canon_value_type __id>
482 [[__gnu__::__always_inline__]]
483 constexpr _Vp
484 _M_pad_to_T_with_value() const noexcept
485 {
486 static_assert(!_Vp::_S_is_partial);
487 static_assert(_Ap::_S_nreg == 1);
488 if constexpr (sizeof(_Vp) == 32)
489 { // when we need to reduce from a 512-bit register
490 static_assert(sizeof(_M_data) == 32);
491 constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
492 return __select_impl(__k, _Vp::_S_init(_M_data), __id);
493 }
494 else
495 {
496 static_assert(sizeof(_Vp) <= 16); // => max. 7 Bytes need to be zeroed
497 static_assert(sizeof(_M_data) <= sizeof(_Vp));
498 _Vp __v1 = __vec_zero_pad_to<sizeof(_Vp)>(_M_data);
499 if constexpr (__id == 0 && _S_is_partial)
500 // cheapest solution: shift values to the back while shifting in zeros
501 // This is valid because we shift out padding elements and use all elements in a
502 // subsequent reduction.
503 __v1 = __v1.template _M_elements_shifted_to_front<-(_Vp::_S_size - _S_size)>();
504 else if constexpr (_Vp::_S_size - _S_size == 1)
505 // if a single element needs to be changed, use an insert instruction
506 __vec_set(__v1._M_data, _Vp::_S_size - 1, __id);
507 else if constexpr (__has_single_bit(unsigned(_Vp::_S_size - _S_size)))
508 { // if 2^n elements need to be changed, use a single insert instruction
509 constexpr int __n = _Vp::_S_size - _S_size;
510 using _Ip = __integer_from<__n * sizeof(__canon_value_type)>;
511 constexpr auto [...__is] = _IotaArray<__n>;
512 constexpr __canon_value_type __idn[__n] = {((void)__is, __id)...};
513 auto __vn = __vec_bit_cast<_Ip>(__v1._M_data);
514 __vec_set(__vn, _Vp::_S_size / __n - 1, __builtin_bit_cast(_Ip, __idn));
515 __v1._M_data = reinterpret_cast<typename _Vp::_DataType>(__vn);
516 }
517 else if constexpr (__id != 0 && !_S_is_partial)
518 { // if __vec_zero_pad_to added zeros in all the places where we need __id, a
519 // bitwise or is sufficient (needs a vector constant for the __id vector, which
520 // isn't optimal)
521 constexpr _Vp __idn([](int __i) {
522 return __i >= _S_size ? __id : __canon_value_type();
523 });
524 __v1._M_data = __vec_or(__v1._M_data, __idn._M_data);
525 }
526 else if constexpr (__id != 0 || _S_is_partial)
527 { // fallback
528 constexpr auto __k = _Vp::mask_type::_S_partial_mask_of_n(_S_size);
529 __v1 = __select_impl(__k, __v1, __id);
530 }
531 return __v1;
532 }
533 }
534
535 [[__gnu__::__always_inline__]]
536 constexpr auto
537 _M_reduce_to_half(auto __binary_op) const
538 {
539 static_assert(__has_single_bit(unsigned(_S_size)));
540 auto [__a, __b] = chunk<_S_size / 2>(*this);
541 return __binary_op(__a, __b);
542 }
543
544 template <typename _Rest, typename _BinaryOp>
545 [[__gnu__::__always_inline__]]
546 constexpr value_type
547 _M_reduce_tail(const _Rest& __rest, _BinaryOp __binary_op) const
548 {
549 if constexpr (_S_is_scalar)
550 return __binary_op(*this, __rest)._M_data;
551 else if constexpr (_Rest::_S_size == _S_size)
552 return __binary_op(*this, __rest)._M_reduce(__binary_op);
553 else if constexpr (_Rest::_S_size > _S_size)
554 {
555 auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
556 return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
557 }
558 else if constexpr (_Rest::_S_size == 1)
559 return __binary_op(_Rest(_M_reduce(__binary_op)), __rest)[0];
560 else if constexpr (sizeof(_M_data) <= 16
561 && requires { __default_identity_element<__canon_value_type, _BinaryOp>(); })
562 { // extend __rest with identity element for more parallelism
563 constexpr __canon_value_type __id
564 = __default_identity_element<__canon_value_type, _BinaryOp>();
565 return __binary_op(_M_data, __rest.template _M_pad_to_T_with_value<basic_vec, __id>())
566 ._M_reduce(__binary_op);
567 }
568 else
569 return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
570 }
571
572 /** @internal
573 * @brief Reduction over @p __binary_op of all (non-padding) elements.
574 *
575 * @note The implementation assumes it is most efficient to first reduce to one 128-bit SIMD
576 * register and then shuffle elements while sticking to 128-bit registers.
577 */
578 template <typename _BinaryOp, _ArchTraits _Traits = {}>
579 [[__gnu__::__always_inline__]]
580 constexpr value_type
581 _M_reduce(_BinaryOp __binary_op) const
582 {
583 constexpr bool __have_id_elem
584 = requires { __default_identity_element<__canon_value_type, _BinaryOp>(); };
585 if constexpr (_S_size == 1)
586 return operator[](0);
587 else if constexpr (_Traits.template _M_eval_as_f32<value_type>()
588 && (is_same_v<_BinaryOp, plus<>>
589 || is_same_v<_BinaryOp, multiplies<>>))
590 return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
591#ifdef __SSE2__
592 else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
593 && is_same_v<decltype(__binary_op), multiplies<>>)
594 {
595 // convert to unsigned short because of missing 8-bit mul instruction
596 // we don't need to preserve the order of elements
597 //
598 // The left columns under Latency and Throughput show bit-cast to ushort with shift by
599 // 8. The right column uses the alternative in the else branch.
600 // Benchmark on Intel Ultra 7 165U (AVX2)
601 // TYPE Latency Throughput
602 // [cycles/call] [cycles/call]
603 //schar, 2 9.11 7.73 3.17 3.21
604 //schar, 4 31.6 34.9 5.11 6.97
605 //schar, 8 35.7 41.5 7.77 7.17
606 //schar, 16 36.7 44.1 6.66 8.96
607 //schar, 32 42.2 61.1 8.82 10.1
608 if constexpr (!_S_is_partial)
609 { // If all elements participate in the reduction we can take this shortcut
610 using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
611 auto __a = __builtin_bit_cast(_V16, *this);
612 return __binary_op(__a, __a >> 8)._M_reduce(__binary_op);
613 }
614 else
615 {
616 using _V16 = rebind_t<unsigned short, basic_vec>;
617 return _V16(*this)._M_reduce(__binary_op);
618 }
619 }
620#endif
621 else if constexpr (__has_single_bit(unsigned(_S_size)))
622 {
623 if constexpr (sizeof(_M_data) > 16)
624 return _M_reduce_to_half(__binary_op)._M_reduce(__binary_op);
625 else if constexpr (_S_size == 2)
626 return _M_reduce_to_half(__binary_op)[0];
627 else
628 {
629 static_assert(_S_size <= 16);
630 auto __x = *this;
631#ifdef __SSE2__
632 if constexpr (sizeof(_M_data) <= 16 && is_integral_v<value_type>)
633 {
634 if constexpr (_S_size > 8)
635 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<8>());
636 if constexpr (_S_size > 4)
637 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<4>());
638 if constexpr (_S_size > 2)
639 __x = __binary_op(__x, __x.template _M_elements_shifted_to_front<2>());
640 // We could also call __binary_op with vec<T, 1> arguments. However,
641 // micro-benchmarking on Intel Ultra 7 165U showed this to be more efficient:
642 return __binary_op(__x, __x.template _M_elements_shifted_to_front<1>())[0];
643 }
644#endif
645 if constexpr (_S_size > 8)
646 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<8>()));
647 if constexpr (_S_size > 4)
648 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<4>()));
649#ifdef __SSE2__
650 // avoid pshufb by "promoting" to int
651 if constexpr (is_integral_v<value_type> && sizeof(value_type) <= 1)
652 return value_type(resize_t<4, rebind_t<int, basic_vec>>(chunk<4>(__x)[0])
653 ._M_reduce(__binary_op));
654#endif
655 if constexpr (_S_size > 2)
656 __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<2>()));
657 if constexpr (is_integral_v<value_type> && sizeof(value_type) == 2)
658 return __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<1>()))[0];
659 else
660 return __binary_op(vec<value_type, 1>(__x[0]), vec<value_type, 1>(__x[1]))[0];
661 }
662 }
663 else if constexpr (sizeof(_M_data) == 32)
664 {
665 const auto [__lo, __hi] = chunk<__bit_floor(unsigned(_S_size))>(*this);
666 return __lo._M_reduce_tail(__hi, __binary_op);
667 }
668 else if constexpr (sizeof(_M_data) == 64)
669 {
670 // e.g. _S_size = 16 + 16 + 15 (vec<char, 47>)
671 // -> 8 + 8 + 7 -> 4 + 4 + 3 -> 2 + 2 + 1 -> 1
672 auto __chunked = chunk<__bit_floor(unsigned(_S_size)) / 2>(*this);
673 using _Cp = decltype(__chunked);
674 if constexpr (tuple_size_v<_Cp> == 4)
675 {
676 const auto& [__a, __b, __c, __rest] = __chunked;
677 constexpr bool __amd_cpu = _Traits._M_have_sse4a();
678 if constexpr (__have_id_elem && __rest._S_size > 1 && __amd_cpu)
679 { // do one 256-bit op -> one 128-bit op
680 // 4 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
681 // 9 cycles on Skylake-AVX512 until _M_reduce
682 // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
683 // 17 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
684 const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
685 using _Vp = remove_cvref_t<decltype(__a)>;
686 constexpr __canon_value_type __id
687 = __default_identity_element<__canon_value_type, _BinaryOp>();
688 const _Vp __b = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
689 return __binary_op(__a, __b)._M_reduce(__binary_op);
690 }
691 else if constexpr (__have_id_elem && __rest._S_size > 1)
692 { // do two 128-bit ops -> one 128-bit op
693 // 5 cycles on Zen4/5 until _M_reduce (short, 26, plus<>)
694 // 7 cycles on Skylake-AVX512 until _M_reduce (short, 26, plus<>)
695 // 9 cycles on Zen4/5 until _M_reduce (short, 27, multiplies<>)
696 // 16 cycles on Skylake-AVX512 until _M_reduce (short, 27, multiplies<>)
697 using _Vp = remove_cvref_t<decltype(__a)>;
698 constexpr __canon_value_type __id
699 = __default_identity_element<__canon_value_type, _BinaryOp>();
700 const _Vp __d = __rest.template _M_pad_to_T_with_value<_Vp, __id>();
701 return __binary_op(__binary_op(__a, __b), __binary_op(__c, __d))
702 ._M_reduce(__binary_op);
703 }
704 else
705 return __binary_op(__binary_op(__a, __b), __c)
706 ._M_reduce_tail(__rest, __binary_op);
707 }
708 else if constexpr (tuple_size_v<_Cp> == 3)
709 {
710 const auto& [__a, __b, __rest] = __chunked;
711 return __binary_op(__a, __b)._M_reduce_tail(__rest, __binary_op);
712 }
713 else
714 static_assert(false);
715 }
716 else if constexpr (__have_id_elem)
717 {
718 constexpr __canon_value_type __id
719 = __default_identity_element<__canon_value_type, _BinaryOp>();
720 using _Vp = resize_t<__bit_ceil(unsigned(_S_size)), basic_vec>;
721 return _M_pad_to_T_with_value<_Vp, __id>()._M_reduce(__binary_op);
722 }
723 else
724 {
725 const auto& [__a, __rest] = chunk<__bit_floor(unsigned(_S_size))>(*this);
726 return __a._M_reduce_tail(__rest, __binary_op);
727 }
728 }
729
730 // [simd.math] ----------------------------------------------------------
731 //
732 // ISO/IEC 60559 on the classification operations (5.7.2 General Operations):
733 // "They are never exceptional, even for signaling NaNs."
734 //
735 template <_OptTraits _Traits = {}>
736 [[__gnu__::__always_inline__]]
737 constexpr mask_type
738 _M_isnan() const requires is_floating_point_v<value_type>
739 {
740 if constexpr (_Traits._M_finite_math_only())
741 return mask_type(false);
742 else if constexpr (_S_is_scalar)
743 return mask_type(std::isnan(_M_data));
744 else if constexpr (_S_use_bitmask)
745 return _M_isunordered(*this);
746 else if constexpr (!_Traits._M_support_snan())
747 return !(*this == *this);
748 else if (__is_const_known(_M_data))
749 return mask_type([&](int __i) { return std::isnan(_M_data[__i]); });
750 else
751 {
752 // 60559: NaN is represented as Inf + non-zero mantissa bits
753 using _Ip = __integer_from<sizeof(value_type)>;
754 return __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity())
755 < __builtin_bit_cast(rebind_t<_Ip, basic_vec>, _M_fabs());
756 }
757 }
758
759 template <_TargetTraits _Traits = {}>
760 [[__gnu__::__always_inline__]]
761 constexpr mask_type
762 _M_isinf() const requires is_floating_point_v<value_type>
763 {
764 if constexpr (_Traits._M_finite_math_only())
765 return mask_type(false);
766 else if constexpr (_S_is_scalar)
767 return mask_type(std::isinf(_M_data));
768 else if (__is_const_known(_M_data))
769 return mask_type([&](int __i) { return std::isinf(_M_data[__i]); });
770#ifdef _GLIBCXX_X86
771 else if constexpr (_S_use_bitmask)
772 return mask_type::_S_init(__x86_bitmask_isinf(_M_data));
773 else if constexpr (_Traits._M_have_avx512dq())
774 return __x86_bit_to_vecmask<typename mask_type::_DataType>(
775 __x86_bitmask_isinf(_M_data));
776#endif
777 else
778 {
779 using _Ip = __integer_from<sizeof(value_type)>;
780 return __vec_bit_cast<_Ip>(_M_fabs()._M_data)
781 == __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity());
782 }
783 }
784
785 [[__gnu__::__always_inline__]]
786 constexpr basic_vec
787 _M_abs() const requires signed_integral<value_type>
788 { return _M_data < 0 ? -_M_data : _M_data; }
789
790 [[__gnu__::__always_inline__]]
791 constexpr basic_vec
792 _M_fabs() const requires floating_point<value_type>
793 {
794 if constexpr (_S_is_scalar)
795 return std::fabs(_M_data);
796 else
797 return __vec_and(__vec_not(_S_signmask<_DataType>), _M_data);
798 }
799
800 template <_TargetTraits _Traits = {}>
801 [[__gnu__::__always_inline__]]
802 constexpr mask_type
803 _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
804 {
805 if constexpr (_Traits._M_finite_math_only())
806 return mask_type(false);
807 else if constexpr (_S_is_scalar)
808 return mask_type(std::isunordered(_M_data, __y._M_data));
809#ifdef _GLIBCXX_X86
810 else if constexpr (_S_use_bitmask)
811 return _M_bitmask_cmp<_X86Cmp::_Unord>(__y._M_data);
812#endif
813 else
814 return mask_type([&](int __i) {
815 return std::isunordered(_M_data[__i], __y._M_data[__i]);
816 });
817 }
818
819 /** @internal
820 * Implementation of @ref partial_load.
821 *
822 * @param __mem A pointer to an array of @p __n values. Can be complex or real.
823 * @param __n Read no more than @p __n values from memory. However, depending on @p __mem
824 * alignment, out of bounds reads are benign.
825 */
826 template <typename _Up, _ArchTraits _Traits = {}>
827 static inline basic_vec
828 _S_partial_load(const _Up* __mem, size_t __n)
829 {
830 if constexpr (_S_is_scalar)
831 return __n == 0 ? basic_vec() : basic_vec(static_cast<value_type>(*__mem));
832 else if (__is_const_known_equal_to(__n >= size_t(_S_size), true))
833 return basic_vec(_LoadCtorTag(), __mem);
834 else if constexpr (!__converts_trivially<_Up, value_type>)
835 return static_cast<basic_vec>(rebind_t<_Up, basic_vec>::_S_partial_load(__mem, __n));
836 else
837 {
838#if _GLIBCXX_X86
839 if constexpr (_Traits._M_have_avx512f()
840 || (_Traits._M_have_avx() && sizeof(_Up) >= 4))
841 {
842 const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
843 : mask_type(true);
844 return _S_masked_load(__mem, mask_type::_S_partial_mask_of_n(int(__n)));
845 }
846#endif
847 if (__n >= size_t(_S_size)) [[unlikely]]
848 return basic_vec(_LoadCtorTag(), __mem);
849#if _GLIBCXX_X86 // TODO: where else is this "safe"?
850 // allow out-of-bounds read when it cannot lead to a #GP
851 else if (__is_const_known_equal_to(
852 is_sufficiently_aligned<sizeof(_Up) * _S_full_size>(__mem), true))
853 return __select_impl(mask_type::_S_partial_mask_of_n(int(__n)),
854 basic_vec(_LoadCtorTag(), __mem), basic_vec());
855#endif
856 else if constexpr (_S_size > 4)
857 {
858 alignas(_DataType) byte __dst[sizeof(_DataType)] = {};
859 const byte* __src = reinterpret_cast<const byte*>(__mem);
860 __memcpy_chunks<sizeof(_Up), sizeof(_DataType)>(__dst, __src, __n);
861 return __builtin_bit_cast(_DataType, __dst);
862 }
863 else if (__n == 0) [[unlikely]]
864 return basic_vec();
865 else if constexpr (_S_size == 2)
866 return _DataType {static_cast<value_type>(__mem[0]), 0};
867 else
868 {
869 constexpr auto [...__is] = _IotaArray<_S_size - 2>;
870 return _DataType{
871 static_cast<value_type>(__mem[0]),
872 static_cast<value_type>(__is + 1 < __n ? __mem[__is + 1] : 0)...
873 };
874 }
875 }
876 }
877
878 /** @internal
879 * Loads elements from @p __mem according to mask @p __k.
880 *
881 * @param __mem Pointer (in)to array.
882 * @param __k Mask controlling which elements to load. For each bit i in the mask:
883 * - If bit i is 1: copy __mem[i] into result[i]
884 * - If bit i is 0: result[i] is default initialized
885 *
886 * @note This function assumes it's called after determining that no other method
887 * (like full load) is more appropriate. Calling with all mask bits set to 1
888 * is suboptimal for performance but still correct.
889 */
890 template <typename _Up, _ArchTraits _Traits = {}>
891 static inline basic_vec
892 _S_masked_load(const _Up* __mem, mask_type __k)
893 {
894 if constexpr (_S_size == 1)
895 return __k[0] ? static_cast<value_type>(__mem[0]) : value_type();
896#if _GLIBCXX_X86
897 else if constexpr (_Traits._M_have_avx512f())
898 return __x86_masked_load<_DataType>(__mem, __k._M_data);
899 else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
900 {
901 if constexpr (__converts_trivially<_Up, value_type>)
902 return __x86_masked_load<_DataType>(__mem, __k._M_data);
903 else
904 {
905 using _UV = rebind_t<_Up, basic_vec>;
906 return basic_vec(_UV::_S_masked_load(__mem, typename _UV::mask_type(__k)));
907 }
908 }
909#endif
910 else if (__k._M_none_of()) [[unlikely]]
911 return basic_vec();
912 else if constexpr (_S_is_scalar)
913 return basic_vec(static_cast<value_type>(*__mem));
914 else
915 {
916 // Use at least 4-byte __bits in __bit_foreach for better code-gen
917 _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
918 [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
919 if constexpr (__converts_trivially<_Up, value_type>)
920 {
921 _DataType __r = {};
922 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
923 __r[__i] = __mem[__i];
924 });
925 return __r;
926 }
927 else
928 {
929 using _UV = rebind_t<_Up, basic_vec>;
930 alignas(_UV) _Up __tmp[sizeof(_UV) / sizeof(_Up)] = {};
931 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
932 __tmp[__i] = __mem[__i];
933 });
934 return basic_vec(__builtin_bit_cast(_UV, __tmp));
935 }
936 }
937 }
938
939 template <typename _Up>
940 [[__gnu__::__always_inline__]]
941 inline void
942 _M_store(_Up* __mem) const
943 {
944 if constexpr (__converts_trivially<value_type, _Up>)
945 __builtin_memcpy(__mem, &_M_data, sizeof(_Up) * _S_size);
946 else
947 rebind_t<_Up, basic_vec>(*this)._M_store(__mem);
948 }
949
950 /** @internal
951 * Implementation of @ref partial_store.
952 *
953 * @note This is a static function to allow passing @p __v via register in case the function
954 * is not inlined.
955 *
956 * @note The function is not marked @c __always_inline__ since code-gen can become fairly
957 * long.
958 */
959 template <typename _Up, _ArchTraits _Traits = {}>
960 static inline void
961 _S_partial_store(const basic_vec __v, _Up* __mem, size_t __n)
962 {
963 if (__is_const_known_equal_to(__n >= _S_size, true))
964 __v._M_store(__mem);
965#if _GLIBCXX_X86
966 else if constexpr (_Traits._M_have_avx512f() && !_S_is_scalar)
967 {
968 const auto __k = __n < _S_size ? mask_type::_S_partial_mask_of_n(int(__n))
969 : mask_type(true);
970 return _S_masked_store(__v, __mem, __k);
971 }
972#endif
973 else if (__n >= _S_size) [[unlikely]]
974 __v._M_store(__mem);
975 else if (__n == 0) [[unlikely]]
976 return;
977 else if constexpr (__converts_trivially<value_type, _Up>)
978 {
979 byte* __dst = reinterpret_cast<byte*>(__mem);
980 const byte* __src = reinterpret_cast<const byte*>(&__v._M_data);
981 __memcpy_chunks<sizeof(_Up), sizeof(_M_data)>(__dst, __src, __n);
982 }
983 else
984 {
985 using _UV = rebind_t<_Up, basic_vec>;
986 _UV::_S_partial_store(_UV(__v), __mem, __n);
987 }
988 }
989
990 /** @internal
991 * Stores elements of @p __v to @p __mem according to mask @p __k.
992 *
993 * @param __v Values to store to @p __mem.
994 * @param __mem Pointer (in)to array.
995 * @param __k Mask controlling which elements to store. For each bit i in the mask:
996 * - If bit i is 1: store __v[i] to __mem[i]
997 * - If bit i is 0: __mem[i] is left unchanged
998 *
999 * @note This function assumes it's called after determining that no other method
1000 * (like full store) is more appropriate. Calling with all mask bits set to 1
1001 * is suboptimal for performance but still correct.
1002 */
1003 template <typename _Up, _ArchTraits _Traits = {}>
1004 //[[__gnu__::__always_inline__]]
1005 static inline void
1006 _S_masked_store(const basic_vec __v, _Up* __mem, const mask_type __k)
1007 {
1008#if _GLIBCXX_X86
1009 if constexpr (_Traits._M_have_avx512f())
1010 {
1011 __x86_masked_store(__v._M_data, __mem, __k._M_data);
1012 return;
1013 }
1014 else if constexpr (_Traits._M_have_avx() && (sizeof(_Up) == 4 || sizeof(_Up) == 8))
1015 {
1016 if constexpr (__converts_trivially<value_type, _Up>)
1017 __x86_masked_store(__v._M_data, __mem, __k._M_data);
1018 else
1019 {
1020 using _UV = rebind_t<_Up, basic_vec>;
1021 _UV::_S_masked_store(_UV(__v), __mem, typename _UV::mask_type(__k));
1022 }
1023 return;
1024 }
1025#endif
1026 if (__k._M_none_of()) [[unlikely]]
1027 return;
1028 else if constexpr (_S_is_scalar)
1029 __mem[0] = __v._M_data;
1030 else
1031 {
1032 // Use at least 4-byte __bits in __bit_foreach for better code-gen
1033 _Bitmask<_S_size < 32 ? 32 : _S_size> __bits = __k._M_to_uint();
1034 [[assume(__bits != 0)]]; // because of '__k._M_none_of()' branch above
1035 if constexpr (__converts_trivially<value_type, _Up>)
1036 {
1037 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
1038 __mem[__i] = __v[__i];
1039 });
1040 }
1041 else
1042 {
1043 const rebind_t<_Up, basic_vec> __cvted(__v);
1044 __bit_foreach(__bits, [&] [[__gnu__::__always_inline__]] (int __i) {
1045 __mem[__i] = __cvted[__i];
1046 });
1047 }
1048 }
1049 }
1050
1051 // [simd.overview] default constructor ----------------------------------
1052 basic_vec() = default;
1053
1054 // [simd.overview] p2 impl-def conversions ------------------------------
1055 using _NativeVecType = decltype([] {
1056 if constexpr (_S_is_scalar)
1057 return __vec_builtin_type<__canon_value_type, 1>();
1058 else
1059 return _DataType();
1060 }());
1061 /**
1062 * @brief Converting constructor from GCC vector builtins.
1063 *
1064 * This constructor enables direct construction from GCC vector builtins
1065 * (`[[gnu::vector_size(N)]]`).
1066 *
1067 * @param __x GCC vector builtin to convert from.
1068 *
1069 * @note This constructor is not available when size() equals 1.
1070 *
1071 * @see operator _NativeVecType() for the reverse conversion.
1072 */
1073 constexpr
1074 basic_vec(_NativeVecType __x)
1075 : _M_data([&] [[__gnu__::__always_inline__]] {
1076 if constexpr (_S_is_scalar)
1077 return __x[0];
1078 else
1079 return __x;
1080 }())
1081 {}
1082
1083 /**
1084 * @brief Conversion operator to GCC vector builtins.
1085 *
1086 * This operator enables implicit conversion from basic_vec to GCC vector builtins.
1087 *
1088 * @note This operator is not available when size() equals 1.
1089 *
1090 * @see basic_vec(_NativeVecType) for the reverse conversion.
1091 */
1092 constexpr
1093 operator _NativeVecType() const
1094 {
1095 if constexpr (_S_is_scalar)
1096 return _NativeVecType{_M_data};
1097 else
1098 return _M_data;
1099 }
1100
1101#if _GLIBCXX_X86
1102 /**
1103 * @brief Converting constructor from Intel Intrinsics (__m128, __m128i, ...).
1104 */
1105 template <__vec_builtin _IV>
1106 requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
1107 && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
1108 && !is_same_v<_IV, _DataType>)
1109 constexpr
1110 basic_vec(_IV __x)
1111 : _M_data(reinterpret_cast<_DataType>(__x))
1112 {}
1113
1114 /**
1115 * @brief Conversion operator to Intel Intrinsics (__m128, __m128i, ...).
1116 */
1117 template <__vec_builtin _IV>
1118 requires same_as<__x86_intel_intrin_value_type<value_type>, __vec_value_type<_IV>>
1119 && (sizeof(_IV) == sizeof(_DataType) && sizeof(_IV) >= 16
1120 && !is_same_v<_IV, _DataType>)
1121 constexpr
1122 operator _IV() const
1123 { return reinterpret_cast<_IV>(_M_data); }
1124#endif
1125
1126 // [simd.ctor] broadcast constructor ------------------------------------
1127 /**
1128 * @brief Broadcast constructor from scalar value.
1129 *
1130 * Constructs a vector where all elements are initialized to the same scalar value.
1131 * The scalar value is converted to the vector's element type.
1132 *
1133 * @param __x Scalar value to broadcast to all vector elements.
1134 * @tparam _Up Type of scalar value (must be explicitly convertible to value_type).
1135 *
1136 * @note The constructor is implicit if the conversion (if any) is value-preserving.
1137 */
1138 template <__explicitly_convertible_to<value_type> _Up>
1139 [[__gnu__::__always_inline__]]
1140 constexpr explicit(!__broadcast_constructible<_Up, value_type>)
1141 basic_vec(_Up&& __x) noexcept
1142 : _M_data(_DataType() == _DataType() ? static_cast<value_type>(__x) : value_type())
1143 {}
1144
1145 template <__simd_vec_bcast_consteval<value_type> _Up>
1146 consteval
1147 basic_vec(_Up&& __x)
1148 : _M_data(_DataType() == _DataType()
1149 ? __value_preserving_cast<value_type>(__x) : value_type())
1150 {}
1151
1152 // [simd.ctor] conversion constructor -----------------------------------
1153 template <typename _Up, typename _UAbi, _TargetTraits _Traits = {}>
1154 requires (_S_size == _UAbi::_S_size)
1155 && __explicitly_convertible_to<_Up, value_type>
1156 [[__gnu__::__always_inline__]]
1157 constexpr
1158 explicit(!__value_preserving_convertible_to<_Up, value_type>
1159 || __higher_rank_than<_Up, value_type>)
1160 basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
1161 : _M_data([&] [[__gnu__::__always_inline__]] {
1162 if constexpr (_S_is_scalar)
1163 return static_cast<value_type>(__x[0]);
1164 else if constexpr (_UAbi::_S_nreg >= 2)
1165 // __builtin_convertvector (__vec_cast) is inefficient for over-sized inputs.
1166 // Also e.g. vec<float, 12> -> vec<char, 12> (with SSE2) would otherwise emit 4
1167 // vcvttps2dq instructions, where only 3 are needed
1168 return _S_concat(resize_t<__x._N0, basic_vec>(__x._M_data0),
1169 resize_t<__x._N1, basic_vec>(__x._M_data1))._M_data;
1170 else
1171 return __vec_cast<_DataType>(__x._M_concat_data());
1172 }())
1173 {}
1174
1175 using _VecBase<_Tp, _Ap>::_VecBase;
1176
1177 // [simd.ctor] generator constructor ------------------------------------
1178 template <__simd_generator_invokable<value_type, _S_size> _Fp>
1179 [[__gnu__::__always_inline__]]
1180 constexpr explicit
1181 basic_vec(_Fp&& __gen)
1182 : _M_data([&] [[__gnu__::__always_inline__]] {
1183 constexpr auto [...__is] = _IotaArray<_S_size>;
1184 return _DataType{static_cast<value_type>(__gen(__simd_size_c<__is>))...};
1185 }())
1186 {}
1187
1188 // [simd.ctor] load constructor -----------------------------------------
1189 template <typename _Up>
1190 [[__gnu__::__always_inline__]]
1191 constexpr
1192 basic_vec(_LoadCtorTag, const _Up* __ptr)
1193 : _M_data()
1194 {
1195 if constexpr (_S_is_scalar)
1196 _M_data = static_cast<value_type>(__ptr[0]);
1197 else if consteval
1198 {
1199 constexpr auto [...__is] = _IotaArray<_S_size>;
1200 _M_data = _DataType{static_cast<value_type>(__ptr[__is])...};
1201 }
1202 else
1203 {
1204 if constexpr (__converts_trivially<_Up, value_type>)
1205 // This assumes std::floatN_t to be bitwise equal to float/double
1206 __builtin_memcpy(&_M_data, __ptr, sizeof(value_type) * _S_size);
1207 else
1208 {
1209 __vec_builtin_type<_Up, _S_full_size> __tmp = {};
1210 __builtin_memcpy(&__tmp, __ptr, sizeof(_Up) * _S_size);
1211 _M_data = __vec_cast<_DataType>(__tmp);
1212 }
1213 }
1214 }
1215
1216 template <ranges::contiguous_range _Rg, typename... _Flags>
1217 requires __static_sized_range<_Rg, _S_size>
1218 && __vectorizable<ranges::range_value_t<_Rg>>
1219 && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
1220 [[__gnu__::__always_inline__]]
1221 constexpr
1222 basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
1223 : basic_vec(_LoadCtorTag(), __flags.template _S_adjust_pointer<basic_vec>(
1224 ranges::data(__range)))
1225 {
1226 static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
1227 _Flags...>);
1228 }
1229
1230 // [simd.subscr] --------------------------------------------------------
1231 /**
1232 * @brief Return the value of the element at index @p __i.
1233 *
1234 * @pre __i >= 0 && __i < size().
1235 */
1236 [[__gnu__::__always_inline__]]
1237 constexpr value_type
1238 operator[](__simd_size_type __i) const
1239 {
1240 __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
1241 if constexpr (_S_is_scalar)
1242 return _M_data;
1243 else
1244 return _M_data[__i];
1245 }
1246
1247 // [simd.unary] unary operators -----------------------------------------
1248 // increment and decrement are implemented in terms of operator+=/-= which avoids UB on
1249 // padding elements while not breaking UBsan
1250 [[__gnu__::__always_inline__]]
1251 constexpr basic_vec&
1252 operator++() noexcept requires requires(value_type __a) { ++__a; }
1253 { return *this += value_type(1); }
1254
1255 [[__gnu__::__always_inline__]]
1256 constexpr basic_vec
1257 operator++(int) noexcept requires requires(value_type __a) { __a++; }
1258 {
1259 basic_vec __r = *this;
1260 *this += value_type(1);
1261 return __r;
1262 }
1263
1264 [[__gnu__::__always_inline__]]
1265 constexpr basic_vec&
1266 operator--() noexcept requires requires(value_type __a) { --__a; }
1267 { return *this -= value_type(1); }
1268
1269 [[__gnu__::__always_inline__]]
1270 constexpr basic_vec
1271 operator--(int) noexcept requires requires(value_type __a) { __a--; }
1272 {
1273 basic_vec __r = *this;
1274 *this -= value_type(1);
1275 return __r;
1276 }
1277
1278 [[__gnu__::__always_inline__]]
1279 constexpr mask_type
1280 operator!() const noexcept requires requires(value_type __a) { !__a; }
1281 { return *this == value_type(); }
1282
1283 /**
1284 * @brief Unary plus operator (no-op).
1285 *
1286 * Returns an unchanged copy of the object.
1287 */
1288 [[__gnu__::__always_inline__]]
1289 constexpr basic_vec
1290 operator+() const noexcept requires requires(value_type __a) { +__a; }
1291 { return *this; }
1292
1293 /**
1294 * @brief Unary negation operator.
1295 *
1296 * Returns a new SIMD vector after element-wise negation.
1297 */
1298 [[__gnu__::__always_inline__]]
1299 constexpr basic_vec
1300 operator-() const noexcept requires requires(value_type __a) { -__a; }
1301 { return _S_init(-_M_data); }
1302
1303 /**
1304 * @brief Bitwise NOT / complement operator.
1305 *
1306 * Returns a new SIMD vector after element-wise complement.
1307 */
1308 [[__gnu__::__always_inline__]]
1309 constexpr basic_vec
1310 operator~() const noexcept requires requires(value_type __a) { ~__a; }
1311 { return _S_init(~_M_data); }
1312
1313 // [simd.cassign] binary operators
1314 /**
1315 * @brief Bitwise AND operator.
1316 *
1317 * Returns a new SIMD vector after element-wise AND.
1318 */
1319 [[__gnu__::__always_inline__]]
1320 friend constexpr basic_vec&
1321 operator&=(basic_vec& __x, const basic_vec& __y) noexcept
1322 requires requires(value_type __a) { __a & __a; }
1323 {
1324 __x._M_data &= __y._M_data;
1325 return __x;
1326 }
1327
1328 /**
1329 * @brief Bitwise OR operator.
1330 *
1331 * Returns a new SIMD vector after element-wise OR.
1332 */
1333 [[__gnu__::__always_inline__]]
1334 friend constexpr basic_vec&
1335 operator|=(basic_vec& __x, const basic_vec& __y) noexcept
1336 requires requires(value_type __a) { __a | __a; }
1337 {
1338 __x._M_data |= __y._M_data;
1339 return __x;
1340 }
1341
1342 /**
1343 * @brief Bitwise XOR operator.
1344 *
1345 * Returns a new SIMD vector after element-wise XOR.
1346 */
1347 [[__gnu__::__always_inline__]]
1348 friend constexpr basic_vec&
1349 operator^=(basic_vec& __x, const basic_vec& __y) noexcept
1350 requires requires(value_type __a) { __a ^ __a; }
1351 {
1352 __x._M_data ^= __y._M_data;
1353 return __x;
1354 }
1355
1356 /**
1357 * @brief Applies the compound assignment operator element-wise.
1358 *
1359 * @pre If @c value_type is a signed integral type, the result is representable by @c
1360 * value_type. (This does not apply to padding elements the implementation might add for
1361 * non-power-of-2 widths.) UBsan will only see a call to @c unreachable() on overflow.
1362 *
1363 * @note The overflow detection code is discarded unless UBsan is active.
1364 */
1365 [[__gnu__::__always_inline__]]
1366 friend constexpr basic_vec&
1367 operator+=(basic_vec& __x, const basic_vec& __y) noexcept
1368 requires requires(value_type __a) { __a + __a; }
1369 {
1370 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1371 { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
1372 // remove UB of the active elements (so that UBsan can still do its job).
1373 //
1374 // This check is essentially free (at runtime) because DCE removes everything except
1375 // the final change to _M_data. The overflow check is only emitted if UBsan is active.
1376 //
1377 // The alternative would be to always zero padding elements after operations that can
1378 // produce non-zero values. However, right now:
1379 // - auto f(simd::mask<int, 3> k) { return +k; } is a single VPABSD and would have to
1380 // sanitize
1381 // - bit_cast to basic_vec with non-zero padding elements is fine
1382 // - conversion from intrinsics can create non-zero padding elements
1383 // - shuffles are allowed to put whatever they want into padding elements for
1384 // optimization purposes (e.g. for better instruction selection)
1385 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1386 const _DataType __result
1387 = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1388 + reinterpret_cast<_UV>(__y._M_data));
1389 const auto __positive = __y > value_type();
1390 const auto __overflow = __positive != (__result > __x);
1391 if (__overflow._M_any_of())
1392 __builtin_unreachable(); // trigger UBsan
1393 __x._M_data = __result;
1394 }
1395 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1396 __x = basic_vec(rebind_t<float, basic_vec>(__x) + __y);
1397 else
1398 __x._M_data += __y._M_data;
1399 return __x;
1400 }
1401
1402 /** @copydoc operator+=
1403 */
1404 [[__gnu__::__always_inline__]]
1405 friend constexpr basic_vec&
1406 operator-=(basic_vec& __x, const basic_vec& __y) noexcept
1407 requires requires(value_type __a) { __a - __a; }
1408 {
1409 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1410 { // see comment on operator+=
1411 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1412 const _DataType __result
1413 = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1414 - reinterpret_cast<_UV>(__y._M_data));
1415 const auto __positive = __y > value_type();
1416 const auto __overflow = __positive != (__result < __x);
1417 if (__overflow._M_any_of())
1418 __builtin_unreachable(); // trigger UBsan
1419 __x._M_data = __result;
1420 }
1421 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1422 __x = basic_vec(rebind_t<float, basic_vec>(__x) - __y);
1423 else
1424 __x._M_data -= __y._M_data;
1425 return __x;
1426 }
1427
1428 /** @copydoc operator+=
1429 */
1430 [[__gnu__::__always_inline__]]
1431 friend constexpr basic_vec&
1432 operator*=(basic_vec& __x, const basic_vec& __y) noexcept
1433 requires requires(value_type __a) { __a * __a; }
1434 {
1435 if constexpr (_S_is_partial && is_integral_v<value_type> && is_signed_v<value_type>)
1436 { // see comment on operator+=
1437 for (int __i = 0; __i < _S_size; ++__i)
1438 {
1439 if (__builtin_mul_overflow_p(__x._M_data[__i], __y._M_data[__i], value_type()))
1440 __builtin_unreachable();
1441 }
1442 using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
1443 __x._M_data = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
1444 * reinterpret_cast<_UV>(__y._M_data));
1445 }
1446
1447 // 'uint16 * uint16' promotes to int and can therefore lead to UB. The standard does not
1448 // require to avoid the undefined behavior. It's unnecessary and easy to avoid. It's also
1449 // unexpected because there's no UB on the vector types (which don't promote).
1450 else if constexpr (_S_is_scalar && is_unsigned_v<value_type>
1451 && is_signed_v<decltype(value_type() * value_type())>)
1452 __x._M_data = unsigned(__x._M_data) * unsigned(__y._M_data);
1453
1454 else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
1455 __x = basic_vec(rebind_t<float, basic_vec>(__x) * __y);
1456
1457 else
1458 __x._M_data *= __y._M_data;
1459 return __x;
1460 }
1461
1462 template <_TargetTraits _Traits = {}>
1463 [[__gnu__::__always_inline__]]
1464 friend constexpr basic_vec&
1465 operator/=(basic_vec& __x, const basic_vec& __y) noexcept
1466 requires requires(value_type __a) { __a / __a; }
1467 {
1468 const basic_vec __result([&](int __i) -> value_type { return __x[__i] / __y[__i]; });
1469 if (__is_const_known(__result))
1470 // the optimizer already knows the values of the result
1471 return __x = __result;
1472
1473#ifdef __SSE2__
1474 // x86 doesn't have integral SIMD division instructions
1475 // While division is faster, the required conversions are still a problem:
1476 // see PR121274, PR121284, and PR121296 for missed optimizations wrt. conversions
1477 //
1478 // With only 1 or 2 divisions, the conversion to and from fp is too expensive.
1479 if constexpr (is_integral_v<value_type> && _S_size > 2
1480 && __value_preserving_convertible_to<value_type, double>)
1481 {
1482 // If the denominator (y) is known to the optimizer, don't convert to fp because the
1483 // integral division can be translated into shifts/multiplications.
1484 if (!__is_const_known(__y))
1485 {
1486 // With AVX512FP16 use vdivph for 8-bit integers
1487 if constexpr (_Traits._M_have_avx512fp16()
1488 && __value_preserving_convertible_to<value_type, _Float16>)
1489 return __x = basic_vec(rebind_t<_Float16, basic_vec>(__x) / __y);
1490 else if constexpr (__value_preserving_convertible_to<value_type, float>)
1491 return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
1492 else
1493 return __x = basic_vec(rebind_t<double, basic_vec>(__x) / __y);
1494 }
1495 }
1496#endif
1497 if constexpr (_Traits._M_eval_as_f32<value_type>())
1498 return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
1499
1500 basic_vec __y1 = __y;
1501 if constexpr (_S_is_partial)
1502 {
1503 if constexpr (is_integral_v<value_type>)
1504 {
1505 // Assume integral division doesn't have SIMD instructions and must be done per
1506 // element anyway. Partial vectors should skip their padding elements.
1507 for (int __i = 0; __i < _S_size; ++__i)
1508 __x._M_data[__i] /= __y._M_data[__i];
1509 return __x;
1510 }
1511 else
1512 __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
1513 __y, basic_vec(value_type(1)));
1514 }
1515 __x._M_data /= __y1._M_data;
1516 return __x;
1517 }
1518
1519 [[__gnu__::__always_inline__]]
1520 friend constexpr basic_vec&
1521 operator%=(basic_vec& __x, const basic_vec& __y) noexcept
1522 requires requires(value_type __a) { __a % __a; }
1523 {
1524 static_assert(is_integral_v<value_type>);
1525 if constexpr (_S_is_partial)
1526 {
1527 const basic_vec __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
1528 __y, basic_vec(value_type(1)));
1529 if (__is_const_known(__y1))
1530 __x._M_data %= __y1._M_data;
1531 else
1532 {
1533 // Assume integral division doesn't have SIMD instructions and must be done per
1534 // element anyway. Partial vectors should skip their padding elements.
1535 for (int __i = 0; __i < _S_size; ++__i)
1536 __x._M_data[__i] %= __y._M_data[__i];
1537 }
1538 }
1539 else
1540 __x._M_data %= __y._M_data;
1541 return __x;
1542 }
1543
1544 [[__gnu__::__always_inline__]]
1545 friend constexpr basic_vec&
1546 operator<<=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
1547 requires requires(value_type __a) { __a << __a; }
1548 {
1549 __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
1550 "negative shift is undefined behavior");
1551 __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
1552 "too large shift invokes undefined behavior");
1553 __x._M_data <<= __y._M_data;
1554 return __x;
1555 }
1556
1557 [[__gnu__::__always_inline__]]
1558 friend constexpr basic_vec&
1559 operator>>=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
1560 requires requires(value_type __a) { __a >> __a; }
1561 {
1562 __glibcxx_simd_precondition(is_unsigned_v<value_type> || all_of(__y >= value_type()),
1563 "negative shift is undefined behavior");
1564 __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
1565 "too large shift invokes undefined behavior");
1566 __x._M_data >>= __y._M_data;
1567 return __x;
1568 }
1569
1570 [[__gnu__::__always_inline__]]
1571 friend constexpr basic_vec&
1572 operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
1573 requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
1574 {
1575 __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
1576 __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
1577 "too large shift invokes undefined behavior");
1578 __x._M_data <<= __y;
1579 return __x;
1580 }
1581
1582 [[__gnu__::__always_inline__]]
1583 friend constexpr basic_vec&
1584 operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
1585 requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
1586 {
1587 __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
1588 __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
1589 "too large shift invokes undefined behavior");
1590 __x._M_data >>= __y;
1591 return __x;
1592 }
1593
1594 // [simd.comparison] ----------------------------------------------------
1595#if _GLIBCXX_X86
1596 template <_X86Cmp _Cmp>
1597 [[__gnu__::__always_inline__]]
1598 constexpr mask_type
1599 _M_bitmask_cmp(_DataType __y) const
1600 {
1601 static_assert(_S_use_bitmask);
1602 if (__is_const_known(_M_data, __y))
1603 {
1604 constexpr auto [...__is] = _IotaArray<_S_size>;
1605 constexpr auto __cmp_op = [] [[__gnu__::__always_inline__]]
1606 (value_type __a, value_type __b) {
1607 if constexpr (_Cmp == _X86Cmp::_Eq)
1608 return __a == __b;
1609 else if constexpr (_Cmp == _X86Cmp::_Lt)
1610 return __a < __b;
1611 else if constexpr (_Cmp == _X86Cmp::_Le)
1612 return __a <= __b;
1613 else if constexpr (_Cmp == _X86Cmp::_Unord)
1614 return std::isunordered(__a, __b);
1615 else if constexpr (_Cmp == _X86Cmp::_Neq)
1616 return __a != __b;
1617 else if constexpr (_Cmp == _X86Cmp::_Nlt)
1618 return !(__a < __b);
1619 else if constexpr (_Cmp == _X86Cmp::_Nle)
1620 return !(__a <= __b);
1621 else
1622 static_assert(false);
1623 };
1624 const _Bitmask<_S_size> __bits
1625 = ((__cmp_op(__vec_get(_M_data, __is), __vec_get(__y, __is))
1626 ? (1ULL << __is) : 0) | ...);
1627 return mask_type::_S_init(__bits);
1628 }
1629 else
1630 return mask_type::_S_init(__x86_bitmask_cmp<_Cmp>(_M_data, __y));
1631 }
1632#endif
1633
1634 [[__gnu__::__always_inline__]]
1635 friend constexpr mask_type
1636 operator==(const basic_vec& __x, const basic_vec& __y) noexcept
1637 {
1638#if _GLIBCXX_X86
1639 if constexpr (_S_use_bitmask)
1640 return __x._M_bitmask_cmp<_X86Cmp::_Eq>(__y._M_data);
1641 else
1642#endif
1643 return mask_type::_S_init(__x._M_data == __y._M_data);
1644 }
1645
1646 [[__gnu__::__always_inline__]]
1647 friend constexpr mask_type
1648 operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
1649 {
1650#if _GLIBCXX_X86
1651 if constexpr (_S_use_bitmask)
1652 return __x._M_bitmask_cmp<_X86Cmp::_Neq>(__y._M_data);
1653 else
1654#endif
1655 return mask_type::_S_init(__x._M_data != __y._M_data);
1656 }
1657
1658 [[__gnu__::__always_inline__]]
1659 friend constexpr mask_type
1660 operator<(const basic_vec& __x, const basic_vec& __y) noexcept
1661 {
1662#if _GLIBCXX_X86
1663 if constexpr (_S_use_bitmask)
1664 return __x._M_bitmask_cmp<_X86Cmp::_Lt>(__y._M_data);
1665 else
1666#endif
1667 return mask_type::_S_init(__x._M_data < __y._M_data);
1668 }
1669
1670 [[__gnu__::__always_inline__]]
1671 friend constexpr mask_type
1672 operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
1673 {
1674#if _GLIBCXX_X86
1675 if constexpr (_S_use_bitmask)
1676 return __x._M_bitmask_cmp<_X86Cmp::_Le>(__y._M_data);
1677 else
1678#endif
1679 return mask_type::_S_init(__x._M_data <= __y._M_data);
1680 }
1681
1682 [[__gnu__::__always_inline__]]
1683 friend constexpr mask_type
1684 operator>(const basic_vec& __x, const basic_vec& __y) noexcept
1685 { return __y < __x; }
1686
1687 [[__gnu__::__always_inline__]]
1688 friend constexpr mask_type
1689 operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
1690 { return __y <= __x; }
1691
1692 // [simd.cond] ---------------------------------------------------------
1693 template <_TargetTraits _Traits = {}>
1694 [[__gnu__::__always_inline__]]
1695 friend constexpr basic_vec
1696 __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
1697 {
1698 if constexpr (_S_size == 1)
1699 return __k[0] ? __t : __f;
1700 else if constexpr (_S_use_bitmask)
1701 {
1702#if _GLIBCXX_X86
1703 if (__is_const_known(__k, __t, __f))
1704 return basic_vec([&](int __i) { return __k[__i] ? __t[__i] : __f[__i]; });
1705 else
1706 return __x86_bitmask_blend(__k._M_data, __t._M_data, __f._M_data);
1707#else
1708 static_assert(false, "TODO");
1709#endif
1710 }
1711 else if consteval
1712 {
1713 return __k._M_data ? __t._M_data : __f._M_data;
1714 }
1715 else
1716 {
1717 constexpr bool __uses_simd_register = sizeof(_M_data) >= 8;
1718 using _VO = _VecOps<_DataType>;
1719 if (_VO::_S_is_const_known_equal_to(__f._M_data, 0))
1720 {
1721 if (is_integral_v<value_type> && __uses_simd_register
1722 && _VO::_S_is_const_known_equal_to(__t._M_data, 1))
1723 // This is equivalent to converting the mask into a vec of 0s and 1s. So +__k.
1724 // However, basic_mask::operator+ arrives here; returning +__k would be
1725 // recursive. Instead we use -__k (which is a no-op for vector-masks) and then
1726 // flip all -1 elements to +1 by taking the absolute value.
1727 return basic_vec((-__k)._M_abs());
1728 else
1729 return __vec_and(reinterpret_cast<_DataType>(__k._M_data), __t._M_data);
1730 }
1731 else if (_VecOps<_DataType>::_S_is_const_known_equal_to(__t._M_data, 0))
1732 {
1733 if (is_integral_v<value_type> && __uses_simd_register
1734 && _VO::_S_is_const_known_equal_to(__f._M_data, 1))
1735 return value_type(1) + basic_vec(-__k);
1736 else
1737 return __vec_and(reinterpret_cast<_DataType>(__vec_not(__k._M_data)), __f._M_data);
1738 }
1739 else
1740 {
1741#if _GLIBCXX_X86
1742 // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
1743 // This pattern, is recognized to match the x86 blend instructions, which only consider
1744 // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
1745 // is a vector-mask, then the '< 0' is elided.
1746 return __k._M_data < 0 ? __t._M_data : __f._M_data;
1747#endif
1748 return __k._M_data ? __t._M_data : __f._M_data;
1749 }
1750 }
1751 }
1752 };
1753
1754 template <__vectorizable _Tp, __abi_tag _Ap>
1755 requires (_Ap::_S_nreg > 1)
1756 class basic_vec<_Tp, _Ap>
1757 : public _VecBase<_Tp, _Ap>
1758 {
1759 template <typename, typename>
1760 friend class basic_vec;
1761
1762 template <size_t, typename>
1763 friend class basic_mask;
1764
1765 static constexpr int _S_size = _Ap::_S_size;
1766
1767 static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
1768
1769 static constexpr int _N1 = _S_size - _N0;
1770
1771 using _DataType0 = __similar_vec<_Tp, _N0, _Ap>;
1772
1773 // the implementation (and users) depend on elements being contiguous in memory
1774 static_assert(_N0 * sizeof(_Tp) == sizeof(_DataType0));
1775
1776 using _DataType1 = __similar_vec<_Tp, _N1, _Ap>;
1777
1778 static_assert(_DataType0::abi_type::_S_nreg + _DataType1::abi_type::_S_nreg == _Ap::_S_nreg);
1779
1780 static constexpr bool _S_is_scalar = _DataType0::_S_is_scalar;
1781
1782 _DataType0 _M_data0;
1783
1784 _DataType1 _M_data1;
1785
1786 static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask;
1787
1788 static constexpr bool _S_is_partial = _DataType1::_S_is_partial;
1789
1790 public:
1791 using value_type = _Tp;
1792
1793 using mask_type = _VecBase<_Tp, _Ap>::mask_type;
1794
1795 [[__gnu__::__always_inline__]]
1796 static constexpr basic_vec
1797 _S_init(const _DataType0& __x, const _DataType1& __y)
1798 {
1799 basic_vec __r;
1800 __r._M_data0 = __x;
1801 __r._M_data1 = __y;
1802 return __r;
1803 }
1804
1805 [[__gnu__::__always_inline__]]
1806 constexpr const _DataType0&
1807 _M_get_low() const
1808 { return _M_data0; }
1809
1810 [[__gnu__::__always_inline__]]
1811 constexpr const _DataType1&
1812 _M_get_high() const
1813 { return _M_data1; }
1814
1815 [[__gnu__::__always_inline__]]
1816 friend constexpr bool
1817 __is_const_known(const basic_vec& __x)
1818 { return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); }
1819
1820 [[__gnu__::__always_inline__]]
1821 constexpr auto
1822 _M_concat_data([[maybe_unused]] bool __do_sanitize = false) const
1823 {
1824 return __vec_concat(_M_data0._M_concat_data(false),
1825 __vec_zero_pad_to<sizeof(_M_data0)>(
1826 _M_data1._M_concat_data(__do_sanitize)));
1827 }
1828
1829 template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
1830 [[__gnu__::__always_inline__]]
1831 static constexpr basic_vec
1832 _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
1833 {
1834 return _S_init(
1835 _DataType0::template _S_static_permute<_Size, _Offset>(__x, __idxmap),
1836 _DataType1::template _S_static_permute<_Size, _Offset + _N0>(__x, __idxmap));
1837 }
1838
1839 template <typename _Vp>
1840 [[__gnu__::__always_inline__]]
1841 constexpr auto
1842 _M_chunk() const noexcept
1843 {
1844 constexpr int __n = _S_size / _Vp::_S_size;
1845 constexpr int __rem = _S_size % _Vp::_S_size;
1846 constexpr auto [...__is] = _IotaArray<__n>;
1847 if constexpr (__rem == 0)
1848 return array<_Vp, __n>{__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>,
1849 _M_data0, _M_data1)...};
1850 else
1851 {
1852 using _Rest = resize_t<__rem, _Vp>;
1853 return tuple(__extract_simd_at<_Vp>(cw<_Vp::_S_size * __is>, _M_data0, _M_data1)...,
1854 __extract_simd_at<_Rest>(cw<_Vp::_S_size * __n>, _M_data0, _M_data1));
1855 }
1856 }
1857
1858 [[__gnu__::__always_inline__]]
1859 static constexpr const basic_vec&
1860 _S_concat(const basic_vec& __x0) noexcept
1861 { return __x0; }
1862
1863 template <typename... _As>
1864 requires (sizeof...(_As) >= 2)
1865 [[__gnu__::__always_inline__]]
1866 static constexpr basic_vec
1867 _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
1868 {
1869 static_assert(_S_size == (_As::_S_size + ...));
1870 return _S_init(__extract_simd_at<_DataType0>(cw<0>, __xs...),
1871 __extract_simd_at<_DataType1>(cw<_N0>, __xs...));
1872 }
1873
1874 [[__gnu__::__always_inline__]]
1875 constexpr auto
1876 _M_reduce_to_half(auto __binary_op) const requires (_N0 == _N1)
1877 { return __binary_op(_M_data0, _M_data1); }
1878
1879 [[__gnu__::__always_inline__]]
1880 constexpr value_type
1881 _M_reduce_tail(const auto& __rest, auto __binary_op) const
1882 {
1883 if constexpr (__rest.size() > _S_size)
1884 {
1885 auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
1886 return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
1887 }
1888 else if constexpr (__rest.size() == _S_size)
1889 return __binary_op(*this, __rest)._M_reduce(__binary_op);
1890 else
1891 return _M_reduce_to_half(__binary_op)._M_reduce_tail(__rest, __binary_op);
1892 }
1893
1894 template <typename _BinaryOp, _TargetTraits _Traits = {}>
1895 [[__gnu__::__always_inline__]]
1896 constexpr value_type
1897 _M_reduce(_BinaryOp __binary_op) const
1898 {
1899 if constexpr (_Traits.template _M_eval_as_f32<value_type>()
1900 && (is_same_v<_BinaryOp, plus<>>
1901 || is_same_v<_BinaryOp, multiplies<>>))
1902 return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
1903#ifdef __SSE2__
1904 else if constexpr (is_integral_v<value_type> && sizeof(value_type) == 1
1905 && is_same_v<decltype(__binary_op), multiplies<>>)
1906 {
1907 // convert to unsigned short because of missing 8-bit mul instruction
1908 // we don't need to preserve the order of elements
1909 //
1910 // The left columns under Latency and Throughput show bit-cast to ushort with shift by
1911 // 8. The right column uses the alternative in the else branch.
1912 // Benchmark on Intel Ultra 7 165U (AVX2)
1913 // TYPE Latency Throughput
1914 // [cycles/call] [cycles/call]
1915 //schar, 64 59.9 70.7 10.5 13.3
1916 //schar, 128 81.4 97.2 12.2 21
1917 //schar, 256 92.4 129 17.2 35.2
1918 if constexpr (_DataType1::_S_is_scalar)
1919 return __binary_op(_DataType1(_M_data0._M_reduce(__binary_op)), _M_data1)[0];
1920 // TODO: optimize trailing scalar (e.g. (8+8)+(8+1))
1921 else if constexpr (_S_size % 2 == 0)
1922 { // If all elements participate in the reduction we can take this shortcut
1923 using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
1924 auto __a = __builtin_bit_cast(_V16, *this);
1925 return __binary_op(__a, __a >> __CHAR_BIT__)._M_reduce(__binary_op);
1926 }
1927 else
1928 {
1929 using _V16 = rebind_t<unsigned short, basic_vec>;
1930 return _V16(*this)._M_reduce(__binary_op);
1931 }
1932 }
1933#endif
1934 else
1935 return _M_data0._M_reduce_tail(_M_data1, __binary_op);
1936 }
1937
1938 [[__gnu__::__always_inline__]]
1939 constexpr mask_type
1940 _M_isnan() const requires is_floating_point_v<value_type>
1941 { return mask_type::_S_init(_M_data0._M_isnan(), _M_data1._M_isnan()); }
1942
1943 [[__gnu__::__always_inline__]]
1944 constexpr mask_type
1945 _M_isinf() const requires is_floating_point_v<value_type>
1946 { return mask_type::_S_init(_M_data0._M_isinf(), _M_data1._M_isinf()); }
1947
1948 [[__gnu__::__always_inline__]]
1949 constexpr mask_type
1950 _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
1951 {
1952 return mask_type::_S_init(_M_data0._M_isunordered(__y._M_data0),
1953 _M_data1._M_isunordered(__y._M_data1));
1954 }
1955
1956 [[__gnu__::__always_inline__]]
1957 constexpr basic_vec
1958 _M_abs() const requires signed_integral<value_type>
1959 { return _S_init(_M_data0._M_abs(), _M_data1._M_abs()); }
1960
1961 [[__gnu__::__always_inline__]]
1962 constexpr basic_vec
1963 _M_fabs() const requires floating_point<value_type>
1964 { return _S_init(_M_data0._M_fabs(), _M_data1._M_fabs()); }
1965
1966 template <typename _Up>
1967 [[__gnu__::__always_inline__]]
1968 static inline basic_vec
1969 _S_partial_load(const _Up* __mem, size_t __n)
1970 {
1971 if (__n >= _N0)
1972 return _S_init(_DataType0(_LoadCtorTag(), __mem),
1973 _DataType1::_S_partial_load(__mem + _N0, __n - _N0));
1974 else
1975 return _S_init(_DataType0::_S_partial_load(__mem, __n),
1976 _DataType1());
1977 }
1978
1979 template <typename _Up, _ArchTraits _Traits = {}>
1980 static inline basic_vec
1981 _S_masked_load(const _Up* __mem, mask_type __k)
1982 {
1983 return _S_init(_DataType0::_S_masked_load(__mem, __k._M_data0),
1984 _DataType1::_S_masked_load(__mem + _N0, __k._M_data1));
1985 }
1986
1987 template <typename _Up>
1988 [[__gnu__::__always_inline__]]
1989 inline void
1990 _M_store(_Up* __mem) const
1991 {
1992 _M_data0._M_store(__mem);
1993 _M_data1._M_store(__mem + _N0);
1994 }
1995
1996 template <typename _Up>
1997 [[__gnu__::__always_inline__]]
1998 static inline void
1999 _S_partial_store(const basic_vec& __v, _Up* __mem, size_t __n)
2000 {
2001 if (__n >= _N0)
2002 {
2003 __v._M_data0._M_store(__mem);
2004 _DataType1::_S_partial_store(__v._M_data1, __mem + _N0, __n - _N0);
2005 }
2006 else
2007 {
2008 _DataType0::_S_partial_store(__v._M_data0, __mem, __n);
2009 }
2010 }
2011
2012 template <typename _Up>
2013 [[__gnu__::__always_inline__]]
2014 static inline void
2015 _S_masked_store(const basic_vec& __v, _Up* __mem, const mask_type& __k)
2016 {
2017 _DataType0::_S_masked_store(__v._M_data0, __mem, __k._M_data0);
2018 _DataType1::_S_masked_store(__v._M_data1, __mem + _N0, __k._M_data1);
2019 }
2020
2021 basic_vec() = default;
2022
2023 // [simd.overview] p2 impl-def conversions ------------------------------
2024 using _NativeVecType = __vec_builtin_type<value_type, __bit_ceil(unsigned(_S_size))>;
2025
2026 [[__gnu__::__always_inline__]]
2027 constexpr
2028 basic_vec(const _NativeVecType& __x)
2029 : _M_data0(_VecOps<__vec_builtin_type<value_type, _N0>>::_S_extract(__x)),
2030 _M_data1(_VecOps<__vec_builtin_type<value_type, __bit_ceil(unsigned(_N1))>>
2031 ::_S_extract(__x, integral_constant<int, _N0>()))
2032 {}
2033
2034 [[__gnu__::__always_inline__]]
2035 constexpr
2036 operator _NativeVecType() const
2037 { return _M_concat_data(); }
2038
2039 // [simd.ctor] broadcast constructor ------------------------------------
2040 template <__explicitly_convertible_to<value_type> _Up>
2041 [[__gnu__::__always_inline__]]
2042 constexpr explicit(!__broadcast_constructible<_Up, value_type>)
2043 basic_vec(_Up&& __x) noexcept
2044 : _M_data0(static_cast<value_type>(__x)), _M_data1(static_cast<value_type>(__x))
2045 {}
2046
2047 template <__simd_vec_bcast_consteval<value_type> _Up>
2048 consteval
2049 basic_vec(_Up&& __x)
2050 : _M_data0(__value_preserving_cast<value_type>(__x)),
2051 _M_data1(__value_preserving_cast<value_type>(__x))
2052 {}
2053
2054 // [simd.ctor] conversion constructor -----------------------------------
2055 template <typename _Up, typename _UAbi>
2056 requires (_S_size == _UAbi::_S_size)
2057 && __explicitly_convertible_to<_Up, value_type>
2058 [[__gnu__::__always_inline__]]
2059 constexpr
2060 explicit(!__value_preserving_convertible_to<_Up, value_type>
2061 || __higher_rank_than<_Up, value_type>)
2062 basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
2063 : _M_data0(get<0>(chunk<_N0>(__x))),
2064 _M_data1(get<1>(chunk<_N0>(__x)))
2065 {}
2066
2067 using _VecBase<_Tp, _Ap>::_VecBase;
2068
2069 // [simd.ctor] generator constructor ------------------------------------
2070 template <__simd_generator_invokable<value_type, _S_size> _Fp>
2071 [[__gnu__::__always_inline__]]
2072 constexpr explicit
2073 basic_vec(_Fp&& __gen)
2074 : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
2075 return __gen(__simd_size_c<__i + _N0>);
2076 })
2077 {}
2078
2079 // [simd.ctor] load constructor -----------------------------------------
2080 template <typename _Up>
2081 [[__gnu__::__always_inline__]]
2082 constexpr
2083 basic_vec(_LoadCtorTag, const _Up* __ptr)
2084 : _M_data0(_LoadCtorTag(), __ptr),
2085 _M_data1(_LoadCtorTag(), __ptr + _N0)
2086 {}
2087
2088 template <ranges::contiguous_range _Rg, typename... _Flags>
2089 requires __static_sized_range<_Rg, _S_size>
2090 && __vectorizable<ranges::range_value_t<_Rg>>
2091 && __explicitly_convertible_to<ranges::range_value_t<_Rg>, value_type>
2092 constexpr
2093 basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
2094 : basic_vec(_LoadCtorTag(),
2095 __flags.template _S_adjust_pointer<basic_vec>(ranges::data(__range)))
2096 {
2097 static_assert(__loadstore_convertible_to<ranges::range_value_t<_Rg>, value_type,
2098 _Flags...>);
2099 }
2100
2101 // [simd.subscr] --------------------------------------------------------
2102 [[__gnu__::__always_inline__]]
2103 constexpr value_type
2104 operator[](__simd_size_type __i) const
2105 {
2106 __glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
2107 if (__is_const_known(__i))
2108 return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
2109 else
2110 {
2111 using _AliasingT [[__gnu__::__may_alias__]] = value_type;
2112 return reinterpret_cast<const _AliasingT*>(this)[__i];
2113 }
2114 }
2115
2116 // [simd.unary] unary operators -----------------------------------------
2117 [[__gnu__::__always_inline__]]
2118 constexpr basic_vec&
2119 operator++() noexcept requires requires(value_type __a) { ++__a; }
2120 {
2121 ++_M_data0;
2122 ++_M_data1;
2123 return *this;
2124 }
2125
2126 [[__gnu__::__always_inline__]]
2127 constexpr basic_vec
2128 operator++(int) noexcept requires requires(value_type __a) { __a++; }
2129 {
2130 basic_vec __r = *this;
2131 ++_M_data0;
2132 ++_M_data1;
2133 return __r;
2134 }
2135
2136 [[__gnu__::__always_inline__]]
2137 constexpr basic_vec&
2138 operator--() noexcept requires requires(value_type __a) { --__a; }
2139 {
2140 --_M_data0;
2141 --_M_data1;
2142 return *this;
2143 }
2144
2145 [[__gnu__::__always_inline__]]
2146 constexpr basic_vec
2147 operator--(int) noexcept requires requires(value_type __a) { __a--; }
2148 {
2149 basic_vec __r = *this;
2150 --_M_data0;
2151 --_M_data1;
2152 return __r;
2153 }
2154
2155 [[__gnu__::__always_inline__]]
2156 constexpr mask_type
2157 operator!() const noexcept requires requires(value_type __a) { !__a; }
2158 { return mask_type::_S_init(!_M_data0, !_M_data1); }
2159
2160 [[__gnu__::__always_inline__]]
2161 constexpr basic_vec
2162 operator+() const noexcept requires requires(value_type __a) { +__a; }
2163 { return *this; }
2164
2165 [[__gnu__::__always_inline__]]
2166 constexpr basic_vec
2167 operator-() const noexcept requires requires(value_type __a) { -__a; }
2168 { return _S_init(-_M_data0, -_M_data1); }
2169
2170 [[__gnu__::__always_inline__]]
2171 constexpr basic_vec
2172 operator~() const noexcept requires requires(value_type __a) { ~__a; }
2173 { return _S_init(~_M_data0, ~_M_data1); }
2174
2175 // [simd.cassign] -------------------------------------------------------
2176#define _GLIBCXX_SIMD_DEFINE_OP(sym) \
2177 [[__gnu__::__always_inline__]] \
2178 friend constexpr basic_vec& \
2179 operator sym##=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT \
2180 { \
2181 __x._M_data0 sym##= __y._M_data0; \
2182 __x._M_data1 sym##= __y._M_data1; \
2183 return __x; \
2184 }
2185
2186 _GLIBCXX_SIMD_DEFINE_OP(+)
2187 _GLIBCXX_SIMD_DEFINE_OP(-)
2188 _GLIBCXX_SIMD_DEFINE_OP(*)
2189 _GLIBCXX_SIMD_DEFINE_OP(/)
2190 _GLIBCXX_SIMD_DEFINE_OP(%)
2191 _GLIBCXX_SIMD_DEFINE_OP(&)
2192 _GLIBCXX_SIMD_DEFINE_OP(|)
2193 _GLIBCXX_SIMD_DEFINE_OP(^)
2194 _GLIBCXX_SIMD_DEFINE_OP(<<)
2195 _GLIBCXX_SIMD_DEFINE_OP(>>)
2196
2197#undef _GLIBCXX_SIMD_DEFINE_OP
2198
2199 [[__gnu__::__always_inline__]]
2200 friend constexpr basic_vec&
2201 operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
2202 requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
2203 {
2204 __x._M_data0 <<= __y;
2205 __x._M_data1 <<= __y;
2206 return __x;
2207 }
2208
2209 [[__gnu__::__always_inline__]]
2210 friend constexpr basic_vec&
2211 operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
2212 requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
2213 {
2214 __x._M_data0 >>= __y;
2215 __x._M_data1 >>= __y;
2216 return __x;
2217 }
2218
2219 // [simd.comparison] ----------------------------------------------------
2220 [[__gnu__::__always_inline__]]
2221 friend constexpr mask_type
2222 operator==(const basic_vec& __x, const basic_vec& __y) noexcept
2223 { return mask_type::_S_init(__x._M_data0 == __y._M_data0, __x._M_data1 == __y._M_data1); }
2224
2225 [[__gnu__::__always_inline__]]
2226 friend constexpr mask_type
2227 operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
2228 { return mask_type::_S_init(__x._M_data0 != __y._M_data0, __x._M_data1 != __y._M_data1); }
2229
2230 [[__gnu__::__always_inline__]]
2231 friend constexpr mask_type
2232 operator<(const basic_vec& __x, const basic_vec& __y) noexcept
2233 { return mask_type::_S_init(__x._M_data0 < __y._M_data0, __x._M_data1 < __y._M_data1); }
2234
2235 [[__gnu__::__always_inline__]]
2236 friend constexpr mask_type
2237 operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
2238 { return mask_type::_S_init(__x._M_data0 <= __y._M_data0, __x._M_data1 <= __y._M_data1); }
2239
2240 [[__gnu__::__always_inline__]]
2241 friend constexpr mask_type
2242 operator>(const basic_vec& __x, const basic_vec& __y) noexcept
2243 { return mask_type::_S_init(__x._M_data0 > __y._M_data0, __x._M_data1 > __y._M_data1); }
2244
2245 [[__gnu__::__always_inline__]]
2246 friend constexpr mask_type
2247 operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
2248 { return mask_type::_S_init(__x._M_data0 >= __y._M_data0, __x._M_data1 >= __y._M_data1); }
2249
2250 // [simd.cond] ---------------------------------------------------------
2251 [[__gnu__::__always_inline__]]
2252 friend constexpr basic_vec
2253 __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
2254 {
2255 return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
2256 __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
2257 }
2258 };
2259
2260 // [simd.overview] deduction guide ------------------------------------------
2261 template <ranges::contiguous_range _Rg, typename... _Ts>
2262 requires __static_sized_range<_Rg>
2263 basic_vec(_Rg&& __r, _Ts...)
2264 -> basic_vec<ranges::range_value_t<_Rg>,
2265 __deduce_abi_t<ranges::range_value_t<_Rg>,
2266#if 0 // PR117849
2267 static_cast<__simd_size_type>(ranges::size(__r))>>;
2268#else
2269 static_cast<__simd_size_type>(decltype(std::span(__r))::extent)>>;
2270#endif
2271
2272 template <size_t _Bytes, typename _Ap>
2273 basic_vec(basic_mask<_Bytes, _Ap>)
2274 -> basic_vec<__integer_from<_Bytes>,
2275 decltype(__abi_rebind<__integer_from<_Bytes>, basic_mask<_Bytes, _Ap>::size.value,
2276 _Ap>())>;
2277
2278 // [P3319R5] ----------------------------------------------------------------
2279 template <__vectorizable _Tp>
2280 requires is_arithmetic_v<_Tp>
2281 inline constexpr _Tp
2282 __iota<_Tp> = _Tp();
2283
2284 template <typename _Tp, typename _Ap>
2285 inline constexpr basic_vec<_Tp, _Ap>
2286 __iota<basic_vec<_Tp, _Ap>> = basic_vec<_Tp, _Ap>([](_Tp __i) -> _Tp {
2287 static_assert(_Ap::_S_size - 1 <= numeric_limits<_Tp>::max(),
2288 "iota object would overflow");
2289 return __i;
2290 });
2291} // namespace simd
2292_GLIBCXX_END_NAMESPACE_VERSION
2293} // namespace std
2294
2295#pragma GCC diagnostic pop
2296#endif // C++26
2297#endif // _GLIBCXX_SIMD_VEC_H
constexpr bool operator<=(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:859
constexpr bool operator>=(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:873
constexpr bool operator<(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:826
constexpr bool operator>(const duration< _Rep1, _Period1 > &__lhs, const duration< _Rep2, _Period2 > &__rhs)
Definition chrono.h:866
constexpr complex< _Tp > operator-(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x minus y.
Definition complex:404
constexpr complex< _Tp > operator+(const complex< _Tp > &__x, const complex< _Tp > &__y)
Return new complex value x plus y.
Definition complex:374
bool is_sufficiently_aligned(_Tp *__ptr)
Is __ptr aligned to an _Align byte boundary?
Definition align.h:118
ISO C++ entities toplevel namespace is std.
_Tp fabs(const std::complex< _Tp > &__z)
fabs(__z) TR1 8.1.8 [tr.c99.cmplx.fabs]
Definition complex:2525
constexpr auto data(_Container &__cont) noexcept(noexcept(__cont.data())) -> decltype(__cont.data())
Return the data pointer of a container.
static constexpr _Tp max() noexcept
Definition limits:328
static constexpr _Tp infinity() noexcept
Definition limits:348