libstdc++
bits/simd_x86.h
1// Implementation of <simd> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_SIMD_X86_H
26#define _GLIBCXX_SIMD_X86_H 1
27
28#ifdef _GLIBCXX_SYSHDR
29#pragma GCC system_header
30#endif
31
32#if __cplusplus >= 202400L
33
34#include "vec_ops.h"
35
36#if !_GLIBCXX_X86
37#error "wrong include for this target"
38#endif
39
40#pragma GCC push_options
41// ensure GCC knows about the __builtin_ia32_* calls
42#pragma GCC target("avx2,bmi,bmi2,avx512vl,avx512bw,avx512dq,avx10.2")
43#pragma GCC pop_options
44
45// psabi warnings are bogus because the ABI of the internal types never leaks into user code
46#pragma GCC diagnostic push
47#pragma GCC diagnostic ignored "-Wpsabi"
48
49namespace std _GLIBCXX_VISIBILITY(default)
50{
51_GLIBCXX_BEGIN_NAMESPACE_VERSION
52namespace simd
53{
54 static constexpr size_t __x86_max_general_register_size
55#ifdef __x86_64__
56 = 8;
57#else
58 = 4;
59#endif
60
61 /** @internal
62 * Return a bit-mask for the given vector-mask.
63 *
64 * Caveats:
65 * 1. The bit-mask of 2-Byte vector-masks has duplicated entries (because of missing instruction)
66 * 2. The return type internally is 'int', but that fails on conversion to uint64 if the MSB of a
67 * YMM 1/2-Byte vector-mask is set (sign extension). Therefore these helper functions return
68 * unsigned instead.
69 * 3. ZMM inputs are not supported.
70 */
71 [[__gnu__::__always_inline__]]
72 inline unsigned
73 __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 16> __x)
74 { return __builtin_ia32_movmskpd(__vec_bit_cast<double>(__x)); }
75
76 [[__gnu__::__always_inline__]]
77 inline unsigned
78 __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 32> __x)
79 { return __builtin_ia32_movmskpd256(__vec_bit_cast<double>(__x)); }
80
81 [[__gnu__::__always_inline__]]
82 inline unsigned
83 __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 16> __x)
84 { return __builtin_ia32_movmskps(__vec_bit_cast<float>(__x)); }
85
86 template <_ArchTraits _Traits = {}>
87 [[__gnu__::__always_inline__]]
88 inline _Bitmask<8>
89 __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 8> __x)
90 {
91#if __has_builtin(__builtin_ia32_pext_di)
92 if constexpr (_Traits._M_have_bmi2())
93 return _Bitmask<8>(__builtin_ia32_pext_di(
94 __builtin_bit_cast(unsigned long long, __x),
95 0x80000000'80000000ULL));
96#endif
97 return _Bitmask<8>(__x86_movmsk(__vec_zero_pad_to_16(__x)));
98 }
99
100 [[__gnu__::__always_inline__]]
101 inline unsigned
102 __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 32> __x)
103 { return __builtin_ia32_movmskps256(__vec_bit_cast<float>(__x)); }
104
105 template <__vec_builtin _TV, auto _Traits = _ArchTraits()>
106 requires (sizeof(__vec_value_type<_TV>) <= 2)
107 [[__gnu__::__always_inline__]]
108 inline unsigned
109 __x86_movmsk(_TV __x)
110 {
111 static_assert(__width_of<_TV> > 1);
112 if constexpr (sizeof(__x) == 32)
113 return __builtin_ia32_pmovmskb256(__vec_bit_cast<char>(__x));
114 else if constexpr (sizeof(__x) == 16)
115 return __builtin_ia32_pmovmskb128(__vec_bit_cast<char>(__x));
116 else if constexpr (sizeof(__x) == 8)
117 {
118#if __has_builtin(__builtin_ia32_pext_di)
119 if constexpr (_Traits._M_have_bmi2())
120 return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
121 0x8080'8080'8080'8080ULL);
122#endif
123 return __x86_movmsk(__vec_zero_pad_to_16(__x));
124 }
125 else if constexpr (sizeof(__x) == 4)
126 {
127#if __has_builtin(__builtin_ia32_pext_si)
128 if constexpr (_Traits._M_have_bmi2())
129 return __builtin_ia32_pext_si(__builtin_bit_cast(unsigned int, __x), 0x80808080u);
130#endif
131 return __x86_movmsk(__vec_zero_pad_to_16(__x));
132 }
133 else if constexpr (sizeof(__x) == 2)
134 {
135 auto __bits = __builtin_bit_cast(unsigned short, __x);
136#if __has_builtin(__builtin_ia32_pext_si)
137 if constexpr (_Traits._M_have_bmi2())
138 return __builtin_ia32_pext_si(__bits, 0x00008080u);
139#endif
140 return ((__bits >> 7) & 1) | ((__bits & 0x8000) >> 14);
141 }
142 else
143 static_assert(false);
144 }
145
146 template <__vec_builtin _TV, _ArchTraits _Traits = {}>
147 [[__gnu__::__always_inline__]]
148 inline bool
149 __x86_vec_is_zero(_TV __a)
150 {
151 using _Tp = __vec_value_type<_TV>;
152 static_assert(is_integral_v<_Tp>);
153 if constexpr (sizeof(_TV) <= __x86_max_general_register_size)
154 return __builtin_bit_cast(__integer_from<sizeof(_TV)>, __a) == 0;
155 else if constexpr (_Traits._M_have_avx())
156 {
157 if constexpr (sizeof(_TV) == 32)
158 return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
159 __vec_bit_cast<long long>(__a));
160 else if constexpr (sizeof(_TV) == 16)
161 return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
162 __vec_bit_cast<long long>(__a));
163 else if constexpr (sizeof(_TV) < 16)
164 return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
165 else
166 static_assert(false);
167 }
168 else if constexpr (_Traits._M_have_sse4_1())
169 {
170 if constexpr (sizeof(_TV) == 16)
171 return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
172 __vec_bit_cast<long long>(__a));
173 else if constexpr (sizeof(_TV) < 16)
174 return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
175 else
176 static_assert(false);
177 }
178 else
179 return __x86_movmsk(__a) == 0;
180 }
181
182 template <__vec_builtin _TV, _ArchTraits _Traits = {}>
183 [[__gnu__::__always_inline__]]
184 inline int
185 __x86_vec_testz(_TV __a, _TV __b)
186 {
187 static_assert(sizeof(_TV) == 16 || sizeof(_TV) == 32);
188 static_assert(_Traits._M_have_sse4_1());
189 if constexpr (sizeof(_TV) == 32)
190 return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
191 __vec_bit_cast<long long>(__b));
192 else
193 return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
194 __vec_bit_cast<long long>(__b));
195 }
196
197 template <__vec_builtin _TV, _ArchTraits _Traits = {}>
198 [[__gnu__::__always_inline__]]
199 inline int
200 __x86_vec_testc(_TV __a, _TV __b)
201 {
202 static_assert(sizeof(_TV) == 16 || sizeof(_TV) == 32);
203 static_assert(_Traits._M_have_sse4_1());
204 if constexpr (sizeof(_TV) == 32)
205 return __builtin_ia32_ptestc256(__vec_bit_cast<long long>(__a),
206 __vec_bit_cast<long long>(__b));
207 else
208 return __builtin_ia32_ptestc128(__vec_bit_cast<long long>(__a),
209 __vec_bit_cast<long long>(__b));
210 }
211
212 template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
213 [[__gnu__::__always_inline__]]
214 inline bool
215 __x86_vecmask_all(_TV __k)
216 {
217 using _Tp = __vec_value_type<_TV>;
218 static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
219 constexpr int __width = __width_of<_TV>;
220 static_assert(sizeof(__k) <= 32);
221 if constexpr (_Np == __width)
222 {
223 if constexpr (sizeof(__k) <= __x86_max_general_register_size)
224 {
225 using _Ip = __integer_from<sizeof(__k)>;
226 return __builtin_bit_cast(_Ip, __k) == ~_Ip();
227 }
228 else if constexpr (!_Traits._M_have_sse4_1())
229 {
230 constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
231 return __x86_movmsk(__k) == __valid_bits;
232 }
233 else if constexpr (sizeof(__k) < 16)
234 return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
235 else
236 return 0 != __x86_vec_testc(__k, ~_TV());
237 }
238 else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
239 {
240 using _Ip = __integer_from<sizeof(__k)>;
241 constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
242 return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == __valid_bits;
243 }
244 else if constexpr (!_Traits._M_have_sse4_1())
245 {
246 constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
247 return (__x86_movmsk(__k) & __valid_bits) == __valid_bits;
248 }
249 else if constexpr (sizeof(__k) < 16)
250 return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
251 else
252 return 0 != __x86_vec_testc(__k, _S_vec_implicit_mask<_Np, _TV>);
253 }
254
255 template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
256 [[__gnu__::__always_inline__]]
257 inline bool
258 __x86_vecmask_any(_TV __k)
259 {
260 using _Tp = __vec_value_type<_TV>;
261 static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
262 constexpr int __width = __width_of<_TV>;
263 static_assert(sizeof(__k) <= 32);
264 if constexpr (_Np == __width)
265 return !__x86_vec_is_zero(__k);
266 else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
267 {
268 using _Ip = __integer_from<sizeof(__k)>;
269 constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
270 return (__builtin_bit_cast(_Ip, __k) & __valid_bits) != _Ip();
271 }
272 else if constexpr (!_Traits._M_have_sse4_1())
273 {
274 constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
275 return (__x86_movmsk(__k) & __valid_bits) != 0;
276 }
277 else if constexpr (sizeof(__k) < 16)
278 return __x86_vecmask_any<_Np>(__vec_zero_pad_to_16(__k));
279 else
280 return 0 == __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
281 }
282
283 template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
284 [[__gnu__::__always_inline__]]
285 inline bool
286 __x86_vecmask_none(_TV __k)
287 {
288 using _Tp = __vec_value_type<_TV>;
289 static_assert(is_integral_v<_Tp> && is_signed_v<_Tp>);
290 constexpr int __width = __width_of<_TV>;
291 static_assert(sizeof(__k) <= 32);
292 if constexpr (_Np == __width)
293 return __x86_vec_is_zero(__k);
294 else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
295 {
296 using _Ip = __integer_from<sizeof(__k)>;
297 constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
298 return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == _Ip();
299 }
300 else if constexpr (!_Traits._M_have_sse4_1())
301 {
302 constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
303 return (__x86_movmsk(__k) & __valid_bits) == 0;
304 }
305 else if constexpr (sizeof(__k) < 16)
306 return __x86_vecmask_none<_Np>(__vec_zero_pad_to_16(__k));
307 else
308 return 0 != __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
309 }
310
311 enum class _X86Cmp
312 {
313 _Eq = 0,
314 _Lt = 1,
315 _Le = 2,
316 _Unord = 3,
317 _Neq = 4,
318 _Nlt = 5,
319 _Nle = 6,
320 };
321
322 template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
323 requires is_floating_point_v<__vec_value_type<_TV>>
324 [[__gnu__::__always_inline__]]
325 inline auto
326 __x86_bitmask_cmp(_TV __x, _TV __y)
327 {
328 constexpr int __c = int(_Cmp);
329 using _Tp = __vec_value_type<_TV>;
330 if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
331 return __builtin_ia32_cmppd512_mask(__x, __y, __c, -1, 4);
332 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
333 return __builtin_ia32_cmpps512_mask(__x, __y, __c, -1, 4);
334 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
335 return __builtin_ia32_cmppd256_mask(__x, __y, __c, -1);
336 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
337 return __builtin_ia32_cmpps256_mask(__x, __y, __c, -1);
338 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
339 return __builtin_ia32_cmppd128_mask(__x, __y, __c, -1);
340 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
341 return __builtin_ia32_cmpps128_mask(__x, __y, __c, -1);
342 else if constexpr (is_same_v<_Tp, _Float16>)
343 {
344 if constexpr (sizeof(_TV) == 64 && _Traits._M_have_avx512fp16())
345 return __builtin_ia32_cmpph512_mask(__x, __y, __c, -1);
346 else if constexpr (sizeof(_TV) == 32 && _Traits._M_have_avx512fp16())
347 return __builtin_ia32_cmpph256_mask(__x, __y, __c, -1);
348 else if constexpr (sizeof(_TV) == 16 && _Traits._M_have_avx512fp16())
349 return __builtin_ia32_cmpph128_mask(__x, __y, __c, -1);
350 else if constexpr (sizeof(_TV) < 16 && _Traits._M_have_avx512fp16())
351 return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
352 else
353 {
354 // without AVX512_FP16, float16_t size needs to match float32_t size
355 // (cf. __native_abi())
356 static_assert(sizeof(_TV) <= 32);
357 return __x86_bitmask_cmp<_Cmp>(__vec_cast<float>(__x), __vec_cast<float>(__y));
358 }
359 }
360 else if constexpr (sizeof(_TV) < 16)
361 return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
362 else
363 static_assert(false);
364 }
365
366 template <typename _Tp>
367 using __x86_intrin_int
368 = decltype([] {
369 if constexpr (sizeof(_Tp) == 1)
370 return char();
371 else
372 return __integer_from<sizeof(_Tp)>();
373 }());
374
375 template <typename _Tp>
376 using __x86_intrin_type
377 = decltype([] {
378 if constexpr (is_integral_v<_Tp> || sizeof(_Tp) <= 2)
379 return __x86_intrin_int<_Tp>();
380 else
381 return __canonical_vec_type_t<_Tp>();
382 }());
383
384 template <typename _Tp>
385 using __x86_intel_intrin_value_type
386 = decltype([] {
387 if constexpr (is_integral_v<_Tp>)
388 return 0ll;
389 else if constexpr (sizeof(_Tp) == 8)
390 return 0.;
391 else if constexpr (sizeof(_Tp) == 4)
392 return 0.f;
393 else if constexpr (sizeof(_Tp) == 2)
394 return 0.f16;
395 }());
396
397#if !_GLIBCXX_CLANG
398 // overload __vec_andnot from simd_detail.h
399 template <__vec_builtin _TV>
400 requires (sizeof(_TV) >= 16)
401 [[__gnu__::__always_inline__]]
402 constexpr _TV
403 __vec_andnot(_TV __a, _TV __b)
404 {
405 constexpr _TargetTraits _Traits = {};
406 using _Tp = __vec_value_type<_TV>;
407 using _UV = __vec_builtin_type<_UInt<sizeof(_Tp)>, __width_of<_TV>>;
408 if (__builtin_is_constant_evaluated()
409 || (__builtin_constant_p(__a) && __builtin_constant_p(__b)))
410 return reinterpret_cast<_TV>(~reinterpret_cast<_UV>(__a) & reinterpret_cast<_UV>(__b));
411 else if constexpr (is_same_v<_Tp, _Float16>)
412 return reinterpret_cast<_TV>(__vec_andnot(__vec_bit_cast<float>(__a),
413 __vec_bit_cast<float>(__b)));
414 else if constexpr (sizeof(_TV) == 16 && is_same_v<_Tp, float>)
415 return __builtin_ia32_andnps(__a, __b);
416 else if constexpr (sizeof(_TV) == 16 && is_same_v<_Tp, double>)
417 return __builtin_ia32_andnpd(__a, __b);
418 else if constexpr (sizeof(_TV) == 32 && is_same_v<_Tp, float>)
419 return __builtin_ia32_andnps256(__a, __b);
420 else if constexpr (sizeof(_TV) == 32 && is_same_v<_Tp, double>)
421 return __builtin_ia32_andnpd256(__a, __b);
422 else if constexpr (sizeof(_TV) == 64 && is_same_v<_Tp, float> && _Traits._M_have_avx512dq())
423 return __builtin_ia32_andnps512_mask(__a, __b, _TV{}, -1);
424 else if constexpr (sizeof(_TV) == 64 && is_same_v<_Tp, double> && _Traits._M_have_avx512dq())
425 return __builtin_ia32_andnpd512_mask(__a, __b, _TV{}, -1);
426 else
427 {
428 auto __all = __vec_bit_cast<long long>(__a);
429 auto __bll = __vec_bit_cast<long long>(__b);
430 if constexpr (sizeof(_TV) == 16 && is_integral_v<_Tp>)
431 return reinterpret_cast<_TV>(__builtin_ia32_pandn128(__all, __bll));
432 else if constexpr (sizeof(_TV) == 32 && is_integral_v<_Tp> && _Traits._M_have_avx2())
433 return reinterpret_cast<_TV>(__builtin_ia32_andnotsi256(__all, __bll));
434 else if constexpr (sizeof(_TV) == 32 && is_integral_v<_Tp>)
435 return reinterpret_cast<_TV>(__builtin_ia32_andnpd256(__vec_bit_cast<double>(__a),
436 __vec_bit_cast<double>(__b)));
437 else if constexpr (sizeof(_TV) == 64)
438 {
439 auto __ai = __vec_bit_cast<int>(__a);
440 auto __bi = __vec_bit_cast<int>(__b);
441 return reinterpret_cast<_TV>(
442 __builtin_ia32_pandnd512_mask(__ai, __bi, decltype(__ai)(), -1));
443 }
444 }
445 }
446#endif // not Clang
447
448 template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
449 requires is_integral_v<__vec_value_type<_TV>>
450 [[__gnu__::__always_inline__]]
451 inline auto
452 __x86_bitmask_cmp(_TV __x, _TV __y)
453 {
454 constexpr int __c = int(_Cmp);
455 using _Tp = __vec_value_type<_TV>;
456 if constexpr (sizeof(_TV) < 16)
457 return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
458 else if constexpr (is_signed_v<_Tp>)
459 {
460 const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
461 const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
462 if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
463 return __builtin_ia32_cmpq512_mask(__xi, __yi, __c, -1);
464 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
465 return __builtin_ia32_cmpd512_mask(__xi, __yi, __c, -1);
466 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
467 return __builtin_ia32_cmpw512_mask(__xi, __yi, __c, -1);
468 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
469 return __builtin_ia32_cmpb512_mask(__xi, __yi, __c, -1);
470 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
471 return __builtin_ia32_cmpq256_mask(__xi, __yi, __c, -1);
472 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
473 return __builtin_ia32_cmpd256_mask(__xi, __yi, __c, -1);
474 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
475 return __builtin_ia32_cmpw256_mask(__xi, __yi, __c, -1);
476 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
477 return __builtin_ia32_cmpb256_mask(__xi, __yi, __c, -1);
478 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
479 return __builtin_ia32_cmpq128_mask(__xi, __yi, __c, -1);
480 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
481 return __builtin_ia32_cmpd128_mask(__xi, __yi, __c, -1);
482 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
483 return __builtin_ia32_cmpw128_mask(__xi, __yi, __c, -1);
484 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
485 return __builtin_ia32_cmpb128_mask(__xi, __yi, __c, -1);
486 else
487 static_assert(false);
488 }
489 else
490 {
491 const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
492 const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
493 if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
494 return __builtin_ia32_ucmpq512_mask(__xi, __yi, __c, -1);
495 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
496 return __builtin_ia32_ucmpd512_mask(__xi, __yi, __c, -1);
497 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
498 return __builtin_ia32_ucmpw512_mask(__xi, __yi, __c, -1);
499 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
500 return __builtin_ia32_ucmpb512_mask(__xi, __yi, __c, -1);
501 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
502 return __builtin_ia32_ucmpq256_mask(__xi, __yi, __c, -1);
503 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
504 return __builtin_ia32_ucmpd256_mask(__xi, __yi, __c, -1);
505 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
506 return __builtin_ia32_ucmpw256_mask(__xi, __yi, __c, -1);
507 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
508 return __builtin_ia32_ucmpb256_mask(__xi, __yi, __c, -1);
509 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
510 return __builtin_ia32_ucmpq128_mask(__xi, __yi, __c, -1);
511 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
512 return __builtin_ia32_ucmpd128_mask(__xi, __yi, __c, -1);
513 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
514 return __builtin_ia32_ucmpw128_mask(__xi, __yi, __c, -1);
515 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
516 return __builtin_ia32_ucmpb128_mask(__xi, __yi, __c, -1);
517 else
518 static_assert(false);
519 }
520 }
521
522 template <__vec_builtin _TV, _ArchTraits _Traits = {}>
523 [[__gnu__::__always_inline__]]
524 inline auto
525 __x86_bitmask_isinf(_TV __x)
526 {
527 static_assert(_Traits._M_have_avx512dq());
528 using _Tp = __vec_value_type<_TV>;
529 static_assert(is_floating_point_v<_Tp>);
530 if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
531 return __builtin_ia32_fpclasspd512_mask(__x, 0x18, -1);
532 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
533 return __builtin_ia32_fpclasspd256_mask(__x, 0x18, -1);
534 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
535 return __builtin_ia32_fpclasspd128_mask(__x, 0x18, -1);
536 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
537 return __builtin_ia32_fpclassps512_mask(__x, 0x18, -1);
538 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
539 return __builtin_ia32_fpclassps256_mask(__x, 0x18, -1);
540 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
541 return __builtin_ia32_fpclassps128_mask(__x, 0x18, -1);
542 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
543 return __builtin_ia32_fpclassph512_mask(__x, 0x18, -1);
544 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
545 return __builtin_ia32_fpclassph256_mask(__x, 0x18, -1);
546 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2 && _Traits._M_have_avx512fp16())
547 return __builtin_ia32_fpclassph128_mask(__x, 0x18, -1);
548 else if constexpr (sizeof(_Tp) == 2 && !_Traits._M_have_avx512fp16())
549 return __x86_bitmask_isinf(__vec_cast<float>(__x));
550 else if constexpr (sizeof(_TV) < 16)
551 return __x86_bitmask_isinf(__vec_zero_pad_to_16(__x));
552 else
553 static_assert(false);
554 }
555
556 template <__vec_builtin _KV, _ArchTraits _Traits = {}>
557 [[__gnu__::__always_inline__]]
558 inline _KV
559 __x86_bit_to_vecmask(std::integral auto __bits)
560 {
561 using _Kp = __vec_value_type<_KV>;
562 static_assert((sizeof(__bits) * __CHAR_BIT__ == __width_of<_KV>)
563 || (sizeof(__bits) == 1 && __CHAR_BIT__ > __width_of<_KV>));
564
565 if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 64)
566 return __builtin_ia32_cvtmask2b512(__bits);
567 else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 32)
568 return __builtin_ia32_cvtmask2b256(__bits);
569 else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) == 16)
570 return __builtin_ia32_cvtmask2b128(__bits);
571 else if constexpr (sizeof(_Kp) == 1 && sizeof(_KV) <= 8)
572 return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2b128(__bits));
573
574 else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 64)
575 return __builtin_ia32_cvtmask2w512(__bits);
576 else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 32)
577 return __builtin_ia32_cvtmask2w256(__bits);
578 else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) == 16)
579 return __builtin_ia32_cvtmask2w128(__bits);
580 else if constexpr (sizeof(_Kp) == 2 && sizeof(_KV) <= 8)
581 return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2w128(__bits));
582
583 else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) == 64)
584 return __builtin_ia32_cvtmask2d512(__bits);
585 else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) == 32)
586 return __builtin_ia32_cvtmask2d256(__bits);
587 else if constexpr (sizeof(_Kp) == 4 && sizeof(_KV) <= 16)
588 return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2d128(__bits));
589
590 else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 64)
591 return __builtin_ia32_cvtmask2q512(__bits);
592 else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 32)
593 return __builtin_ia32_cvtmask2q256(__bits);
594 else if constexpr (sizeof(_Kp) == 8 && sizeof(_KV) == 16)
595 return __builtin_ia32_cvtmask2q128(__bits);
596
597 else
598 static_assert(false);
599 }
600
601 template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
602 requires is_integral_v<__vec_value_type<_TV>>
603 [[__gnu__::__always_inline__]]
604 constexpr inline _TV
605 __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
606 {
607 using _Tp = __vec_value_type<_TV>;
608 using _Ip = __x86_intrin_int<_Tp>;
609 if constexpr (!is_same_v<_Ip, _Tp>)
610 return reinterpret_cast<_TV>(__x86_bitmask_blend(__k, __vec_bit_cast<_Ip>(__t),
611 __vec_bit_cast<_Ip>(__f)));
612 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
613 return __builtin_ia32_blendmq_512_mask (__f, __t, __k);
614 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
615 return __builtin_ia32_blendmd_512_mask (__f, __t, __k);
616 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 2)
617 return __builtin_ia32_blendmw_512_mask (__f, __t, __k);
618 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 1)
619 return __builtin_ia32_blendmb_512_mask (__f, __t, __k);
620 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
621 return __builtin_ia32_blendmq_256_mask (__f, __t, __k);
622 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
623 return __builtin_ia32_blendmd_256_mask (__f, __t, __k);
624 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 2)
625 return __builtin_ia32_blendmw_256_mask (__f, __t, __k);
626 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 1)
627 return __builtin_ia32_blendmb_256_mask (__f, __t, __k);
628 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
629 return __builtin_ia32_blendmq_128_mask (__f, __t, __k);
630 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
631 return __builtin_ia32_blendmd_128_mask (__f, __t, __k);
632 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 2)
633 return __builtin_ia32_blendmw_128_mask (__f, __t, __k);
634 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 1)
635 return __builtin_ia32_blendmb_128_mask (__f, __t, __k);
636 else if constexpr (sizeof(_TV) < 16)
637 return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
638 __vec_zero_pad_to_16(__f)));
639 else
640 static_assert(false);
641 }
642
643 template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
644 requires is_floating_point_v<__vec_value_type<_TV>>
645 [[__gnu__::__always_inline__]]
646 constexpr inline _TV
647 __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
648 {
649 using _Tp = __vec_value_type<_TV>;
650 if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 8)
651 return __builtin_ia32_blendmpd_512_mask (__f, __t, __k);
652 else if constexpr (sizeof(_TV) == 64 && sizeof(_Tp) == 4)
653 return __builtin_ia32_blendmps_512_mask (__f, __t, __k);
654 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
655 return __builtin_ia32_blendmpd_256_mask (__f, __t, __k);
656 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
657 return __builtin_ia32_blendmps_256_mask (__f, __t, __k);
658 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
659 return __builtin_ia32_blendmpd_128_mask (__f, __t, __k);
660 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
661 return __builtin_ia32_blendmps_128_mask (__f, __t, __k);
662 else if constexpr (is_same_v<_Tp, _Float16>)
663 {
664 using _Up = __integer_from<sizeof(_Tp)>;
665 return __vec_bit_cast<_Float16>(__x86_bitmask_blend(__k, __vec_bit_cast<_Up>(__t),
666 __vec_bit_cast<_Up>(__f)));
667 }
668 else if constexpr (sizeof(_TV) < 16)
669 return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
670 __vec_zero_pad_to_16(__f)));
671 else
672 static_assert(false);
673 }
674
675 template <int _OutputBits = 4, _ArchTraits _Traits = {}>
676 constexpr _Bitmask<1>
677 __bit_extract_even(_UInt<1> __x)
678 {
679 static_assert(_OutputBits <= 4);
680 constexpr _UInt<1> __mask = 0x55u >> ((4 - _OutputBits) * 2);
681#if __has_builtin(__builtin_ia32_pext_si)
682 if constexpr (_Traits._M_have_bmi2())
683 return __builtin_ia32_pext_si(__x, __mask);
684#endif
685 __x &= __mask;
686 __x |= __x >> 1;
687 __x &= 0x33u;
688 __x |= __x >> 2;
689 __x &= 0x0Fu;
690 return __x;
691 }
692
693 template <int _OutputBits = 8, _ArchTraits _Traits = {}>
694 constexpr _Bitmask<1>
695 __bit_extract_even(_UInt<2> __x)
696 {
697 if constexpr (_OutputBits <= 4)
698 return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
699 else
700 {
701 static_assert(_OutputBits <= 8);
702 constexpr _UInt<2> __mask = 0x5555u >> ((8 - _OutputBits) * 2);
703#if __has_builtin(__builtin_ia32_pext_si)
704 if constexpr (_Traits._M_have_bmi2())
705 return __builtin_ia32_pext_si(__x, __mask);
706#endif
707 __x &= __mask;
708 __x |= __x >> 1;
709 __x &= 0x3333u;
710 __x |= __x >> 2;
711 __x &= 0x0F0Fu;
712 __x |= __x >> 4;
713 return __x;
714 }
715 }
716
717 template <int _OutputBits = 16, _ArchTraits _Traits = {}>
718 constexpr _Bitmask<_OutputBits>
719 __bit_extract_even(_UInt<4> __x)
720 {
721 if constexpr (_OutputBits <= 4)
722 return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
723 else if constexpr (_OutputBits <= 8)
724 return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
725 else
726 {
727 static_assert(_OutputBits <= 16);
728 constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - _OutputBits) * 2);
729#if __has_builtin(__builtin_ia32_pext_si)
730 if constexpr (_Traits._M_have_bmi2())
731 return __builtin_ia32_pext_si(__x, __mask);
732#endif
733 __x &= __mask;
734 __x |= __x >> 1;
735 __x &= 0x3333'3333u;
736 __x |= __x >> 2;
737 __x &= 0x0F0F'0F0Fu;
738 __x |= __x >> 4;
739 __x &= 0x00FF'00FFu;
740 __x |= __x >> 8;
741 return __x;
742 }
743 }
744
745 template <int _OutputBits = 32, _ArchTraits _Traits = {}>
746 constexpr _Bitmask<_OutputBits>
747 __bit_extract_even(_UInt<8> __x)
748 {
749 if constexpr (_OutputBits <= 4)
750 return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
751 else if constexpr (_OutputBits <= 8)
752 return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
753 else if constexpr (_OutputBits <= 16)
754 return __bit_extract_even<_OutputBits>(_UInt<4>(__x));
755 else
756 {
757 static_assert(_OutputBits <= 32);
758 constexpr _UInt<8> __mask = 0x5555'5555'5555'5555ull >> ((32 - _OutputBits) * 2);
759#if __has_builtin(__builtin_ia32_pext_si)
760 if constexpr (_Traits._M_have_bmi2())
761 {
762#if __has_builtin(__builtin_ia32_pext_di)
763 return __builtin_ia32_pext_di(__x, __mask);
764#else
765 return __builtin_ia32_pext_si(__x, static_cast<unsigned>(__mask))
766 | (__builtin_ia32_pext_si(__x >> 32, __mask >> 32) << 16);
767#endif
768 }
769#endif
770 __x &= __mask;
771 __x |= __x >> 1;
772 __x &= 0x3333'3333'3333'3333ull;
773 __x |= __x >> 2;
774 __x &= 0x0F0F'0F0F'0F0F'0F0Full;
775 __x |= __x >> 4;
776 __x &= 0x00FF'00FF'00FF'00FFull;
777 __x |= __x >> 8;
778 __x &= 0x0000'FFFF'0000'FFFFull;
779 __x |= __x >> 16;
780 return __x;
781 }
782 }
783
784 // input bits must be 0 for all bits > _InputBits
785 template <int _InputBits = -1, _ArchTraits _Traits = {}>
786 constexpr auto
787 __duplicate_each_bit(unsigned_integral auto __x)
788 {
789 constexpr int __input_bits = _InputBits == -1 ? sizeof(__x) * __CHAR_BIT__ : _InputBits;
790 static_assert(__input_bits >= 1);
791 static_assert(sizeof(__x) * __CHAR_BIT__ >= __input_bits);
792 if constexpr (__input_bits <= 8)
793 {
794 constexpr _UInt<2> __mask = 0x5555u >> ((8 - __input_bits) * 2);
795 if constexpr (__input_bits == 1)
796 return _UInt<1>(__x * 3u);
797#if __has_builtin(__builtin_ia32_pdep_si)
798 else if constexpr (_Traits._M_have_bmi2())
799 return _Bitmask<__input_bits * 2>(3u * __builtin_ia32_pdep_si(__x, __mask));
800#endif
801 else if constexpr (__input_bits == 2) // 0000'00BA
802 return _UInt<1>(((__x + 0b0010u) & 0b0101u) * 3u); // 0B?A -> 0B0A -> BBAA
803 else if constexpr (__input_bits <= 4) // 0000'DCBA
804 {
805 __x = ((__x << 2) | __x ) & 0b0011'0011u; // 00DC'??BA -> 00DC'00BA
806 return _UInt<1>(((__x + 0b0010'0010u) & __mask) * 3u); // -> DDCC'BBAA
807 }
808 else
809 { // HGFE'DCBA
810 _UInt<2> __y = ((__x << 4) | __x) & 0x0F0Fu; // HGFE'0000'DCBA
811 __y |= __y << 2; // 00HG'??FE'00DC'??BA
812 __y &= 0x3333u; // 00HG'00FE'00DC'00BA
813 __y += 0x2222u; // 0H?G'0F?E'0D?C'0B?A
814 return _UInt<2>((__y & __mask) * 3u); // HHGG'FFEE'DDCC'BBAA
815 }
816 }
817 else if constexpr (__input_bits <= 16)
818 {
819 constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - __input_bits) * 2);
820#if __has_builtin(__builtin_ia32_pdep_si)
821 if constexpr (_Traits._M_have_bmi2())
822 return 3u * __builtin_ia32_pdep_si(__x, __mask);
823#endif
824 _UInt<4> __y = ((__x << 8) | __x) & 0x00FF00FFu;
825 __y |= __y << 4;
826 __y &= 0x0F0F'0F0Fu;
827 __y |= __y << 2;
828 __y &= 0x3333'3333u;
829 return ((__y + 0x2222'2222u) & __mask) * 3;
830 }
831 else if constexpr (__input_bits <= 32)
832 {
833 constexpr _UInt<8> __mask = 0x5555'5555'5555'5555u >> ((32 - __input_bits) * 2);
834#if __has_builtin(__builtin_ia32_pdep_si)
835 if constexpr (_Traits._M_have_bmi2())
836 {
837#if __has_builtin(__builtin_ia32_pdep_di)
838 return 3ull * __builtin_ia32_pdep_di(__x, __mask);
839#else
840 const _UInt<8> __hi = 3 * __builtin_ia32_pdep_si(__x >> 16, __mask >> 32);
841 return (3u * __builtin_ia32_pdep_si(__x, static_cast<unsigned>(__mask))) | __hi << 32;
842#endif
843 }
844#endif
845 _UInt<8> __y = ((__x & 0xFFFF'0000ull) << 16) | (__x & 0x0000'FFFFu);
846 __y |= __y << 8;
847 __y &= 0x00FF'00FF'00FF'00FFull;
848 __y |= __y << 4;
849 __y &= 0x0F0F'0F0F'0F0F'0F0Full;
850 __y |= __y << 2;
851 __y &= 0x3333'3333'3333'3333ull;
852 return ((__y + 0x2222'2222'2222'2222ull) & __mask) * 3;
853 }
854 else
855 return __trivial_pair { __duplicate_each_bit(_UInt<4>(__x)),
856 __duplicate_each_bit<__input_bits - 32>(
857 _Bitmask<__input_bits - 32>(__x >> 32)) };
858 }
859
860 template <int _InputBits = -1, typename _U0, typename _U1>
861 constexpr auto
862 __duplicate_each_bit(const __trivial_pair<_U0, _U1>& __x)
863 {
864 static_assert(_InputBits != -1 || is_unsigned_v<_U1>);
865 constexpr int __input_bits = _InputBits == -1 ? (sizeof(_U0) + sizeof(_U1)) * __CHAR_BIT__
866 : _InputBits;
867 constexpr int __in0 = min(int(sizeof(_U0)) * __CHAR_BIT__, __input_bits);
868 constexpr int __in1 = __input_bits - __in0;
869 if constexpr (__in1 == 0)
870 return __duplicate_each_bit<__in0>(__x._M_first);
871 else
872 return __trivial_pair { __duplicate_each_bit<__in0>(__x._M_first),
873 __duplicate_each_bit<__in1>(__x._M_second) };
874 }
875
876 template <__vec_builtin _TV, _ArchTraits _Traits = {}>
877 [[__gnu__::__always_inline__]]
878 inline _TV
879 __x86_complex_multiplies(_TV __x, _TV __y)
880 {
881 using _Tp = __vec_value_type<_TV>;
882 using _VO = _VecOps<_TV>;
883
884 static_assert(_Traits._M_have_fma());
885 static_assert(is_floating_point_v<_Tp>);
886
887 if constexpr (!_Traits._M_have_avx512fp16() && sizeof(_Tp) == 2)
888 return __vec_cast<_Tp>(__x86_complex_multiplies(__vec_cast<float>(__x),
889 __vec_cast<float>(__y)));
890 else if constexpr (sizeof(_TV) < 16)
891 return _VO::_S_extract(__x86_complex_multiplies(__vec_zero_pad_to_16(__x),
892 __vec_zero_pad_to_16(__y)));
893
894 else
895 {
896 _TV __x_real = _VO::_S_dup_even(__x);
897 _TV __x_imag = _VO::_S_dup_odd(__x);
898 _TV __y_swapped = _VO::_S_swap_neighbors(__y);
899
900 if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 2)
901 return __builtin_ia32_vfmaddsubph128_mask(__x_real, __y, __x_imag * __y_swapped, -1);
902 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 2)
903 return __builtin_ia32_vfmaddsubph256_mask(__x_real, __y, __x_imag * __y_swapped, -1);
904 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 2)
905 return __builtin_ia32_vfmaddsubph512_mask(
906 __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
907
908 else if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 4)
909 return __builtin_ia32_vfmaddsubps(__x_real, __y, __x_imag * __y_swapped);
910 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
911 return __builtin_ia32_vfmaddsubps256(__x_real, __y, __x_imag * __y_swapped);
912 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
913 return __builtin_ia32_vfmaddsubps512_mask(
914 __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
915
916 else if constexpr (sizeof(__x) == 16 && sizeof(_Tp) == 8)
917 return __builtin_ia32_vfmaddsubpd(__x_real, __y, __x_imag * __y_swapped);
918 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
919 return __builtin_ia32_vfmaddsubpd256(__x_real, __y, __x_imag * __y_swapped);
920 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
921 return __builtin_ia32_vfmaddsubpd512_mask(
922 __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
923
924 else
925 static_assert(false);
926 }
927 }
928
929 // FIXME: Work around PR121688
930 template <__vec_builtin _UV, __vec_builtin _TV>
931 [[__gnu__::__always_inline__]]
932 inline _UV
933 __x86_cvt_f16c(_TV __v)
934 {
935 constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
936 constexpr bool __to_f16 = !__from_f16;
937 if constexpr (__to_f16 && !is_same_v<__vec_value_type<_TV>, float>)
938 return __x86_cvt_f16c<_UV>(__vec_cast<float>(__v));
939 else if constexpr (__from_f16 && !is_same_v<__vec_value_type<_UV>, float>)
940 return __vec_cast<_UV>(__x86_cvt_f16c<__vec_builtin_type<float, __width_of<_TV>>>(__v));
941 else if constexpr (__from_f16)
942 {
943 const auto __vi = __vec_bit_cast<__x86_intrin_int<_Float16>>(__v);
944 if constexpr (sizeof(_TV) == 4)
945 return __vec_split_lo(__builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi)));
946 else if constexpr (sizeof(_TV) == 8)
947 return __builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi));
948 else if constexpr (sizeof(_TV) == 16)
949 return __builtin_ia32_vcvtph2ps256(__vi);
950 else if constexpr (sizeof(_TV) == 32)
951 return __builtin_ia32_vcvtph2ps512_mask(__vi, __vec_builtin_type<float, 16>(), -1, 4);
952 else if constexpr (sizeof(_TV) >= 64)
953 return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
954 __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
955 else
956 static_assert(false);
957 }
958 else if constexpr (sizeof(_TV) == 8)
959 return reinterpret_cast<_UV>(
960 __vec_split_lo(__vec_split_lo(__builtin_ia32_vcvtps2ph(
961 __vec_zero_pad_to_16(__v), 4))));
962 else if constexpr (sizeof(_TV) == 16)
963 return reinterpret_cast<_UV>(__vec_split_lo(__builtin_ia32_vcvtps2ph(__v, 4)));
964 else if constexpr (sizeof(_TV) == 32)
965 return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph256(__v, 4));
966 else if constexpr (sizeof(_TV) == 64)
967 return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph512_mask(
968 __v, 4, __vec_builtin_type<short, 16>(), -1));
969 else if constexpr (sizeof(_TV) >= 128)
970 return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
971 __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
972 else
973 static_assert(false);
974 }
975
976 /** @internal
977 * AVX instructions typically work per 128-bit chunk. Horizontal operations thus produce vectors
978 * where the two 128-bit chunks in the center are swapped. This function works as a fix-up step.
979 */
980 template <__vec_builtin _TV>
981 [[__gnu__::__always_inline__]]
982 inline _TV
983 __x86_swizzle4x64_acbd(_TV __x)
984 {
985 static_assert(sizeof(_TV) == 32);
986 using _UV = __vec_builtin_type_bytes<long long, 32>;
987 return reinterpret_cast<_TV>(__builtin_shufflevector(reinterpret_cast<_UV>(__x), _UV(),
988 0, 2, 1, 3));
989 }
990
991 /** @internal
992 * Like __builtin_convertvector but with a precondition that input values are either 0 or -1.
993 */
994 template <__vec_builtin _To, __vec_builtin _From>
995 [[__gnu__::__always_inline__]]
996 inline _To
997 __x86_cvt_vecmask(_From __k)
998 {
999 using _T0 = __vec_value_type<_From>;
1000 using _T1 = __vec_value_type<_To>;
1001 if constexpr (sizeof(_From) > sizeof(_To) && sizeof(_From) < 16)
1002 {
1003 using _ToPadded = __vec_builtin_type_bytes<_T1, sizeof(_To) * 16 / sizeof(_From)>;
1004 return _VecOps<_To>::_S_extract(__x86_cvt_vecmask<_ToPadded>(__vec_zero_pad_to_16(__k)));
1005 }
1006 else if constexpr (sizeof(_T0) == 2 && sizeof(_T1) == 1) // -> packsswb
1007 {
1008 if constexpr (sizeof(__k) == 16)
1009 return reinterpret_cast<_To>(__vec_split_lo(__builtin_ia32_packsswb128(__k, __k)));
1010 else if constexpr (sizeof(__k) == 32)
1011 return reinterpret_cast<_To>(
1012 __vec_split_lo(__x86_swizzle4x64_acbd(
1013 __builtin_ia32_packsswb256(__k, __k))));
1014 else
1015 static_assert(false);
1016 }
1017 else
1018 static_assert(false, "TODO");
1019 }
1020
1021 /** @internal
1022 * Overload that concatenates @p __k0 and @p __k1 while converting.
1023 */
1024 template <__vec_builtin _To, __vec_builtin _From>
1025 [[__gnu__::__always_inline__]]
1026 inline _To
1027 __x86_cvt_vecmask(_From __k0, _From __k1)
1028 {
1029 using _T0 = __vec_value_type<_From>;
1030 using _T1 = __vec_value_type<_To>;
1031 static_assert(sizeof(_From) >= 16);
1032 if constexpr (sizeof(_T0) == 2 && sizeof(_T1) == 1) // -> packsswb
1033 {
1034 if constexpr (sizeof(__k0) == 16)
1035 return reinterpret_cast<_To>(__builtin_ia32_packsswb128(__k0, __k1));
1036 else if constexpr (sizeof(__k0) == 32)
1037 return reinterpret_cast<_To>(__x86_swizzle4x64_acbd(
1038 __builtin_ia32_packsswb256(__k0, __k1)));
1039 else
1040 static_assert(false);
1041 }
1042 else
1043 static_assert(false, "TODO");
1044 }
1045
1046 /** @internal
1047 * AVX512 masked (converting) loads
1048 *
1049 * @note AVX512VL and AVX512BW is required
1050 */
1051 template <__vec_builtin _TV, typename _Up, _ArchTraits _Traits = {}>
1052 [[__gnu__::__always_inline__]]
1053 inline _TV
1054 __x86_masked_load(const _Up* __mem, unsigned_integral auto __k)
1055 {
1056 static_assert(_Traits._M_have_avx512vl() && _Traits._M_have_avx512bw());
1057 using _Tp = __vec_value_type<_TV>;
1058 constexpr int __n = __width_of<_TV>;
1059 if constexpr (!__converts_trivially<_Up, _Tp>)
1060 {
1061 const auto __uvec
1062 = __x86_masked_load<__vec_builtin_type<__canonical_vec_type_t<_Up>, __n>>(__mem, __k);
1063 return __vec_cast<_TV>(__uvec);
1064 }
1065 else if constexpr (sizeof(_TV) < 16)
1066 {
1067 return _VecOps<_TV>::_S_extract(
1068 __x86_masked_load<__vec_builtin_type_bytes<_Tp, 16>>(__mem, __k));
1069 }
1070 else if constexpr (sizeof(_TV) > 64)
1071 {
1072 return __vec_concat(
1073 __x86_masked_load<__vec_builtin_type<_Tp, __n / 2>>(__mem, __k),
1074 __x86_masked_load<__vec_builtin_type<_Tp, __n / 2>>(__mem + __n / 2, __k >> __n / 2)
1075 );
1076 }
1077 else if constexpr (sizeof(_TV) == 64)
1078 {
1079 const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
1080 const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 64> __z = {};
1081 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1082 return __builtin_ia32_loadups512_mask(__src, __z, __k);
1083 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1084 return __builtin_ia32_loadupd512_mask(__src, __z, __k);
1085 else if constexpr (sizeof(_Tp) == 1)
1086 return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi512_mask(__src, __z, __k));
1087 else if constexpr (sizeof(_Tp) == 2)
1088 return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi512_mask(__src, __z, __k));
1089 else if constexpr (sizeof(_Tp) == 4)
1090 return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi512_mask(__src, __z, __k));
1091 else if constexpr (sizeof(_Tp) == 8)
1092 return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi512_mask(__src, __z, __k));
1093 else
1094 static_assert(false);
1095 }
1096 else if constexpr (sizeof(_TV) == 32)
1097 {
1098 const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
1099 const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 32> __z = {};
1100 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1101 return __builtin_ia32_loadups256_mask(__src, __z, __k);
1102 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1103 return __builtin_ia32_loadupd256_mask(__src, __z, __k);
1104 else if constexpr (sizeof(_Tp) == 1)
1105 return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi256_mask(__src, __z, __k));
1106 else if constexpr (sizeof(_Tp) == 2)
1107 return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi256_mask(__src, __z, __k));
1108 else if constexpr (sizeof(_Tp) == 4)
1109 return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi256_mask(__src, __z, __k));
1110 else if constexpr (sizeof(_Tp) == 8)
1111 return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi256_mask(__src, __z, __k));
1112 else
1113 static_assert(false);
1114 }
1115 else if constexpr (sizeof(_TV) == 16)
1116 {
1117 const auto* __src = reinterpret_cast<const __x86_intrin_type<_Up>*>(__mem);
1118 const __vec_builtin_type_bytes<__x86_intrin_type<_Up>, 16> __z = {};
1119 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1120 return __builtin_ia32_loadups128_mask(__src, __z, __k);
1121 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1122 return __builtin_ia32_loadupd128_mask(__src, __z, __k);
1123 else if constexpr (sizeof(_Tp) == 1)
1124 return reinterpret_cast<_TV>(__builtin_ia32_loaddquqi128_mask(__src, __z, __k));
1125 else if constexpr (sizeof(_Tp) == 2)
1126 return reinterpret_cast<_TV>(__builtin_ia32_loaddquhi128_mask(__src, __z, __k));
1127 else if constexpr (sizeof(_Tp) == 4)
1128 return reinterpret_cast<_TV>(__builtin_ia32_loaddqusi128_mask(__src, __z, __k));
1129 else if constexpr (sizeof(_Tp) == 8)
1130 return reinterpret_cast<_TV>(__builtin_ia32_loaddqudi128_mask(__src, __z, __k));
1131 else
1132 static_assert(false);
1133 }
1134 else
1135 static_assert(false);
1136 }
1137
1138 /** @internal
1139 * AVX(2) masked loads (only trivial conversions)
1140 */
1141 template <__vec_builtin _TV, typename _Up, __vec_builtin _KV, _ArchTraits _Traits = {}>
1142 [[__gnu__::__always_inline__]]
1143 inline _TV
1144 __x86_masked_load(const _Up* __mem, const _KV __k)
1145 {
1146 using _Tp = __vec_value_type<_TV>;
1147 static_assert(_Traits._M_have_avx() && __converts_trivially<_Up, _Tp> && sizeof(_Up) >= 4);
1148 constexpr int __n = __width_of<_TV>;
1149 using _IV = __vec_builtin_type<__x86_intrin_int<_Tp>, __n>;
1150 const auto __vk = reinterpret_cast<_IV>(__k);
1151 if constexpr (sizeof(_TV) < 16)
1152 return _VecOps<_TV>::_S_extract(__x86_masked_load<__vec_builtin_type_bytes<_Tp, 16>>(
1153 __mem, __vec_zero_pad_to_16(__k)));
1154 else if constexpr (_Traits._M_have_avx2() && is_integral_v<_Up>)
1155 {
1156 const auto* __src
1157 = reinterpret_cast<const __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
1158 if constexpr (sizeof(_Up) == 4 && sizeof(_TV) == 32)
1159 return reinterpret_cast<_TV>(__builtin_ia32_maskloadd256(__src, __vk));
1160 else if constexpr (sizeof(_Up) == 4 && sizeof(_TV) == 16)
1161 return reinterpret_cast<_TV>(__builtin_ia32_maskloadd(__src, __vk));
1162 else if constexpr (sizeof(_Up) == 8 && sizeof(_TV) == 32)
1163 return reinterpret_cast<_TV>(__builtin_ia32_maskloadq256(__src, __vk));
1164 else if constexpr (sizeof(_Up) == 8 && sizeof(_TV) == 16)
1165 return reinterpret_cast<_TV>(__builtin_ia32_maskloadq(__src, __vk));
1166 else
1167 static_assert(false);
1168 }
1169 else if constexpr (sizeof(_Up) == 4)
1170 {
1171 const auto* __src = reinterpret_cast<const __vec_builtin_type<float, __n>*>(__mem);
1172 if constexpr (sizeof(_TV) == 32)
1173 return reinterpret_cast<_TV>(__builtin_ia32_maskloadps256(__src, __vk));
1174 else if constexpr (sizeof(_TV) == 16)
1175 return reinterpret_cast<_TV>(__builtin_ia32_maskloadps(__src, __vk));
1176 else
1177 static_assert(false);
1178 }
1179 else
1180 {
1181 const auto* __src = reinterpret_cast<const __vec_builtin_type<double, __n>*>(__mem);
1182 if constexpr (sizeof(_TV) == 32)
1183 return reinterpret_cast<_TV>(__builtin_ia32_maskloadpd256(__src, __vk));
1184 else if constexpr (sizeof(_TV) == 16)
1185 return reinterpret_cast<_TV>(__builtin_ia32_maskloadpd(__src, __vk));
1186 else
1187 static_assert(false);
1188 }
1189 }
1190
1191 /** @internal
1192 * AVX512 masked stores
1193 *
1194 * @note AVX512VL is required
1195 */
1196 template <__vec_builtin _TV, typename _Up>
1197 [[__gnu__::__always_inline__]]
1198 inline void
1199 __x86_masked_store(const _TV __v, _Up* __mem, unsigned_integral auto __k)
1200 {
1201 using _Tp = __vec_value_type<_TV>;
1202 constexpr int __n = __width_of<_TV>;
1203 [[maybe_unused]] const auto __w = __vec_bit_cast<__x86_intrin_type<_Tp>>(__v);
1204 if constexpr (sizeof(_TV) == 64)
1205 {
1206 if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
1207 {
1208 auto* __dst = reinterpret_cast<
1209 __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
1210 if constexpr (sizeof(_Tp) == 2)
1211 __builtin_ia32_pmovwb512mem_mask(__dst, __w, __k);
1212 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
1213 __builtin_ia32_pmovdb512mem_mask(__dst, __w, __k);
1214 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
1215 __builtin_ia32_pmovdw512mem_mask(__dst, __w, __k);
1216 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
1217 __builtin_ia32_pmovqb512mem_mask(__dst, __w, __k);
1218 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
1219 __builtin_ia32_pmovqw512mem_mask(__dst, __w, __k);
1220 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
1221 __builtin_ia32_pmovqd512mem_mask(__dst, __w, __k);
1222 else
1223 static_assert(false);
1224 }
1225 else if constexpr (__converts_trivially<_Tp, _Up>)
1226 {
1227 auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
1228 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1229 __builtin_ia32_storeups512_mask(__dst, __w, __k);
1230 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1231 __builtin_ia32_storeupd512_mask(__dst, __w, __k);
1232 else if constexpr (sizeof(_Tp) == 1)
1233 __builtin_ia32_storedquqi512_mask(__dst, __w, __k);
1234 else if constexpr (sizeof(_Tp) == 2)
1235 __builtin_ia32_storedquhi512_mask(__dst, __w, __k);
1236 else if constexpr (sizeof(_Tp) == 4)
1237 __builtin_ia32_storedqusi512_mask(__dst, __w, __k);
1238 else if constexpr (sizeof(_Tp) == 8)
1239 __builtin_ia32_storedqudi512_mask(__dst, __w, __k);
1240 else
1241 static_assert(false);
1242 }
1243 else if constexpr (sizeof(_Tp) >= sizeof(_Up))
1244 {
1245 if constexpr (is_floating_point_v<_Tp> && is_integral_v<_Up>
1246 && sizeof(_Tp) > sizeof(_Up))
1247 __x86_masked_store(__vec_cast<__integer_from<sizeof(_Tp)>>(__v), __mem, __k);
1248 else
1249 __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
1250 }
1251 else
1252 {
1253 __x86_masked_store(__vec_split_lo(__v), __mem, _Bitmask<__n / 2>(__k));
1254 __x86_masked_store(__vec_split_hi(__v), __mem + __n / 2,
1255 _Bitmask<__n / 2>(__k >> (__n / 2)));
1256 }
1257 }
1258 else if constexpr (sizeof(_TV) == 32)
1259 {
1260 if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
1261 {
1262 auto* __dst = reinterpret_cast<
1263 __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
1264 if constexpr (sizeof(_Tp) == 2)
1265 __builtin_ia32_pmovwb256mem_mask(__dst, __w, __k);
1266 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
1267 __builtin_ia32_pmovdb256mem_mask(__dst, __w, __k);
1268 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
1269 __builtin_ia32_pmovdw256mem_mask(__dst, __w, __k);
1270 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
1271 __builtin_ia32_pmovqb256mem_mask(__dst, __w, __k);
1272 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
1273 __builtin_ia32_pmovqw256mem_mask(__dst, __w, __k);
1274 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
1275 __builtin_ia32_pmovqd256mem_mask(__dst, __w, __k);
1276 else
1277 static_assert(false);
1278 }
1279 else if constexpr (__converts_trivially<_Tp, _Up>)
1280 {
1281 auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
1282 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1283 __builtin_ia32_storeups256_mask(__dst, __w, __k);
1284 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1285 __builtin_ia32_storeupd256_mask(__dst, __w, __k);
1286 else if constexpr (sizeof(_Tp) == 1)
1287 __builtin_ia32_storedquqi256_mask(__dst, __w, __k);
1288 else if constexpr (sizeof(_Tp) == 2)
1289 __builtin_ia32_storedquhi256_mask(__dst, __w, __k);
1290 else if constexpr (sizeof(_Tp) == 4)
1291 __builtin_ia32_storedqusi256_mask(__dst, __w, __k);
1292 else if constexpr (sizeof(_Tp) == 8)
1293 __builtin_ia32_storedqudi256_mask(__dst, __w, __k);
1294 else
1295 static_assert(false);
1296 }
1297 else if constexpr (2 * sizeof(_Tp) >= sizeof(_Up))
1298 {
1299 __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
1300 }
1301 else
1302 {
1303 __x86_masked_store(__vec_split_lo(__v), __mem, _Bitmask<__n / 2>(__k));
1304 __x86_masked_store(__vec_split_hi(__v), __mem + __n / 2,
1305 _Bitmask<__n / 2>(__k >> (__n / 2)));
1306 }
1307 }
1308 else if constexpr (sizeof(_TV) == 16)
1309 {
1310 if constexpr (sizeof(_Tp) > sizeof(_Up) && is_integral_v<_Tp> && is_integral_v<_Up>)
1311 {
1312 auto* __dst = reinterpret_cast<
1313 __vec_builtin_type<__x86_intrin_int<_Up>, __n>*>(__mem);
1314 if constexpr (sizeof(_Tp) == 2)
1315 __builtin_ia32_pmovwb128mem_mask(__dst, __w, __k);
1316 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1)
1317 __builtin_ia32_pmovdb128mem_mask(__dst, __w, __k);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2)
1319 __builtin_ia32_pmovdw128mem_mask(__dst, __w, __k);
1320 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1)
1321 __builtin_ia32_pmovqb128mem_mask(__dst, __w, __k);
1322 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2)
1323 __builtin_ia32_pmovqw128mem_mask(__dst, __w, __k);
1324 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4)
1325 __builtin_ia32_pmovqd128mem_mask(reinterpret_cast<unsigned long long*>(__mem),
1326 __w, __k);
1327 else
1328 static_assert(false);
1329 }
1330 else if constexpr (__converts_trivially<_Tp, _Up>)
1331 {
1332 auto* __dst = reinterpret_cast<__x86_intrin_type<_Up>*>(__mem);
1333 if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 4)
1334 __builtin_ia32_storeups128_mask(__dst, __w, __k);
1335 else if constexpr (is_floating_point_v<_Tp> && sizeof(_Tp) == 8)
1336 __builtin_ia32_storeupd128_mask(__dst, __w, __k);
1337 else if constexpr (sizeof(_Tp) == 1)
1338 __builtin_ia32_storedquqi128_mask(__dst, __w, __k);
1339 else if constexpr (sizeof(_Tp) == 2)
1340 __builtin_ia32_storedquhi128_mask(__dst, __w, __k);
1341 else if constexpr (sizeof(_Tp) == 4)
1342 __builtin_ia32_storedqusi128_mask(__dst, __w, __k);
1343 else if constexpr (sizeof(_Tp) == 8)
1344 __builtin_ia32_storedqudi128_mask(__dst, __w, __k);
1345 else
1346 static_assert(false);
1347 }
1348 else if constexpr (4 * sizeof(_Tp) >= sizeof(_Up))
1349 {
1350 __x86_masked_store(__vec_cast<_Up>(__v), __mem, __k);
1351 }
1352 else
1353 {
1354 __x86_masked_store(__vec_cast<_Up>(__vec_split_lo(__v)), __mem,
1355 _Bitmask<__n / 2>(__k));
1356 __x86_masked_store(__vec_cast<_Up>(__vec_split_hi(__v)), __mem + __n / 2,
1357 _Bitmask<__n / 2>(__k >> (__n / 2)));
1358 }
1359 }
1360 else
1361 __x86_masked_store(__vec_zero_pad_to_16(__v), __mem, __k);
1362 }
1363
1364 /** @internal
1365 * AVX(2) masked stores
1366 */
1367 template <__vec_builtin _TV, typename _Up, __vec_builtin _KV, _ArchTraits _Traits = {}>
1368 [[__gnu__::__always_inline__]]
1369 inline void
1370 __x86_masked_store(const _TV __v, _Up* __mem, const _KV __k)
1371 {
1372 using _Tp = __vec_value_type<_TV>;
1373 constexpr int __n = __width_of<_TV>;
1374 static_assert(sizeof(_Tp) == 4 || sizeof(_Tp) == 8);
1375 auto* __dst = reinterpret_cast<
1376 __vec_builtin_type<__x86_intrin_type<_Up>, __n>*>(__mem);
1377 [[maybe_unused]] const auto __w = __vec_bit_cast<__x86_intrin_type<_Tp>>(__v);
1378 if constexpr (sizeof(_TV) < 16)
1379 __x86_masked_store(__vec_zero_pad_to_16(__v), __mem, __vec_zero_pad_to_16(__k));
1380 else if constexpr (_Traits._M_have_avx2() && is_integral_v<_Tp>)
1381 {
1382 if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
1383 __builtin_ia32_maskstored256(__dst, __k, __w);
1384 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
1385 __builtin_ia32_maskstored(__dst, __k, __w);
1386 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
1387 __builtin_ia32_maskstoreq256(__dst, __k, __w);
1388 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
1389 __builtin_ia32_maskstoreq(__dst, __k, __w);
1390 else
1391 static_assert(false);
1392 }
1393 else
1394 {
1395 if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 4)
1396 __builtin_ia32_maskstoreps256(__dst, __k, __w);
1397 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 4)
1398 __builtin_ia32_maskstoreps(__dst, __k, __w);
1399 else if constexpr (sizeof(_TV) == 32 && sizeof(_Tp) == 8)
1400 __builtin_ia32_maskstorepd256(__dst, __k, __w);
1401 else if constexpr (sizeof(_TV) == 16 && sizeof(_Tp) == 8)
1402 __builtin_ia32_maskstorepd(__dst, __k, __w);
1403 else
1404 static_assert(false);
1405 }
1406 }
1407} // namespace simd
1408_GLIBCXX_END_NAMESPACE_VERSION
1409} // namespace std
1410
1411#pragma GCC diagnostic pop
1412#endif // C++26
1413#endif // _GLIBCXX_SIMD_X86_H
ISO C++ entities toplevel namespace is std.