libstdc++
experimental/bits/simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2026 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); }
44
45template <typename _TV,
46 typename _TVT
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
52
53// }}}
54// __interleave128_lo {{{
55template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
59 {
60 const _Tp __a(__av);
61 const _Tp __b(__bv);
62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
111 __b[55]};
112 else
113 __assert_unreachable<_Tp>();
114 }
115
116// }}}
117// __is_zero{{{
118template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr bool
120 __is_zero(_Tp __a)
121 {
122 if (!__builtin_is_constant_evaluated())
123 {
124 if constexpr (__have_avx)
125 {
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<double, 2>)
135 return _mm_testz_pd(__a, __a);
136 else
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
138 }
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
142 }
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
145 else
146 {
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
154 else
155 __assert_unreachable<_Tp>();
156 }
157 }
158
159// }}}
160// __movemask{{{
161template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
163 __movemask(_Tp __a)
164 {
165 if constexpr (sizeof(_Tp) == 32)
166 {
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
171 else
172 return _mm256_movemask_epi8(__to_intrin(__a));
173 }
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
178 else
179 return _mm_movemask_epi8(__to_intrin(__a));
180 }
181
182// }}}
183// __testz{{{
184template <typename _TI, typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
186 __testz(_TI __a, _TI __b)
187 {
188 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
191 {
192 if constexpr (sizeof(_TI) == 32)
193 {
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
198 else
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
200 }
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
208 else
209 return __movemask(0 == __and(__a, __b)) != 0;
210 }
211 else
212 return __is_zero(__and(__a, __b));
213 }
214
215// }}}
216// __testc{{{
217// requires SSE4.1 or above
218template <typename _TI, typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
220 __testc(_TI __a, _TI __b)
221 {
222 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
226
227 if constexpr (sizeof(_TI) == 32)
228 {
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
233 else
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
235 }
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
240 else
241 {
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
245 }
246 }
247
248// }}}
249// __testnzc{{{
250template <typename _TI, typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
252 __testnzc(_TI __a, _TI __b)
253 {
254 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
257 {
258 if constexpr (sizeof(_TI) == 32)
259 {
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
264 else
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
266 }
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
274 else
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
277 }
278 else
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
280 }
281
282// }}}
283// __xzyw{{{
284// shuffles the complete vector, swapping the inner two quarters. Often useful
285// for AVX for fixing up a shuffle result.
286template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
288 __xzyw(_Tp __a)
289 {
290 if constexpr (sizeof(_Tp) == 16)
291 {
292 const auto __x = __vector_bitcast<conditional_t<
293 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
294 return reinterpret_cast<_Tp>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
296 }
297 else if constexpr (sizeof(_Tp) == 32)
298 {
299 const auto __x = __vector_bitcast<conditional_t<
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
303 }
304 else if constexpr (sizeof(_Tp) == 64)
305 {
306 const auto __x = __vector_bitcast<conditional_t<
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
310 __x[6], __x[7]});
311 }
312 else
313 __assert_unreachable<_Tp>();
314 }
315
316// }}}
317// __maskload_epi32{{{
318template <typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC auto
320 __maskload_epi32(const int* __ptr, _Tp __k)
321 {
322 if constexpr (sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
324 else
325 return _mm256_maskload_epi32(__ptr, __k);
326 }
327
328// }}}
329// __maskload_epi64{{{
330template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
333 {
334 if constexpr (sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
336 else
337 return _mm256_maskload_epi64(__ptr, __k);
338 }
339
340// }}}
341// __maskload_ps{{{
342template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const float* __ptr, _Tp __k)
345 {
346 if constexpr (sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
348 else
349 return _mm256_maskload_ps(__ptr, __k);
350 }
351
352// }}}
353// __maskload_pd{{{
354template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const double* __ptr, _Tp __k)
357 {
358 if constexpr (sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
360 else
361 return _mm256_maskload_pd(__ptr, __k);
362 }
363
364// }}}
365
366#ifdef _GLIBCXX_CLANG
367template <size_t _Np, typename _Tp, typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto
369 __movm(_Kp __k) noexcept
370 {
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw)
373 {
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
378 else
379 return __builtin_ia32_cvtmask2b512(__k);
380 }
381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw)
382 {
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
387 else
388 return __builtin_ia32_cvtmask2w512(__k);
389 }
390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq)
391 {
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
396 else
397 return __builtin_ia32_cvtmask2d512(__k);
398 }
399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq)
400 {
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
405 else
406 return __builtin_ia32_cvtmask2q512(__k);
407 }
408 else
409 __assert_unreachable<_Tp>();
410 }
411#endif // _GLIBCXX_CLANG
412
413#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
414#include "simd_x86_conversions.h"
415#endif
416
417// ISA & type detection {{{
418template <typename _Tp>
419 constexpr bool
420 __is_x86_ps()
421 {
422 return is_same_v<_Tp, float>;
423 }
424
425template <typename _Tp>
426 constexpr bool
427 __is_x86_pd()
428 {
429#if __LDBL_MANT_DIG == __DBL_MANT_DIG
430 if constexpr (is_same_v<_Tp, long double>)
431 return true;
432#endif
433 return is_same_v<_Tp, double>;
434 }
435
436template <typename _Tp, size_t _Np>
437 constexpr bool
438 __is_sse_ps()
439 {
440 return __have_sse
441 && __is_x86_ps<_Tp>()
442 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
443 }
444
445template <typename _Tp, size_t _Np>
446 constexpr bool
447 __is_sse_pd()
448 {
449 return __have_sse2
450 && __is_x86_pd<_Tp>()
451 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
452 }
453
454template <typename _Tp, size_t _Np>
455 constexpr bool
456 __is_avx_ps()
457 {
458 return __have_avx
459 && __is_x86_ps<_Tp>()
460 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
461 }
462
463template <typename _Tp, size_t _Np>
464 constexpr bool
465 __is_avx_pd()
466 {
467 return __have_avx
468 && __is_x86_pd<_Tp>()
469 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
470 }
471
472template <typename _Tp, size_t _Np>
473 constexpr bool
474 __is_avx512_ps()
475 {
476 return __have_avx512f
477 && __is_x86_ps<_Tp>()
478 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
479 }
480
481template <typename _Tp, size_t _Np>
482 constexpr bool
483 __is_avx512_pd()
484 {
485 return __have_avx512f
486 && __is_x86_pd<_Tp>()
487 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
488 }
489
490// }}}
491struct _MaskImplX86Mixin;
492
493// _CommonImplX86 {{{
494struct _CommonImplX86 : _CommonImplBuiltin
495{
496#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
497 // _S_converts_via_decomposition {{{
498 template <typename _From, typename _To, size_t _ToSize>
499 static constexpr bool
500 _S_converts_via_decomposition()
501 {
502 if constexpr (is_integral_v<
503 _From> && is_integral_v<_To> && sizeof(_From) == 8
504 && _ToSize == 16)
505 return (sizeof(_To) == 2 && !__have_ssse3)
506 || (sizeof(_To) == 1 && !__have_avx512f);
507 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
508 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
509 && !__have_avx512dq)
510 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
511 && _ToSize == 16);
512 else if constexpr (
513 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
514 && !__have_avx512dq)
515 return (sizeof(_To) == 4 && _ToSize == 16)
516 || (sizeof(_To) == 8 && _ToSize < 64);
517 else
518 return false;
519 }
520
521 template <typename _From, typename _To, size_t _ToSize>
522 static inline constexpr bool __converts_via_decomposition_v
523 = _S_converts_via_decomposition<_From, _To, _ToSize>();
524
525 // }}}
526#endif
527 // _S_store {{{
528 using _CommonImplBuiltin::_S_store;
529
530 template <typename _Tp, size_t _Np>
531 _GLIBCXX_SIMD_INTRINSIC static constexpr void
532 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
533 {
534 constexpr size_t _Bytes = _Np * sizeof(_Tp);
535
536 if (__builtin_is_constant_evaluated())
537 _CommonImplBuiltin::_S_store(__x, __addr);
538 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
539 {
540 const auto __v = __to_intrin(__x);
541
542 if constexpr (_Bytes & 1)
543 {
544 if constexpr (_Bytes < 16)
545 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
546 __intrin_bitcast<__m128i>(__v));
547 else if constexpr (_Bytes < 32)
548 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
549 __intrin_bitcast<__m256i>(__v));
550 else
551 _mm512_mask_storeu_epi8(__addr,
552 0xffffffffffffffffull >> (64 - _Bytes),
553 __intrin_bitcast<__m512i>(__v));
554 }
555 else if constexpr (_Bytes & 2)
556 {
557 if constexpr (_Bytes < 16)
558 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
559 __intrin_bitcast<__m128i>(__v));
560 else if constexpr (_Bytes < 32)
561 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
562 __intrin_bitcast<__m256i>(__v));
563 else
564 _mm512_mask_storeu_epi16(__addr,
565 0xffffffffull >> (32 - _Bytes / 2),
566 __intrin_bitcast<__m512i>(__v));
567 }
568 else if constexpr (_Bytes & 4)
569 {
570 if constexpr (_Bytes < 16)
571 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
572 __intrin_bitcast<__m128i>(__v));
573 else if constexpr (_Bytes < 32)
574 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
575 __intrin_bitcast<__m256i>(__v));
576 else
577 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
578 __intrin_bitcast<__m512i>(__v));
579 }
580 else
581 {
582 static_assert(
583 _Bytes > 16,
584 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
585 "- 1)) != 0 is impossible");
586 if constexpr (_Bytes < 32)
587 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
588 __intrin_bitcast<__m256i>(__v));
589 else
590 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
591 __intrin_bitcast<__m512i>(__v));
592 }
593 }
594 else
595 _CommonImplBuiltin::_S_store(__x, __addr);
596 }
597
598 // }}}
599 // _S_store_bool_array(_BitMask) {{{
600 template <size_t _Np, bool _Sanitized>
601 _GLIBCXX_SIMD_INTRINSIC static constexpr void
602 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
603 {
604 if (__builtin_is_constant_evaluated())
605 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
606 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
607 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
608 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
609 if constexpr (_Np <= 16)
610 return _mm_movm_epi8(__x._M_to_bits());
611 else if constexpr (_Np <= 32)
612 return _mm256_movm_epi8(__x._M_to_bits());
613 else if constexpr (_Np <= 64)
614 return _mm512_movm_epi8(__x._M_to_bits());
615 else
616 __assert_unreachable<_SizeConstant<_Np>>();
617 }()),
618 __mem);
619 else if constexpr (__have_bmi2)
620 {
621 if constexpr (_Np <= 4)
622 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
623 else
624 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
625 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
626 constexpr size_t __offset = __i * sizeof(size_t);
627 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
628 if constexpr (__todo == 1)
629 __mem[__offset] = __x[__offset];
630 else
631 {
632 const auto __bools =
633#ifdef __x86_64__
634 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
635 0x0101010101010101ULL);
636#else // __x86_64__
637 _pdep_u32(
638 __x.template _M_extract<__offset>()._M_to_bits(),
639 0x01010101U);
640#endif // __x86_64__
641 _S_store<__todo>(__bools, __mem + __offset);
642 }
643 });
644 }
645 else if constexpr (__have_sse2 && _Np > 7)
646 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
647 constexpr int __offset = __i * 16;
648 constexpr int __todo = std::min(16, int(_Np) - __offset);
649 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
650 __vector_type16_t<_UChar> __bools;
651 if constexpr (__have_avx512f)
652 {
653 auto __as32bits
654 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
655 __vector_broadcast<16>(1)));
656 auto __as16bits
657 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
658 __todo > 8 ? __hi256(__as32bits)
659 : __m256i()));
660 __bools = __vector_bitcast<_UChar>(
661 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
662 }
663 else
664 {
665 using _V = __vector_type_t<_UChar, 16>;
666 auto __tmp = _mm_cvtsi32_si128(__bits);
667 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
668 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
669 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
670 _V __tmp2 = reinterpret_cast<_V>(__tmp);
671 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
672 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
673 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
674 }
675 _S_store<__todo>(__bools, __mem + __offset);
676 });
677 else
678 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
679 }
680
681 // }}}
682 // _S_blend_avx512 {{{
683 // Returns: __k ? __b : __a
684 // TODO: reverse __a and __b to match COND_EXPR
685 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
686 // __k
687 template <typename _Kp, typename _TV>
688 _GLIBCXX_SIMD_INTRINSIC static _TV
689 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
690 {
691 static_assert(__is_vector_type_v<_TV>);
692 using _Tp = typename _VectorTraits<_TV>::value_type;
693 static_assert(sizeof(_TV) >= 16);
694 static_assert(sizeof(_Tp) <= 8);
695#ifdef _GLIBCXX_CLANG
696 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
697#else
698 using _IntT
699 = conditional_t<(sizeof(_Tp) > 2),
700 conditional_t<sizeof(_Tp) == 4, int, long long>,
701 conditional_t<sizeof(_Tp) == 1, char, short>>;
702 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
703 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
704 if constexpr (sizeof(_TV) == 64)
705 {
706 if constexpr (sizeof(_Tp) == 1)
707 return reinterpret_cast<_TV>(
708 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
709 else if constexpr (sizeof(_Tp) == 2)
710 return reinterpret_cast<_TV>(
711 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
712 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
713 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
714 else if constexpr (sizeof(_Tp) == 4)
715 return reinterpret_cast<_TV>(
716 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
717 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
718 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
719 else if constexpr (sizeof(_Tp) == 8)
720 return reinterpret_cast<_TV>(
721 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
722 }
723 else if constexpr (sizeof(_TV) == 32)
724 {
725 if constexpr (sizeof(_Tp) == 1)
726 return reinterpret_cast<_TV>(
727 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
728 else if constexpr (sizeof(_Tp) == 2)
729 return reinterpret_cast<_TV>(
730 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
731 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
732 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
733 else if constexpr (sizeof(_Tp) == 4)
734 return reinterpret_cast<_TV>(
735 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
736 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
737 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
738 else if constexpr (sizeof(_Tp) == 8)
739 return reinterpret_cast<_TV>(
740 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
741 }
742 else if constexpr (sizeof(_TV) == 16)
743 {
744 if constexpr (sizeof(_Tp) == 1)
745 return reinterpret_cast<_TV>(
746 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
747 else if constexpr (sizeof(_Tp) == 2)
748 return reinterpret_cast<_TV>(
749 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
750 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
751 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
752 else if constexpr (sizeof(_Tp) == 4)
753 return reinterpret_cast<_TV>(
754 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
755 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
756 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
757 else if constexpr (sizeof(_Tp) == 8)
758 return reinterpret_cast<_TV>(
759 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
760 }
761#endif
762 }
763
764 // }}}
765 // _S_blend_intrin {{{
766 // Returns: __k ? __b : __a
767 // TODO: reverse __a and __b to match COND_EXPR
768 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
769 // Bytes wide
770 template <typename _Tp>
771 _GLIBCXX_SIMD_INTRINSIC static _Tp
772 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
773 {
774 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
775 constexpr struct
776 {
777 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
778 __m128 __k) const noexcept
779 {
780 return __builtin_ia32_blendvps(__a, __b, __k);
781 }
782 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
783 __m128d __k) const noexcept
784 {
785 return __builtin_ia32_blendvpd(__a, __b, __k);
786 }
787 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
788 __m128i __k) const noexcept
789 {
790 return reinterpret_cast<__m128i>(
791 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
792 reinterpret_cast<__v16qi>(__b),
793 reinterpret_cast<__v16qi>(__k)));
794 }
795 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
796 __m256 __k) const noexcept
797 {
798 return __builtin_ia32_blendvps256(__a, __b, __k);
799 }
800 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
801 __m256d __k) const noexcept
802 {
803 return __builtin_ia32_blendvpd256(__a, __b, __k);
804 }
805 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
806 __m256i __k) const noexcept
807 {
808 if constexpr (__have_avx2)
809 return reinterpret_cast<__m256i>(
810 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
811 reinterpret_cast<__v32qi>(__b),
812 reinterpret_cast<__v32qi>(__k)));
813 else
814 return reinterpret_cast<__m256i>(
815 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
816 reinterpret_cast<__v8sf>(__b),
817 reinterpret_cast<__v8sf>(__k)));
818 }
819 } __eval;
820 return __eval(__a, __b, __k);
821 }
822
823 // }}}
824 // _S_blend {{{
825 // Returns: __k ? __at1 : __at0
826 // TODO: reverse __at0 and __at1 to match COND_EXPR
827 template <typename _Tp, size_t _Np>
828 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
829 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
830 _SimdWrapper<_Tp, _Np> __at1)
831 {
832 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
833 if (__k._M_is_constprop() && __at0._M_is_constprop()
834 && __at1._M_is_constprop())
835 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
836 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
837 return __k[__i] ? __at1[__i] : __at0[__i];
838 });
839 else if constexpr (sizeof(__at0) == 64
840 || (__have_avx512vl && sizeof(__at0) >= 16))
841 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
842 else
843 {
844 static_assert((__have_avx512vl && sizeof(__at0) < 16)
845 || !__have_avx512vl);
846 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
847 return __vector_bitcast<_Tp, _Np>(
848 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
849 __vector_bitcast<_Tp, __size>(__at1)));
850 }
851 }
852
853 template <typename _Tp, size_t _Np>
854 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
855 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
856 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
857 {
858 const auto __kk = __wrapper_bitcast<_Tp>(__k);
859 if (__builtin_is_constant_evaluated()
860 || (__kk._M_is_constprop() && __at0._M_is_constprop()
861 && __at1._M_is_constprop()))
862 {
863 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
864 if (__r._M_is_constprop())
865 return __r;
866 }
867 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
868 && (sizeof(_Tp) >= 4 || __have_avx512bw))
869 // convert to bitmask and call overload above
870 return _S_blend(
871 _SimdWrapper<bool, _Np>(
872 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
873 ._M_to_bits()),
874 __at0, __at1);
875 else
876 {
877 // Since GCC does not assume __k to be a mask, using the builtin
878 // conditional operator introduces an extra compare against 0 before
879 // blending. So we rather call the intrinsic here.
880 if constexpr (__have_sse4_1)
881 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
882 __to_intrin(__at1));
883 else
884 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
885 }
886 }
887
888 // }}}
889};
890
891// }}}
892// _SimdImplX86 {{{
893template <typename _Abi, typename>
894 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
895 {
896 using _Base = _SimdImplBuiltin<_Abi>;
897
898 template <typename _Tp>
899 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
900
901 template <typename _Tp>
902 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
903
904 template <typename _Tp>
905 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
906
907 template <typename _Tp>
908 static constexpr size_t _S_max_store_size
909 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
910 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
911 : 16;
912
913 using _MaskImpl = typename _Abi::_MaskImpl;
914
915 // _S_masked_load {{{
916 template <typename _Tp, size_t _Np, typename _Up>
917 static inline _SimdWrapper<_Tp, _Np>
918 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
919 const _Up* __mem) noexcept
920 {
921 static_assert(_Np == _S_size<_Tp>);
922 if constexpr (is_same_v<_Tp, _Up> || // no conversion
923 (sizeof(_Tp) == sizeof(_Up)
924 && is_integral_v<
925 _Tp> == is_integral_v<_Up>) // conversion via bit
926 // reinterpretation
927 )
928 {
929 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
930 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
931 && sizeof(_Tp) == 1)
932 {
933 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
934 if constexpr (sizeof(__intrin) == 16)
935 __merge = __vector_bitcast<_Tp, _Np>(
936 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
937 else if constexpr (sizeof(__merge) == 32)
938 __merge = __vector_bitcast<_Tp, _Np>(
939 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
940 else if constexpr (sizeof(__merge) == 64)
941 __merge = __vector_bitcast<_Tp, _Np>(
942 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
943 else
944 __assert_unreachable<_Tp>();
945 }
946 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
947 && sizeof(_Tp) == 2)
948 {
949 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
950 if constexpr (sizeof(__intrin) == 16)
951 __merge = __vector_bitcast<_Tp, _Np>(
952 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
953 else if constexpr (sizeof(__intrin) == 32)
954 __merge = __vector_bitcast<_Tp, _Np>(
955 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
956 else if constexpr (sizeof(__intrin) == 64)
957 __merge = __vector_bitcast<_Tp, _Np>(
958 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
959 else
960 __assert_unreachable<_Tp>();
961 }
962 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
963 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
964 {
965 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
966 if constexpr (sizeof(__intrin) == 16)
967 __merge = __vector_bitcast<_Tp, _Np>(
968 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
969 else if constexpr (sizeof(__intrin) == 32)
970 __merge = __vector_bitcast<_Tp, _Np>(
971 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
972 else if constexpr (sizeof(__intrin) == 64)
973 __merge = __vector_bitcast<_Tp, _Np>(
974 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
975 else
976 __assert_unreachable<_Tp>();
977 }
978 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
979 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
980 {
981 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
982 if constexpr (sizeof(__intrin) == 16)
983 __merge = __vector_bitcast<_Tp, _Np>(
984 _mm_mask_loadu_ps(__intrin, __kk, __mem));
985 else if constexpr (sizeof(__intrin) == 32)
986 __merge = __vector_bitcast<_Tp, _Np>(
987 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
988 else if constexpr (sizeof(__intrin) == 64)
989 __merge = __vector_bitcast<_Tp, _Np>(
990 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
991 else
992 __assert_unreachable<_Tp>();
993 }
994 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
995 && is_integral_v<_Up>)
996 {
997 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
998 __merge
999 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1000 __vector_bitcast<_Tp, _Np>(
1001 __maskload_epi32(reinterpret_cast<const int*>(__mem),
1002 __to_intrin(__k))));
1003 }
1004 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1005 {
1006 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1007 __merge
1008 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1009 __vector_bitcast<_Tp, _Np>(
1010 __maskload_ps(reinterpret_cast<const float*>(__mem),
1011 __to_intrin(__k))));
1012 }
1013 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1014 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
1015 {
1016 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1017 if constexpr (sizeof(__intrin) == 16)
1018 __merge = __vector_bitcast<_Tp, _Np>(
1019 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1020 else if constexpr (sizeof(__intrin) == 32)
1021 __merge = __vector_bitcast<_Tp, _Np>(
1022 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1023 else if constexpr (sizeof(__intrin) == 64)
1024 __merge = __vector_bitcast<_Tp, _Np>(
1025 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1026 else
1027 __assert_unreachable<_Tp>();
1028 }
1029 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1030 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1031 {
1032 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1033 if constexpr (sizeof(__intrin) == 16)
1034 __merge = __vector_bitcast<_Tp, _Np>(
1035 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1036 else if constexpr (sizeof(__intrin) == 32)
1037 __merge = __vector_bitcast<_Tp, _Np>(
1038 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1039 else if constexpr (sizeof(__intrin) == 64)
1040 __merge = __vector_bitcast<_Tp, _Np>(
1041 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1042 else
1043 __assert_unreachable<_Tp>();
1044 }
1045 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1046 && is_integral_v<_Up>)
1047 {
1048 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1049 __merge
1050 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1051 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1052 reinterpret_cast<const _LLong*>(__mem),
1053 __to_intrin(__k))));
1054 }
1055 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1056 {
1057 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1058 __merge
1059 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1060 __vector_bitcast<_Tp, _Np>(
1061 __maskload_pd(reinterpret_cast<const double*>(__mem),
1062 __to_intrin(__k))));
1063 }
1064 else
1065 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1066 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1067 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1068 });
1069 }
1070 /* Very uncertain, that the following improves anything. Needs
1071 benchmarking
1072 * before it's activated.
1073 else if constexpr (sizeof(_Up) <= 8 && // no long double
1074 !__converts_via_decomposition_v<
1075 _Up, _Tp,
1076 sizeof(__merge)> // conversion via decomposition
1077 // is better handled via the
1078 // bit_iteration fallback below
1079 )
1080 {
1081 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1082 // fixed_size
1083 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1084 using _ATraits = _SimdTraits<_Up, _Ap>;
1085 using _AImpl = typename _ATraits::_SimdImpl;
1086 typename _ATraits::_SimdMember __uncvted{};
1087 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1088 _S_convert<_Up>(__k);
1089 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1090 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1091 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1092 }
1093 */
1094 else
1095 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1096 return __merge;
1097 }
1098
1099 // }}}
1100 // _S_masked_store_nocvt {{{
1101 template <typename _Tp, size_t _Np>
1102 _GLIBCXX_SIMD_INTRINSIC static void
1103 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1104 {
1105 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1106 if constexpr (sizeof(__vi) == 64)
1107 {
1108 static_assert(sizeof(__v) == 64 && __have_avx512f);
1109 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1110 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1111 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1112 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1113 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1114 {
1115 if constexpr (is_integral_v<_Tp>)
1116 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1117 else
1118 _mm512_mask_storeu_ps(__mem, __k, __vi);
1119 }
1120 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1121 {
1122 if constexpr (is_integral_v<_Tp>)
1123 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1124 else
1125 _mm512_mask_storeu_pd(__mem, __k, __vi);
1126 }
1127 else
1128 __assert_unreachable<_Tp>();
1129 }
1130 else if constexpr (sizeof(__vi) == 32)
1131 {
1132 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1133 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1134 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1135 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1136 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1137 {
1138 if constexpr (is_integral_v<_Tp>)
1139 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1140 else
1141 _mm256_mask_storeu_ps(__mem, __k, __vi);
1142 }
1143 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1144 {
1145 if constexpr (is_integral_v<_Tp>)
1146 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1147 else
1148 _mm256_mask_storeu_pd(__mem, __k, __vi);
1149 }
1150 else if constexpr (__have_avx512f
1151 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1152 {
1153 // use a 512-bit maskstore, using zero-extension of the bitmask
1154 _S_masked_store_nocvt(
1155 _SimdWrapper64<_Tp>(
1156 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1157 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1158 }
1159 else
1160 _S_masked_store_nocvt(__v, __mem,
1161 _MaskImpl::template _S_to_maskvector<
1162 __int_for_sizeof_t<_Tp>, _Np>(__k));
1163 }
1164 else if constexpr (sizeof(__vi) == 16)
1165 {
1166 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1167 _mm_mask_storeu_epi8(__mem, __k, __vi);
1168 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1169 _mm_mask_storeu_epi16(__mem, __k, __vi);
1170 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1171 {
1172 if constexpr (is_integral_v<_Tp>)
1173 _mm_mask_storeu_epi32(__mem, __k, __vi);
1174 else
1175 _mm_mask_storeu_ps(__mem, __k, __vi);
1176 }
1177 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1178 {
1179 if constexpr (is_integral_v<_Tp>)
1180 _mm_mask_storeu_epi64(__mem, __k, __vi);
1181 else
1182 _mm_mask_storeu_pd(__mem, __k, __vi);
1183 }
1184 else if constexpr (__have_avx512f
1185 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1186 {
1187 // use a 512-bit maskstore, using zero-extension of the bitmask
1188 _S_masked_store_nocvt(
1189 _SimdWrapper64<_Tp>(
1190 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1191 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1192 }
1193 else
1194 _S_masked_store_nocvt(__v, __mem,
1195 _MaskImpl::template _S_to_maskvector<
1196 __int_for_sizeof_t<_Tp>, _Np>(__k));
1197 }
1198 else
1199 __assert_unreachable<_Tp>();
1200 }
1201
1202 template <typename _Tp, size_t _Np>
1203 _GLIBCXX_SIMD_INTRINSIC static void
1204 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1205 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1206 {
1207 if constexpr (sizeof(__v) <= 16)
1208 {
1209 [[maybe_unused]] const auto __vi
1210 = __intrin_bitcast<__m128i>(__as_vector(__v));
1211 [[maybe_unused]] const auto __ki
1212 = __intrin_bitcast<__m128i>(__as_vector(__k));
1213 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1214 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1215 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1216 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1217 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1218 && is_integral_v<_Tp>)
1219 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1220 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1221 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1222 __vector_bitcast<float>(__vi));
1223 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1224 && is_integral_v<_Tp>)
1225 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1226 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1227 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1228 __vector_bitcast<double>(__vi));
1229 else
1230 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1231 }
1232 else if constexpr (sizeof(__v) == 32)
1233 {
1234 [[maybe_unused]] const auto __vi
1235 = __intrin_bitcast<__m256i>(__as_vector(__v));
1236 [[maybe_unused]] const auto __ki
1237 = __intrin_bitcast<__m256i>(__as_vector(__k));
1238 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1239 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1240 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1241 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1242 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1243 && is_integral_v<_Tp>)
1244 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1245 else if constexpr (sizeof(_Tp) == 4)
1246 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1247 __vector_bitcast<float>(__v));
1248 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1249 && is_integral_v<_Tp>)
1250 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1251 __vi);
1252 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1253 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1254 __vector_bitcast<double>(__v));
1255 else
1256 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1257 }
1258 else
1259 __assert_unreachable<_Tp>();
1260 }
1261
1262 // }}}
1263 // _S_masked_store {{{
1264 template <typename _Tp, size_t _Np, typename _Up>
1265 _GLIBCXX_SIMD_INTRINSIC static void
1266 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1267 const _MaskMember<_Tp> __k) noexcept
1268 {
1269 if constexpr (is_integral_v<
1270 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1271 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1272 && (sizeof(__v) == 64 || __have_avx512vl))
1273 { // truncating store
1274 const auto __vi = __to_intrin(__v);
1275 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1276 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1277 && sizeof(__vi) == 64)
1278 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1279 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1280 && sizeof(__vi) == 32)
1281 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1282 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1283 && sizeof(__vi) == 16)
1284 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1285 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1286 && sizeof(__vi) == 64)
1287 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1288 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1289 && sizeof(__vi) == 32)
1290 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1292 && sizeof(__vi) == 16)
1293 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1295 && sizeof(__vi) == 64)
1296 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1298 && sizeof(__vi) == 32)
1299 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1301 && sizeof(__vi) == 16)
1302 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1304 && sizeof(__vi) == 64)
1305 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1307 && sizeof(__vi) == 32)
1308 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1310 && sizeof(__vi) == 16)
1311 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1312 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1313 && sizeof(__vi) == 64)
1314 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1316 && sizeof(__vi) == 32)
1317 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1319 && sizeof(__vi) == 16)
1320 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1321 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1322 && sizeof(__vi) == 64)
1323 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1324 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1325 && sizeof(__vi) == 32)
1326 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1327 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1328 && sizeof(__vi) == 16)
1329 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1330 else
1331 __assert_unreachable<_Tp>();
1332 }
1333 else
1334 _Base::_S_masked_store(__v, __mem, __k);
1335 }
1336
1337 // }}}
1338 // _S_multiplies {{{
1339 template <typename _V, typename _VVT = _VectorTraits<_V>>
1340 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1341 _S_multiplies(_V __x, _V __y)
1342 {
1343 using _Tp = typename _VVT::value_type;
1344 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1345 || __y._M_is_constprop())
1346 return __as_vector(__x) * __as_vector(__y);
1347 else if constexpr (sizeof(_Tp) == 1)
1348 {
1349 if constexpr (sizeof(_V) == 2)
1350 {
1351 const auto __xs = reinterpret_cast<short>(__x._M_data);
1352 const auto __ys = reinterpret_cast<short>(__y._M_data);
1353 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1354 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1355 }
1356 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1357 {
1358 const auto __xi = reinterpret_cast<int>(__x._M_data);
1359 const auto __yi = reinterpret_cast<int>(__y._M_data);
1360 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1361 ((__xi * __yi) & 0xff)
1362 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1363 | ((__xi >> 16) * (__yi & 0xff0000)));
1364 }
1365 else if constexpr (sizeof(_V) == 4)
1366 {
1367 const auto __xi = reinterpret_cast<int>(__x._M_data);
1368 const auto __yi = reinterpret_cast<int>(__y._M_data);
1369 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1370 ((__xi * __yi) & 0xff)
1371 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1372 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1373 | ((__xi >> 24) * (__yi & 0xff000000u)));
1374 }
1375 else if constexpr (sizeof(_V) == 8 && __have_avx2
1376 && is_signed_v<_Tp>)
1377 return __convert<typename _VVT::type>(
1378 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1379 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1380 else if constexpr (sizeof(_V) == 8 && __have_avx2
1381 && is_unsigned_v<_Tp>)
1382 return __convert<typename _VVT::type>(
1383 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1384 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1385 else
1386 {
1387 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1388 constexpr size_t __full_size = _VVT::_S_full_size;
1389 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1390 using _ShortW = _SimdWrapper<short, _Np>;
1391 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1392 * __vector_bitcast<short, _Np>(__y);
1393 _ShortW __high_byte = _ShortW()._M_data - 256;
1394 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1395 const _ShortW __odd
1396 = (__vector_bitcast<short, _Np>(__x) >> 8)
1397 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1398 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1399 return _CommonImplX86::_S_blend_avx512(
1400 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1401 __vector_bitcast<_Tp>(__odd));
1402 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1403 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1404 __high_byte),
1405 __to_intrin(__even),
1406 __to_intrin(__odd));
1407 else
1408 return __to_intrin(
1409 __or(__andnot(__high_byte, __even), __odd));
1410 }
1411 }
1412 else
1413 return _Base::_S_multiplies(__x, __y);
1414 }
1415
1416 // }}}
1417 // _S_divides {{{
1418#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1419 template <typename _Tp, size_t _Np>
1420 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1421 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1422 {
1423 if (!__builtin_is_constant_evaluated()
1424 && !__builtin_constant_p(__y._M_data))
1425 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1426 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1427 // Note that using floating-point division is likely to raise the
1428 // *Inexact* exception flag and thus appears like an invalid
1429 // "as-if" transformation. However, C++ doesn't specify how the
1430 // fpenv can be observed and points to C. C says that function
1431 // calls are assumed to potentially raise fp exceptions, unless
1432 // documented otherwise. Consequently, operator/, which is a
1433 // function call, may raise fp exceptions.
1434 /*const struct _CsrGuard
1435 {
1436 const unsigned _M_data = _mm_getcsr();
1437 _CsrGuard()
1438 {
1439 _mm_setcsr(0x9f80); // turn off FP exceptions and
1440 flush-to-zero
1441 }
1442 ~_CsrGuard() { _mm_setcsr(_M_data); }
1443 } __csr;*/
1444 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1445 constexpr size_t __n_intermediate
1446 = std::min(_Np, (__have_avx512f ? 64
1447 : __have_avx ? 32
1448 : 16)
1449 / sizeof(_Float));
1450 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1451 constexpr size_t __n_floatv
1452 = __div_roundup(_Np, __n_intermediate);
1453 using _R = __vector_type_t<_Tp, _Np>;
1454 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1455 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1456 _Abi::__make_padding_nonzero(__as_vector(__y)));
1457 return __call_with_n_evaluations<__n_floatv>(
1458 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1459 return __vector_convert<_R>(__quotients...);
1460 },
1461 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1462 -> _SimdWrapper<_Float, __n_intermediate>
1463 {
1464#if __RECIPROCAL_MATH__
1465 // If -freciprocal-math is active, using the `/` operator is
1466 // incorrect because it may be translated to an imprecise
1467 // multiplication with reciprocal. We need to use inline
1468 // assembly to force a real division.
1469 _FloatV __r;
1470 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1471 // because once -mavx is given, GCC
1472 // emits VEX encoded vdivp[sd]
1473 {
1474 if constexpr (sizeof(_Tp) == 4)
1475 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1476 : "=x"(__r)
1477 : "x"(__xf[__i]), "x"(__yf[__i]));
1478 else
1479 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1480 : "=x"(__r)
1481 : "x"(__xf[__i]), "x"(__yf[__i]));
1482 }
1483 else
1484 {
1485 __r = __xf[__i];
1486 if constexpr (sizeof(_Tp) == 4)
1487 asm("divpd\t{%1, %0|%0, %1}"
1488 : "=x"(__r)
1489 : "x"(__yf[__i]));
1490 else
1491 asm("divps\t{%1, %0|%0, %1}"
1492 : "=x"(__r)
1493 : "x"(__yf[__i]));
1494 }
1495 return __r;
1496#else
1497 return __xf[__i] / __yf[__i];
1498#endif
1499 });
1500 }
1501 /* 64-bit int division is potentially optimizable via double division if
1502 * the value in __x is small enough and the conversion between
1503 * int<->double is efficient enough:
1504 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1505 sizeof(_Tp) == 8)
1506 {
1507 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1508 {
1509 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1510 0xffe0'0000'0000'0000ull}))
1511 {
1512 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1513 }
1514 }
1515 }
1516 */
1517 return _Base::_S_divides(__x, __y);
1518 }
1519#else
1520 using _Base::_S_divides;
1521#endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1522
1523 // }}}
1524 // _S_modulus {{{
1525 template <typename _Tp, size_t _Np>
1526 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1527 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1528 {
1529 if (__builtin_is_constant_evaluated()
1530 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1531 return _Base::_S_modulus(__x, __y);
1532 else
1533 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1534 }
1535
1536 // }}}
1537 // _S_bit_shift_left {{{
1538 // Notes on UB. C++2a [expr.shift] says:
1539 // -1- [...] The operands shall be of integral or unscoped enumeration type
1540 // and integral promotions are performed. The type of the result is that
1541 // of the promoted left operand. The behavior is undefined if the right
1542 // operand is negative, or greater than or equal to the width of the
1543 // promoted left operand.
1544 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1545 // 2^N, where N is the width of the type of the result.
1546 //
1547 // C++17 [expr.shift] says:
1548 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1549 // bits are zero-filled. If E1 has an unsigned type, the value of the
1550 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1551 // representable in the result type. Otherwise, if E1 has a signed type
1552 // and non-negative value, and E1 × 2^E2 is representable in the
1553 // corresponding unsigned type of the result type, then that value,
1554 // converted to the result type, is the resulting value; otherwise, the
1555 // behavior is undefined.
1556 //
1557 // Consequences:
1558 // With C++2a signed and unsigned types have the same UB
1559 // characteristics:
1560 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1561 //
1562 // With C++17 there's little room for optimizations because the standard
1563 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1564 // short and char shifts must assume shifts affect bits of neighboring
1565 // values.
1566 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1567 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1568 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1569 _S_bit_shift_left(_Tp __xx, int __y)
1570 {
1571 using _V = typename _TVT::type;
1572 using _Up = typename _TVT::value_type;
1573 _V __x = __xx;
1574 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1575 if (__builtin_is_constant_evaluated())
1576 return __x << __y;
1577#if __cplusplus > 201703
1578 // after C++17, signed shifts have no UB, and behave just like unsigned
1579 // shifts
1580 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1581 return __vector_bitcast<_Up>(
1582 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1583 __y));
1584#endif
1585 else if constexpr (sizeof(_Up) == 1)
1586 {
1587 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1588 if (__builtin_constant_p(__y))
1589 {
1590 if (__y == 0)
1591 return __x;
1592 else if (__y == 1)
1593 return __x + __x;
1594 else if (__y == 2)
1595 {
1596 __x = __x + __x;
1597 return __x + __x;
1598 }
1599 else if (__y > 2 && __y < 8)
1600 {
1601 if constexpr (sizeof(__x) > sizeof(unsigned))
1602 {
1603 const _UChar __mask = 0xff << __y; // precomputed vector
1604 return __vector_bitcast<_Up>(
1605 __vector_bitcast<_UChar>(
1606 __vector_bitcast<unsigned>(__x) << __y)
1607 & __mask);
1608 }
1609 else
1610 {
1611 const unsigned __mask
1612 = (0xff & (0xff << __y)) * 0x01010101u;
1613 return reinterpret_cast<_V>(
1614 static_cast<__int_for_sizeof_t<_V>>(
1615 unsigned(
1616 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1617 << __y)
1618 & __mask));
1619 }
1620 }
1621 else if (__y >= 8 && __y < 32)
1622 return _V();
1623 else
1624 __builtin_unreachable();
1625 }
1626 // general strategy in the following: use an sllv instead of sll
1627 // instruction, because it's 2 to 4 times faster:
1628 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1629 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1630 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1631 _mm256_set1_epi16(__y))));
1632 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1633 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1634 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1635 _mm512_set1_epi16(__y))));
1636 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1637 {
1638 const auto __shift = _mm512_set1_epi16(__y);
1639 return __vector_bitcast<_Up>(
1640 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1641 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1642 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1643 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1644 }
1645 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1646 {
1647#if 1
1648 const auto __shift = _mm_cvtsi32_si128(__y);
1649 auto __k
1650 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1651 __k |= _mm256_srli_epi16(__k, 8);
1652 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1653 & __k);
1654#else
1655 const _Up __k = 0xff << __y;
1656 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1657 & __k;
1658#endif
1659 }
1660 else
1661 {
1662 const auto __shift = _mm_cvtsi32_si128(__y);
1663 auto __k
1664 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1665 __k |= _mm_srli_epi16(__k, 8);
1666 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1667 }
1668 }
1669 return __x << __y;
1670 }
1671
1672 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1673 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1674 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1675 {
1676 using _V = typename _TVT::type;
1677 using _Up = typename _TVT::value_type;
1678 _V __x = __xx;
1679 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1680 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1681 if (__builtin_is_constant_evaluated())
1682 return __x << __y;
1683#if __cplusplus > 201703
1684 // after C++17, signed shifts have no UB, and behave just like unsigned
1685 // shifts
1686 else if constexpr (is_signed_v<_Up>)
1687 return __vector_bitcast<_Up>(
1688 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1689 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1690#endif
1691 else if constexpr (sizeof(_Up) == 1)
1692 {
1693 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1694 return __vector_bitcast<_Up>(__concat(
1695 _mm512_cvtepi16_epi8(
1696 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1697 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1698 _mm512_cvtepi16_epi8(
1699 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1700 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1701 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1702 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1703 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1704 _mm512_cvtepu8_epi16(__iy))));
1705 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1706 return __intrin_bitcast<_V>(
1707 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1708 _mm_cvtepu8_epi16(__iy))));
1709 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1710 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1711 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1712 _mm256_cvtepu8_epi16(__iy))));
1713 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1714 return __intrin_bitcast<_V>(
1715 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1716 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1717 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1718 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1719 {
1720 auto __mask
1721 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1722 auto __x4
1723 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1724 __x4 &= char(0xf0);
1725 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1726 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1727 __mask += __mask;
1728 auto __x2
1729 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1730 __x2 &= char(0xfc);
1731 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1732 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1733 __mask += __mask;
1734 auto __x1 = __x + __x;
1735 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1736 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1737 return __x
1738 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1739 }
1740 else if constexpr (sizeof(__x) == 16)
1741 {
1742 auto __mask
1743 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1744 auto __x4
1745 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1746 __x4 &= char(0xf0);
1747 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1748 __mask += __mask;
1749 auto __x2
1750 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1751 __x2 &= char(0xfc);
1752 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1753 __mask += __mask;
1754 auto __x1 = __x + __x;
1755 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1756 return __x
1757 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1758 }
1759 else
1760 return __x << __y;
1761 }
1762 else if constexpr (sizeof(_Up) == 2)
1763 {
1764 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1765 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1766 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1767 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1768 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1769 return __vector_bitcast<_Up>(
1770 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1771 _mm512_castsi256_si512(__iy))));
1772 else if constexpr (sizeof __ix == 32 && __have_avx2)
1773 {
1774 const auto __ux = __vector_bitcast<unsigned>(__x);
1775 const auto __uy = __vector_bitcast<unsigned>(__y);
1776 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1777 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1778 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1779 }
1780 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1781 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1782 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1783 return __intrin_bitcast<_V>(
1784 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1785 _mm512_castsi128_si512(__iy))));
1786 else if constexpr (sizeof __ix == 16 && __have_avx2)
1787 {
1788 const auto __ux = __vector_bitcast<unsigned>(__ix);
1789 const auto __uy = __vector_bitcast<unsigned>(__iy);
1790 return __intrin_bitcast<_V>(_mm_blend_epi16(
1791 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1792 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1793 }
1794 else if constexpr (sizeof __ix == 16)
1795 {
1796 using _Float4 = __vector_type_t<float, 4>;
1797 using _Int4 = __vector_type_t<int, 4>;
1798 using _UInt4 = __vector_type_t<unsigned, 4>;
1799 const _UInt4 __yu
1800 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1801 return __x
1802 * __intrin_bitcast<_V>(
1803 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1804 reinterpret_cast<_Float4>(__yu << 23)))
1805 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1806 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1807 << 16));
1808 }
1809 else
1810 __assert_unreachable<_Tp>();
1811 }
1812 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1813 && !__have_avx2)
1814 // latency is suboptimal, but throughput is at full speedup
1815 return __intrin_bitcast<_V>(
1816 __vector_bitcast<unsigned>(__ix)
1817 * __vector_convert<__vector_type16_t<int>>(
1818 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1819 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1820 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1821 && !__have_avx2)
1822 {
1823 const auto __lo = _mm_sll_epi64(__ix, __iy);
1824 const auto __hi
1825 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1826 if constexpr (__have_sse4_1)
1827 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1828 else
1829 return __vector_bitcast<_Up>(
1830 _mm_move_sd(__vector_bitcast<double>(__hi),
1831 __vector_bitcast<double>(__lo)));
1832 }
1833 else
1834 return __x << __y;
1835 }
1836#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1837
1838 // }}}
1839 // _S_bit_shift_right {{{
1840#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1841 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1842 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1843 _S_bit_shift_right(_Tp __xx, int __y)
1844 {
1845 using _V = typename _TVT::type;
1846 using _Up = typename _TVT::value_type;
1847 _V __x = __xx;
1848 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1849 if (__builtin_is_constant_evaluated())
1850 return __x >> __y;
1851 else if (__builtin_constant_p(__y)
1852 && is_unsigned_v<
1853 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1854 return _V();
1855 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1856 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1857 & _Up(0xff >> __y);
1858 //}}}
1859 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1860 return __intrin_bitcast<_V>(
1861 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1862 >> (__y + 8))
1863 << 8)
1864 | (__vector_bitcast<_UShort>(
1865 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1866 >> __y)
1867 >> 8));
1868 //}}}
1869 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1870 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1871 {
1872 if (__y > 32)
1873 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1874 & _Up(0xffff'ffff'0000'0000ull))
1875 | __vector_bitcast<_Up>(
1876 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1877 >> 32)
1878 >> (__y - 32));
1879 else
1880 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1881 >> __y)
1882 | __vector_bitcast<_Up>(
1883 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1884 >> __y);
1885 }
1886 //}}}
1887 else
1888 return __x >> __y;
1889 }
1890
1891 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1892 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1893 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1894 {
1895 using _V = typename _TVT::type;
1896 using _Up = typename _TVT::value_type;
1897 _V __x = __xx;
1898 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1899 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1900 if (__builtin_is_constant_evaluated()
1901 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1902 return __x >> __y;
1903 else if constexpr (sizeof(_Up) == 1) //{{{
1904 {
1905 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1906 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1907 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1908 _mm_cvtepi8_epi16(__iy))
1909 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1910 _mm_cvtepu8_epi16(__iy))));
1911 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1912 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1913 is_signed_v<_Up>
1914 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1915 _mm256_cvtepi8_epi16(__iy))
1916 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1917 _mm256_cvtepu8_epi16(__iy))));
1918 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1919 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1920 is_signed_v<_Up>
1921 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1922 _mm512_cvtepi8_epi16(__iy))
1923 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1924 _mm512_cvtepu8_epi16(__iy))));
1925 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1926 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1927 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1928 0x5555'5555'5555'5555ull,
1929 _mm512_srav_epi16(
1930 _mm512_slli_epi16(__ix, 8),
1931 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1932 _mm512_set1_epi16(8)))));
1933 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1934 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1935 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1936 0x5555'5555'5555'5555ull,
1937 _mm512_srlv_epi16(
1938 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1939 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1940 /* This has better throughput but higher latency than the impl below
1941 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1942 is_unsigned_v<_Up>)
1943 {
1944 const auto __shorts = __to_intrin(_S_bit_shift_right(
1945 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1946 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1947 return __vector_bitcast<_Up>(
1948 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1949 }
1950 */
1951 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1952 // the following uses vpsr[al]vd, which requires AVX2
1953 if constexpr (is_signed_v<_Up>)
1954 {
1955 const auto r3 = __vector_bitcast<_UInt>(
1956 (__vector_bitcast<int>(__x)
1957 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1958 & 0xff000000u;
1959 const auto r2
1960 = __vector_bitcast<_UInt>(
1961 ((__vector_bitcast<int>(__x) << 8)
1962 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1963 & 0xff000000u;
1964 const auto r1
1965 = __vector_bitcast<_UInt>(
1966 ((__vector_bitcast<int>(__x) << 16)
1967 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1968 & 0xff000000u;
1969 const auto r0 = __vector_bitcast<_UInt>(
1970 (__vector_bitcast<int>(__x) << 24)
1971 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1972 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1973 | (r0 >> 24));
1974 }
1975 else
1976 {
1977 const auto r3 = (__vector_bitcast<_UInt>(__x)
1978 >> (__vector_bitcast<_UInt>(__y) >> 24))
1979 & 0xff000000u;
1980 const auto r2
1981 = ((__vector_bitcast<_UInt>(__x) << 8)
1982 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1983 & 0xff000000u;
1984 const auto r1
1985 = ((__vector_bitcast<_UInt>(__x) << 16)
1986 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1987 & 0xff000000u;
1988 const auto r0
1989 = (__vector_bitcast<_UInt>(__x) << 24)
1990 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1991 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1992 | (r0 >> 24));
1993 }
1994 else if constexpr (__have_sse4_1
1995 && is_unsigned_v<_Up> && sizeof(__x) > 2)
1996 {
1997 auto __x128 = __vector_bitcast<_Up>(__ix);
1998 auto __mask
1999 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
2000 auto __x4 = __vector_bitcast<_Up>(
2001 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
2002 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2003 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
2004 __mask += __mask;
2005 auto __x2 = __vector_bitcast<_Up>(
2006 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
2007 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2008 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
2009 __mask += __mask;
2010 auto __x1 = __vector_bitcast<_Up>(
2011 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
2012 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2013 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
2014 return __intrin_bitcast<_V>(
2015 __x128
2016 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2017 == 0)); // y > 7 nulls the result
2018 }
2019 else if constexpr (__have_sse4_1
2020 && is_signed_v<_Up> && sizeof(__x) > 2)
2021 {
2022 auto __mask = __vector_bitcast<_UChar>(
2023 __vector_bitcast<_UShort>(__iy) << 5);
2024 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2025 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
2026 };
2027 auto __xh = __vector_bitcast<short>(__ix);
2028 auto __xl = __vector_bitcast<short>(__ix) << 8;
2029 auto __xh4 = __xh >> 4;
2030 auto __xl4 = __xl >> 4;
2031 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2032 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
2033 __xl = __vector_bitcast<short>(
2034 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2035 __to_intrin(__xl4)));
2036 __mask += __mask;
2037 auto __xh2 = __xh >> 2;
2038 auto __xl2 = __xl >> 2;
2039 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2040 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2041 __xl = __vector_bitcast<short>(
2042 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2043 __to_intrin(__xl2)));
2044 __mask += __mask;
2045 auto __xh1 = __xh >> 1;
2046 auto __xl1 = __xl >> 1;
2047 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2048 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2049 __xl = __vector_bitcast<short>(
2050 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2051 __to_intrin(__xl1)));
2052 return __intrin_bitcast<_V>(
2053 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2054 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2055 >> 8))
2056 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2057 == 0)); // y > 7 nulls the result
2058 }
2059 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2060 {
2061 auto __mask
2062 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2063 auto __x4 = __vector_bitcast<_Up>(
2064 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2065 __x = __mask > 0x7f ? __x4 : __x;
2066 __mask += __mask;
2067 auto __x2 = __vector_bitcast<_Up>(
2068 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2069 __x = __mask > 0x7f ? __x2 : __x;
2070 __mask += __mask;
2071 auto __x1 = __vector_bitcast<_Up>(
2072 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2073 __x = __mask > 0x7f ? __x1 : __x;
2074 return __x
2075 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2076 }
2077 else if constexpr (sizeof(__x) > 2) // signed SSE2
2078 {
2079 static_assert(is_signed_v<_Up>);
2080 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2081 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2082 auto __xh = __vector_bitcast<short>(__x);
2083 auto __xl = __vector_bitcast<short>(__x) << 8;
2084 auto __xh4 = __xh >> 4;
2085 auto __xl4 = __xl >> 4;
2086 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2087 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2088 __maskh += __maskh;
2089 __maskl += __maskl;
2090 auto __xh2 = __xh >> 2;
2091 auto __xl2 = __xl >> 2;
2092 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2093 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2094 __maskh += __maskh;
2095 __maskl += __maskl;
2096 auto __xh1 = __xh >> 1;
2097 auto __xl1 = __xl >> 1;
2098 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2099 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2100 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2101 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2102 >> 8);
2103 return __x
2104 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2105 }
2106 else
2107 return __x >> __y;
2108 } //}}}
2109 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2110 {
2111 [[maybe_unused]] auto __blend_0xaa
2112 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2113 if constexpr (sizeof(__a) == 16)
2114 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2115 0xaa);
2116 else if constexpr (sizeof(__a) == 32)
2117 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2118 0xaa);
2119 else if constexpr (sizeof(__a) == 64)
2120 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2121 __to_intrin(__b));
2122 else
2123 __assert_unreachable<decltype(__a)>();
2124 };
2125 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2126 return __intrin_bitcast<_V>(is_signed_v<_Up>
2127 ? _mm_srav_epi16(__ix, __iy)
2128 : _mm_srlv_epi16(__ix, __iy));
2129 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2130 return __vector_bitcast<_Up>(is_signed_v<_Up>
2131 ? _mm256_srav_epi16(__ix, __iy)
2132 : _mm256_srlv_epi16(__ix, __iy));
2133 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2134 return __vector_bitcast<_Up>(is_signed_v<_Up>
2135 ? _mm512_srav_epi16(__ix, __iy)
2136 : _mm512_srlv_epi16(__ix, __iy));
2137 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2138 return __intrin_bitcast<_V>(
2139 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2140 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2141 >> 16,
2142 __vector_bitcast<int>(__ix)
2143 >> (__vector_bitcast<int>(__iy) >> 16)));
2144 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2145 return __intrin_bitcast<_V>(
2146 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2147 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2148 __vector_bitcast<_UInt>(__ix)
2149 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2150 else if constexpr (__have_sse4_1)
2151 {
2152 auto __mask = __vector_bitcast<_UShort>(__iy);
2153 auto __x128 = __vector_bitcast<_Up>(__ix);
2154 //__mask *= 0x0808;
2155 __mask = (__mask << 3) | (__mask << 11);
2156 // do __x128 = 0 where __y[4] is set
2157 __x128 = __vector_bitcast<_Up>(
2158 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2159 __to_intrin(__mask)));
2160 // do __x128 =>> 8 where __y[3] is set
2161 __x128 = __vector_bitcast<_Up>(
2162 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2163 __to_intrin(__mask += __mask)));
2164 // do __x128 =>> 4 where __y[2] is set
2165 __x128 = __vector_bitcast<_Up>(
2166 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2167 __to_intrin(__mask += __mask)));
2168 // do __x128 =>> 2 where __y[1] is set
2169 __x128 = __vector_bitcast<_Up>(
2170 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2171 __to_intrin(__mask += __mask)));
2172 // do __x128 =>> 1 where __y[0] is set
2173 return __intrin_bitcast<_V>(
2174 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2175 __to_intrin(__mask + __mask)));
2176 }
2177 else
2178 {
2179 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2180 auto __x128 = __vector_bitcast<_Up>(__ix);
2181 auto __mask
2182 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2183 return __vector_bitcast<short>(__kk) < 0;
2184 };
2185 // do __x128 = 0 where __y[4] is set
2186 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2187 // do __x128 =>> 8 where __y[3] is set
2188 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2189 // do __x128 =>> 4 where __y[2] is set
2190 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2191 // do __x128 =>> 2 where __y[1] is set
2192 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2193 // do __x128 =>> 1 where __y[0] is set
2194 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2195 : __x128);
2196 }
2197 } //}}}
2198 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2199 {
2200 if constexpr (is_unsigned_v<_Up>)
2201 {
2202 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2203 const __m128 __factor_f = reinterpret_cast<__m128>(
2204 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2205 const __m128i __factor
2206 = __builtin_constant_p(__factor_f)
2207 ? __to_intrin(
2208 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2209 __factor_f[2], __factor_f[3]))
2210 : _mm_cvttps_epi32(__factor_f);
2211 const auto __r02
2212 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2213 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2214 _mm_srli_si128(__factor, 4));
2215 if constexpr (__have_sse4_1)
2216 return __intrin_bitcast<_V>(
2217 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2218 else
2219 return __intrin_bitcast<_V>(
2220 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2221 }
2222 else
2223 {
2224 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2225 if constexpr (is_signed_v<_Up>)
2226 return _mm_sra_epi32(__a, __b);
2227 else
2228 return _mm_srl_epi32(__a, __b);
2229 };
2230 const auto __r0
2231 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2232 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2233 const auto __r2
2234 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2235 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2236 if constexpr (__have_sse4_1)
2237 return __intrin_bitcast<_V>(
2238 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2239 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2240 else
2241 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2242 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2243 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2244 }
2245 } //}}}
2246 else
2247 return __x >> __y;
2248 }
2249#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2250
2251 // }}}
2252 // compares {{{
2253 // _S_equal_to {{{
2254 template <typename _Tp, size_t _Np>
2255 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2256 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2257 {
2258 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2259 {
2260 if (__builtin_is_constant_evaluated()
2261 || (__x._M_is_constprop() && __y._M_is_constprop()))
2262 return _MaskImpl::_S_to_bits(
2263 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2264
2265 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2266 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2267 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2268 if constexpr (is_floating_point_v<_Tp>)
2269 {
2270 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2271 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2272 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2273 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2274 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2275 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2276 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2277 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2278 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2279 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2280 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2281 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2282 else
2283 __assert_unreachable<_Tp>();
2284 }
2285 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2286 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2287 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2288 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2289 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2290 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2291 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2292 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2293 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2294 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2295 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2296 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2297 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2298 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2299 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2300 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2301 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2302 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2303 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2304 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2305 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2306 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2307 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2308 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2309 else
2310 __assert_unreachable<_Tp>();
2311 } // }}}
2312 else if (__builtin_is_constant_evaluated())
2313 return _Base::_S_equal_to(__x, __y);
2314 else if constexpr (sizeof(__x) == 8)
2315 {
2316 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2317 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2318 _MaskMember<_Tp> __r64{};
2319 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2320 return __r64;
2321 }
2322 else
2323 return _Base::_S_equal_to(__x, __y);
2324 }
2325
2326 // }}}
2327 // _S_not_equal_to {{{
2328 template <typename _Tp, size_t _Np>
2329 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2330 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2331 {
2332 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2333 {
2334 if (__builtin_is_constant_evaluated()
2335 || (__x._M_is_constprop() && __y._M_is_constprop()))
2336 return _MaskImpl::_S_to_bits(
2337 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2338
2339 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2340 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2341 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2342 if constexpr (is_floating_point_v<_Tp>)
2343 {
2344 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2345 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2346 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2347 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2348 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2349 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2350 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2351 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2352 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2353 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2354 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2355 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2356 else
2357 __assert_unreachable<_Tp>();
2358 }
2359 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2360 return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2361 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2362 return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2363 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2364 return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2365 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2366 return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2367 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2368 return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2369 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2370 return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2371 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2372 return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2373 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2374 return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2375 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2376 return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2377 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2378 return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2379 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2380 return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2381 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2382 return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2383 else
2384 __assert_unreachable<_Tp>();
2385 } // }}}
2386 else if (__builtin_is_constant_evaluated())
2387 return _Base::_S_not_equal_to(__x, __y);
2388 else if constexpr (sizeof(__x) == 8)
2389 {
2390 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2391 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2392 _MaskMember<_Tp> __r64{};
2393 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2394 return __r64;
2395 }
2396 else
2397 return _Base::_S_not_equal_to(__x, __y);
2398 }
2399
2400 // }}}
2401 // _S_less {{{
2402 template <typename _Tp, size_t _Np>
2403 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2404 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2405 {
2406 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2407 {
2408 if (__builtin_is_constant_evaluated()
2409 || (__x._M_is_constprop() && __y._M_is_constprop()))
2410 return _MaskImpl::_S_to_bits(
2411 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2412
2413 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2414 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2415 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2416 if constexpr (sizeof(__xi) == 64)
2417 {
2418 if constexpr (__is_x86_ps<_Tp> ())
2419 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2420 else if constexpr (__is_x86_pd<_Tp> ())
2421 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2422 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2423 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2424 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2425 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2426 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2427 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2428 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2429 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2430 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2431 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2432 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2433 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2434 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2435 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2436 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2437 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2438 else
2439 __assert_unreachable<_Tp>();
2440 }
2441 else if constexpr (sizeof(__xi) == 32)
2442 {
2443 if constexpr (__is_x86_ps<_Tp> ())
2444 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2445 else if constexpr (__is_x86_pd<_Tp> ())
2446 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2447 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2448 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2449 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2450 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2451 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2452 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2453 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2454 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2455 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2456 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2457 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2458 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2459 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2460 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2461 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2462 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2463 else
2464 __assert_unreachable<_Tp>();
2465 }
2466 else if constexpr (sizeof(__xi) == 16)
2467 {
2468 if constexpr (__is_x86_ps<_Tp> ())
2469 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2470 else if constexpr (__is_x86_pd<_Tp> ())
2471 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2472 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2473 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2474 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2475 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2476 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2477 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2478 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2479 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2480 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2481 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2482 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2483 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2484 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2485 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2486 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2487 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2488 else
2489 __assert_unreachable<_Tp>();
2490 }
2491 else
2492 __assert_unreachable<_Tp>();
2493 } // }}}
2494 else if (__builtin_is_constant_evaluated())
2495 return _Base::_S_less(__x, __y);
2496 else if constexpr (sizeof(__x) == 8)
2497 {
2498 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2499 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2500 _MaskMember<_Tp> __r64{};
2501 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2502 return __r64;
2503 }
2504 else
2505 return _Base::_S_less(__x, __y);
2506 }
2507
2508 // }}}
2509 // _S_less_equal {{{
2510 template <typename _Tp, size_t _Np>
2511 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2512 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2513 {
2514 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2515 {
2516 if (__builtin_is_constant_evaluated()
2517 || (__x._M_is_constprop() && __y._M_is_constprop()))
2518 return _MaskImpl::_S_to_bits(
2519 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2520
2521 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2522 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2523 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2524 if constexpr (sizeof(__xi) == 64)
2525 {
2526 if constexpr (__is_x86_ps<_Tp> ())
2527 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2528 else if constexpr (__is_x86_pd<_Tp> ())
2529 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2530 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2531 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2532 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2533 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2534 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2535 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2536 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2537 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2538 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2539 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2540 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2541 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2542 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2543 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2544 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2545 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2546 else
2547 __assert_unreachable<_Tp>();
2548 }
2549 else if constexpr (sizeof(__xi) == 32)
2550 {
2551 if constexpr (__is_x86_ps<_Tp> ())
2552 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2553 else if constexpr (__is_x86_pd<_Tp> ())
2554 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2555 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2556 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2557 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2558 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2559 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2560 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2561 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2562 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2563 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2564 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2565 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2566 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2567 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2568 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2569 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2570 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2571 else
2572 __assert_unreachable<_Tp>();
2573 }
2574 else if constexpr (sizeof(__xi) == 16)
2575 {
2576 if constexpr (__is_x86_ps<_Tp> ())
2577 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2578 else if constexpr (__is_x86_pd<_Tp> ())
2579 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2580 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2581 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2582 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2583 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2584 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2585 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2586 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2587 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2588 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2589 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2590 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2591 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2592 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2593 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2594 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2595 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2596 else
2597 __assert_unreachable<_Tp>();
2598 }
2599 else
2600 __assert_unreachable<_Tp>();
2601 } // }}}
2602 else if (__builtin_is_constant_evaluated())
2603 return _Base::_S_less_equal(__x, __y);
2604 else if constexpr (sizeof(__x) == 8)
2605 {
2606 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2607 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2608 _MaskMember<_Tp> __r64{};
2609 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2610 return __r64;
2611 }
2612 else
2613 return _Base::_S_less_equal(__x, __y);
2614 }
2615
2616 // }}} }}}
2617 // negation {{{
2618 template <typename _Tp, size_t _Np>
2619 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2620 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2621 {
2622 if constexpr (__is_avx512_abi<_Abi>())
2623 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2624 else
2625 return _Base::_S_negate(__x);
2626 }
2627
2628 // }}}
2629 // math {{{
2630 using _Base::_S_abs;
2631
2632 // _S_sqrt {{{
2633 template <typename _Tp, size_t _Np>
2634 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2635 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2636 {
2637 if constexpr (__is_sse_ps<_Tp, _Np>())
2638 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2639 else if constexpr (__is_sse_pd<_Tp, _Np>())
2640 return _mm_sqrt_pd(__x);
2641 else if constexpr (__is_avx_ps<_Tp, _Np>())
2642 return _mm256_sqrt_ps(__x);
2643 else if constexpr (__is_avx_pd<_Tp, _Np>())
2644 return _mm256_sqrt_pd(__x);
2645 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2646 return _mm512_sqrt_ps(__x);
2647 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2648 return _mm512_sqrt_pd(__x);
2649 else
2650 __assert_unreachable<_Tp>();
2651 }
2652
2653 // }}}
2654 // _S_ldexp {{{
2655 template <typename _Tp, size_t _Np>
2656 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2657 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2658 __fixed_size_storage_t<int, _Np> __exp)
2659 {
2660 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2661 {
2662 const auto __xi = __to_intrin(__x);
2663 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2664 __cvt;
2665 const auto __expi = __to_intrin(__cvt(__exp));
2666 using _Up = __bool_storage_member_type_t<_Np>;
2667 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2668 if constexpr (sizeof(__xi) == 16)
2669 {
2670 if constexpr (sizeof(_Tp) == 8)
2671 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2672 else
2673 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2674 }
2675 else if constexpr (sizeof(__xi) == 32)
2676 {
2677 if constexpr (sizeof(_Tp) == 8)
2678 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2679 else
2680 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2681 }
2682 else
2683 {
2684 static_assert(sizeof(__xi) == 64);
2685 if constexpr (sizeof(_Tp) == 8)
2686 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2687 else
2688 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2689 }
2690 }
2691 else
2692 return _Base::_S_ldexp(__x, __exp);
2693 }
2694
2695 // }}}
2696 // _S_trunc {{{
2697 template <typename _Tp, size_t _Np>
2698 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2699 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2700 {
2701 if constexpr (__is_avx512_ps<_Tp, _Np>())
2702 return _mm512_roundscale_ps(__x, 0x0b);
2703 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2704 return _mm512_roundscale_pd(__x, 0x0b);
2705 else if constexpr (__is_avx_ps<_Tp, _Np>())
2706 return _mm256_round_ps(__x, 0xb);
2707 else if constexpr (__is_avx_pd<_Tp, _Np>())
2708 return _mm256_round_pd(__x, 0xb);
2709 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2710 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2711 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2712 return _mm_round_pd(__x, 0xb);
2713 else if constexpr (__is_sse_ps<_Tp, _Np>())
2714 {
2715 auto __truncated
2716 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2717 const auto __no_fractional_values
2718 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2719 & 0x7f800000u)
2720 < 0x4b000000; // the exponent is so large that no mantissa bits
2721 // signify fractional values (0x3f8 + 23*8 =
2722 // 0x4b0)
2723 return __no_fractional_values ? __truncated : __to_intrin(__x);
2724 }
2725 else
2726 return _Base::_S_trunc(__x);
2727 }
2728
2729 // }}}
2730 // _S_round {{{
2731 template <typename _Tp, size_t _Np>
2732 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2733 _S_round(_SimdWrapper<_Tp, _Np> __x)
2734 {
2735 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2736 // from zero as required by std::round. Therefore this function is more
2737 // complicated.
2738 using _V = __vector_type_t<_Tp, _Np>;
2739 _V __truncated;
2740 if constexpr (__is_avx512_ps<_Tp, _Np>())
2741 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2742 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2743 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2744 else if constexpr (__is_avx_ps<_Tp, _Np>())
2745 __truncated = _mm256_round_ps(__x._M_data,
2746 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2747 else if constexpr (__is_avx_pd<_Tp, _Np>())
2748 __truncated = _mm256_round_pd(__x._M_data,
2749 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2750 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2751 __truncated = __auto_bitcast(
2752 _mm_round_ps(__to_intrin(__x),
2753 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2754 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2755 __truncated
2756 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2757 else if constexpr (__is_sse_ps<_Tp, _Np>())
2758 __truncated = __auto_bitcast(
2759 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2760 else
2761 return _Base::_S_round(__x);
2762
2763 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2764 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2765
2766 const _V __rounded
2767 = __truncated
2768 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2769 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2770 : _V());
2771 if constexpr (__have_sse4_1)
2772 return __rounded;
2773 else // adjust for missing range in cvttps_epi32
2774 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2775 : __x._M_data;
2776 }
2777
2778 // }}}
2779 // _S_nearbyint {{{
2780 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2781 _GLIBCXX_SIMD_INTRINSIC static _Tp
2782 _S_nearbyint(_Tp __x) noexcept
2783 {
2784 if constexpr (_TVT::template _S_is<float, 16>)
2785 return _mm512_roundscale_ps(__x, 0x0c);
2786 else if constexpr (_TVT::template _S_is<double, 8>)
2787 return _mm512_roundscale_pd(__x, 0x0c);
2788 else if constexpr (_TVT::template _S_is<float, 8>)
2789 return _mm256_round_ps(__x,
2790 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2791 else if constexpr (_TVT::template _S_is<double, 4>)
2792 return _mm256_round_pd(__x,
2793 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2794 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2795 return _mm_round_ps(__x,
2796 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2797 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2798 return _mm_round_pd(__x,
2799 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2800 else
2801 return _Base::_S_nearbyint(__x);
2802 }
2803
2804 // }}}
2805 // _S_rint {{{
2806 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2807 _GLIBCXX_SIMD_INTRINSIC static _Tp
2808 _S_rint(_Tp __x) noexcept
2809 {
2810 if constexpr (_TVT::template _S_is<float, 16>)
2811 return _mm512_roundscale_ps(__x, 0x04);
2812 else if constexpr (_TVT::template _S_is<double, 8>)
2813 return _mm512_roundscale_pd(__x, 0x04);
2814 else if constexpr (_TVT::template _S_is<float, 8>)
2815 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2816 else if constexpr (_TVT::template _S_is<double, 4>)
2817 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2818 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2819 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2820 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2821 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2822 else
2823 return _Base::_S_rint(__x);
2824 }
2825
2826 // }}}
2827 // _S_floor {{{
2828 template <typename _Tp, size_t _Np>
2829 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2830 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2831 {
2832 if constexpr (__is_avx512_ps<_Tp, _Np>())
2833 return _mm512_roundscale_ps(__x, 0x09);
2834 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2835 return _mm512_roundscale_pd(__x, 0x09);
2836 else if constexpr (__is_avx_ps<_Tp, _Np>())
2837 return _mm256_round_ps(__x, 0x9);
2838 else if constexpr (__is_avx_pd<_Tp, _Np>())
2839 return _mm256_round_pd(__x, 0x9);
2840 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2841 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2842 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2843 return _mm_round_pd(__x, 0x9);
2844 else
2845 return _Base::_S_floor(__x);
2846 }
2847
2848 // }}}
2849 // _S_ceil {{{
2850 template <typename _Tp, size_t _Np>
2851 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2852 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2853 {
2854 if constexpr (__is_avx512_ps<_Tp, _Np>())
2855 return _mm512_roundscale_ps(__x, 0x0a);
2856 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2857 return _mm512_roundscale_pd(__x, 0x0a);
2858 else if constexpr (__is_avx_ps<_Tp, _Np>())
2859 return _mm256_round_ps(__x, 0xa);
2860 else if constexpr (__is_avx_pd<_Tp, _Np>())
2861 return _mm256_round_pd(__x, 0xa);
2862 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2863 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2864 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2865 return _mm_round_pd(__x, 0xa);
2866 else
2867 return _Base::_S_ceil(__x);
2868 }
2869
2870 // }}}
2871 // _S_signbit {{{
2872 template <typename _Tp, size_t _Np>
2873 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2874 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2875 {
2876 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2877 {
2878 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2879 return _mm512_movepi32_mask(
2880 __intrin_bitcast<__m512i>(__x._M_data));
2881 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2882 return _mm512_movepi64_mask(
2883 __intrin_bitcast<__m512i>(__x._M_data));
2884 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2885 return _mm256_movepi32_mask(
2886 __intrin_bitcast<__m256i>(__x._M_data));
2887 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2888 return _mm256_movepi64_mask(
2889 __intrin_bitcast<__m256i>(__x._M_data));
2890 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2891 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2892 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2893 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2894 }
2895 else if constexpr (__is_avx512_abi<_Abi>())
2896 {
2897 const auto __xi = __to_intrin(__x);
2898 [[maybe_unused]] constexpr auto __k1
2899 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2900 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2901 return _mm_movemask_ps(__xi);
2902 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2903 return _mm_movemask_pd(__xi);
2904 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2905 return _mm256_movemask_ps(__xi);
2906 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2907 return _mm256_movemask_pd(__xi);
2908 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2909 return _mm512_mask_cmplt_epi32_mask(
2910 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2911 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2912 return _mm512_mask_cmplt_epi64_mask(
2913 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2914 else
2915 __assert_unreachable<_Tp>();
2916 }
2917 else
2918 return _Base::_S_signbit(__x);
2919 /*{
2920 using _I = __int_for_sizeof_t<_Tp>;
2921 if constexpr (sizeof(__x) == 64)
2922 return _S_less(__vector_bitcast<_I>(__x), _I());
2923 else
2924 {
2925 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2926 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2927 if constexpr ((sizeof(_Tp) == 4 &&
2928 (__have_avx2 || sizeof(__x) == 16)) ||
2929 __have_avx512vl)
2930 {
2931 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2932 }
2933 else if constexpr ((__have_avx2 ||
2934 (__have_ssse3 && sizeof(__x) == 16)))
2935 {
2936 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2937 __signmask);
2938 }
2939 else
2940 { // SSE2/3 or AVX (w/o AVX2)
2941 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2942 return __vector_bitcast<_Tp>(
2943 __vector_bitcast<_Tp>(
2944 (__xx & __signmask) |
2945 __vector_bitcast<_I>(__one)) // -1 or 1
2946 != __one);
2947 }
2948 }
2949 }*/
2950 }
2951
2952 // }}}
2953 // _S_isnonzerovalue_mask {{{
2954 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2955 template <typename _Tp>
2956 _GLIBCXX_SIMD_INTRINSIC static auto
2957 _S_isnonzerovalue_mask(_Tp __x)
2958 {
2959 using _Traits = _VectorTraits<_Tp>;
2960 if constexpr (__have_avx512dq_vl)
2961 {
2962 if constexpr (_Traits::template _S_is<
2963 float, 2> || _Traits::template _S_is<float, 4>)
2964 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2965 else if constexpr (_Traits::template _S_is<float, 8>)
2966 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2967 else if constexpr (_Traits::template _S_is<float, 16>)
2968 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2969 else if constexpr (_Traits::template _S_is<double, 2>)
2970 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2971 else if constexpr (_Traits::template _S_is<double, 4>)
2972 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2973 else if constexpr (_Traits::template _S_is<double, 8>)
2974 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2975 else
2976 __assert_unreachable<_Tp>();
2977 }
2978 else
2979 {
2980 using _Up = typename _Traits::value_type;
2981 constexpr size_t _Np = _Traits::_S_full_size;
2982 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2983 const auto __b = __x * _Up(); // NaN if __x == inf
2984 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2985 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2986 _CMP_ORD_Q);
2987 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2988 return __mmask8(0xf
2989 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2990 __auto_bitcast(__b),
2991 _CMP_ORD_Q));
2992 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2993 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2994 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2995 return __mmask8(0x3
2996 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2997 __auto_bitcast(__b),
2998 _CMP_ORD_Q));
2999 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
3000 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3001 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
3002 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
3003 __auto_bitcast(__b),
3004 _CMP_ORD_Q));
3005 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
3006 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3007 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
3008 return __mmask8(0xf
3009 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3010 __auto_bitcast(__b),
3011 _CMP_ORD_Q));
3012 else if constexpr (__is_avx512_ps<_Up, _Np>())
3013 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3014 else if constexpr (__is_avx512_pd<_Up, _Np>())
3015 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3016 else
3017 __assert_unreachable<_Tp>();
3018 }
3019 }
3020
3021 // }}}
3022 // _S_isfinite {{{
3023 template <typename _Tp, size_t _Np>
3024 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3025 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
3026 {
3027 static_assert(is_floating_point_v<_Tp>);
3028#if !__FINITE_MATH_ONLY__
3029 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3030 {
3031 const auto __xi = __to_intrin(__x);
3032 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3033 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3034 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3035 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3036 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3037 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3038 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3039 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3040 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3041 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3042 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3043 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3044 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3045 }
3046 else if constexpr (__is_avx512_abi<_Abi>())
3047 {
3048 // if all exponent bits are set, __x is either inf or NaN
3049 using _I = __int_for_sizeof_t<_Tp>;
3050 const auto __inf = __vector_bitcast<_I>(
3051 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3052 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3053 }
3054 else
3055#endif
3056 return _Base::_S_isfinite(__x);
3057 }
3058
3059 // }}}
3060 // _S_isinf {{{
3061 template <typename _Tp, size_t _Np>
3062 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3063 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3064 {
3065#if !__FINITE_MATH_ONLY__
3066 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3067 {
3068 const auto __xi = __to_intrin(__x);
3069 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3070 return _mm512_fpclass_ps_mask(__xi, 0x18);
3071 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3072 return _mm512_fpclass_pd_mask(__xi, 0x18);
3073 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3074 return _mm256_fpclass_ps_mask(__xi, 0x18);
3075 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3076 return _mm256_fpclass_pd_mask(__xi, 0x18);
3077 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3078 return _mm_fpclass_ps_mask(__xi, 0x18);
3079 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3080 return _mm_fpclass_pd_mask(__xi, 0x18);
3081 else
3082 __assert_unreachable<_Tp>();
3083 }
3084 else if constexpr (__have_avx512dq_vl)
3085 {
3086 if constexpr (__is_sse_pd<_Tp, _Np>())
3087 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3088 else if constexpr (__is_avx_pd<_Tp, _Np>())
3089 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3090 else if constexpr (__is_sse_ps<_Tp, _Np>())
3091 return _mm_movm_epi32(
3092 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3093 else if constexpr (__is_avx_ps<_Tp, _Np>())
3094 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3095 else
3096 __assert_unreachable<_Tp>();
3097 }
3098 else
3099#endif
3100 return _Base::_S_isinf(__x);
3101 }
3102
3103 // }}}
3104 // _S_isnormal {{{
3105 template <typename _Tp, size_t _Np>
3106 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3107 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3108 {
3109#if __FINITE_MATH_ONLY__
3110 [[maybe_unused]] constexpr int __mode = 0x26;
3111#else
3112 [[maybe_unused]] constexpr int __mode = 0xbf;
3113#endif
3114 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3115 {
3116 const auto __xi = __to_intrin(__x);
3117 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3118 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3119 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3120 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3121 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3122 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3123 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3124 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3125 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3126 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3127 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3128 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3129 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3130 else
3131 __assert_unreachable<_Tp>();
3132 }
3133 else if constexpr (__have_avx512dq)
3134 {
3135 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3136 return _mm_movm_epi32(
3137 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3138 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3139 return _mm256_movm_epi32(
3140 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3141 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3142 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3143 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3144 return _mm_movm_epi64(
3145 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3146 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3147 return _mm256_movm_epi64(
3148 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3149 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3150 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3151 else
3152 __assert_unreachable<_Tp>();
3153 }
3154 else if constexpr (__is_avx512_abi<_Abi>())
3155 {
3156 using _I = __int_for_sizeof_t<_Tp>;
3157 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3158 const auto minn = __vector_bitcast<_I>(
3159 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3160#if __FINITE_MATH_ONLY__
3161 return _S_less_equal<_I, _Np>(minn, absn);
3162#else
3163 const auto infn
3164 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3165 return __and(_S_less_equal<_I, _Np>(minn, absn),
3166 _S_less<_I, _Np>(absn, infn));
3167#endif
3168 }
3169 else
3170 return _Base::_S_isnormal(__x);
3171 }
3172
3173 // }}}
3174 // _S_isnan {{{
3175 template <typename _Tp, size_t _Np>
3176 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3177 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3178 { return _S_isunordered(__x, __x); }
3179
3180 // }}}
3181 // _S_isunordered {{{
3182 template <typename _Tp, size_t _Np>
3183 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3184 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3185 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3186 {
3187#if __FINITE_MATH_ONLY__
3188 return {}; // false
3189#else
3190 const auto __xi = __to_intrin(__x);
3191 const auto __yi = __to_intrin(__y);
3192 if constexpr (__is_avx512_abi<_Abi>())
3193 {
3194 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3195 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3196 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3197 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3198 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3199 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3200 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3201 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3202 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3203 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3204 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3205 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3206 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3207 }
3208 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3209 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3210 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3211 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3212 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3213 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3214 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3215 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3216 else
3217 __assert_unreachable<_Tp>();
3218#endif
3219 }
3220
3221 // }}}
3222 // _S_isgreater {{{
3223 template <typename _Tp, size_t _Np>
3224 static constexpr _MaskMember<_Tp>
3225 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3226 {
3227 const auto __xi = __to_intrin(__x);
3228 const auto __yi = __to_intrin(__y);
3229 if constexpr (__is_avx512_abi<_Abi>())
3230 {
3231 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3232 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3233 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3234 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3235 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3236 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3237 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3238 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3239 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3240 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3241 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3242 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3243 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3244 else
3245 __assert_unreachable<_Tp>();
3246 }
3247 else if constexpr (__have_avx)
3248 {
3249 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3250 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3251 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3252 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3253 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3254 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3255 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3256 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3257 else
3258 __assert_unreachable<_Tp>();
3259 }
3260 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3261 && sizeof(_Tp) == 4)
3262 {
3263 const auto __xn = __vector_bitcast<int>(__xi);
3264 const auto __yn = __vector_bitcast<int>(__yi);
3265 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3266 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3267 return __auto_bitcast(
3268 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3269 }
3270 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3271 && sizeof(_Tp) == 8)
3272 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3273 -_mm_ucomigt_sd(__xi, __yi),
3274 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3275 _mm_unpackhi_pd(__yi, __yi))};
3276 else
3277 return _Base::_S_isgreater(__x, __y);
3278 }
3279
3280 // }}}
3281 // _S_isgreaterequal {{{
3282 template <typename _Tp, size_t _Np>
3283 static constexpr _MaskMember<_Tp>
3284 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3285 {
3286 const auto __xi = __to_intrin(__x);
3287 const auto __yi = __to_intrin(__y);
3288 if constexpr (__is_avx512_abi<_Abi>())
3289 {
3290 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3291 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3292 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3293 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3294 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3295 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3296 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3297 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3298 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3299 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3300 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3301 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3302 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3303 else
3304 __assert_unreachable<_Tp>();
3305 }
3306 else if constexpr (__have_avx)
3307 {
3308 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3309 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3310 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3311 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3312 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3313 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3314 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3315 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3316 else
3317 __assert_unreachable<_Tp>();
3318 }
3319 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3320 && sizeof(_Tp) == 4)
3321 {
3322 const auto __xn = __vector_bitcast<int>(__xi);
3323 const auto __yn = __vector_bitcast<int>(__yi);
3324 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3325 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3326 return __auto_bitcast(
3327 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3328 }
3329 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3330 && sizeof(_Tp) == 8)
3331 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3332 -_mm_ucomige_sd(__xi, __yi),
3333 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3334 _mm_unpackhi_pd(__yi, __yi))};
3335 else
3336 return _Base::_S_isgreaterequal(__x, __y);
3337 }
3338
3339 // }}}
3340 // _S_isless {{{
3341 template <typename _Tp, size_t _Np>
3342 static constexpr _MaskMember<_Tp>
3343 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3344 {
3345 const auto __xi = __to_intrin(__x);
3346 const auto __yi = __to_intrin(__y);
3347 if constexpr (__is_avx512_abi<_Abi>())
3348 {
3349 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3350 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3351 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3352 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3353 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3354 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3355 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3356 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3357 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3358 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3359 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3360 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3361 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3362 else
3363 __assert_unreachable<_Tp>();
3364 }
3365 else if constexpr (__have_avx)
3366 {
3367 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3368 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3369 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3370 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3371 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3372 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3373 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3374 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3375 else
3376 __assert_unreachable<_Tp>();
3377 }
3378 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3379 && sizeof(_Tp) == 4)
3380 {
3381 const auto __xn = __vector_bitcast<int>(__xi);
3382 const auto __yn = __vector_bitcast<int>(__yi);
3383 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3384 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3385 return __auto_bitcast(
3386 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3387 }
3388 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3389 && sizeof(_Tp) == 8)
3390 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3391 -_mm_ucomigt_sd(__yi, __xi),
3392 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3393 _mm_unpackhi_pd(__xi, __xi))};
3394 else
3395 return _Base::_S_isless(__x, __y);
3396 }
3397
3398 // }}}
3399 // _S_islessequal {{{
3400 template <typename _Tp, size_t _Np>
3401 static constexpr _MaskMember<_Tp>
3402 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3403 {
3404 const auto __xi = __to_intrin(__x);
3405 const auto __yi = __to_intrin(__y);
3406 if constexpr (__is_avx512_abi<_Abi>())
3407 {
3408 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3409 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3410 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3411 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3412 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3413 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3414 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3415 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3416 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3417 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3418 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3419 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3420 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3421 else
3422 __assert_unreachable<_Tp>();
3423 }
3424 else if constexpr (__have_avx)
3425 {
3426 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3427 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3428 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3429 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3430 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3431 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3432 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3433 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3434 else
3435 __assert_unreachable<_Tp>();
3436 }
3437 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3438 && sizeof(_Tp) == 4)
3439 {
3440 const auto __xn = __vector_bitcast<int>(__xi);
3441 const auto __yn = __vector_bitcast<int>(__yi);
3442 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3443 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3444 return __auto_bitcast(
3445 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3446 }
3447 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3448 && sizeof(_Tp) == 8)
3449 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3450 -_mm_ucomige_sd(__yi, __xi),
3451 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3452 _mm_unpackhi_pd(__xi, __xi))};
3453 else
3454 return _Base::_S_islessequal(__x, __y);
3455 }
3456
3457 // }}}
3458 // _S_islessgreater {{{
3459 template <typename _Tp, size_t _Np>
3460 static constexpr _MaskMember<_Tp>
3461 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3462 {
3463 const auto __xi = __to_intrin(__x);
3464 const auto __yi = __to_intrin(__y);
3465 if constexpr (__is_avx512_abi<_Abi>())
3466 {
3467 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3468 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3469 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3470 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3471 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3472 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3473 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3474 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3475 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3476 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3477 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3478 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3479 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3480 else
3481 __assert_unreachable<_Tp>();
3482 }
3483 else if constexpr (__have_avx)
3484 {
3485 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3486 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3487 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3488 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3489 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3490 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3491 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3492 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3493 else
3494 __assert_unreachable<_Tp>();
3495 }
3496 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3497 return __auto_bitcast(
3498 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3499 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3500 return __to_masktype(
3501 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3502 else
3503 __assert_unreachable<_Tp>();
3504 }
3505
3506 //}}} }}}
3507 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np>
3508 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
3509 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v)
3510 {
3511 if (__k._M_is_constprop_none_of())
3512 return __v;
3513 else if (__k._M_is_constprop_all_of())
3514 {
3515 auto __vv = _Base::_M_make_simd(__v);
3516 _Op<decltype(__vv)> __op;
3517 return __data(__op(__vv));
3518 }
3519 else if constexpr (__is_bitmask_v<decltype(__k)>
3520 && (is_same_v<_Op<void>, __increment<void>>
3521 || is_same_v<_Op<void>, __decrement<void>>))
3522 {
3523 // optimize masked unary increment and decrement as masked sub +/-1
3524 constexpr int __pm_one
3525 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1;
3526#ifdef _GLIBCXX_CLANG
3527 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data;
3528#else // _GLIBCXX_CLANG
3529 using _TV = __vector_type_t<_Tp, _Np>;
3530 constexpr size_t __bytes = sizeof(__v) < 16 ? 16 : sizeof(__v);
3531 constexpr size_t __width = __bytes / sizeof(_Tp);
3532 if constexpr (is_integral_v<_Tp>)
3533 {
3534 constexpr bool __lp64 = sizeof(long) == sizeof(long long);
3535 using _Ip = std::make_signed_t<_Tp>;
3536 using _Up = std::conditional_t<
3537 std::is_same_v<_Ip, long>,
3540 std::is_same_v<_Ip, signed char>, char, _Ip>>;
3541 const auto __value = __intrin_bitcast<__vector_type_t<_Up, __width>>(__v._M_data);
3542#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3543 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3544 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask(__value, \
3545 __vector_broadcast<__width>(_Up(__pm_one)), __value, __k._M_data))
3546 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512);
3547 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256);
3548 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128);
3549 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512);
3550 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256);
3551 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128);
3552 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512);
3553 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256);
3554 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128);
3555 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512);
3556 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256);
3557 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128);
3558#undef _GLIBCXX_SIMD_MASK_SUB
3559 }
3560 else
3561 {
3562 const auto __value = __intrin_bitcast<__vector_type_t<_Tp, __width>>(__v._M_data);
3563#define _GLIBCXX_SIMD_MASK_SUB_512(_Sizeof, _Width, _Instr) \
3564 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3565 return __builtin_ia32_##_Instr##_mask( \
3566 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \
3567 __k._M_data, _MM_FROUND_CUR_DIRECTION)
3568#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3569 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3570 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask( \
3571 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \
3572 __k._M_data))
3573 _GLIBCXX_SIMD_MASK_SUB_512(4, 64, subps512);
3574 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256);
3575 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128);
3576 _GLIBCXX_SIMD_MASK_SUB_512(8, 64, subpd512);
3577 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256);
3578 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128);
3579#undef _GLIBCXX_SIMD_MASK_SUB_512
3580#undef _GLIBCXX_SIMD_MASK_SUB
3581 }
3582#endif // _GLIBCXX_CLANG
3583 }
3584 else
3585 return _Base::template _S_masked_unary<_Op>(__k, __v);
3586 }
3587 };
3588
3589// }}}
3590// _MaskImplX86Mixin {{{
3591struct _MaskImplX86Mixin
3592{
3593 template <typename _Tp>
3594 using _TypeTag = _Tp*;
3595
3596 using _Base = _MaskImplBuiltinMixin;
3597
3598 // _S_to_maskvector(bool) {{{
3599 template <typename _Up, size_t _ToN = 1, typename _Tp>
3600 _GLIBCXX_SIMD_INTRINSIC static constexpr
3601 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3602 _S_to_maskvector(_Tp __x)
3603 {
3604 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3605 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3606 : __vector_type_t<_Up, _ToN>();
3607 }
3608
3609 // }}}
3610 // _S_to_maskvector(_SanitizedBitMask) {{{
3611 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN>
3612 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3613 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3614 {
3615 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3616 using _UV = __vector_type_t<_Up, _ToN>;
3617 using _UI = __intrinsic_type_t<_Up, _ToN>;
3618 [[maybe_unused]] const auto __k = __x._M_to_bits();
3619 if constexpr (_Np == 1)
3620 return _S_to_maskvector<_Up, _ToN>(__k);
3621 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3622 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3623 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3624 else if constexpr (sizeof(_Up) == 1)
3625 {
3626 if constexpr (sizeof(_UI) == 16)
3627 {
3628 if constexpr (__have_avx512bw_vl)
3629 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3630 else if constexpr (__have_avx512bw)
3631 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3632 else if constexpr (__have_avx512f)
3633 {
3634 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3635 auto __as16bits
3636 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3637 __hi256(__as32bits)));
3638 return __intrin_bitcast<_UV>(
3639 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3640 }
3641 else if constexpr (__have_ssse3)
3642 {
3643 const auto __bitmask = __to_intrin(
3644 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3645 8, 16, 32, 64, 128));
3646 return __intrin_bitcast<_UV>(
3647 __vector_bitcast<_Up>(
3648 _mm_shuffle_epi8(__to_intrin(
3649 __vector_type_t<_ULLong, 2>{__k}),
3650 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3651 1, 1, 1, 1, 1, 1, 1))
3652 & __bitmask)
3653 != 0);
3654 }
3655 // else fall through
3656 }
3657 else if constexpr (sizeof(_UI) == 32)
3658 {
3659 if constexpr (__have_avx512bw_vl)
3660 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3661 else if constexpr (__have_avx512bw)
3662 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3663 else if constexpr (__have_avx512f)
3664 {
3665 auto __as16bits = // 0 16 1 17 ... 15 31
3666 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3667 16)
3668 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3669 ~__m512i()),
3670 16);
3671 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3672 __lo256(__as16bits),
3673 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3674 );
3675 // deinterleave:
3676 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3677 __0_16_1_17, // 0 16 1 17 2 ...
3678 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3679 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3680 3, 5, 7, 9, 11, 13,
3681 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3682 // 0-3 8-11 16-19 24-27
3683 // 4-7 12-15 20-23 28-31
3684 }
3685 else if constexpr (__have_avx2)
3686 {
3687 const auto __bitmask
3688 = _mm256_broadcastsi128_si256(__to_intrin(
3689 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3690 4, 8, 16, 32, 64, 128)));
3691 return __vector_bitcast<_Up>(
3692 __vector_bitcast<_Up>(
3693 _mm256_shuffle_epi8(
3694 _mm256_broadcastsi128_si256(
3695 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3696 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3697 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3698 3, 3, 3, 3, 3, 3))
3699 & __bitmask)
3700 != 0);
3701 }
3702 // else fall through
3703 }
3704 else if constexpr (sizeof(_UI) == 64)
3705 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3706 if constexpr (std::min(_ToN, _Np) <= 4)
3707 {
3708 if constexpr (_Np > 7) // avoid overflow
3709 __x &= _SanitizedBitMask<_Np>(0x0f);
3710 const _UInt __char_mask
3711 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3712 * 0xff;
3713 _UV __r = {};
3714 __builtin_memcpy(&__r, &__char_mask,
3715 std::min(sizeof(__r), sizeof(__char_mask)));
3716 return __r;
3717 }
3718 else if constexpr (std::min(_ToN, _Np) <= 7)
3719 {
3720 if constexpr (_Np > 7) // avoid overflow
3721 __x &= _SanitizedBitMask<_Np>(0x7f);
3722 const _ULLong __char_mask
3723 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3724 * 0xff;
3725 _UV __r = {};
3726 __builtin_memcpy(&__r, &__char_mask,
3727 std::min(sizeof(__r), sizeof(__char_mask)));
3728 return __r;
3729 }
3730 }
3731 else if constexpr (sizeof(_Up) == 2)
3732 {
3733 if constexpr (sizeof(_UI) == 16)
3734 {
3735 if constexpr (__have_avx512bw_vl)
3736 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3737 else if constexpr (__have_avx512bw)
3738 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3739 else if constexpr (__have_avx512f)
3740 {
3741 __m256i __as32bits = {};
3742 if constexpr (__have_avx512vl)
3743 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3744 else
3745 __as32bits
3746 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3747 return __intrin_bitcast<_UV>(
3748 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3749 }
3750 // else fall through
3751 }
3752 else if constexpr (sizeof(_UI) == 32)
3753 {
3754 if constexpr (__have_avx512bw_vl)
3755 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3756 else if constexpr (__have_avx512bw)
3757 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3758 else if constexpr (__have_avx512f)
3759 {
3760 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3761 return __vector_bitcast<_Up>(
3762 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3763 __hi256(__as32bits))));
3764 }
3765 // else fall through
3766 }
3767 else if constexpr (sizeof(_UI) == 64)
3768 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3769 }
3770 else if constexpr (sizeof(_Up) == 4)
3771 {
3772 if constexpr (sizeof(_UI) == 16)
3773 {
3774 if constexpr (__have_avx512dq_vl)
3775 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3776 else if constexpr (__have_avx512dq)
3777 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3778 else if constexpr (__have_avx512vl)
3779 return __intrin_bitcast<_UV>(
3780 _mm_maskz_mov_epi32(__k, ~__m128i()));
3781 else if constexpr (__have_avx512f)
3782 return __intrin_bitcast<_UV>(
3783 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3784 // else fall through
3785 }
3786 else if constexpr (sizeof(_UI) == 32)
3787 {
3788 if constexpr (__have_avx512dq_vl)
3789 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3790 else if constexpr (__have_avx512dq)
3791 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3792 else if constexpr (__have_avx512vl)
3793 return __vector_bitcast<_Up>(
3794 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3795 else if constexpr (__have_avx512f)
3796 return __vector_bitcast<_Up>(
3797 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3798 // else fall through
3799 }
3800 else if constexpr (sizeof(_UI) == 64)
3801 return __vector_bitcast<_Up>(
3802 __have_avx512dq ? _mm512_movm_epi32(__k)
3803 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3804 }
3805 else if constexpr (sizeof(_Up) == 8)
3806 {
3807 if constexpr (sizeof(_UI) == 16)
3808 {
3809 if constexpr (__have_avx512dq_vl)
3810 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3811 else if constexpr (__have_avx512dq)
3812 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3813 else if constexpr (__have_avx512vl)
3814 return __vector_bitcast<_Up>(
3815 _mm_maskz_mov_epi64(__k, ~__m128i()));
3816 else if constexpr (__have_avx512f)
3817 return __vector_bitcast<_Up>(
3818 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3819 // else fall through
3820 }
3821 else if constexpr (sizeof(_UI) == 32)
3822 {
3823 if constexpr (__have_avx512dq_vl)
3824 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3825 else if constexpr (__have_avx512dq)
3826 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3827 else if constexpr (__have_avx512vl)
3828 return __vector_bitcast<_Up>(
3829 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3830 else if constexpr (__have_avx512f)
3831 return __vector_bitcast<_Up>(
3832 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3833 // else fall through
3834 }
3835 else if constexpr (sizeof(_UI) == 64)
3836 return __vector_bitcast<_Up>(
3837 __have_avx512dq ? _mm512_movm_epi64(__k)
3838 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3839 }
3840
3841 using _UpUInt = make_unsigned_t<_Up>;
3842 using _V = __vector_type_t<_UpUInt, _ToN>;
3843 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3844 if constexpr (_ToN == 2)
3845 {
3846 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3847 }
3848 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3849 {
3850 if constexpr (sizeof(_Up) == 4)
3851 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3852 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3853 _mm256_castsi256_ps(_mm256_setr_epi32(
3854 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3855 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3856 else if constexpr (sizeof(_Up) == 8)
3857 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3858 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3859 _mm256_castsi256_pd(
3860 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3861 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3862 else
3863 __assert_unreachable<_Up>();
3864 }
3865 else if constexpr (__bits_per_element >= _ToN)
3866 {
3867 constexpr auto __bitmask
3868 = __generate_vector<_V>([](auto __i)
3869 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3870 { return __i < _ToN ? 1ull << __i : 0; });
3871 const auto __bits
3872 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3873 if constexpr (__bits_per_element > _ToN)
3874 return __vector_bitcast<_Up>(__bits) > 0;
3875 else
3876 return __vector_bitcast<_Up>(__bits != 0);
3877 }
3878 else
3879 {
3880 const _V __tmp
3881 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3882 return static_cast<_UpUInt>(
3883 __k >> (__bits_per_element * (__i / __bits_per_element)));
3884 })
3885 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3886 return static_cast<_UpUInt>(1ull
3887 << (__i % __bits_per_element));
3888 }); // mask bit index
3889 return __intrin_bitcast<_UV>(__tmp != _V());
3890 }
3891 }
3892
3893 // }}}
3894 // _S_to_maskvector(_SimdWrapper) {{{
3895 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3896 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3897 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3898 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3899 {
3900 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3901 using _TW = _SimdWrapper<_Tp, _Np>;
3902 using _UW = _SimdWrapper<_Up, _ToN>;
3903 using _UI = __intrinsic_type_t<_Up, _ToN>;
3904 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3905 return _S_to_maskvector<_Up, _ToN>(
3906 _BitMask<_Np>(__x._M_data)._M_sanitized());
3907 // vector -> vector bitcast
3908 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3909 && sizeof(_TW) == sizeof(_UW))
3910 return __wrapper_bitcast<_Up, _ToN>(
3911 _ToN <= _Np
3912 ? __x
3913 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3914 else // vector -> vector {{{
3915 {
3916 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3917 {
3918 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3919 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3920 __vector_type_t<_Up, _ToN>>(
3921 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3922 }
3923 using _To = __vector_type_t<_Up, _ToN>;
3924 [[maybe_unused]] constexpr size_t _FromN = _Np;
3925 constexpr int _FromBytes = sizeof(_Tp);
3926 constexpr int _ToBytes = sizeof(_Up);
3927 const auto __k = __x._M_data;
3928
3929 if constexpr (_FromBytes == _ToBytes)
3930 return __intrin_bitcast<_To>(__k);
3931 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3932 { // SSE -> SSE {{{
3933 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3934 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3935 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3936 {
3937 const auto __y
3938 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3939 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3940 }
3941 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3942 {
3943 auto __y
3944 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3945 auto __z
3946 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3947 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3948 }
3949 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3950 && __have_sse2)
3951 return __intrin_bitcast<_To>(
3952 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3953 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3954 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3955 _UI());
3956 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3957 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3958 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3959 {
3960 const auto __y
3961 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3962 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3963 }
3964 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3965 {
3966 if constexpr (__have_sse2 && !__have_ssse3)
3967 return __intrin_bitcast<_To>(_mm_packs_epi32(
3968 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3969 __m128i()));
3970 else
3971 return __intrin_bitcast<_To>(
3972 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3973 __vector_bitcast<_Up>(__k)));
3974 }
3975 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3976 return __intrin_bitcast<_To>(
3977 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3978 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3979 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3980 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3981 && __have_ssse3)
3982 return __intrin_bitcast<_To>(
3983 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3984 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3985 -1, -1, -1, -1, -1, -1, -1,
3986 -1)));
3987 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3988 {
3989 auto __y
3990 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3991 __y = _mm_packs_epi32(__y, __m128i());
3992 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3993 }
3994 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3995 && __have_ssse3)
3996 return __intrin_bitcast<_To>(
3997 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3998 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3999 -1, -1, -1, -1, -1, -1, -1,
4000 -1)));
4001 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4002 {
4003 const auto __y
4004 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
4005 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
4006 }
4007 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
4008 return __intrin_bitcast<_To>(
4009 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
4010 else
4011 __assert_unreachable<_Tp>();
4012 } // }}}
4013 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
4014 { // AVX -> AVX {{{
4015 if constexpr (_FromBytes == _ToBytes)
4016 __assert_unreachable<_Tp>();
4017 else if constexpr (_FromBytes == _ToBytes * 2)
4018 {
4019 const auto __y = __vector_bitcast<_LLong>(__k);
4020 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4021 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
4022 }
4023 else if constexpr (_FromBytes == _ToBytes * 4)
4024 {
4025 const auto __y = __vector_bitcast<_LLong>(__k);
4026 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4027 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4028 __m128i())));
4029 }
4030 else if constexpr (_FromBytes == _ToBytes * 8)
4031 {
4032 const auto __y = __vector_bitcast<_LLong>(__k);
4033 return __intrin_bitcast<_To>(
4034 _mm256_castsi128_si256(_mm_shuffle_epi8(
4035 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4036 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
4037 -1, -1, -1, -1, -1))));
4038 }
4039 else if constexpr (_FromBytes * 2 == _ToBytes)
4040 {
4041 auto __y = __xzyw(__to_intrin(__k));
4042 if constexpr (is_floating_point_v<
4043 _Tp> || (!__have_avx2 && _FromBytes == 4))
4044 {
4045 const auto __yy = __vector_bitcast<float>(__y);
4046 return __intrin_bitcast<_To>(
4047 _mm256_unpacklo_ps(__yy, __yy));
4048 }
4049 else
4050 return __intrin_bitcast<_To>(
4051 _mm256_unpacklo_epi8(__y, __y));
4052 }
4053 else if constexpr (_FromBytes * 4 == _ToBytes)
4054 {
4055 auto __y
4056 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4057 __lo128(__vector_bitcast<_LLong>(
4058 __k))); // drops 3/4 of input
4059 return __intrin_bitcast<_To>(
4060 __concat(_mm_unpacklo_epi16(__y, __y),
4061 _mm_unpackhi_epi16(__y, __y)));
4062 }
4063 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
4064 {
4065 auto __y
4066 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4067 __lo128(__vector_bitcast<_LLong>(
4068 __k))); // drops 3/4 of input
4069 __y
4070 = _mm_unpacklo_epi16(__y,
4071 __y); // drops another 1/2 => 7/8 total
4072 return __intrin_bitcast<_To>(
4073 __concat(_mm_unpacklo_epi32(__y, __y),
4074 _mm_unpackhi_epi32(__y, __y)));
4075 }
4076 else
4077 __assert_unreachable<_Tp>();
4078 } // }}}
4079 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
4080 { // SSE -> AVX {{{
4081 if constexpr (_FromBytes == _ToBytes)
4082 return __intrin_bitcast<_To>(
4083 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
4084 __zero_extend(__to_intrin(__k))));
4085 else if constexpr (_FromBytes * 2 == _ToBytes)
4086 { // keep all
4087 return __intrin_bitcast<_To>(
4088 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
4089 __vector_bitcast<_LLong>(__k)),
4090 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
4091 __vector_bitcast<_LLong>(__k))));
4092 }
4093 else if constexpr (_FromBytes * 4 == _ToBytes)
4094 {
4095 if constexpr (__have_avx2)
4096 {
4097 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4098 __concat(__vector_bitcast<_LLong>(__k),
4099 __vector_bitcast<_LLong>(__k)),
4100 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
4101 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
4102 6, 6, 7, 7, 7, 7)));
4103 }
4104 else
4105 {
4106 return __intrin_bitcast<_To>(__concat(
4107 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4108 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
4109 2, 2, 2, 2, 3, 3, 3, 3)),
4110 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4111 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
4112 6, 6, 6, 6, 7, 7, 7,
4113 7))));
4114 }
4115 }
4116 else if constexpr (_FromBytes * 8 == _ToBytes)
4117 {
4118 if constexpr (__have_avx2)
4119 {
4120 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4121 __concat(__vector_bitcast<_LLong>(__k),
4122 __vector_bitcast<_LLong>(__k)),
4123 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
4124 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4125 3, 3, 3, 3, 3, 3)));
4126 }
4127 else
4128 {
4129 return __intrin_bitcast<_To>(__concat(
4130 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4131 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4132 1, 1, 1, 1, 1, 1, 1, 1)),
4133 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4134 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4135 3, 3, 3, 3, 3, 3, 3,
4136 3))));
4137 }
4138 }
4139 else if constexpr (_FromBytes == _ToBytes * 2)
4140 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4141 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4142 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4143 {
4144 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4145 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4146 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4147 -1, -1, -1, -1, -1, -1, -1,
4148 -1)))));
4149 }
4150 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4151 {
4152 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4153 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4154 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4155 -1, -1, -1, -1, -1, -1, -1,
4156 -1)))));
4157 }
4158 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4159 {
4160 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4161 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4162 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4163 -1, -1, -1, -1, -1, -1, -1,
4164 -1, -1)))));
4165 }
4166 else
4167 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4168 } // }}}
4169 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4170 { // AVX -> SSE {{{
4171 if constexpr (_FromBytes == _ToBytes)
4172 { // keep low 1/2
4173 return __intrin_bitcast<_To>(__lo128(__k));
4174 }
4175 else if constexpr (_FromBytes == _ToBytes * 2)
4176 { // keep all
4177 auto __y = __vector_bitcast<_LLong>(__k);
4178 return __intrin_bitcast<_To>(
4179 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4180 }
4181 else if constexpr (_FromBytes == _ToBytes * 4)
4182 { // add 1/2 undef
4183 auto __y = __vector_bitcast<_LLong>(__k);
4184 return __intrin_bitcast<_To>(
4185 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4186 __m128i()));
4187 }
4188 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4189 { // add 3/4 undef
4190 auto __y = __vector_bitcast<_LLong>(__k);
4191 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4192 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4193 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4194 -1, -1, -1, -1)));
4195 }
4196 else if constexpr (_FromBytes * 2 == _ToBytes)
4197 { // keep low 1/4
4198 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4199 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4200 }
4201 else if constexpr (_FromBytes * 4 == _ToBytes)
4202 { // keep low 1/8
4203 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4204 __y = _mm_unpacklo_epi8(__y, __y);
4205 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4206 }
4207 else if constexpr (_FromBytes * 8 == _ToBytes)
4208 { // keep low 1/16
4209 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4210 __y = _mm_unpacklo_epi8(__y, __y);
4211 __y = _mm_unpacklo_epi8(__y, __y);
4212 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4213 }
4214 else
4215 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4216 } // }}}
4217 else
4218 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4219 /*
4220 if constexpr (_FromBytes > _ToBytes) {
4221 const _To __y = __vector_bitcast<_Up>(__k);
4222 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4223 constexpr int _Stride = _FromBytes / _ToBytes;
4224 return _To{__y[(_Is + 1) * _Stride - 1]...};
4225 }(make_index_sequence<std::min(_ToN, _FromN)>());
4226 } else {
4227 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4228 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4229 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4230 // ...
4231 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4232 constexpr int __dup = _ToBytes / _FromBytes;
4233 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4234 }(make_index_sequence<_FromN>());
4235 }
4236 */
4237 } // }}}
4238 }
4239
4240 // }}}
4241 // _S_to_bits {{{
4242 template <typename _Tp, size_t _Np>
4243 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4244 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4245 {
4246 if constexpr (is_same_v<_Tp, bool>)
4247 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4248 else
4249 {
4250 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4251 if (__builtin_is_constant_evaluated()
4252 || __builtin_constant_p(__x._M_data))
4253 {
4254 const auto __bools = -__x._M_data;
4255 const _ULLong __k = __call_with_n_evaluations<_Np>(
4256 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4257 return (__bits | ...);
4258 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4259 return _ULLong(__bools[+__i]) << __i;
4260 });
4261 if (__builtin_is_constant_evaluated()
4262 || __builtin_constant_p(__k))
4263 return __k;
4264 }
4265 const auto __xi = __to_intrin(__x);
4266 if constexpr (sizeof(_Tp) == 1)
4267 if constexpr (sizeof(__xi) == 16)
4268 if constexpr (__have_avx512bw_vl)
4269 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4270 else // implies SSE2
4271 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4272 else if constexpr (sizeof(__xi) == 32)
4273 if constexpr (__have_avx512bw_vl)
4274 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4275 else // implies AVX2
4276 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4277 else // implies AVX512BW
4278 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4279
4280 else if constexpr (sizeof(_Tp) == 2)
4281 if constexpr (sizeof(__xi) == 16)
4282 if constexpr (__have_avx512bw_vl)
4283 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4284 else if constexpr (__have_avx512bw)
4285 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4286 else // implies SSE2
4287 return _BitMask<_Np>(
4288 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4289 else if constexpr (sizeof(__xi) == 32)
4290 if constexpr (__have_avx512bw_vl)
4291 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4292 else if constexpr (__have_avx512bw)
4293 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4294 else // implies SSE2
4295 return _BitMask<_Np>(_mm_movemask_epi8(
4296 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4297 else // implies AVX512BW
4298 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4299
4300 else if constexpr (sizeof(_Tp) == 4)
4301 if constexpr (sizeof(__xi) == 16)
4302 if constexpr (__have_avx512dq_vl)
4303 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4304 else if constexpr (__have_avx512vl)
4305 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4306 else if constexpr (__have_avx512dq)
4307 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4308 else if constexpr (__have_avx512f)
4309 return _BitMask<_Np>(
4310 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4311 else // implies SSE
4312 return _BitMask<_Np>(
4313 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4314 else if constexpr (sizeof(__xi) == 32)
4315 if constexpr (__have_avx512dq_vl)
4316 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4317 else if constexpr (__have_avx512dq)
4318 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4319 else if constexpr (__have_avx512vl)
4320 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4321 else if constexpr (__have_avx512f)
4322 return _BitMask<_Np>(
4323 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4324 else // implies AVX
4325 return _BitMask<_Np>(
4326 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4327 else // implies AVX512??
4328 if constexpr (__have_avx512dq)
4329 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4330 else // implies AVX512F
4331 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4332
4333 else if constexpr (sizeof(_Tp) == 8)
4334 if constexpr (sizeof(__xi) == 16)
4335 if constexpr (__have_avx512dq_vl)
4336 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4337 else if constexpr (__have_avx512dq)
4338 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4339 else if constexpr (__have_avx512vl)
4340 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4341 else if constexpr (__have_avx512f)
4342 return _BitMask<_Np>(
4343 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4344 else // implies SSE2
4345 return _BitMask<_Np>(
4346 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4347 else if constexpr (sizeof(__xi) == 32)
4348 if constexpr (__have_avx512dq_vl)
4349 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4350 else if constexpr (__have_avx512dq)
4351 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4352 else if constexpr (__have_avx512vl)
4353 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4354 else if constexpr (__have_avx512f)
4355 return _BitMask<_Np>(
4356 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4357 else // implies AVX
4358 return _BitMask<_Np>(
4359 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4360 else // implies AVX512??
4361 if constexpr (__have_avx512dq)
4362 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4363 else // implies AVX512F
4364 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4365
4366 else
4367 __assert_unreachable<_Tp>();
4368 }
4369 }
4370 // }}}
4371};
4372
4373// }}}
4374// _MaskImplX86 {{{
4375template <typename _Abi, typename>
4376 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4377 {
4378 using _MaskImplX86Mixin::_S_to_bits;
4379 using _MaskImplX86Mixin::_S_to_maskvector;
4380 using _MaskImplBuiltin<_Abi>::_S_convert;
4381
4382 // member types {{{
4383 template <typename _Tp>
4384 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4385
4386 template <typename _Tp>
4387 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4388
4389 template <typename _Tp>
4390 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4391
4392 using _Base = _MaskImplBuiltin<_Abi>;
4393
4394 // }}}
4395 // _S_broadcast {{{
4396 template <typename _Tp>
4397 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4398 _S_broadcast(bool __x)
4399 {
4400 if constexpr (__is_avx512_abi<_Abi>())
4401 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4402 : _MaskMember<_Tp>();
4403 else
4404 return _Base::template _S_broadcast<_Tp>(__x);
4405 }
4406
4407 // }}}
4408 // _S_load {{{
4409 template <typename _Tp>
4410 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4411 _S_load(const bool* __mem)
4412 {
4413 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4414 if (__builtin_is_constant_evaluated())
4415 {
4416 if constexpr (__is_avx512_abi<_Abi>())
4417 {
4418 _MaskMember<_Tp> __r{};
4419 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i)
4420 __r._M_data |= _ULLong(__mem[__i]) << __i;
4421 return __r;
4422 }
4423 else
4424 return _Base::template _S_load<_Tp>(__mem);
4425 }
4426 else if constexpr (__have_avx512bw)
4427 {
4428 const auto __to_vec_or_bits
4429 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4430 if constexpr (__is_avx512_abi<_Abi>())
4431 return __bits;
4432 else
4433 return _S_to_maskvector<_Tp>(
4434 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4435 };
4436
4437 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4438 {
4439 __m128i __a = {};
4440 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4441 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4442 }
4443 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4444 {
4445 __m256i __a = {};
4446 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4447 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4448 }
4449 else if constexpr (_S_size<_Tp> <= 64)
4450 {
4451 __m512i __a = {};
4452 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4453 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4454 }
4455 }
4456 else if constexpr (__is_avx512_abi<_Abi>())
4457 {
4458 if constexpr (_S_size<_Tp> <= 8)
4459 {
4460 __m128i __a = {};
4461 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4462 const auto __b = _mm512_cvtepi8_epi64(__a);
4463 return _mm512_test_epi64_mask(__b, __b);
4464 }
4465 else if constexpr (_S_size<_Tp> <= 16)
4466 {
4467 __m128i __a = {};
4468 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4469 const auto __b = _mm512_cvtepi8_epi32(__a);
4470 return _mm512_test_epi32_mask(__b, __b);
4471 }
4472 else if constexpr (_S_size<_Tp> <= 32)
4473 {
4474 __m128i __a = {};
4475 __builtin_memcpy(&__a, __mem, 16);
4476 const auto __b = _mm512_cvtepi8_epi32(__a);
4477 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4478 const auto __c = _mm512_cvtepi8_epi32(__a);
4479 return _mm512_test_epi32_mask(__b, __b)
4480 | (_mm512_test_epi32_mask(__c, __c) << 16);
4481 }
4482 else if constexpr (_S_size<_Tp> <= 64)
4483 {
4484 __m128i __a = {};
4485 __builtin_memcpy(&__a, __mem, 16);
4486 const auto __b = _mm512_cvtepi8_epi32(__a);
4487 __builtin_memcpy(&__a, __mem + 16, 16);
4488 const auto __c = _mm512_cvtepi8_epi32(__a);
4489 if constexpr (_S_size<_Tp> <= 48)
4490 {
4491 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4492 const auto __d = _mm512_cvtepi8_epi32(__a);
4493 return _mm512_test_epi32_mask(__b, __b)
4494 | (_mm512_test_epi32_mask(__c, __c) << 16)
4495 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4496 }
4497 else
4498 {
4499 __builtin_memcpy(&__a, __mem + 16, 16);
4500 const auto __d = _mm512_cvtepi8_epi32(__a);
4501 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4502 const auto __e = _mm512_cvtepi8_epi32(__a);
4503 return _mm512_test_epi32_mask(__b, __b)
4504 | (_mm512_test_epi32_mask(__c, __c) << 16)
4505 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4506 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4507 }
4508 }
4509 else
4510 __assert_unreachable<_Tp>();
4511 }
4512 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4513 return __vector_bitcast<_Tp>(
4514 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4515 -int(__mem[1]), -int(__mem[1])});
4516 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4517 {
4518 int __bool4 = 0;
4519 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4520 const auto __k = __to_intrin(
4521 (__vector_broadcast<4>(__bool4)
4522 & __make_vector<int>(0x1, 0x100, 0x10000,
4523 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4524 != 0);
4525 return __vector_bitcast<_Tp>(
4526 __concat(_mm_unpacklo_epi32(__k, __k),
4527 _mm_unpackhi_epi32(__k, __k)));
4528 }
4529 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4530 {
4531 int __bools = 0;
4532 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4533 if constexpr (__have_sse2)
4534 {
4535 __m128i __k = _mm_cvtsi32_si128(__bools);
4536 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4537 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4538 _mm_unpacklo_epi16(__k, __k));
4539 }
4540 else
4541 {
4542 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4543 _mm_empty();
4544 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4545 _mm_cmpgt_ps(__k, __m128()));
4546 }
4547 }
4548 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4549 {
4550 __m128i __k = {};
4551 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4552 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4553 return __vector_bitcast<_Tp>(
4554 __concat(_mm_unpacklo_epi16(__k, __k),
4555 _mm_unpackhi_epi16(__k, __k)));
4556 }
4557 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4558 {
4559 __m128i __k = {};
4560 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4561 __k = _mm_cmpgt_epi8(__k, __m128i());
4562 if constexpr (_S_size<_Tp> <= 8)
4563 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4564 _mm_unpacklo_epi8(__k, __k));
4565 else
4566 return __concat(_mm_unpacklo_epi8(__k, __k),
4567 _mm_unpackhi_epi8(__k, __k));
4568 }
4569 else
4570 return _Base::template _S_load<_Tp>(__mem);
4571 }
4572
4573 // }}}
4574 // _S_from_bitmask{{{
4575 template <size_t _Np, typename _Tp>
4576 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4577 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4578 {
4579 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4580 if constexpr (__is_avx512_abi<_Abi>())
4581 return __bits._M_to_bits();
4582 else
4583 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4584 }
4585
4586 // }}}
4587 // _S_masked_load {{{2
4588 template <typename _Tp, size_t _Np>
4589 static inline _SimdWrapper<_Tp, _Np>
4590 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4591 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4592 {
4593 if constexpr (__is_avx512_abi<_Abi>())
4594 {
4595 if constexpr (__have_avx512bw_vl)
4596 {
4597 if constexpr (_Np <= 16)
4598 {
4599 const auto __a
4600 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4601 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4602 }
4603 else if constexpr (_Np <= 32)
4604 {
4605 const auto __a
4606 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4607 return (__merge & ~__mask)
4608 | _mm256_test_epi8_mask(__a, __a);
4609 }
4610 else if constexpr (_Np <= 64)
4611 {
4612 const auto __a
4613 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4614 return (__merge & ~__mask)
4615 | _mm512_test_epi8_mask(__a, __a);
4616 }
4617 else
4618 __assert_unreachable<_Tp>();
4619 }
4620 else
4621 {
4622 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4623 __merge._M_set(__i, __mem[__i]);
4624 });
4625 return __merge;
4626 }
4627 }
4628 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4629 {
4630 const auto __k = _S_to_bits(__mask)._M_to_bits();
4631 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4632 _mm256_mask_loadu_epi8(__m256i(),
4633 __k, __mem));
4634 }
4635 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4636 {
4637 const auto __k = _S_to_bits(__mask)._M_to_bits();
4638 __merge
4639 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4640 __m128i(),
4641 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4642 }
4643 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4644 {
4645 const auto __k = _S_to_bits(__mask)._M_to_bits();
4646 __merge = _mm256_mask_sub_epi16(
4647 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4648 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4649 }
4650 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4651 {
4652 const auto __k = _S_to_bits(__mask)._M_to_bits();
4653 __merge = _mm_mask_sub_epi16(
4654 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4655 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4656 }
4657 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4658 {
4659 const auto __k = _S_to_bits(__mask)._M_to_bits();
4660 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4661 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4662 _mm256_cvtepi8_epi32(
4663 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4664 }
4665 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4666 {
4667 const auto __k = _S_to_bits(__mask)._M_to_bits();
4668 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4669 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4670 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4671 }
4672 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4673 {
4674 const auto __k = _S_to_bits(__mask)._M_to_bits();
4675 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4676 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4677 _mm256_cvtepi8_epi64(
4678 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4679 }
4680 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4681 {
4682 const auto __k = _S_to_bits(__mask)._M_to_bits();
4683 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4684 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4685 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4686 }
4687 else
4688 return _Base::_S_masked_load(__merge, __mask, __mem);
4689 return __merge;
4690 }
4691
4692 // _S_store {{{2
4693 template <typename _Tp, size_t _Np>
4694 _GLIBCXX_SIMD_INTRINSIC static constexpr void
4695 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
4696 {
4697 if (__builtin_is_constant_evaluated())
4698 _Base::_S_store(__v, __mem);
4699 else if constexpr (__is_avx512_abi<_Abi>())
4700 {
4701 if constexpr (__have_avx512bw_vl)
4702 _CommonImplX86::_S_store<_Np>(
4703 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4704 if constexpr (_Np <= 16)
4705 return _mm_maskz_set1_epi8(__data, 1);
4706 else if constexpr (_Np <= 32)
4707 return _mm256_maskz_set1_epi8(__data, 1);
4708 else
4709 return _mm512_maskz_set1_epi8(__data, 1);
4710 }(__v._M_data)),
4711 __mem);
4712 else if constexpr (_Np <= 8)
4713 _CommonImplX86::_S_store<_Np>(
4714 __vector_bitcast<char>(
4715#if defined __x86_64__
4716 __make_wrapper<_ULLong>(
4717 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4718#else
4719 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4720 _pdep_u32(__v._M_data >> 4,
4721 0x01010101U))
4722#endif
4723 ),
4724 __mem);
4725 else if constexpr (_Np <= 16)
4726 _mm512_mask_cvtepi32_storeu_epi8(
4727 __mem, 0xffffu >> (16 - _Np),
4728 _mm512_maskz_set1_epi32(__v._M_data, 1));
4729 else
4730 __assert_unreachable<_Tp>();
4731 }
4732 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4733 {
4734 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4735 {
4736 const auto __k = __vector_bitcast<int>(__v);
4737 __mem[0] = -__k[1];
4738 __mem[1] = -__k[3];
4739 }
4740 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4741 {
4742 if constexpr (__have_sse2)
4743 {
4744 const unsigned __bool4
4745 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4746 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4747 __to_intrin(__v)),
4748 __m128i()),
4749 __m128i()))[0]
4750 & 0x01010101u;
4751 __builtin_memcpy(__mem, &__bool4, _Np);
4752 }
4753 else if constexpr (__have_mmx)
4754 {
4755 const __m64 __k = _mm_cvtps_pi8(
4756 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4757 __builtin_memcpy(__mem, &__k, _Np);
4758 _mm_empty();
4759 }
4760 else
4761 return _Base::_S_store(__v, __mem);
4762 }
4763 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4764 {
4765 _CommonImplX86::_S_store<_Np>(
4766 __vector_bitcast<char>(_mm_packs_epi16(
4767 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4768 __m128i())),
4769 __mem);
4770 }
4771 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4772 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4773 else
4774 __assert_unreachable<_Tp>();
4775 } // }}}
4776 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4777 {
4778 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4779 {
4780 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4781 int __bool4{};
4782 if constexpr (__have_avx2)
4783 __bool4 = _mm256_movemask_epi8(__k);
4784 else
4785 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4786 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4787 __bool4 &= 0x01010101;
4788 __builtin_memcpy(__mem, &__bool4, _Np);
4789 }
4790 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4791 {
4792 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4793 const auto __k2
4794 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4795 15);
4796 const auto __k3
4797 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4798 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4799 }
4800 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4801 {
4802 if constexpr (__have_avx2)
4803 {
4804 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4805 const auto __bools = __vector_bitcast<char>(
4806 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4807 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4808 }
4809 else
4810 {
4811 const auto __bools
4812 = 1
4813 & __vector_bitcast<_UChar>(
4814 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4815 __hi128(__to_intrin(__v))));
4816 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4817 }
4818 }
4819 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4820 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4821 else
4822 __assert_unreachable<_Tp>();
4823 } // }}}
4824 else
4825 __assert_unreachable<_Tp>();
4826 }
4827
4828 // _S_masked_store {{{2
4829 template <typename _Tp, size_t _Np>
4830 static inline void
4831 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4832 const _SimdWrapper<_Tp, _Np> __k) noexcept
4833 {
4834 if constexpr (__is_avx512_abi<_Abi>())
4835 {
4836 static_assert(is_same_v<_Tp, bool>);
4837 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4838 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4839 else if constexpr (_Np <= 16)
4840 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4841 _mm512_maskz_set1_epi32(__v, 1));
4842 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4843 _mm256_mask_storeu_epi8(__mem, __k,
4844 _mm256_maskz_set1_epi8(__v, 1));
4845 else if constexpr (_Np <= 32 && __have_avx512bw)
4846 _mm256_mask_storeu_epi8(__mem, __k,
4847 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4848 else if constexpr (_Np <= 64 && __have_avx512bw)
4849 _mm512_mask_storeu_epi8(__mem, __k,
4850 _mm512_maskz_set1_epi8(__v, 1));
4851 else
4852 __assert_unreachable<_Tp>();
4853 }
4854 else
4855 _Base::_S_masked_store(__v, __mem, __k);
4856 }
4857
4858 // logical and bitwise operators {{{2
4859 template <typename _Tp, size_t _Np>
4860 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4861 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4862 {
4863 if constexpr (is_same_v<_Tp, bool>)
4864 {
4865 if (__builtin_is_constant_evaluated())
4866 return __x._M_data & __y._M_data;
4867 else if constexpr (__have_avx512dq && _Np <= 8)
4868 return _kand_mask8(__x._M_data, __y._M_data);
4869 else if constexpr (_Np <= 16)
4870 return _kand_mask16(__x._M_data, __y._M_data);
4871 else if constexpr (__have_avx512bw && _Np <= 32)
4872 return _kand_mask32(__x._M_data, __y._M_data);
4873 else if constexpr (__have_avx512bw && _Np <= 64)
4874 return _kand_mask64(__x._M_data, __y._M_data);
4875 else
4876 __assert_unreachable<_Tp>();
4877 }
4878 else
4879 return _Base::_S_logical_and(__x, __y);
4880 }
4881
4882 template <typename _Tp, size_t _Np>
4883 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4884 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4885 {
4886 if constexpr (is_same_v<_Tp, bool>)
4887 {
4888 if (__builtin_is_constant_evaluated())
4889 return __x._M_data | __y._M_data;
4890 else if constexpr (__have_avx512dq && _Np <= 8)
4891 return _kor_mask8(__x._M_data, __y._M_data);
4892 else if constexpr (_Np <= 16)
4893 return _kor_mask16(__x._M_data, __y._M_data);
4894 else if constexpr (__have_avx512bw && _Np <= 32)
4895 return _kor_mask32(__x._M_data, __y._M_data);
4896 else if constexpr (__have_avx512bw && _Np <= 64)
4897 return _kor_mask64(__x._M_data, __y._M_data);
4898 else
4899 __assert_unreachable<_Tp>();
4900 }
4901 else
4902 return _Base::_S_logical_or(__x, __y);
4903 }
4904
4905 template <typename _Tp, size_t _Np>
4906 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4907 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4908 {
4909 if constexpr (is_same_v<_Tp, bool>)
4910 {
4911 if (__builtin_is_constant_evaluated())
4912 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>();
4913 else if constexpr (__have_avx512dq && _Np <= 8)
4914 return _kandn_mask8(__x._M_data,
4915 _Abi::template __implicit_mask_n<_Np>());
4916 else if constexpr (_Np <= 16)
4917 return _kandn_mask16(__x._M_data,
4918 _Abi::template __implicit_mask_n<_Np>());
4919 else if constexpr (__have_avx512bw && _Np <= 32)
4920 return _kandn_mask32(__x._M_data,
4921 _Abi::template __implicit_mask_n<_Np>());
4922 else if constexpr (__have_avx512bw && _Np <= 64)
4923 return _kandn_mask64(__x._M_data,
4924 _Abi::template __implicit_mask_n<_Np>());
4925 else
4926 __assert_unreachable<_Tp>();
4927 }
4928 else
4929 return _Base::_S_bit_not(__x);
4930 }
4931
4932 template <typename _Tp, size_t _Np>
4933 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4934 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4935 {
4936 if constexpr (is_same_v<_Tp, bool>)
4937 {
4938 if (__builtin_is_constant_evaluated())
4939 return __x._M_data & __y._M_data;
4940 else if constexpr (__have_avx512dq && _Np <= 8)
4941 return _kand_mask8(__x._M_data, __y._M_data);
4942 else if constexpr (_Np <= 16)
4943 return _kand_mask16(__x._M_data, __y._M_data);
4944 else if constexpr (__have_avx512bw && _Np <= 32)
4945 return _kand_mask32(__x._M_data, __y._M_data);
4946 else if constexpr (__have_avx512bw && _Np <= 64)
4947 return _kand_mask64(__x._M_data, __y._M_data);
4948 else
4949 __assert_unreachable<_Tp>();
4950 }
4951 else
4952 return _Base::_S_bit_and(__x, __y);
4953 }
4954
4955 template <typename _Tp, size_t _Np>
4956 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4957 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4958 {
4959 if constexpr (is_same_v<_Tp, bool>)
4960 {
4961 if (__builtin_is_constant_evaluated())
4962 return __x._M_data | __y._M_data;
4963 else if constexpr (__have_avx512dq && _Np <= 8)
4964 return _kor_mask8(__x._M_data, __y._M_data);
4965 else if constexpr (_Np <= 16)
4966 return _kor_mask16(__x._M_data, __y._M_data);
4967 else if constexpr (__have_avx512bw && _Np <= 32)
4968 return _kor_mask32(__x._M_data, __y._M_data);
4969 else if constexpr (__have_avx512bw && _Np <= 64)
4970 return _kor_mask64(__x._M_data, __y._M_data);
4971 else
4972 __assert_unreachable<_Tp>();
4973 }
4974 else
4975 return _Base::_S_bit_or(__x, __y);
4976 }
4977
4978 template <typename _Tp, size_t _Np>
4979 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4980 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4981 {
4982 if constexpr (is_same_v<_Tp, bool>)
4983 {
4984 if (__builtin_is_constant_evaluated())
4985 return __x._M_data ^ __y._M_data;
4986 else if constexpr (__have_avx512dq && _Np <= 8)
4987 return _kxor_mask8(__x._M_data, __y._M_data);
4988 else if constexpr (_Np <= 16)
4989 return _kxor_mask16(__x._M_data, __y._M_data);
4990 else if constexpr (__have_avx512bw && _Np <= 32)
4991 return _kxor_mask32(__x._M_data, __y._M_data);
4992 else if constexpr (__have_avx512bw && _Np <= 64)
4993 return _kxor_mask64(__x._M_data, __y._M_data);
4994 else
4995 __assert_unreachable<_Tp>();
4996 }
4997 else
4998 return _Base::_S_bit_xor(__x, __y);
4999 }
5000
5001 //}}}2
5002 // _S_masked_assign{{{
5003 template <size_t _Np>
5004 _GLIBCXX_SIMD_INTRINSIC static void
5005 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5006 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs)
5007 {
5008 __lhs._M_data
5009 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
5010 }
5011
5012 template <size_t _Np>
5013 _GLIBCXX_SIMD_INTRINSIC static void
5014 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5015 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
5016 {
5017 if (__rhs)
5018 __lhs._M_data = __k._M_data | __lhs._M_data;
5019 else
5020 __lhs._M_data = ~__k._M_data & __lhs._M_data;
5021 }
5022
5023 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
5024
5025 //}}}
5026 // _S_all_of {{{
5027 template <typename _Tp>
5028 _GLIBCXX_SIMD_INTRINSIC static bool
5029 _S_all_of(simd_mask<_Tp, _Abi> __k)
5030 {
5031 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5032 {
5033 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5034 using _TI = __intrinsic_type_t<_Tp, _Np>;
5035 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5036 if constexpr (__have_sse4_1)
5037 {
5038 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5039 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5040 return 0 != __testc(__a, __b);
5041 }
5042 else if constexpr (__is_x86_ps<_Tp> ())
5043 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
5044 == (1 << _Np) - 1;
5045 else if constexpr (__is_x86_pd<_Tp> ())
5046 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
5047 == (1 << _Np) - 1;
5048 else
5049 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5050 == (1 << (_Np * sizeof(_Tp))) - 1;
5051 }
5052 else if constexpr (__is_avx512_abi<_Abi>())
5053 {
5054 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
5055 const auto __kk = __k._M_data._M_data;
5056 if constexpr (sizeof(__kk) == 1)
5057 {
5058 if constexpr (__have_avx512dq)
5059 return _kortestc_mask8_u8(__kk, _Mask == 0xff
5060 ? __kk
5061 : __mmask8(~_Mask));
5062 else
5063 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
5064 }
5065 else if constexpr (sizeof(__kk) == 2)
5066 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
5067 ? __kk
5068 : __mmask16(~_Mask));
5069 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
5070 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
5071 ? __kk
5072 : __mmask32(~_Mask));
5073 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
5074 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
5075 ? __kk
5076 : __mmask64(~_Mask));
5077 else
5078 __assert_unreachable<_Tp>();
5079 }
5080 }
5081
5082 // }}}
5083 // _S_any_of {{{
5084 template <typename _Tp>
5085 _GLIBCXX_SIMD_INTRINSIC static bool
5086 _S_any_of(simd_mask<_Tp, _Abi> __k)
5087 {
5088 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5089 {
5090 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5091 using _TI = __intrinsic_type_t<_Tp, _Np>;
5092 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5093 if constexpr (__have_sse4_1)
5094 {
5095 if constexpr (_Abi::template _S_is_partial<
5096 _Tp> || sizeof(__k) < 16)
5097 {
5098 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5099 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5100 return 0 == __testz(__a, __b);
5101 }
5102 else
5103 return 0 == __testz(__a, __a);
5104 }
5105 else if constexpr (__is_x86_ps<_Tp> ())
5106 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
5107 else if constexpr (__is_x86_pd<_Tp> ())
5108 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
5109 else
5110 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5111 != 0;
5112 }
5113 else if constexpr (__is_avx512_abi<_Abi>())
5114 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5115 != 0;
5116 }
5117
5118 // }}}
5119 // _S_none_of {{{
5120 template <typename _Tp>
5121 _GLIBCXX_SIMD_INTRINSIC static bool
5122 _S_none_of(simd_mask<_Tp, _Abi> __k)
5123 {
5124 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5125 {
5126 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5127 using _TI = __intrinsic_type_t<_Tp, _Np>;
5128 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5129 if constexpr (__have_sse4_1)
5130 {
5131 if constexpr (_Abi::template _S_is_partial<
5132 _Tp> || sizeof(__k) < 16)
5133 {
5134 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5135 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5136 return 0 != __testz(__a, __b);
5137 }
5138 else
5139 return 0 != __testz(__a, __a);
5140 }
5141 else if constexpr (__is_x86_ps<_Tp> ())
5142 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5143 else if constexpr (__is_x86_pd<_Tp> ())
5144 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5145 else
5146 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5147 == 0;
5148 }
5149 else if constexpr (__is_avx512_abi<_Abi>())
5150 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5151 == 0;
5152 }
5153
5154 // }}}
5155 // _S_some_of {{{
5156 template <typename _Tp>
5157 _GLIBCXX_SIMD_INTRINSIC static bool
5158 _S_some_of(simd_mask<_Tp, _Abi> __k)
5159 {
5160 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5161 {
5162 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5163 using _TI = __intrinsic_type_t<_Tp, _Np>;
5164 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5165 if constexpr (__have_sse4_1)
5166 {
5167 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5168 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5169 return 0 != __testnzc(__a, __b);
5170 }
5171 else if constexpr (__is_x86_ps<_Tp> ())
5172 {
5173 constexpr int __allbits = (1 << _Np) - 1;
5174 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5175 return __tmp > 0 && __tmp < __allbits;
5176 }
5177 else if constexpr (__is_x86_pd<_Tp> ())
5178 {
5179 constexpr int __allbits = (1 << _Np) - 1;
5180 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5181 return __tmp > 0 && __tmp < __allbits;
5182 }
5183 else
5184 {
5185 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5186 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5187 return __tmp > 0 && __tmp < __allbits;
5188 }
5189 }
5190 else if constexpr (__is_avx512_abi<_Abi>())
5191 return _S_any_of(__k) && !_S_all_of(__k);
5192 else
5193 __assert_unreachable<_Tp>();
5194 }
5195
5196 // }}}
5197 // _S_popcount {{{
5198 template <typename _Tp>
5199 _GLIBCXX_SIMD_INTRINSIC static int
5200 _S_popcount(simd_mask<_Tp, _Abi> __k)
5201 {
5202 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5203 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5204 if constexpr (__is_avx512_abi<_Abi>())
5205 {
5206 if constexpr (_Np > 32)
5207 return __builtin_popcountll(__kk);
5208 else
5209 return __builtin_popcount(__kk);
5210 }
5211 else
5212 {
5213 if constexpr (__have_popcnt)
5214 {
5215 int __bits
5216 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5217 const int __count = __builtin_popcount(__bits);
5218 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5219 }
5220 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5221 {
5222 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5223 return mask - (mask >> 1);
5224 }
5225 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5226 {
5227 auto __x = -(__lo128(__kk) + __hi128(__kk));
5228 return __x[0] + __x[1];
5229 }
5230 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5231 {
5232 if constexpr (__have_sse2)
5233 {
5234 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5235 __x = _mm_add_epi32(
5236 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5237 __x = _mm_add_epi32(
5238 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5239 return -_mm_cvtsi128_si32(__x);
5240 }
5241 else
5242 return __builtin_popcount(
5243 _mm_movemask_ps(__auto_bitcast(__kk)));
5244 }
5245 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5246 {
5247 auto __x = __to_intrin(__kk);
5248 __x = _mm_add_epi16(__x,
5249 _mm_shuffle_epi32(__x,
5250 _MM_SHUFFLE(0, 1, 2, 3)));
5251 __x = _mm_add_epi16(
5252 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5253 __x = _mm_add_epi16(
5254 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5255 return -short(_mm_extract_epi16(__x, 0));
5256 }
5257 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5258 {
5259 auto __x = __to_intrin(__kk);
5260 __x = _mm_add_epi8(__x,
5261 _mm_shuffle_epi32(__x,
5262 _MM_SHUFFLE(0, 1, 2, 3)));
5263 __x = _mm_add_epi8(__x,
5264 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5265 3)));
5266 __x = _mm_add_epi8(__x,
5267 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5268 1)));
5269 auto __y = -__vector_bitcast<_UChar>(__x);
5270 if constexpr (__have_sse4_1)
5271 return __y[0] + __y[1];
5272 else
5273 {
5274 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5275 return (__z & 0xff) + (__z >> 8);
5276 }
5277 }
5278 else if constexpr (sizeof(__kk) == 32)
5279 {
5280 // The following works only as long as the implementations above
5281 // use a summation
5282 using _I = __int_for_sizeof_t<_Tp>;
5283 const auto __as_int = __vector_bitcast<_I>(__kk);
5284 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5285 simd_mask<_I, simd_abi::__sse>(__private_init,
5286 __lo128(__as_int)
5287 + __hi128(__as_int)));
5288 }
5289 else
5290 __assert_unreachable<_Tp>();
5291 }
5292 }
5293
5294 // }}}
5295 // _S_find_first_set {{{
5296 template <typename _Tp>
5297 _GLIBCXX_SIMD_INTRINSIC static int
5298 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5299 {
5300 if constexpr (__is_avx512_abi<_Abi>())
5301 return std::__countr_zero(__k._M_data._M_data);
5302 else
5303 return _Base::_S_find_first_set(__k);
5304 }
5305
5306 // }}}
5307 // _S_find_last_set {{{
5308 template <typename _Tp>
5309 _GLIBCXX_SIMD_INTRINSIC static int
5310 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5311 {
5312 if constexpr (__is_avx512_abi<_Abi>())
5313 return std::__bit_width(_Abi::_S_masked(__k._M_data)._M_data) - 1;
5314 else
5315 return _Base::_S_find_last_set(__k);
5316 }
5317
5318 // }}}
5319 };
5320
5321// }}}
5322
5323_GLIBCXX_SIMD_END_NAMESPACE
5324#endif // __cplusplus >= 201703L
5325#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5326
5327// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition type_traits:2948
typename make_signed< _Tp >::type make_signed_t
Alias template for make_signed.
Definition type_traits:2246
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.