libstdc++
experimental/bits/simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2026 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); }
44
45template <typename _TV,
46 typename _TVT
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
52
53// }}}
54// __interleave128_lo {{{
55template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
59 {
60 const _Tp __a(__av);
61 const _Tp __b(__bv);
62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
111 __b[55]};
112 else
113 __assert_unreachable<_Tp>();
114 }
115
116// }}}
117// __is_zero{{{
118template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr bool
120 __is_zero(_Tp __a)
121 {
122 if (!__builtin_is_constant_evaluated())
123 {
124 if constexpr (__have_avx)
125 {
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<double, 2>)
135 return _mm_testz_pd(__a, __a);
136 else
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
138 }
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
142 }
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
145 else
146 {
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
154 else
155 __assert_unreachable<_Tp>();
156 }
157 }
158
159// }}}
160// __movemask{{{
161template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
163 __movemask(_Tp __a)
164 {
165 if constexpr (sizeof(_Tp) == 32)
166 {
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
171 else
172 return _mm256_movemask_epi8(__to_intrin(__a));
173 }
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
178 else
179 return _mm_movemask_epi8(__to_intrin(__a));
180 }
181
182// }}}
183// __testz{{{
184template <typename _TI, typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
186 __testz(_TI __a, _TI __b)
187 {
188 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
191 {
192 if constexpr (sizeof(_TI) == 32)
193 {
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
198 else
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
200 }
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
208 else
209 return __movemask(0 == __and(__a, __b)) != 0;
210 }
211 else
212 return __is_zero(__and(__a, __b));
213 }
214
215// }}}
216// __testc{{{
217// requires SSE4.1 or above
218template <typename _TI, typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
220 __testc(_TI __a, _TI __b)
221 {
222 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
226
227 if constexpr (sizeof(_TI) == 32)
228 {
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
233 else
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
235 }
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
240 else
241 {
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
245 }
246 }
247
248// }}}
249// __testnzc{{{
250template <typename _TI, typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
252 __testnzc(_TI __a, _TI __b)
253 {
254 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
257 {
258 if constexpr (sizeof(_TI) == 32)
259 {
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
264 else
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
266 }
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
274 else
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
277 }
278 else
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
280 }
281
282// }}}
283// __xzyw{{{
284// shuffles the complete vector, swapping the inner two quarters. Often useful
285// for AVX for fixing up a shuffle result.
286template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
288 __xzyw(_Tp __a)
289 {
290 if constexpr (sizeof(_Tp) == 16)
291 {
292 const auto __x = __vector_bitcast<conditional_t<
293 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
294 return reinterpret_cast<_Tp>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
296 }
297 else if constexpr (sizeof(_Tp) == 32)
298 {
299 const auto __x = __vector_bitcast<conditional_t<
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
303 }
304 else if constexpr (sizeof(_Tp) == 64)
305 {
306 const auto __x = __vector_bitcast<conditional_t<
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
310 __x[6], __x[7]});
311 }
312 else
313 __assert_unreachable<_Tp>();
314 }
315
316// }}}
317// __maskload_epi32{{{
318template <typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC auto
320 __maskload_epi32(const int* __ptr, _Tp __k)
321 {
322 if constexpr (sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
324 else
325 return _mm256_maskload_epi32(__ptr, __k);
326 }
327
328// }}}
329// __maskload_epi64{{{
330template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
333 {
334 if constexpr (sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
336 else
337 return _mm256_maskload_epi64(__ptr, __k);
338 }
339
340// }}}
341// __maskload_ps{{{
342template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const float* __ptr, _Tp __k)
345 {
346 if constexpr (sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
348 else
349 return _mm256_maskload_ps(__ptr, __k);
350 }
351
352// }}}
353// __maskload_pd{{{
354template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const double* __ptr, _Tp __k)
357 {
358 if constexpr (sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
360 else
361 return _mm256_maskload_pd(__ptr, __k);
362 }
363
364// }}}
365
366#ifdef _GLIBCXX_CLANG
367template <size_t _Np, typename _Tp, typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto
369 __movm(_Kp __k) noexcept
370 {
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw)
373 {
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
378 else
379 return __builtin_ia32_cvtmask2b512(__k);
380 }
381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw)
382 {
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
387 else
388 return __builtin_ia32_cvtmask2w512(__k);
389 }
390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq)
391 {
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
396 else
397 return __builtin_ia32_cvtmask2d512(__k);
398 }
399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq)
400 {
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
405 else
406 return __builtin_ia32_cvtmask2q512(__k);
407 }
408 else
409 __assert_unreachable<_Tp>();
410 }
411#endif // _GLIBCXX_CLANG
412
413#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
414#include "simd_x86_conversions.h"
415#endif
416
417// ISA & type detection {{{
418template <typename _Tp>
419 constexpr bool
420 __is_x86_ps()
421 {
422 return is_same_v<_Tp, float>;
423 }
424
425template <typename _Tp>
426 constexpr bool
427 __is_x86_pd()
428 {
429 if constexpr (is_same_v<_Tp, double>)
430 return true;
431#if __LDBL_MANT_DIG__ == __DBL_MANT_DIG__
432 else if constexpr (is_same_v<_Tp, long double>)
433 return true;
434#endif
435 else
436 return false;
437 }
438
439template <typename _Tp, size_t _Np>
440 constexpr bool
441 __is_sse_ps()
442 {
443 return __have_sse
444 && __is_x86_ps<_Tp>()
445 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
446 }
447
448template <typename _Tp, size_t _Np>
449 constexpr bool
450 __is_sse_pd()
451 {
452 return __have_sse2
453 && __is_x86_pd<_Tp>()
454 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
455 }
456
457template <typename _Tp, size_t _Np>
458 constexpr bool
459 __is_avx_ps()
460 {
461 return __have_avx
462 && __is_x86_ps<_Tp>()
463 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
464 }
465
466template <typename _Tp, size_t _Np>
467 constexpr bool
468 __is_avx_pd()
469 {
470 return __have_avx
471 && __is_x86_pd<_Tp>()
472 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
473 }
474
475template <typename _Tp, size_t _Np>
476 constexpr bool
477 __is_avx512_ps()
478 {
479 return __have_avx512f
480 && __is_x86_ps<_Tp>()
481 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
482 }
483
484template <typename _Tp, size_t _Np>
485 constexpr bool
486 __is_avx512_pd()
487 {
488 return __have_avx512f
489 && __is_x86_pd<_Tp>()
490 && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
491 }
492
493// }}}
494struct _MaskImplX86Mixin;
495
496// _CommonImplX86 {{{
497struct _CommonImplX86 : _CommonImplBuiltin
498{
499#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
500 // _S_converts_via_decomposition {{{
501 template <typename _From, typename _To, size_t _ToSize>
502 static constexpr bool
503 _S_converts_via_decomposition()
504 {
505 if constexpr (is_integral_v<
506 _From> && is_integral_v<_To> && sizeof(_From) == 8
507 && _ToSize == 16)
508 return (sizeof(_To) == 2 && !__have_ssse3)
509 || (sizeof(_To) == 1 && !__have_avx512f);
510 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
511 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
512 && !__have_avx512dq)
513 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
514 && _ToSize == 16);
515 else if constexpr (
516 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
517 && !__have_avx512dq)
518 return (sizeof(_To) == 4 && _ToSize == 16)
519 || (sizeof(_To) == 8 && _ToSize < 64);
520 else
521 return false;
522 }
523
524 template <typename _From, typename _To, size_t _ToSize>
525 static inline constexpr bool __converts_via_decomposition_v
526 = _S_converts_via_decomposition<_From, _To, _ToSize>();
527
528 // }}}
529#endif
530 // _S_store {{{
531 using _CommonImplBuiltin::_S_store;
532
533 template <typename _Tp, size_t _Np>
534 _GLIBCXX_SIMD_INTRINSIC static constexpr void
535 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
536 {
537 constexpr size_t _Bytes = _Np * sizeof(_Tp);
538
539 if (__builtin_is_constant_evaluated())
540 _CommonImplBuiltin::_S_store(__x, __addr);
541 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
542 {
543 const auto __v = __to_intrin(__x);
544
545 if constexpr (_Bytes & 1)
546 {
547 if constexpr (_Bytes < 16)
548 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
549 __intrin_bitcast<__m128i>(__v));
550 else if constexpr (_Bytes < 32)
551 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
552 __intrin_bitcast<__m256i>(__v));
553 else
554 _mm512_mask_storeu_epi8(__addr,
555 0xffffffffffffffffull >> (64 - _Bytes),
556 __intrin_bitcast<__m512i>(__v));
557 }
558 else if constexpr (_Bytes & 2)
559 {
560 if constexpr (_Bytes < 16)
561 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
562 __intrin_bitcast<__m128i>(__v));
563 else if constexpr (_Bytes < 32)
564 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
565 __intrin_bitcast<__m256i>(__v));
566 else
567 _mm512_mask_storeu_epi16(__addr,
568 0xffffffffull >> (32 - _Bytes / 2),
569 __intrin_bitcast<__m512i>(__v));
570 }
571 else if constexpr (_Bytes & 4)
572 {
573 if constexpr (_Bytes < 16)
574 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
575 __intrin_bitcast<__m128i>(__v));
576 else if constexpr (_Bytes < 32)
577 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
578 __intrin_bitcast<__m256i>(__v));
579 else
580 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
581 __intrin_bitcast<__m512i>(__v));
582 }
583 else
584 {
585 static_assert(
586 _Bytes > 16,
587 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
588 "- 1)) != 0 is impossible");
589 if constexpr (_Bytes < 32)
590 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
591 __intrin_bitcast<__m256i>(__v));
592 else
593 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
594 __intrin_bitcast<__m512i>(__v));
595 }
596 }
597 else
598 _CommonImplBuiltin::_S_store(__x, __addr);
599 }
600
601 // }}}
602 // _S_store_bool_array(_BitMask) {{{
603 template <size_t _Np, bool _Sanitized>
604 _GLIBCXX_SIMD_INTRINSIC static constexpr void
605 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
606 {
607 if (__builtin_is_constant_evaluated())
608 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
609 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
610 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
611 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
612 if constexpr (_Np <= 16)
613 return _mm_movm_epi8(__x._M_to_bits());
614 else if constexpr (_Np <= 32)
615 return _mm256_movm_epi8(__x._M_to_bits());
616 else if constexpr (_Np <= 64)
617 return _mm512_movm_epi8(__x._M_to_bits());
618 else
619 __assert_unreachable<_SizeConstant<_Np>>();
620 }()),
621 __mem);
622 else if constexpr (__have_bmi2)
623 {
624 if constexpr (_Np <= 4)
625 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
626 else
627 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
628 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
629 constexpr size_t __offset = __i * sizeof(size_t);
630 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
631 if constexpr (__todo == 1)
632 __mem[__offset] = __x[__offset];
633 else
634 {
635 const auto __bools =
636#ifdef __x86_64__
637 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
638 0x0101010101010101ULL);
639#else // __x86_64__
640 _pdep_u32(
641 __x.template _M_extract<__offset>()._M_to_bits(),
642 0x01010101U);
643#endif // __x86_64__
644 _S_store<__todo>(__bools, __mem + __offset);
645 }
646 });
647 }
648 else if constexpr (__have_sse2 && _Np > 7)
649 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
650 constexpr int __offset = __i * 16;
651 constexpr int __todo = std::min(16, int(_Np) - __offset);
652 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
653 __vector_type16_t<_UChar> __bools;
654 if constexpr (__have_avx512f)
655 {
656 auto __as32bits
657 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
658 __vector_broadcast<16>(1)));
659 auto __as16bits
660 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
661 __todo > 8 ? __hi256(__as32bits)
662 : __m256i()));
663 __bools = __vector_bitcast<_UChar>(
664 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
665 }
666 else
667 {
668 using _V = __vector_type_t<_UChar, 16>;
669 auto __tmp = _mm_cvtsi32_si128(__bits);
670 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
671 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
672 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
673 _V __tmp2 = reinterpret_cast<_V>(__tmp);
674 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
675 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
676 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
677 }
678 _S_store<__todo>(__bools, __mem + __offset);
679 });
680 else
681 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
682 }
683
684 // }}}
685 // _S_blend_avx512 {{{
686 // Returns: __k ? __b : __a
687 // TODO: reverse __a and __b to match COND_EXPR
688 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
689 // __k
690 template <typename _Kp, typename _TV>
691 _GLIBCXX_SIMD_INTRINSIC static _TV
692 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
693 {
694 static_assert(__is_vector_type_v<_TV>);
695 using _Tp = typename _VectorTraits<_TV>::value_type;
696 static_assert(sizeof(_TV) >= 16);
697 static_assert(sizeof(_Tp) <= 8);
698#ifdef _GLIBCXX_CLANG
699 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
700#else
701 using _IntT
702 = conditional_t<(sizeof(_Tp) > 2),
703 conditional_t<sizeof(_Tp) == 4, int, long long>,
704 conditional_t<sizeof(_Tp) == 1, char, short>>;
705 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
706 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
707 if constexpr (sizeof(_TV) == 64)
708 {
709 if constexpr (sizeof(_Tp) == 1)
710 return reinterpret_cast<_TV>(
711 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
712 else if constexpr (sizeof(_Tp) == 2)
713 return reinterpret_cast<_TV>(
714 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
715 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
716 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
717 else if constexpr (sizeof(_Tp) == 4)
718 return reinterpret_cast<_TV>(
719 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
720 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
721 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
722 else if constexpr (sizeof(_Tp) == 8)
723 return reinterpret_cast<_TV>(
724 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
725 }
726 else if constexpr (sizeof(_TV) == 32)
727 {
728 if constexpr (sizeof(_Tp) == 1)
729 return reinterpret_cast<_TV>(
730 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
731 else if constexpr (sizeof(_Tp) == 2)
732 return reinterpret_cast<_TV>(
733 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
734 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
735 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
736 else if constexpr (sizeof(_Tp) == 4)
737 return reinterpret_cast<_TV>(
738 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
739 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
740 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
741 else if constexpr (sizeof(_Tp) == 8)
742 return reinterpret_cast<_TV>(
743 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
744 }
745 else if constexpr (sizeof(_TV) == 16)
746 {
747 if constexpr (sizeof(_Tp) == 1)
748 return reinterpret_cast<_TV>(
749 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
750 else if constexpr (sizeof(_Tp) == 2)
751 return reinterpret_cast<_TV>(
752 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
753 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
754 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
755 else if constexpr (sizeof(_Tp) == 4)
756 return reinterpret_cast<_TV>(
757 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
758 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
759 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
760 else if constexpr (sizeof(_Tp) == 8)
761 return reinterpret_cast<_TV>(
762 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
763 }
764#endif
765 }
766
767 // }}}
768 // _S_blend_intrin {{{
769 // Returns: __k ? __b : __a
770 // TODO: reverse __a and __b to match COND_EXPR
771 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
772 // Bytes wide
773 template <typename _Tp>
774 _GLIBCXX_SIMD_INTRINSIC static _Tp
775 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
776 {
777 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
778 constexpr struct
779 {
780 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
781 __m128 __k) const noexcept
782 {
783 return __builtin_ia32_blendvps(__a, __b, __k);
784 }
785 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
786 __m128d __k) const noexcept
787 {
788 return __builtin_ia32_blendvpd(__a, __b, __k);
789 }
790 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
791 __m128i __k) const noexcept
792 {
793 return reinterpret_cast<__m128i>(
794 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
795 reinterpret_cast<__v16qi>(__b),
796 reinterpret_cast<__v16qi>(__k)));
797 }
798 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
799 __m256 __k) const noexcept
800 {
801 return __builtin_ia32_blendvps256(__a, __b, __k);
802 }
803 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
804 __m256d __k) const noexcept
805 {
806 return __builtin_ia32_blendvpd256(__a, __b, __k);
807 }
808 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
809 __m256i __k) const noexcept
810 {
811 if constexpr (__have_avx2)
812 return reinterpret_cast<__m256i>(
813 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
814 reinterpret_cast<__v32qi>(__b),
815 reinterpret_cast<__v32qi>(__k)));
816 else
817 return reinterpret_cast<__m256i>(
818 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
819 reinterpret_cast<__v8sf>(__b),
820 reinterpret_cast<__v8sf>(__k)));
821 }
822 } __eval;
823 return __eval(__a, __b, __k);
824 }
825
826 // }}}
827 // _S_blend {{{
828 // Returns: __k ? __at1 : __at0
829 // TODO: reverse __at0 and __at1 to match COND_EXPR
830 template <typename _Tp, size_t _Np>
831 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
832 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
833 _SimdWrapper<_Tp, _Np> __at1)
834 {
835 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
836 if (__k._M_is_constprop() && __at0._M_is_constprop()
837 && __at1._M_is_constprop())
838 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
839 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
840 return __k[__i] ? __at1[__i] : __at0[__i];
841 });
842 else if constexpr (sizeof(__at0) == 64
843 || (__have_avx512vl && sizeof(__at0) >= 16))
844 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
845 else
846 {
847 static_assert((__have_avx512vl && sizeof(__at0) < 16)
848 || !__have_avx512vl);
849 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
850 return __vector_bitcast<_Tp, _Np>(
851 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
852 __vector_bitcast<_Tp, __size>(__at1)));
853 }
854 }
855
856 template <typename _Tp, size_t _Np>
857 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
858 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
859 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
860 {
861 const auto __kk = __wrapper_bitcast<_Tp>(__k);
862 if (__builtin_is_constant_evaluated()
863 || (__kk._M_is_constprop() && __at0._M_is_constprop()
864 && __at1._M_is_constprop()))
865 {
866 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
867 if (__r._M_is_constprop())
868 return __r;
869 }
870 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
871 && (sizeof(_Tp) >= 4 || __have_avx512bw))
872 // convert to bitmask and call overload above
873 return _S_blend(
874 _SimdWrapper<bool, _Np>(
875 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
876 ._M_to_bits()),
877 __at0, __at1);
878 else
879 {
880 // Since GCC does not assume __k to be a mask, using the builtin
881 // conditional operator introduces an extra compare against 0 before
882 // blending. So we rather call the intrinsic here.
883 if constexpr (__have_sse4_1)
884 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
885 __to_intrin(__at1));
886 else
887 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
888 }
889 }
890
891 // }}}
892};
893
894// }}}
895// _SimdImplX86 {{{
896template <typename _Abi, typename>
897 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
898 {
899 using _Base = _SimdImplBuiltin<_Abi>;
900
901 template <typename _Tp>
902 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
903
904 template <typename _Tp>
905 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
906
907 template <typename _Tp>
908 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
909
910 template <typename _Tp>
911 static constexpr size_t _S_max_store_size
912 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
913 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
914 : 16;
915
916 using _MaskImpl = typename _Abi::_MaskImpl;
917
918 // _S_masked_load {{{
919 template <typename _Tp, size_t _Np, typename _Up>
920 static inline _SimdWrapper<_Tp, _Np>
921 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
922 const _Up* __mem) noexcept
923 {
924 static_assert(_Np == _S_size<_Tp>);
925 if constexpr (is_same_v<_Tp, _Up> || // no conversion
926 (sizeof(_Tp) == sizeof(_Up)
927 && is_integral_v<
928 _Tp> == is_integral_v<_Up>) // conversion via bit
929 // reinterpretation
930 )
931 {
932 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
933 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
934 && sizeof(_Tp) == 1)
935 {
936 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
937 if constexpr (sizeof(__intrin) == 16)
938 __merge = __vector_bitcast<_Tp, _Np>(
939 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
940 else if constexpr (sizeof(__merge) == 32)
941 __merge = __vector_bitcast<_Tp, _Np>(
942 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
943 else if constexpr (sizeof(__merge) == 64)
944 __merge = __vector_bitcast<_Tp, _Np>(
945 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
946 else
947 __assert_unreachable<_Tp>();
948 }
949 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
950 && sizeof(_Tp) == 2)
951 {
952 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
953 if constexpr (sizeof(__intrin) == 16)
954 __merge = __vector_bitcast<_Tp, _Np>(
955 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
956 else if constexpr (sizeof(__intrin) == 32)
957 __merge = __vector_bitcast<_Tp, _Np>(
958 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
959 else if constexpr (sizeof(__intrin) == 64)
960 __merge = __vector_bitcast<_Tp, _Np>(
961 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
962 else
963 __assert_unreachable<_Tp>();
964 }
965 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
966 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
967 {
968 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
969 if constexpr (sizeof(__intrin) == 16)
970 __merge = __vector_bitcast<_Tp, _Np>(
971 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
972 else if constexpr (sizeof(__intrin) == 32)
973 __merge = __vector_bitcast<_Tp, _Np>(
974 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
975 else if constexpr (sizeof(__intrin) == 64)
976 __merge = __vector_bitcast<_Tp, _Np>(
977 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
978 else
979 __assert_unreachable<_Tp>();
980 }
981 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
982 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
983 {
984 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
985 if constexpr (sizeof(__intrin) == 16)
986 __merge = __vector_bitcast<_Tp, _Np>(
987 _mm_mask_loadu_ps(__intrin, __kk, __mem));
988 else if constexpr (sizeof(__intrin) == 32)
989 __merge = __vector_bitcast<_Tp, _Np>(
990 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
991 else if constexpr (sizeof(__intrin) == 64)
992 __merge = __vector_bitcast<_Tp, _Np>(
993 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
994 else
995 __assert_unreachable<_Tp>();
996 }
997 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
998 && is_integral_v<_Up>)
999 {
1000 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1001 __merge
1002 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1003 __vector_bitcast<_Tp, _Np>(
1004 __maskload_epi32(reinterpret_cast<const int*>(__mem),
1005 __to_intrin(__k))));
1006 }
1007 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1008 {
1009 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1010 __merge
1011 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1012 __vector_bitcast<_Tp, _Np>(
1013 __maskload_ps(reinterpret_cast<const float*>(__mem),
1014 __to_intrin(__k))));
1015 }
1016 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1017 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
1018 {
1019 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1020 if constexpr (sizeof(__intrin) == 16)
1021 __merge = __vector_bitcast<_Tp, _Np>(
1022 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1023 else if constexpr (sizeof(__intrin) == 32)
1024 __merge = __vector_bitcast<_Tp, _Np>(
1025 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1026 else if constexpr (sizeof(__intrin) == 64)
1027 __merge = __vector_bitcast<_Tp, _Np>(
1028 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1029 else
1030 __assert_unreachable<_Tp>();
1031 }
1032 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1033 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1034 {
1035 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1036 if constexpr (sizeof(__intrin) == 16)
1037 __merge = __vector_bitcast<_Tp, _Np>(
1038 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1039 else if constexpr (sizeof(__intrin) == 32)
1040 __merge = __vector_bitcast<_Tp, _Np>(
1041 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1042 else if constexpr (sizeof(__intrin) == 64)
1043 __merge = __vector_bitcast<_Tp, _Np>(
1044 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1045 else
1046 __assert_unreachable<_Tp>();
1047 }
1048 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1049 && is_integral_v<_Up>)
1050 {
1051 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1052 __merge
1053 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1054 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1055 reinterpret_cast<const _LLong*>(__mem),
1056 __to_intrin(__k))));
1057 }
1058 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1059 {
1060 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
1061 __merge
1062 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1063 __vector_bitcast<_Tp, _Np>(
1064 __maskload_pd(reinterpret_cast<const double*>(__mem),
1065 __to_intrin(__k))));
1066 }
1067 else
1068 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1069 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1070 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1071 });
1072 }
1073 /* Very uncertain, that the following improves anything. Needs
1074 benchmarking
1075 * before it's activated.
1076 else if constexpr (sizeof(_Up) <= 8 && // no long double
1077 !__converts_via_decomposition_v<
1078 _Up, _Tp,
1079 sizeof(__merge)> // conversion via decomposition
1080 // is better handled via the
1081 // bit_iteration fallback below
1082 )
1083 {
1084 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1085 // fixed_size
1086 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1087 using _ATraits = _SimdTraits<_Up, _Ap>;
1088 using _AImpl = typename _ATraits::_SimdImpl;
1089 typename _ATraits::_SimdMember __uncvted{};
1090 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1091 _S_convert<_Up>(__k);
1092 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1093 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1094 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1095 }
1096 */
1097 else
1098 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1099 return __merge;
1100 }
1101
1102 // }}}
1103 // _S_masked_store_nocvt {{{
1104 template <typename _Tp, size_t _Np>
1105 _GLIBCXX_SIMD_INTRINSIC static void
1106 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1107 {
1108 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1109 if constexpr (sizeof(__vi) == 64)
1110 {
1111 static_assert(sizeof(__v) == 64 && __have_avx512f);
1112 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1113 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1114 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1115 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1116 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1117 {
1118 if constexpr (is_integral_v<_Tp>)
1119 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1120 else
1121 _mm512_mask_storeu_ps(__mem, __k, __vi);
1122 }
1123 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1124 {
1125 if constexpr (is_integral_v<_Tp>)
1126 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1127 else
1128 _mm512_mask_storeu_pd(__mem, __k, __vi);
1129 }
1130 else
1131 __assert_unreachable<_Tp>();
1132 }
1133 else if constexpr (sizeof(__vi) == 32)
1134 {
1135 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1136 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1137 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1138 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1139 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1140 {
1141 if constexpr (is_integral_v<_Tp>)
1142 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1143 else
1144 _mm256_mask_storeu_ps(__mem, __k, __vi);
1145 }
1146 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1147 {
1148 if constexpr (is_integral_v<_Tp>)
1149 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1150 else
1151 _mm256_mask_storeu_pd(__mem, __k, __vi);
1152 }
1153 else if constexpr (__have_avx512f
1154 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1155 {
1156 // use a 512-bit maskstore, using zero-extension of the bitmask
1157 _S_masked_store_nocvt(
1158 _SimdWrapper64<_Tp>(
1159 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1160 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1161 }
1162 else
1163 _S_masked_store_nocvt(__v, __mem,
1164 _MaskImpl::template _S_to_maskvector<
1165 __int_for_sizeof_t<_Tp>, _Np>(__k));
1166 }
1167 else if constexpr (sizeof(__vi) == 16)
1168 {
1169 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1170 _mm_mask_storeu_epi8(__mem, __k, __vi);
1171 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1172 _mm_mask_storeu_epi16(__mem, __k, __vi);
1173 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1174 {
1175 if constexpr (is_integral_v<_Tp>)
1176 _mm_mask_storeu_epi32(__mem, __k, __vi);
1177 else
1178 _mm_mask_storeu_ps(__mem, __k, __vi);
1179 }
1180 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1181 {
1182 if constexpr (is_integral_v<_Tp>)
1183 _mm_mask_storeu_epi64(__mem, __k, __vi);
1184 else
1185 _mm_mask_storeu_pd(__mem, __k, __vi);
1186 }
1187 else if constexpr (__have_avx512f
1188 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1189 {
1190 // use a 512-bit maskstore, using zero-extension of the bitmask
1191 _S_masked_store_nocvt(
1192 _SimdWrapper64<_Tp>(
1193 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1194 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1195 }
1196 else
1197 _S_masked_store_nocvt(__v, __mem,
1198 _MaskImpl::template _S_to_maskvector<
1199 __int_for_sizeof_t<_Tp>, _Np>(__k));
1200 }
1201 else
1202 __assert_unreachable<_Tp>();
1203 }
1204
1205 template <typename _Tp, size_t _Np>
1206 _GLIBCXX_SIMD_INTRINSIC static void
1207 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1208 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1209 {
1210 if constexpr (sizeof(__v) <= 16)
1211 {
1212 [[maybe_unused]] const auto __vi
1213 = __intrin_bitcast<__m128i>(__as_vector(__v));
1214 [[maybe_unused]] const auto __ki
1215 = __intrin_bitcast<__m128i>(__as_vector(__k));
1216 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1217 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1218 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1219 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1220 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1221 && is_integral_v<_Tp>)
1222 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1223 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1224 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1225 __vector_bitcast<float>(__vi));
1226 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1227 && is_integral_v<_Tp>)
1228 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1229 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1230 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1231 __vector_bitcast<double>(__vi));
1232 else
1233 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1234 }
1235 else if constexpr (sizeof(__v) == 32)
1236 {
1237 [[maybe_unused]] const auto __vi
1238 = __intrin_bitcast<__m256i>(__as_vector(__v));
1239 [[maybe_unused]] const auto __ki
1240 = __intrin_bitcast<__m256i>(__as_vector(__k));
1241 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1242 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1243 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1244 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1245 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1246 && is_integral_v<_Tp>)
1247 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1248 else if constexpr (sizeof(_Tp) == 4)
1249 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1250 __vector_bitcast<float>(__v));
1251 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1252 && is_integral_v<_Tp>)
1253 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1254 __vi);
1255 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1256 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1257 __vector_bitcast<double>(__v));
1258 else
1259 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1260 }
1261 else
1262 __assert_unreachable<_Tp>();
1263 }
1264
1265 // }}}
1266 // _S_masked_store {{{
1267 template <typename _Tp, size_t _Np, typename _Up>
1268 _GLIBCXX_SIMD_INTRINSIC static void
1269 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1270 const _MaskMember<_Tp> __k) noexcept
1271 {
1272 if constexpr (is_integral_v<
1273 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1274 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1275 && (sizeof(__v) == 64 || __have_avx512vl))
1276 { // truncating store
1277 const auto __vi = __to_intrin(__v);
1278 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1279 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1280 && sizeof(__vi) == 64)
1281 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1282 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1283 && sizeof(__vi) == 32)
1284 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1285 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1286 && sizeof(__vi) == 16)
1287 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1288 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1289 && sizeof(__vi) == 64)
1290 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1292 && sizeof(__vi) == 32)
1293 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1295 && sizeof(__vi) == 16)
1296 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1298 && sizeof(__vi) == 64)
1299 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1301 && sizeof(__vi) == 32)
1302 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1304 && sizeof(__vi) == 16)
1305 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1307 && sizeof(__vi) == 64)
1308 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1310 && sizeof(__vi) == 32)
1311 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1312 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1313 && sizeof(__vi) == 16)
1314 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1316 && sizeof(__vi) == 64)
1317 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1319 && sizeof(__vi) == 32)
1320 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1321 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1322 && sizeof(__vi) == 16)
1323 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1324 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1325 && sizeof(__vi) == 64)
1326 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1327 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1328 && sizeof(__vi) == 32)
1329 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1330 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1331 && sizeof(__vi) == 16)
1332 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1333 else
1334 __assert_unreachable<_Tp>();
1335 }
1336 else
1337 _Base::_S_masked_store(__v, __mem, __k);
1338 }
1339
1340 // }}}
1341 // _S_multiplies {{{
1342 template <typename _V, typename _VVT = _VectorTraits<_V>>
1343 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1344 _S_multiplies(_V __x, _V __y)
1345 {
1346 using _Tp = typename _VVT::value_type;
1347 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1348 || __y._M_is_constprop())
1349 return __as_vector(__x) * __as_vector(__y);
1350 else if constexpr (sizeof(_Tp) == 1)
1351 {
1352 if constexpr (sizeof(_V) == 2)
1353 {
1354 const auto __xs = reinterpret_cast<short>(__x._M_data);
1355 const auto __ys = reinterpret_cast<short>(__y._M_data);
1356 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1357 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1358 }
1359 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1360 {
1361 const auto __xi = reinterpret_cast<int>(__x._M_data);
1362 const auto __yi = reinterpret_cast<int>(__y._M_data);
1363 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1364 ((__xi * __yi) & 0xff)
1365 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1366 | ((__xi >> 16) * (__yi & 0xff0000)));
1367 }
1368 else if constexpr (sizeof(_V) == 4)
1369 {
1370 const auto __xi = reinterpret_cast<int>(__x._M_data);
1371 const auto __yi = reinterpret_cast<int>(__y._M_data);
1372 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1373 ((__xi * __yi) & 0xff)
1374 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1375 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1376 | ((__xi >> 24) * (__yi & 0xff000000u)));
1377 }
1378 else if constexpr (sizeof(_V) == 8 && __have_avx2
1379 && is_signed_v<_Tp>)
1380 return __convert<typename _VVT::type>(
1381 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1382 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1383 else if constexpr (sizeof(_V) == 8 && __have_avx2
1384 && is_unsigned_v<_Tp>)
1385 return __convert<typename _VVT::type>(
1386 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1387 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1388 else
1389 {
1390 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1391 constexpr size_t __full_size = _VVT::_S_full_size;
1392 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1393 using _ShortW = _SimdWrapper<short, _Np>;
1394 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1395 * __vector_bitcast<short, _Np>(__y);
1396 _ShortW __high_byte = _ShortW()._M_data - 256;
1397 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1398 const _ShortW __odd
1399 = (__vector_bitcast<short, _Np>(__x) >> 8)
1400 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1401 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1402 return _CommonImplX86::_S_blend_avx512(
1403 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1404 __vector_bitcast<_Tp>(__odd));
1405 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1406 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1407 __high_byte),
1408 __to_intrin(__even),
1409 __to_intrin(__odd));
1410 else
1411 return __to_intrin(
1412 __or(__andnot(__high_byte, __even), __odd));
1413 }
1414 }
1415 else
1416 return _Base::_S_multiplies(__x, __y);
1417 }
1418
1419 // }}}
1420 // _S_divides {{{
1421#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1422 template <typename _Tp, size_t _Np>
1423 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1424 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1425 {
1426 if (!__builtin_is_constant_evaluated()
1427 && !__builtin_constant_p(__y._M_data))
1428 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1429 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1430 // Note that using floating-point division is likely to raise the
1431 // *Inexact* exception flag and thus appears like an invalid
1432 // "as-if" transformation. However, C++ doesn't specify how the
1433 // fpenv can be observed and points to C. C says that function
1434 // calls are assumed to potentially raise fp exceptions, unless
1435 // documented otherwise. Consequently, operator/, which is a
1436 // function call, may raise fp exceptions.
1437 /*const struct _CsrGuard
1438 {
1439 const unsigned _M_data = _mm_getcsr();
1440 _CsrGuard()
1441 {
1442 _mm_setcsr(0x9f80); // turn off FP exceptions and
1443 flush-to-zero
1444 }
1445 ~_CsrGuard() { _mm_setcsr(_M_data); }
1446 } __csr;*/
1447 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1448 constexpr size_t __n_intermediate
1449 = std::min(_Np, (__have_avx512f ? 64
1450 : __have_avx ? 32
1451 : 16)
1452 / sizeof(_Float));
1453 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1454 constexpr size_t __n_floatv
1455 = __div_roundup(_Np, __n_intermediate);
1456 using _R = __vector_type_t<_Tp, _Np>;
1457 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1458 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1459 _Abi::__make_padding_nonzero(__as_vector(__y)));
1460 return __call_with_n_evaluations<__n_floatv>(
1461 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1462 return __vector_convert<_R>(__quotients...);
1463 },
1464 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1465 -> _SimdWrapper<_Float, __n_intermediate>
1466 {
1467#if __RECIPROCAL_MATH__
1468 // If -freciprocal-math is active, using the `/` operator is
1469 // incorrect because it may be translated to an imprecise
1470 // multiplication with reciprocal. We need to use inline
1471 // assembly to force a real division.
1472 _FloatV __r;
1473 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1474 // because once -mavx is given, GCC
1475 // emits VEX encoded vdivp[sd]
1476 {
1477 if constexpr (sizeof(_Tp) == 4)
1478 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1479 : "=x"(__r)
1480 : "x"(__xf[__i]), "x"(__yf[__i]));
1481 else
1482 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1483 : "=x"(__r)
1484 : "x"(__xf[__i]), "x"(__yf[__i]));
1485 }
1486 else
1487 {
1488 __r = __xf[__i];
1489 if constexpr (sizeof(_Tp) == 4)
1490 asm("divpd\t{%1, %0|%0, %1}"
1491 : "=x"(__r)
1492 : "x"(__yf[__i]));
1493 else
1494 asm("divps\t{%1, %0|%0, %1}"
1495 : "=x"(__r)
1496 : "x"(__yf[__i]));
1497 }
1498 return __r;
1499#else
1500 return __xf[__i] / __yf[__i];
1501#endif
1502 });
1503 }
1504 /* 64-bit int division is potentially optimizable via double division if
1505 * the value in __x is small enough and the conversion between
1506 * int<->double is efficient enough:
1507 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1508 sizeof(_Tp) == 8)
1509 {
1510 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1511 {
1512 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1513 0xffe0'0000'0000'0000ull}))
1514 {
1515 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1516 }
1517 }
1518 }
1519 */
1520 return _Base::_S_divides(__x, __y);
1521 }
1522#else
1523 using _Base::_S_divides;
1524#endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1525
1526 // }}}
1527 // _S_modulus {{{
1528 template <typename _Tp, size_t _Np>
1529 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1530 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1531 {
1532 if (__builtin_is_constant_evaluated()
1533 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1534 return _Base::_S_modulus(__x, __y);
1535 else
1536 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1537 }
1538
1539 // }}}
1540 // _S_bit_shift_left {{{
1541 // Notes on UB. C++2a [expr.shift] says:
1542 // -1- [...] The operands shall be of integral or unscoped enumeration type
1543 // and integral promotions are performed. The type of the result is that
1544 // of the promoted left operand. The behavior is undefined if the right
1545 // operand is negative, or greater than or equal to the width of the
1546 // promoted left operand.
1547 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1548 // 2^N, where N is the width of the type of the result.
1549 //
1550 // C++17 [expr.shift] says:
1551 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1552 // bits are zero-filled. If E1 has an unsigned type, the value of the
1553 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1554 // representable in the result type. Otherwise, if E1 has a signed type
1555 // and non-negative value, and E1 × 2^E2 is representable in the
1556 // corresponding unsigned type of the result type, then that value,
1557 // converted to the result type, is the resulting value; otherwise, the
1558 // behavior is undefined.
1559 //
1560 // Consequences:
1561 // With C++2a signed and unsigned types have the same UB
1562 // characteristics:
1563 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1564 //
1565 // With C++17 there's little room for optimizations because the standard
1566 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1567 // short and char shifts must assume shifts affect bits of neighboring
1568 // values.
1569 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1570 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1571 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1572 _S_bit_shift_left(_Tp __xx, int __y)
1573 {
1574 using _V = typename _TVT::type;
1575 using _Up = typename _TVT::value_type;
1576 _V __x = __xx;
1577 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1578 if (__builtin_is_constant_evaluated())
1579 return __x << __y;
1580#if __cplusplus > 201703
1581 // after C++17, signed shifts have no UB, and behave just like unsigned
1582 // shifts
1583 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1584 return __vector_bitcast<_Up>(
1585 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1586 __y));
1587#endif
1588 else if constexpr (sizeof(_Up) == 1)
1589 {
1590 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1591 if (__builtin_constant_p(__y))
1592 {
1593 if (__y == 0)
1594 return __x;
1595 else if (__y == 1)
1596 return __x + __x;
1597 else if (__y == 2)
1598 {
1599 __x = __x + __x;
1600 return __x + __x;
1601 }
1602 else if (__y > 2 && __y < 8)
1603 {
1604 if constexpr (sizeof(__x) > sizeof(unsigned))
1605 {
1606 const _UChar __mask = 0xff << __y; // precomputed vector
1607 return __vector_bitcast<_Up>(
1608 __vector_bitcast<_UChar>(
1609 __vector_bitcast<unsigned>(__x) << __y)
1610 & __mask);
1611 }
1612 else
1613 {
1614 const unsigned __mask
1615 = (0xff & (0xff << __y)) * 0x01010101u;
1616 return reinterpret_cast<_V>(
1617 static_cast<__int_for_sizeof_t<_V>>(
1618 unsigned(
1619 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1620 << __y)
1621 & __mask));
1622 }
1623 }
1624 else if (__y >= 8 && __y < 32)
1625 return _V();
1626 else
1627 __builtin_unreachable();
1628 }
1629 // general strategy in the following: use an sllv instead of sll
1630 // instruction, because it's 2 to 4 times faster:
1631 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1632 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1633 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1634 _mm256_set1_epi16(__y))));
1635 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1636 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1637 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1638 _mm512_set1_epi16(__y))));
1639 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1640 {
1641 const auto __shift = _mm512_set1_epi16(__y);
1642 return __vector_bitcast<_Up>(
1643 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1644 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1645 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1646 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1647 }
1648 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1649 {
1650#if 1
1651 const auto __shift = _mm_cvtsi32_si128(__y);
1652 auto __k
1653 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1654 __k |= _mm256_srli_epi16(__k, 8);
1655 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1656 & __k);
1657#else
1658 const _Up __k = 0xff << __y;
1659 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1660 & __k;
1661#endif
1662 }
1663 else
1664 {
1665 const auto __shift = _mm_cvtsi32_si128(__y);
1666 auto __k
1667 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1668 __k |= _mm_srli_epi16(__k, 8);
1669 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1670 }
1671 }
1672 return __x << __y;
1673 }
1674
1675 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1676 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1677 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1678 {
1679 using _V = typename _TVT::type;
1680 using _Up = typename _TVT::value_type;
1681 _V __x = __xx;
1682 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1683 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1684 if (__builtin_is_constant_evaluated())
1685 return __x << __y;
1686#if __cplusplus > 201703
1687 // after C++17, signed shifts have no UB, and behave just like unsigned
1688 // shifts
1689 else if constexpr (is_signed_v<_Up>)
1690 return __vector_bitcast<_Up>(
1691 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1692 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1693#endif
1694 else if constexpr (sizeof(_Up) == 1)
1695 {
1696 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1697 return __vector_bitcast<_Up>(__concat(
1698 _mm512_cvtepi16_epi8(
1699 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1700 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1701 _mm512_cvtepi16_epi8(
1702 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1703 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1704 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1705 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1706 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1707 _mm512_cvtepu8_epi16(__iy))));
1708 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1709 return __intrin_bitcast<_V>(
1710 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1711 _mm_cvtepu8_epi16(__iy))));
1712 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1713 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1714 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1715 _mm256_cvtepu8_epi16(__iy))));
1716 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1717 return __intrin_bitcast<_V>(
1718 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1719 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1720 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1721 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1722 {
1723 auto __mask
1724 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1725 auto __x4
1726 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1727 __x4 &= char(0xf0);
1728 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1729 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1730 __mask += __mask;
1731 auto __x2
1732 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1733 __x2 &= char(0xfc);
1734 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1735 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1736 __mask += __mask;
1737 auto __x1 = __x + __x;
1738 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1739 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1740 return __x
1741 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1742 }
1743 else if constexpr (sizeof(__x) == 16)
1744 {
1745 auto __mask
1746 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1747 auto __x4
1748 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1749 __x4 &= char(0xf0);
1750 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1751 __mask += __mask;
1752 auto __x2
1753 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1754 __x2 &= char(0xfc);
1755 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1756 __mask += __mask;
1757 auto __x1 = __x + __x;
1758 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1759 return __x
1760 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1761 }
1762 else
1763 return __x << __y;
1764 }
1765 else if constexpr (sizeof(_Up) == 2)
1766 {
1767 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1768 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1769 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1770 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1771 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1772 return __vector_bitcast<_Up>(
1773 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1774 _mm512_castsi256_si512(__iy))));
1775 else if constexpr (sizeof __ix == 32 && __have_avx2)
1776 {
1777 const auto __ux = __vector_bitcast<unsigned>(__x);
1778 const auto __uy = __vector_bitcast<unsigned>(__y);
1779 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1780 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1781 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1782 }
1783 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1784 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1785 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1786 return __intrin_bitcast<_V>(
1787 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1788 _mm512_castsi128_si512(__iy))));
1789 else if constexpr (sizeof __ix == 16 && __have_avx2)
1790 {
1791 const auto __ux = __vector_bitcast<unsigned>(__ix);
1792 const auto __uy = __vector_bitcast<unsigned>(__iy);
1793 return __intrin_bitcast<_V>(_mm_blend_epi16(
1794 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1795 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1796 }
1797 else if constexpr (sizeof __ix == 16)
1798 {
1799 using _Float4 = __vector_type_t<float, 4>;
1800 using _Int4 = __vector_type_t<int, 4>;
1801 using _UInt4 = __vector_type_t<unsigned, 4>;
1802 const _UInt4 __yu
1803 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1804 return __x
1805 * __intrin_bitcast<_V>(
1806 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1807 reinterpret_cast<_Float4>(__yu << 23)))
1808 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1809 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1810 << 16));
1811 }
1812 else
1813 __assert_unreachable<_Tp>();
1814 }
1815 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1816 && !__have_avx2)
1817 // latency is suboptimal, but throughput is at full speedup
1818 return __intrin_bitcast<_V>(
1819 __vector_bitcast<unsigned>(__ix)
1820 * __vector_convert<__vector_type16_t<int>>(
1821 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1822 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1823 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1824 && !__have_avx2)
1825 {
1826 const auto __lo = _mm_sll_epi64(__ix, __iy);
1827 const auto __hi
1828 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1829 if constexpr (__have_sse4_1)
1830 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1831 else
1832 return __vector_bitcast<_Up>(
1833 _mm_move_sd(__vector_bitcast<double>(__hi),
1834 __vector_bitcast<double>(__lo)));
1835 }
1836 else
1837 return __x << __y;
1838 }
1839#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1840
1841 // }}}
1842 // _S_bit_shift_right {{{
1843#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1844 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1845 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1846 _S_bit_shift_right(_Tp __xx, int __y)
1847 {
1848 using _V = typename _TVT::type;
1849 using _Up = typename _TVT::value_type;
1850 _V __x = __xx;
1851 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1852 if (__builtin_is_constant_evaluated())
1853 return __x >> __y;
1854 else if (__builtin_constant_p(__y)
1855 && is_unsigned_v<
1856 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1857 return _V();
1858 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1859 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1860 & _Up(0xff >> __y);
1861 //}}}
1862 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1863 return __intrin_bitcast<_V>(
1864 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1865 >> (__y + 8))
1866 << 8)
1867 | (__vector_bitcast<_UShort>(
1868 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1869 >> __y)
1870 >> 8));
1871 //}}}
1872 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1873 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1874 {
1875 if (__y > 32)
1876 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1877 & _Up(0xffff'ffff'0000'0000ull))
1878 | __vector_bitcast<_Up>(
1879 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1880 >> 32)
1881 >> (__y - 32));
1882 else
1883 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1884 >> __y)
1885 | __vector_bitcast<_Up>(
1886 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1887 >> __y);
1888 }
1889 //}}}
1890 else
1891 return __x >> __y;
1892 }
1893
1894 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1895 constexpr inline _GLIBCXX_CONST static typename _TVT::type
1896 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1897 {
1898 using _V = typename _TVT::type;
1899 using _Up = typename _TVT::value_type;
1900 _V __x = __xx;
1901 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1902 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1903 if (__builtin_is_constant_evaluated()
1904 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1905 return __x >> __y;
1906 else if constexpr (sizeof(_Up) == 1) //{{{
1907 {
1908 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1909 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1910 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1911 _mm_cvtepi8_epi16(__iy))
1912 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1913 _mm_cvtepu8_epi16(__iy))));
1914 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1915 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1916 is_signed_v<_Up>
1917 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1918 _mm256_cvtepi8_epi16(__iy))
1919 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1920 _mm256_cvtepu8_epi16(__iy))));
1921 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1922 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1923 is_signed_v<_Up>
1924 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1925 _mm512_cvtepi8_epi16(__iy))
1926 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1927 _mm512_cvtepu8_epi16(__iy))));
1928 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1929 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1930 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1931 0x5555'5555'5555'5555ull,
1932 _mm512_srav_epi16(
1933 _mm512_slli_epi16(__ix, 8),
1934 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1935 _mm512_set1_epi16(8)))));
1936 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1937 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1938 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1939 0x5555'5555'5555'5555ull,
1940 _mm512_srlv_epi16(
1941 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1942 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1943 /* This has better throughput but higher latency than the impl below
1944 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1945 is_unsigned_v<_Up>)
1946 {
1947 const auto __shorts = __to_intrin(_S_bit_shift_right(
1948 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1949 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1950 return __vector_bitcast<_Up>(
1951 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1952 }
1953 */
1954 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1955 // the following uses vpsr[al]vd, which requires AVX2
1956 if constexpr (is_signed_v<_Up>)
1957 {
1958 const auto r3 = __vector_bitcast<_UInt>(
1959 (__vector_bitcast<int>(__x)
1960 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1961 & 0xff000000u;
1962 const auto r2
1963 = __vector_bitcast<_UInt>(
1964 ((__vector_bitcast<int>(__x) << 8)
1965 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1966 & 0xff000000u;
1967 const auto r1
1968 = __vector_bitcast<_UInt>(
1969 ((__vector_bitcast<int>(__x) << 16)
1970 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1971 & 0xff000000u;
1972 const auto r0 = __vector_bitcast<_UInt>(
1973 (__vector_bitcast<int>(__x) << 24)
1974 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1975 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1976 | (r0 >> 24));
1977 }
1978 else
1979 {
1980 const auto r3 = (__vector_bitcast<_UInt>(__x)
1981 >> (__vector_bitcast<_UInt>(__y) >> 24))
1982 & 0xff000000u;
1983 const auto r2
1984 = ((__vector_bitcast<_UInt>(__x) << 8)
1985 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1986 & 0xff000000u;
1987 const auto r1
1988 = ((__vector_bitcast<_UInt>(__x) << 16)
1989 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1990 & 0xff000000u;
1991 const auto r0
1992 = (__vector_bitcast<_UInt>(__x) << 24)
1993 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1994 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1995 | (r0 >> 24));
1996 }
1997 else if constexpr (__have_sse4_1
1998 && is_unsigned_v<_Up> && sizeof(__x) > 2)
1999 {
2000 auto __x128 = __vector_bitcast<_Up>(__ix);
2001 auto __mask
2002 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
2003 auto __x4 = __vector_bitcast<_Up>(
2004 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
2005 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2006 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
2007 __mask += __mask;
2008 auto __x2 = __vector_bitcast<_Up>(
2009 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
2010 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2011 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
2012 __mask += __mask;
2013 auto __x1 = __vector_bitcast<_Up>(
2014 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
2015 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
2016 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
2017 return __intrin_bitcast<_V>(
2018 __x128
2019 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2020 == 0)); // y > 7 nulls the result
2021 }
2022 else if constexpr (__have_sse4_1
2023 && is_signed_v<_Up> && sizeof(__x) > 2)
2024 {
2025 auto __mask = __vector_bitcast<_UChar>(
2026 __vector_bitcast<_UShort>(__iy) << 5);
2027 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2028 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
2029 };
2030 auto __xh = __vector_bitcast<short>(__ix);
2031 auto __xl = __vector_bitcast<short>(__ix) << 8;
2032 auto __xh4 = __xh >> 4;
2033 auto __xl4 = __xl >> 4;
2034 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2035 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
2036 __xl = __vector_bitcast<short>(
2037 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2038 __to_intrin(__xl4)));
2039 __mask += __mask;
2040 auto __xh2 = __xh >> 2;
2041 auto __xl2 = __xl >> 2;
2042 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2043 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2044 __xl = __vector_bitcast<short>(
2045 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2046 __to_intrin(__xl2)));
2047 __mask += __mask;
2048 auto __xh1 = __xh >> 1;
2049 auto __xl1 = __xl >> 1;
2050 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2051 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2052 __xl = __vector_bitcast<short>(
2053 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2054 __to_intrin(__xl1)));
2055 return __intrin_bitcast<_V>(
2056 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2057 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2058 >> 8))
2059 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2060 == 0)); // y > 7 nulls the result
2061 }
2062 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2063 {
2064 auto __mask
2065 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2066 auto __x4 = __vector_bitcast<_Up>(
2067 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2068 __x = __mask > 0x7f ? __x4 : __x;
2069 __mask += __mask;
2070 auto __x2 = __vector_bitcast<_Up>(
2071 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2072 __x = __mask > 0x7f ? __x2 : __x;
2073 __mask += __mask;
2074 auto __x1 = __vector_bitcast<_Up>(
2075 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2076 __x = __mask > 0x7f ? __x1 : __x;
2077 return __x
2078 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2079 }
2080 else if constexpr (sizeof(__x) > 2) // signed SSE2
2081 {
2082 static_assert(is_signed_v<_Up>);
2083 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2084 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2085 auto __xh = __vector_bitcast<short>(__x);
2086 auto __xl = __vector_bitcast<short>(__x) << 8;
2087 auto __xh4 = __xh >> 4;
2088 auto __xl4 = __xl >> 4;
2089 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2090 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2091 __maskh += __maskh;
2092 __maskl += __maskl;
2093 auto __xh2 = __xh >> 2;
2094 auto __xl2 = __xl >> 2;
2095 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2096 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2097 __maskh += __maskh;
2098 __maskl += __maskl;
2099 auto __xh1 = __xh >> 1;
2100 auto __xl1 = __xl >> 1;
2101 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2102 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2103 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2104 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2105 >> 8);
2106 return __x
2107 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2108 }
2109 else
2110 return __x >> __y;
2111 } //}}}
2112 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2113 {
2114 [[maybe_unused]] auto __blend_0xaa
2115 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2116 if constexpr (sizeof(__a) == 16)
2117 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2118 0xaa);
2119 else if constexpr (sizeof(__a) == 32)
2120 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2121 0xaa);
2122 else if constexpr (sizeof(__a) == 64)
2123 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2124 __to_intrin(__b));
2125 else
2126 __assert_unreachable<decltype(__a)>();
2127 };
2128 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2129 return __intrin_bitcast<_V>(is_signed_v<_Up>
2130 ? _mm_srav_epi16(__ix, __iy)
2131 : _mm_srlv_epi16(__ix, __iy));
2132 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2133 return __vector_bitcast<_Up>(is_signed_v<_Up>
2134 ? _mm256_srav_epi16(__ix, __iy)
2135 : _mm256_srlv_epi16(__ix, __iy));
2136 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2137 return __vector_bitcast<_Up>(is_signed_v<_Up>
2138 ? _mm512_srav_epi16(__ix, __iy)
2139 : _mm512_srlv_epi16(__ix, __iy));
2140 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2141 return __intrin_bitcast<_V>(
2142 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2143 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2144 >> 16,
2145 __vector_bitcast<int>(__ix)
2146 >> (__vector_bitcast<int>(__iy) >> 16)));
2147 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2148 return __intrin_bitcast<_V>(
2149 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2150 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2151 __vector_bitcast<_UInt>(__ix)
2152 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2153 else if constexpr (__have_sse4_1)
2154 {
2155 auto __mask = __vector_bitcast<_UShort>(__iy);
2156 auto __x128 = __vector_bitcast<_Up>(__ix);
2157 //__mask *= 0x0808;
2158 __mask = (__mask << 3) | (__mask << 11);
2159 // do __x128 = 0 where __y[4] is set
2160 __x128 = __vector_bitcast<_Up>(
2161 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2162 __to_intrin(__mask)));
2163 // do __x128 =>> 8 where __y[3] is set
2164 __x128 = __vector_bitcast<_Up>(
2165 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2166 __to_intrin(__mask += __mask)));
2167 // do __x128 =>> 4 where __y[2] is set
2168 __x128 = __vector_bitcast<_Up>(
2169 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2170 __to_intrin(__mask += __mask)));
2171 // do __x128 =>> 2 where __y[1] is set
2172 __x128 = __vector_bitcast<_Up>(
2173 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2174 __to_intrin(__mask += __mask)));
2175 // do __x128 =>> 1 where __y[0] is set
2176 return __intrin_bitcast<_V>(
2177 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2178 __to_intrin(__mask + __mask)));
2179 }
2180 else
2181 {
2182 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2183 auto __x128 = __vector_bitcast<_Up>(__ix);
2184 auto __mask
2185 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2186 return __vector_bitcast<short>(__kk) < 0;
2187 };
2188 // do __x128 = 0 where __y[4] is set
2189 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2190 // do __x128 =>> 8 where __y[3] is set
2191 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2192 // do __x128 =>> 4 where __y[2] is set
2193 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2194 // do __x128 =>> 2 where __y[1] is set
2195 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2196 // do __x128 =>> 1 where __y[0] is set
2197 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2198 : __x128);
2199 }
2200 } //}}}
2201 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2202 {
2203 if constexpr (is_unsigned_v<_Up>)
2204 {
2205 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2206 const __m128 __factor_f = reinterpret_cast<__m128>(
2207 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2208 const __m128i __factor
2209 = __builtin_constant_p(__factor_f)
2210 ? __to_intrin(
2211 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2212 __factor_f[2], __factor_f[3]))
2213 : _mm_cvttps_epi32(__factor_f);
2214 const auto __r02
2215 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2216 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2217 _mm_srli_si128(__factor, 4));
2218 if constexpr (__have_sse4_1)
2219 return __intrin_bitcast<_V>(
2220 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2221 else
2222 return __intrin_bitcast<_V>(
2223 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2224 }
2225 else
2226 {
2227 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2228 if constexpr (is_signed_v<_Up>)
2229 return _mm_sra_epi32(__a, __b);
2230 else
2231 return _mm_srl_epi32(__a, __b);
2232 };
2233 const auto __r0
2234 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2235 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2236 const auto __r2
2237 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2238 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2239 if constexpr (__have_sse4_1)
2240 return __intrin_bitcast<_V>(
2241 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2242 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2243 else
2244 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2245 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2246 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2247 }
2248 } //}}}
2249 else
2250 return __x >> __y;
2251 }
2252#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2253
2254 // }}}
2255 // compares {{{
2256 // _S_equal_to {{{
2257 template <typename _Tp, size_t _Np>
2258 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2259 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2260 {
2261 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2262 {
2263 if (__builtin_is_constant_evaluated()
2264 || (__x._M_is_constprop() && __y._M_is_constprop()))
2265 return _MaskImpl::_S_to_bits(
2266 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2267
2268 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2269 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2270 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2271 if constexpr (is_floating_point_v<_Tp>)
2272 {
2273 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2274 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2275 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2276 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2278 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2280 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2281 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2282 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2284 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2285 else
2286 __assert_unreachable<_Tp>();
2287 }
2288 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2289 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2290 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2291 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2292 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2293 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2294 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2295 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2296 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2297 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2298 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2299 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2300 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2301 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2302 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2303 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2304 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2305 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2306 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2307 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2308 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2309 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2310 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2311 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2312 else
2313 __assert_unreachable<_Tp>();
2314 } // }}}
2315 else if (__builtin_is_constant_evaluated())
2316 return _Base::_S_equal_to(__x, __y);
2317 else if constexpr (sizeof(__x) == 8)
2318 {
2319 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2320 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2321 _MaskMember<_Tp> __r64{};
2322 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2323 return __r64;
2324 }
2325 else
2326 return _Base::_S_equal_to(__x, __y);
2327 }
2328
2329 // }}}
2330 // _S_not_equal_to {{{
2331 template <typename _Tp, size_t _Np>
2332 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2333 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2334 {
2335 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2336 {
2337 if (__builtin_is_constant_evaluated()
2338 || (__x._M_is_constprop() && __y._M_is_constprop()))
2339 return _MaskImpl::_S_to_bits(
2340 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2341
2342 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2343 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2344 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2345 if constexpr (is_floating_point_v<_Tp>)
2346 {
2347 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2348 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2349 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2350 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2352 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2353 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2354 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2355 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2356 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2357 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2358 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2359 else
2360 __assert_unreachable<_Tp>();
2361 }
2362 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2363 return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2364 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2365 return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2366 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2367 return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2368 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2369 return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2370 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2371 return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2372 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2373 return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2374 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2375 return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2376 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2377 return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2378 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2379 return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi);
2380 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2381 return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi);
2382 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2383 return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi);
2384 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2385 return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi);
2386 else
2387 __assert_unreachable<_Tp>();
2388 } // }}}
2389 else if (__builtin_is_constant_evaluated())
2390 return _Base::_S_not_equal_to(__x, __y);
2391 else if constexpr (sizeof(__x) == 8)
2392 {
2393 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2394 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2395 _MaskMember<_Tp> __r64{};
2396 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2397 return __r64;
2398 }
2399 else
2400 return _Base::_S_not_equal_to(__x, __y);
2401 }
2402
2403 // }}}
2404 // _S_less {{{
2405 template <typename _Tp, size_t _Np>
2406 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2407 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2408 {
2409 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2410 {
2411 if (__builtin_is_constant_evaluated()
2412 || (__x._M_is_constprop() && __y._M_is_constprop()))
2413 return _MaskImpl::_S_to_bits(
2414 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2415
2416 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2417 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2418 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2419 if constexpr (sizeof(__xi) == 64)
2420 {
2421 if constexpr (__is_x86_ps<_Tp> ())
2422 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2423 else if constexpr (__is_x86_pd<_Tp> ())
2424 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2425 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2426 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2427 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2428 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2429 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2430 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2431 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2432 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2433 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2434 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2435 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2436 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2437 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2438 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2439 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2440 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2441 else
2442 __assert_unreachable<_Tp>();
2443 }
2444 else if constexpr (sizeof(__xi) == 32)
2445 {
2446 if constexpr (__is_x86_ps<_Tp> ())
2447 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2448 else if constexpr (__is_x86_pd<_Tp> ())
2449 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2450 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2451 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2452 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2453 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2454 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2455 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2456 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2457 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2458 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2459 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2460 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2461 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2462 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2463 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2464 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2465 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2466 else
2467 __assert_unreachable<_Tp>();
2468 }
2469 else if constexpr (sizeof(__xi) == 16)
2470 {
2471 if constexpr (__is_x86_ps<_Tp> ())
2472 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2473 else if constexpr (__is_x86_pd<_Tp> ())
2474 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2475 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2476 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2477 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2478 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2479 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2480 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2481 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2482 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2483 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2484 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2485 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2486 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2487 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2488 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2489 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2490 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2491 else
2492 __assert_unreachable<_Tp>();
2493 }
2494 else
2495 __assert_unreachable<_Tp>();
2496 } // }}}
2497 else if (__builtin_is_constant_evaluated())
2498 return _Base::_S_less(__x, __y);
2499 else if constexpr (sizeof(__x) == 8)
2500 {
2501 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2502 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2503 _MaskMember<_Tp> __r64{};
2504 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2505 return __r64;
2506 }
2507 else
2508 return _Base::_S_less(__x, __y);
2509 }
2510
2511 // }}}
2512 // _S_less_equal {{{
2513 template <typename _Tp, size_t _Np>
2514 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2515 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2516 {
2517 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2518 {
2519 if (__builtin_is_constant_evaluated()
2520 || (__x._M_is_constprop() && __y._M_is_constprop()))
2521 return _MaskImpl::_S_to_bits(
2522 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2523
2524 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2525 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2526 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2527 if constexpr (sizeof(__xi) == 64)
2528 {
2529 if constexpr (__is_x86_ps<_Tp> ())
2530 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2531 else if constexpr (__is_x86_pd<_Tp> ())
2532 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2533 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2534 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2535 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2536 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2538 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2540 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2541 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2542 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2543 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2544 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2546 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2548 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2549 else
2550 __assert_unreachable<_Tp>();
2551 }
2552 else if constexpr (sizeof(__xi) == 32)
2553 {
2554 if constexpr (__is_x86_ps<_Tp> ())
2555 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2556 else if constexpr (__is_x86_pd<_Tp> ())
2557 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2558 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2559 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2560 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2561 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2562 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2563 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2564 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2565 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2566 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2567 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2568 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2569 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2570 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2571 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2572 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2573 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2574 else
2575 __assert_unreachable<_Tp>();
2576 }
2577 else if constexpr (sizeof(__xi) == 16)
2578 {
2579 if constexpr (__is_x86_ps<_Tp> ())
2580 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2581 else if constexpr (__is_x86_pd<_Tp> ())
2582 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2583 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2584 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2585 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2586 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2587 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2588 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2589 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2590 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2591 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2592 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2593 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2594 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2595 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2596 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2597 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2598 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2599 else
2600 __assert_unreachable<_Tp>();
2601 }
2602 else
2603 __assert_unreachable<_Tp>();
2604 } // }}}
2605 else if (__builtin_is_constant_evaluated())
2606 return _Base::_S_less_equal(__x, __y);
2607 else if constexpr (sizeof(__x) == 8)
2608 {
2609 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2610 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2611 _MaskMember<_Tp> __r64{};
2612 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2613 return __r64;
2614 }
2615 else
2616 return _Base::_S_less_equal(__x, __y);
2617 }
2618
2619 // }}} }}}
2620 // negation {{{
2621 template <typename _Tp, size_t _Np>
2622 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2623 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2624 {
2625 if constexpr (__is_avx512_abi<_Abi>())
2626 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2627 else
2628 return _Base::_S_negate(__x);
2629 }
2630
2631 // }}}
2632 // math {{{
2633 using _Base::_S_abs;
2634
2635 // _S_sqrt {{{
2636 template <typename _Tp, size_t _Np>
2637 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2638 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2639 {
2640 if constexpr (__is_sse_ps<_Tp, _Np>())
2641 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2642 else if constexpr (__is_sse_pd<_Tp, _Np>())
2643 return _mm_sqrt_pd(__x);
2644 else if constexpr (__is_avx_ps<_Tp, _Np>())
2645 return _mm256_sqrt_ps(__x);
2646 else if constexpr (__is_avx_pd<_Tp, _Np>())
2647 return _mm256_sqrt_pd(__x);
2648 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2649 return _mm512_sqrt_ps(__x);
2650 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2651 return _mm512_sqrt_pd(__x);
2652 else
2653 __assert_unreachable<_Tp>();
2654 }
2655
2656 // }}}
2657 // _S_ldexp {{{
2658 template <typename _Tp, size_t _Np>
2659 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2660 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2661 __fixed_size_storage_t<int, _Np> __exp)
2662 {
2663 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2664 {
2665 const auto __xi = __to_intrin(__x);
2666 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2667 __cvt;
2668 const auto __expi = __to_intrin(__cvt(__exp));
2669 using _Up = __bool_storage_member_type_t<_Np>;
2670 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2671 if constexpr (sizeof(__xi) == 16)
2672 {
2673 if constexpr (sizeof(_Tp) == 8)
2674 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2675 else
2676 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2677 }
2678 else if constexpr (sizeof(__xi) == 32)
2679 {
2680 if constexpr (sizeof(_Tp) == 8)
2681 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2682 else
2683 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2684 }
2685 else
2686 {
2687 static_assert(sizeof(__xi) == 64);
2688 if constexpr (sizeof(_Tp) == 8)
2689 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2690 else
2691 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2692 }
2693 }
2694 else
2695 return _Base::_S_ldexp(__x, __exp);
2696 }
2697
2698 // }}}
2699 // _S_trunc {{{
2700 template <typename _Tp, size_t _Np>
2701 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2702 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2703 {
2704 if constexpr (__is_avx512_ps<_Tp, _Np>())
2705 return _mm512_roundscale_ps(__x, 0x0b);
2706 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2707 return _mm512_roundscale_pd(__x, 0x0b);
2708 else if constexpr (__is_avx_ps<_Tp, _Np>())
2709 return _mm256_round_ps(__x, 0xb);
2710 else if constexpr (__is_avx_pd<_Tp, _Np>())
2711 return _mm256_round_pd(__x, 0xb);
2712 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2713 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2714 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2715 return _mm_round_pd(__x, 0xb);
2716 else if constexpr (__is_sse_ps<_Tp, _Np>())
2717 {
2718 auto __truncated
2719 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2720 const auto __no_fractional_values
2721 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2722 & 0x7f800000u)
2723 < 0x4b000000; // the exponent is so large that no mantissa bits
2724 // signify fractional values (0x3f8 + 23*8 =
2725 // 0x4b0)
2726 return __no_fractional_values ? __truncated : __to_intrin(__x);
2727 }
2728 else
2729 return _Base::_S_trunc(__x);
2730 }
2731
2732 // }}}
2733 // _S_round {{{
2734 template <typename _Tp, size_t _Np>
2735 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2736 _S_round(_SimdWrapper<_Tp, _Np> __x)
2737 {
2738 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2739 // from zero as required by std::round. Therefore this function is more
2740 // complicated.
2741 using _V = __vector_type_t<_Tp, _Np>;
2742 _V __truncated;
2743 if constexpr (__is_avx512_ps<_Tp, _Np>())
2744 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2745 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2746 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2747 else if constexpr (__is_avx_ps<_Tp, _Np>())
2748 __truncated = _mm256_round_ps(__x._M_data,
2749 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2750 else if constexpr (__is_avx_pd<_Tp, _Np>())
2751 __truncated = _mm256_round_pd(__x._M_data,
2752 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2753 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2754 __truncated = __auto_bitcast(
2755 _mm_round_ps(__to_intrin(__x),
2756 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2757 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2758 __truncated
2759 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2760 else if constexpr (__is_sse_ps<_Tp, _Np>())
2761 __truncated = __auto_bitcast(
2762 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2763 else
2764 return _Base::_S_round(__x);
2765
2766 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2767 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2768
2769 const _V __rounded
2770 = __truncated
2771 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2772 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2773 : _V());
2774 if constexpr (__have_sse4_1)
2775 return __rounded;
2776 else // adjust for missing range in cvttps_epi32
2777 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2778 : __x._M_data;
2779 }
2780
2781 // }}}
2782 // _S_nearbyint {{{
2783 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2784 _GLIBCXX_SIMD_INTRINSIC static _Tp
2785 _S_nearbyint(_Tp __x) noexcept
2786 {
2787 if constexpr (_TVT::template _S_is<float, 16>)
2788 return _mm512_roundscale_ps(__x, 0x0c);
2789 else if constexpr (_TVT::template _S_is<double, 8>)
2790 return _mm512_roundscale_pd(__x, 0x0c);
2791 else if constexpr (_TVT::template _S_is<float, 8>)
2792 return _mm256_round_ps(__x,
2793 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2794 else if constexpr (_TVT::template _S_is<double, 4>)
2795 return _mm256_round_pd(__x,
2796 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2797 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2798 return _mm_round_ps(__x,
2799 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2800 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2801 return _mm_round_pd(__x,
2802 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2803 else
2804 return _Base::_S_nearbyint(__x);
2805 }
2806
2807 // }}}
2808 // _S_rint {{{
2809 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2810 _GLIBCXX_SIMD_INTRINSIC static _Tp
2811 _S_rint(_Tp __x) noexcept
2812 {
2813 if constexpr (_TVT::template _S_is<float, 16>)
2814 return _mm512_roundscale_ps(__x, 0x04);
2815 else if constexpr (_TVT::template _S_is<double, 8>)
2816 return _mm512_roundscale_pd(__x, 0x04);
2817 else if constexpr (_TVT::template _S_is<float, 8>)
2818 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2819 else if constexpr (_TVT::template _S_is<double, 4>)
2820 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2821 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2822 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2823 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2824 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2825 else
2826 return _Base::_S_rint(__x);
2827 }
2828
2829 // }}}
2830 // _S_floor {{{
2831 template <typename _Tp, size_t _Np>
2832 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2833 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2834 {
2835 if constexpr (__is_avx512_ps<_Tp, _Np>())
2836 return _mm512_roundscale_ps(__x, 0x09);
2837 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2838 return _mm512_roundscale_pd(__x, 0x09);
2839 else if constexpr (__is_avx_ps<_Tp, _Np>())
2840 return _mm256_round_ps(__x, 0x9);
2841 else if constexpr (__is_avx_pd<_Tp, _Np>())
2842 return _mm256_round_pd(__x, 0x9);
2843 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2844 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2845 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2846 return _mm_round_pd(__x, 0x9);
2847 else
2848 return _Base::_S_floor(__x);
2849 }
2850
2851 // }}}
2852 // _S_ceil {{{
2853 template <typename _Tp, size_t _Np>
2854 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2855 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2856 {
2857 if constexpr (__is_avx512_ps<_Tp, _Np>())
2858 return _mm512_roundscale_ps(__x, 0x0a);
2859 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2860 return _mm512_roundscale_pd(__x, 0x0a);
2861 else if constexpr (__is_avx_ps<_Tp, _Np>())
2862 return _mm256_round_ps(__x, 0xa);
2863 else if constexpr (__is_avx_pd<_Tp, _Np>())
2864 return _mm256_round_pd(__x, 0xa);
2865 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2866 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2867 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2868 return _mm_round_pd(__x, 0xa);
2869 else
2870 return _Base::_S_ceil(__x);
2871 }
2872
2873 // }}}
2874 // _S_signbit {{{
2875 template <typename _Tp, size_t _Np>
2876 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2877 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2878 {
2879 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2880 {
2881 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2882 return _mm512_movepi32_mask(
2883 __intrin_bitcast<__m512i>(__x._M_data));
2884 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2885 return _mm512_movepi64_mask(
2886 __intrin_bitcast<__m512i>(__x._M_data));
2887 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2888 return _mm256_movepi32_mask(
2889 __intrin_bitcast<__m256i>(__x._M_data));
2890 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2891 return _mm256_movepi64_mask(
2892 __intrin_bitcast<__m256i>(__x._M_data));
2893 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2894 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2895 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2896 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2897 }
2898 else if constexpr (__is_avx512_abi<_Abi>())
2899 {
2900 const auto __xi = __to_intrin(__x);
2901 [[maybe_unused]] constexpr auto __k1
2902 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2903 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2904 return _mm_movemask_ps(__xi);
2905 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2906 return _mm_movemask_pd(__xi);
2907 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2908 return _mm256_movemask_ps(__xi);
2909 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2910 return _mm256_movemask_pd(__xi);
2911 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2912 return _mm512_mask_cmplt_epi32_mask(
2913 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2914 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2915 return _mm512_mask_cmplt_epi64_mask(
2916 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2917 else
2918 __assert_unreachable<_Tp>();
2919 }
2920 else
2921 return _Base::_S_signbit(__x);
2922 /*{
2923 using _I = __int_for_sizeof_t<_Tp>;
2924 if constexpr (sizeof(__x) == 64)
2925 return _S_less(__vector_bitcast<_I>(__x), _I());
2926 else
2927 {
2928 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2929 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2930 if constexpr ((sizeof(_Tp) == 4 &&
2931 (__have_avx2 || sizeof(__x) == 16)) ||
2932 __have_avx512vl)
2933 {
2934 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2935 }
2936 else if constexpr ((__have_avx2 ||
2937 (__have_ssse3 && sizeof(__x) == 16)))
2938 {
2939 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2940 __signmask);
2941 }
2942 else
2943 { // SSE2/3 or AVX (w/o AVX2)
2944 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2945 return __vector_bitcast<_Tp>(
2946 __vector_bitcast<_Tp>(
2947 (__xx & __signmask) |
2948 __vector_bitcast<_I>(__one)) // -1 or 1
2949 != __one);
2950 }
2951 }
2952 }*/
2953 }
2954
2955 // }}}
2956 // _S_isnonzerovalue_mask {{{
2957 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2958 template <typename _Tp>
2959 _GLIBCXX_SIMD_INTRINSIC static auto
2960 _S_isnonzerovalue_mask(_Tp __x)
2961 {
2962 using _Traits = _VectorTraits<_Tp>;
2963 if constexpr (__have_avx512dq_vl)
2964 {
2965 if constexpr (_Traits::template _S_is<
2966 float, 2> || _Traits::template _S_is<float, 4>)
2967 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2968 else if constexpr (_Traits::template _S_is<float, 8>)
2969 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2970 else if constexpr (_Traits::template _S_is<float, 16>)
2971 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2972 else if constexpr (_Traits::template _S_is<double, 2>)
2973 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2974 else if constexpr (_Traits::template _S_is<double, 4>)
2975 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2976 else if constexpr (_Traits::template _S_is<double, 8>)
2977 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2978 else
2979 __assert_unreachable<_Tp>();
2980 }
2981 else
2982 {
2983 using _Up = typename _Traits::value_type;
2984 constexpr size_t _Np = _Traits::_S_full_size;
2985 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2986 const auto __b = __x * _Up(); // NaN if __x == inf
2987 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2988 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2989 _CMP_ORD_Q);
2990 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2991 return __mmask8(0xf
2992 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2993 __auto_bitcast(__b),
2994 _CMP_ORD_Q));
2995 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2996 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2997 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2998 return __mmask8(0x3
2999 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3000 __auto_bitcast(__b),
3001 _CMP_ORD_Q));
3002 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
3003 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3004 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
3005 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
3006 __auto_bitcast(__b),
3007 _CMP_ORD_Q));
3008 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
3009 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3010 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
3011 return __mmask8(0xf
3012 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
3013 __auto_bitcast(__b),
3014 _CMP_ORD_Q));
3015 else if constexpr (__is_avx512_ps<_Up, _Np>())
3016 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
3017 else if constexpr (__is_avx512_pd<_Up, _Np>())
3018 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
3019 else
3020 __assert_unreachable<_Tp>();
3021 }
3022 }
3023
3024 // }}}
3025 // _S_isfinite {{{
3026 template <typename _Tp, size_t _Np>
3027 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3028 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
3029 {
3030 static_assert(is_floating_point_v<_Tp>);
3031#if !__FINITE_MATH_ONLY__
3032 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3033 {
3034 const auto __xi = __to_intrin(__x);
3035 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3036 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3037 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3038 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3039 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3040 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3041 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3042 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3043 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3044 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3045 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
3046 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3047 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
3048 }
3049 else if constexpr (__is_avx512_abi<_Abi>())
3050 {
3051 // if all exponent bits are set, __x is either inf or NaN
3052 using _I = __int_for_sizeof_t<_Tp>;
3053 const auto __inf = __vector_bitcast<_I>(
3054 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3055 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3056 }
3057 else
3058#endif
3059 return _Base::_S_isfinite(__x);
3060 }
3061
3062 // }}}
3063 // _S_isinf {{{
3064 template <typename _Tp, size_t _Np>
3065 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3066 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3067 {
3068#if !__FINITE_MATH_ONLY__
3069 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3070 {
3071 const auto __xi = __to_intrin(__x);
3072 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3073 return _mm512_fpclass_ps_mask(__xi, 0x18);
3074 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3075 return _mm512_fpclass_pd_mask(__xi, 0x18);
3076 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3077 return _mm256_fpclass_ps_mask(__xi, 0x18);
3078 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3079 return _mm256_fpclass_pd_mask(__xi, 0x18);
3080 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3081 return _mm_fpclass_ps_mask(__xi, 0x18);
3082 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3083 return _mm_fpclass_pd_mask(__xi, 0x18);
3084 else
3085 __assert_unreachable<_Tp>();
3086 }
3087 else if constexpr (__have_avx512dq_vl)
3088 {
3089 if constexpr (__is_sse_pd<_Tp, _Np>())
3090 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3091 else if constexpr (__is_avx_pd<_Tp, _Np>())
3092 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3093 else if constexpr (__is_sse_ps<_Tp, _Np>())
3094 return _mm_movm_epi32(
3095 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3096 else if constexpr (__is_avx_ps<_Tp, _Np>())
3097 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3098 else
3099 __assert_unreachable<_Tp>();
3100 }
3101 else
3102#endif
3103 return _Base::_S_isinf(__x);
3104 }
3105
3106 // }}}
3107 // _S_isnormal {{{
3108 template <typename _Tp, size_t _Np>
3109 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3110 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3111 {
3112#if __FINITE_MATH_ONLY__
3113 [[maybe_unused]] constexpr int __mode = 0x26;
3114#else
3115 [[maybe_unused]] constexpr int __mode = 0xbf;
3116#endif
3117 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3118 {
3119 const auto __xi = __to_intrin(__x);
3120 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3121 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3122 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3123 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3124 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3125 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3126 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3127 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3128 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3129 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3130 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3131 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3132 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3133 else
3134 __assert_unreachable<_Tp>();
3135 }
3136 else if constexpr (__have_avx512dq)
3137 {
3138 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3139 return _mm_movm_epi32(
3140 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3141 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3142 return _mm256_movm_epi32(
3143 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3144 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3145 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3146 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3147 return _mm_movm_epi64(
3148 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3149 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3150 return _mm256_movm_epi64(
3151 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3152 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3153 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3154 else
3155 __assert_unreachable<_Tp>();
3156 }
3157 else if constexpr (__is_avx512_abi<_Abi>())
3158 {
3159 using _I = __int_for_sizeof_t<_Tp>;
3160 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3161 const auto minn = __vector_bitcast<_I>(
3162 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3163#if __FINITE_MATH_ONLY__
3164 return _S_less_equal<_I, _Np>(minn, absn);
3165#else
3166 const auto infn
3167 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3168 return __and(_S_less_equal<_I, _Np>(minn, absn),
3169 _S_less<_I, _Np>(absn, infn));
3170#endif
3171 }
3172 else
3173 return _Base::_S_isnormal(__x);
3174 }
3175
3176 // }}}
3177 // _S_isnan {{{
3178 template <typename _Tp, size_t _Np>
3179 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3180 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3181 { return _S_isunordered(__x, __x); }
3182
3183 // }}}
3184 // _S_isunordered {{{
3185 template <typename _Tp, size_t _Np>
3186 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3187 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3188 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3189 {
3190#if __FINITE_MATH_ONLY__
3191 return {}; // false
3192#else
3193 const auto __xi = __to_intrin(__x);
3194 const auto __yi = __to_intrin(__y);
3195 if constexpr (__is_avx512_abi<_Abi>())
3196 {
3197 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3198 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3199 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3200 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3201 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3202 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3203 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3204 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3205 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3206 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3207 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3208 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3209 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3210 }
3211 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3212 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3213 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3214 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3215 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3216 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3217 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3218 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3219 else
3220 __assert_unreachable<_Tp>();
3221#endif
3222 }
3223
3224 // }}}
3225 // _S_isgreater {{{
3226 template <typename _Tp, size_t _Np>
3227 static constexpr _MaskMember<_Tp>
3228 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3229 {
3230 const auto __xi = __to_intrin(__x);
3231 const auto __yi = __to_intrin(__y);
3232 if constexpr (__is_avx512_abi<_Abi>())
3233 {
3234 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3235 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3236 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3237 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3238 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3239 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3240 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3241 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3242 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3243 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3244 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3245 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3246 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3247 else
3248 __assert_unreachable<_Tp>();
3249 }
3250 else if constexpr (__have_avx)
3251 {
3252 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3253 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3254 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3255 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3256 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3257 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3258 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3259 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3260 else
3261 __assert_unreachable<_Tp>();
3262 }
3263 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3264 && sizeof(_Tp) == 4)
3265 {
3266 const auto __xn = __vector_bitcast<int>(__xi);
3267 const auto __yn = __vector_bitcast<int>(__yi);
3268 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3269 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3270 return __auto_bitcast(
3271 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3272 }
3273 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3274 && sizeof(_Tp) == 8)
3275 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3276 -_mm_ucomigt_sd(__xi, __yi),
3277 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3278 _mm_unpackhi_pd(__yi, __yi))};
3279 else
3280 return _Base::_S_isgreater(__x, __y);
3281 }
3282
3283 // }}}
3284 // _S_isgreaterequal {{{
3285 template <typename _Tp, size_t _Np>
3286 static constexpr _MaskMember<_Tp>
3287 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3288 {
3289 const auto __xi = __to_intrin(__x);
3290 const auto __yi = __to_intrin(__y);
3291 if constexpr (__is_avx512_abi<_Abi>())
3292 {
3293 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3294 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3295 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3296 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3297 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3298 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3299 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3300 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3301 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3302 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3303 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3304 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3305 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3306 else
3307 __assert_unreachable<_Tp>();
3308 }
3309 else if constexpr (__have_avx)
3310 {
3311 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3312 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3313 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3314 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3316 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3317 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3318 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3319 else
3320 __assert_unreachable<_Tp>();
3321 }
3322 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3323 && sizeof(_Tp) == 4)
3324 {
3325 const auto __xn = __vector_bitcast<int>(__xi);
3326 const auto __yn = __vector_bitcast<int>(__yi);
3327 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3328 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3329 return __auto_bitcast(
3330 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3331 }
3332 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3333 && sizeof(_Tp) == 8)
3334 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3335 -_mm_ucomige_sd(__xi, __yi),
3336 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3337 _mm_unpackhi_pd(__yi, __yi))};
3338 else
3339 return _Base::_S_isgreaterequal(__x, __y);
3340 }
3341
3342 // }}}
3343 // _S_isless {{{
3344 template <typename _Tp, size_t _Np>
3345 static constexpr _MaskMember<_Tp>
3346 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3347 {
3348 const auto __xi = __to_intrin(__x);
3349 const auto __yi = __to_intrin(__y);
3350 if constexpr (__is_avx512_abi<_Abi>())
3351 {
3352 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3353 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3354 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3355 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3356 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3357 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3358 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3359 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3360 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3361 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3362 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3363 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3364 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3365 else
3366 __assert_unreachable<_Tp>();
3367 }
3368 else if constexpr (__have_avx)
3369 {
3370 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3371 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3372 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3373 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3374 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3375 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3376 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3377 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3378 else
3379 __assert_unreachable<_Tp>();
3380 }
3381 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3382 && sizeof(_Tp) == 4)
3383 {
3384 const auto __xn = __vector_bitcast<int>(__xi);
3385 const auto __yn = __vector_bitcast<int>(__yi);
3386 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3387 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3388 return __auto_bitcast(
3389 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3390 }
3391 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3392 && sizeof(_Tp) == 8)
3393 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3394 -_mm_ucomigt_sd(__yi, __xi),
3395 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3396 _mm_unpackhi_pd(__xi, __xi))};
3397 else
3398 return _Base::_S_isless(__x, __y);
3399 }
3400
3401 // }}}
3402 // _S_islessequal {{{
3403 template <typename _Tp, size_t _Np>
3404 static constexpr _MaskMember<_Tp>
3405 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3406 {
3407 const auto __xi = __to_intrin(__x);
3408 const auto __yi = __to_intrin(__y);
3409 if constexpr (__is_avx512_abi<_Abi>())
3410 {
3411 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3412 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3413 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3414 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3415 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3416 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3417 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3418 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3419 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3420 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3421 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3422 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3423 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3424 else
3425 __assert_unreachable<_Tp>();
3426 }
3427 else if constexpr (__have_avx)
3428 {
3429 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3430 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3431 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3432 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3433 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3434 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3435 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3436 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3437 else
3438 __assert_unreachable<_Tp>();
3439 }
3440 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3441 && sizeof(_Tp) == 4)
3442 {
3443 const auto __xn = __vector_bitcast<int>(__xi);
3444 const auto __yn = __vector_bitcast<int>(__yi);
3445 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3446 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3447 return __auto_bitcast(
3448 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3449 }
3450 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3451 && sizeof(_Tp) == 8)
3452 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3453 -_mm_ucomige_sd(__yi, __xi),
3454 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3455 _mm_unpackhi_pd(__xi, __xi))};
3456 else
3457 return _Base::_S_islessequal(__x, __y);
3458 }
3459
3460 // }}}
3461 // _S_islessgreater {{{
3462 template <typename _Tp, size_t _Np>
3463 static constexpr _MaskMember<_Tp>
3464 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3465 {
3466 const auto __xi = __to_intrin(__x);
3467 const auto __yi = __to_intrin(__y);
3468 if constexpr (__is_avx512_abi<_Abi>())
3469 {
3470 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3471 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3472 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3473 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3474 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3475 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3476 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3477 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3478 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3479 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3480 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3481 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3482 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3483 else
3484 __assert_unreachable<_Tp>();
3485 }
3486 else if constexpr (__have_avx)
3487 {
3488 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3489 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3490 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3491 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3492 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3493 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3494 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3495 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3496 else
3497 __assert_unreachable<_Tp>();
3498 }
3499 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3500 return __auto_bitcast(
3501 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3502 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3503 return __to_masktype(
3504 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3505 else
3506 __assert_unreachable<_Tp>();
3507 }
3508
3509 //}}} }}}
3510 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np>
3511 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
3512 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v)
3513 {
3514 if (__k._M_is_constprop_none_of())
3515 return __v;
3516 else if (__k._M_is_constprop_all_of())
3517 {
3518 auto __vv = _Base::_M_make_simd(__v);
3519 _Op<decltype(__vv)> __op;
3520 return __data(__op(__vv));
3521 }
3522 else if constexpr (__is_bitmask_v<decltype(__k)>
3523 && (is_same_v<_Op<void>, __increment<void>>
3524 || is_same_v<_Op<void>, __decrement<void>>))
3525 {
3526 // optimize masked unary increment and decrement as masked sub +/-1
3527 constexpr int __pm_one
3528 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1;
3529#ifdef _GLIBCXX_CLANG
3530 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data;
3531#else // _GLIBCXX_CLANG
3532 using _TV = __vector_type_t<_Tp, _Np>;
3533 constexpr size_t __bytes = sizeof(__v) < 16 ? 16 : sizeof(__v);
3534 constexpr size_t __width = __bytes / sizeof(_Tp);
3535 if constexpr (is_integral_v<_Tp>)
3536 {
3537 constexpr bool __lp64 = sizeof(long) == sizeof(long long);
3538 using _Ip = std::make_signed_t<_Tp>;
3539 using _Up = std::conditional_t<
3540 std::is_same_v<_Ip, long>,
3543 std::is_same_v<_Ip, signed char>, char, _Ip>>;
3544 const auto __value = __intrin_bitcast<__vector_type_t<_Up, __width>>(__v._M_data);
3545#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3546 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3547 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask(__value, \
3548 __vector_broadcast<__width>(_Up(__pm_one)), __value, __k._M_data))
3549 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512);
3550 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256);
3551 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128);
3552 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512);
3553 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256);
3554 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128);
3555 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512);
3556 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256);
3557 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128);
3558 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512);
3559 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256);
3560 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128);
3561#undef _GLIBCXX_SIMD_MASK_SUB
3562 }
3563 else
3564 {
3565 const auto __value = __intrin_bitcast<__vector_type_t<_Tp, __width>>(__v._M_data);
3566#define _GLIBCXX_SIMD_MASK_SUB_512(_Sizeof, _Width, _Instr) \
3567 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3568 return __builtin_ia32_##_Instr##_mask( \
3569 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \
3570 __k._M_data, _MM_FROUND_CUR_DIRECTION)
3571#define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \
3572 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \
3573 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask( \
3574 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \
3575 __k._M_data))
3576 _GLIBCXX_SIMD_MASK_SUB_512(4, 64, subps512);
3577 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256);
3578 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128);
3579 _GLIBCXX_SIMD_MASK_SUB_512(8, 64, subpd512);
3580 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256);
3581 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128);
3582#undef _GLIBCXX_SIMD_MASK_SUB_512
3583#undef _GLIBCXX_SIMD_MASK_SUB
3584 }
3585#endif // _GLIBCXX_CLANG
3586 }
3587 else
3588 return _Base::template _S_masked_unary<_Op>(__k, __v);
3589 }
3590 };
3591
3592// }}}
3593// _MaskImplX86Mixin {{{
3594struct _MaskImplX86Mixin
3595{
3596 template <typename _Tp>
3597 using _TypeTag = _Tp*;
3598
3599 using _Base = _MaskImplBuiltinMixin;
3600
3601 // _S_to_maskvector(bool) {{{
3602 template <typename _Up, size_t _ToN = 1, typename _Tp>
3603 _GLIBCXX_SIMD_INTRINSIC static constexpr
3604 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3605 _S_to_maskvector(_Tp __x)
3606 {
3607 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3608 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3609 : __vector_type_t<_Up, _ToN>();
3610 }
3611
3612 // }}}
3613 // _S_to_maskvector(_SanitizedBitMask) {{{
3614 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN>
3615 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3616 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3617 {
3618 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3619 using _UV = __vector_type_t<_Up, _ToN>;
3620 using _UI = __intrinsic_type_t<_Up, _ToN>;
3621 [[maybe_unused]] const auto __k = __x._M_to_bits();
3622 if constexpr (_Np == 1)
3623 return _S_to_maskvector<_Up, _ToN>(__k);
3624 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3625 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3626 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
3627 else if constexpr (sizeof(_Up) == 1)
3628 {
3629 if constexpr (sizeof(_UI) == 16)
3630 {
3631 if constexpr (__have_avx512bw_vl)
3632 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3633 else if constexpr (__have_avx512bw)
3634 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3635 else if constexpr (__have_avx512f)
3636 {
3637 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3638 auto __as16bits
3639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3640 __hi256(__as32bits)));
3641 return __intrin_bitcast<_UV>(
3642 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3643 }
3644 else if constexpr (__have_ssse3)
3645 {
3646 const auto __bitmask = __to_intrin(
3647 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3648 8, 16, 32, 64, 128));
3649 return __intrin_bitcast<_UV>(
3650 __vector_bitcast<_Up>(
3651 _mm_shuffle_epi8(__to_intrin(
3652 __vector_type_t<_ULLong, 2>{__k}),
3653 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3654 1, 1, 1, 1, 1, 1, 1))
3655 & __bitmask)
3656 != 0);
3657 }
3658 // else fall through
3659 }
3660 else if constexpr (sizeof(_UI) == 32)
3661 {
3662 if constexpr (__have_avx512bw_vl)
3663 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3664 else if constexpr (__have_avx512bw)
3665 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3666 else if constexpr (__have_avx512f)
3667 {
3668 auto __as16bits = // 0 16 1 17 ... 15 31
3669 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3670 16)
3671 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3672 ~__m512i()),
3673 16);
3674 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3675 __lo256(__as16bits),
3676 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3677 );
3678 // deinterleave:
3679 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3680 __0_16_1_17, // 0 16 1 17 2 ...
3681 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3682 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3683 3, 5, 7, 9, 11, 13,
3684 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3685 // 0-3 8-11 16-19 24-27
3686 // 4-7 12-15 20-23 28-31
3687 }
3688 else if constexpr (__have_avx2)
3689 {
3690 const auto __bitmask
3691 = _mm256_broadcastsi128_si256(__to_intrin(
3692 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3693 4, 8, 16, 32, 64, 128)));
3694 return __vector_bitcast<_Up>(
3695 __vector_bitcast<_Up>(
3696 _mm256_shuffle_epi8(
3697 _mm256_broadcastsi128_si256(
3698 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3699 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3700 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3701 3, 3, 3, 3, 3, 3))
3702 & __bitmask)
3703 != 0);
3704 }
3705 // else fall through
3706 }
3707 else if constexpr (sizeof(_UI) == 64)
3708 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3709 if constexpr (std::min(_ToN, _Np) <= 4)
3710 {
3711 if constexpr (_Np > 7) // avoid overflow
3712 __x &= _SanitizedBitMask<_Np>(0x0f);
3713 const _UInt __char_mask
3714 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3715 * 0xff;
3716 _UV __r = {};
3717 __builtin_memcpy(&__r, &__char_mask,
3718 std::min(sizeof(__r), sizeof(__char_mask)));
3719 return __r;
3720 }
3721 else if constexpr (std::min(_ToN, _Np) <= 7)
3722 {
3723 if constexpr (_Np > 7) // avoid overflow
3724 __x &= _SanitizedBitMask<_Np>(0x7f);
3725 const _ULLong __char_mask
3726 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3727 * 0xff;
3728 _UV __r = {};
3729 __builtin_memcpy(&__r, &__char_mask,
3730 std::min(sizeof(__r), sizeof(__char_mask)));
3731 return __r;
3732 }
3733 }
3734 else if constexpr (sizeof(_Up) == 2)
3735 {
3736 if constexpr (sizeof(_UI) == 16)
3737 {
3738 if constexpr (__have_avx512bw_vl)
3739 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3740 else if constexpr (__have_avx512bw)
3741 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3742 else if constexpr (__have_avx512f)
3743 {
3744 __m256i __as32bits = {};
3745 if constexpr (__have_avx512vl)
3746 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3747 else
3748 __as32bits
3749 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3750 return __intrin_bitcast<_UV>(
3751 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3752 }
3753 // else fall through
3754 }
3755 else if constexpr (sizeof(_UI) == 32)
3756 {
3757 if constexpr (__have_avx512bw_vl)
3758 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3759 else if constexpr (__have_avx512bw)
3760 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3761 else if constexpr (__have_avx512f)
3762 {
3763 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3764 return __vector_bitcast<_Up>(
3765 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3766 __hi256(__as32bits))));
3767 }
3768 // else fall through
3769 }
3770 else if constexpr (sizeof(_UI) == 64)
3771 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3772 }
3773 else if constexpr (sizeof(_Up) == 4)
3774 {
3775 if constexpr (sizeof(_UI) == 16)
3776 {
3777 if constexpr (__have_avx512dq_vl)
3778 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3779 else if constexpr (__have_avx512dq)
3780 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3781 else if constexpr (__have_avx512vl)
3782 return __intrin_bitcast<_UV>(
3783 _mm_maskz_mov_epi32(__k, ~__m128i()));
3784 else if constexpr (__have_avx512f)
3785 return __intrin_bitcast<_UV>(
3786 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3787 // else fall through
3788 }
3789 else if constexpr (sizeof(_UI) == 32)
3790 {
3791 if constexpr (__have_avx512dq_vl)
3792 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3793 else if constexpr (__have_avx512dq)
3794 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3795 else if constexpr (__have_avx512vl)
3796 return __vector_bitcast<_Up>(
3797 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3798 else if constexpr (__have_avx512f)
3799 return __vector_bitcast<_Up>(
3800 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3801 // else fall through
3802 }
3803 else if constexpr (sizeof(_UI) == 64)
3804 return __vector_bitcast<_Up>(
3805 __have_avx512dq ? _mm512_movm_epi32(__k)
3806 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3807 }
3808 else if constexpr (sizeof(_Up) == 8)
3809 {
3810 if constexpr (sizeof(_UI) == 16)
3811 {
3812 if constexpr (__have_avx512dq_vl)
3813 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3814 else if constexpr (__have_avx512dq)
3815 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3816 else if constexpr (__have_avx512vl)
3817 return __vector_bitcast<_Up>(
3818 _mm_maskz_mov_epi64(__k, ~__m128i()));
3819 else if constexpr (__have_avx512f)
3820 return __vector_bitcast<_Up>(
3821 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3822 // else fall through
3823 }
3824 else if constexpr (sizeof(_UI) == 32)
3825 {
3826 if constexpr (__have_avx512dq_vl)
3827 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3828 else if constexpr (__have_avx512dq)
3829 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3830 else if constexpr (__have_avx512vl)
3831 return __vector_bitcast<_Up>(
3832 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3833 else if constexpr (__have_avx512f)
3834 return __vector_bitcast<_Up>(
3835 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3836 // else fall through
3837 }
3838 else if constexpr (sizeof(_UI) == 64)
3839 return __vector_bitcast<_Up>(
3840 __have_avx512dq ? _mm512_movm_epi64(__k)
3841 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3842 }
3843
3844 using _UpUInt = make_unsigned_t<_Up>;
3845 using _V = __vector_type_t<_UpUInt, _ToN>;
3846 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3847 if constexpr (_ToN == 2)
3848 {
3849 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3850 }
3851 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3852 {
3853 if constexpr (sizeof(_Up) == 4)
3854 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3855 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3856 _mm256_castsi256_ps(_mm256_setr_epi32(
3857 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3858 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3859 else if constexpr (sizeof(_Up) == 8)
3860 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3861 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3862 _mm256_castsi256_pd(
3863 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3864 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3865 else
3866 __assert_unreachable<_Up>();
3867 }
3868 else if constexpr (__bits_per_element >= _ToN)
3869 {
3870 constexpr auto __bitmask
3871 = __generate_vector<_V>([](auto __i)
3872 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
3873 { return __i < _ToN ? 1ull << __i : 0; });
3874 const auto __bits
3875 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3876 if constexpr (__bits_per_element > _ToN)
3877 return __vector_bitcast<_Up>(__bits) > 0;
3878 else
3879 return __vector_bitcast<_Up>(__bits != 0);
3880 }
3881 else
3882 {
3883 const _V __tmp
3884 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3885 return static_cast<_UpUInt>(
3886 __k >> (__bits_per_element * (__i / __bits_per_element)));
3887 })
3888 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
3889 return static_cast<_UpUInt>(1ull
3890 << (__i % __bits_per_element));
3891 }); // mask bit index
3892 return __intrin_bitcast<_UV>(__tmp != _V());
3893 }
3894 }
3895
3896 // }}}
3897 // _S_to_maskvector(_SimdWrapper) {{{
3898 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3899 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3900 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3901 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3902 {
3903 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3904 using _TW = _SimdWrapper<_Tp, _Np>;
3905 using _UW = _SimdWrapper<_Up, _ToN>;
3906 using _UI = __intrinsic_type_t<_Up, _ToN>;
3907 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3908 return _S_to_maskvector<_Up, _ToN>(
3909 _BitMask<_Np>(__x._M_data)._M_sanitized());
3910 // vector -> vector bitcast
3911 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3912 && sizeof(_TW) == sizeof(_UW))
3913 return __wrapper_bitcast<_Up, _ToN>(
3914 _ToN <= _Np
3915 ? __x
3916 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3917 else // vector -> vector {{{
3918 {
3919 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3920 {
3921 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3922 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3923 __vector_type_t<_Up, _ToN>>(
3924 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
3925 }
3926 using _To = __vector_type_t<_Up, _ToN>;
3927 [[maybe_unused]] constexpr size_t _FromN = _Np;
3928 constexpr int _FromBytes = sizeof(_Tp);
3929 constexpr int _ToBytes = sizeof(_Up);
3930 const auto __k = __x._M_data;
3931
3932 if constexpr (_FromBytes == _ToBytes)
3933 return __intrin_bitcast<_To>(__k);
3934 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3935 { // SSE -> SSE {{{
3936 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3937 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3938 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3939 {
3940 const auto __y
3941 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3942 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3943 }
3944 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3945 {
3946 auto __y
3947 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3948 auto __z
3949 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3950 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3951 }
3952 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3953 && __have_sse2)
3954 return __intrin_bitcast<_To>(
3955 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3956 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3957 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3958 _UI());
3959 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3960 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3961 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3962 {
3963 const auto __y
3964 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3965 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3966 }
3967 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3968 {
3969 if constexpr (__have_sse2 && !__have_ssse3)
3970 return __intrin_bitcast<_To>(_mm_packs_epi32(
3971 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3972 __m128i()));
3973 else
3974 return __intrin_bitcast<_To>(
3975 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3976 __vector_bitcast<_Up>(__k)));
3977 }
3978 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3979 return __intrin_bitcast<_To>(
3980 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3981 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3982 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3983 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3984 && __have_ssse3)
3985 return __intrin_bitcast<_To>(
3986 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3987 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3988 -1, -1, -1, -1, -1, -1, -1,
3989 -1)));
3990 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3991 {
3992 auto __y
3993 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3994 __y = _mm_packs_epi32(__y, __m128i());
3995 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3996 }
3997 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3998 && __have_ssse3)
3999 return __intrin_bitcast<_To>(
4000 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4001 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4002 -1, -1, -1, -1, -1, -1, -1,
4003 -1)));
4004 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4005 {
4006 const auto __y
4007 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
4008 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
4009 }
4010 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
4011 return __intrin_bitcast<_To>(
4012 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
4013 else
4014 __assert_unreachable<_Tp>();
4015 } // }}}
4016 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
4017 { // AVX -> AVX {{{
4018 if constexpr (_FromBytes == _ToBytes)
4019 __assert_unreachable<_Tp>();
4020 else if constexpr (_FromBytes == _ToBytes * 2)
4021 {
4022 const auto __y = __vector_bitcast<_LLong>(__k);
4023 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4024 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
4025 }
4026 else if constexpr (_FromBytes == _ToBytes * 4)
4027 {
4028 const auto __y = __vector_bitcast<_LLong>(__k);
4029 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
4030 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4031 __m128i())));
4032 }
4033 else if constexpr (_FromBytes == _ToBytes * 8)
4034 {
4035 const auto __y = __vector_bitcast<_LLong>(__k);
4036 return __intrin_bitcast<_To>(
4037 _mm256_castsi128_si256(_mm_shuffle_epi8(
4038 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4039 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
4040 -1, -1, -1, -1, -1))));
4041 }
4042 else if constexpr (_FromBytes * 2 == _ToBytes)
4043 {
4044 auto __y = __xzyw(__to_intrin(__k));
4045 if constexpr (is_floating_point_v<
4046 _Tp> || (!__have_avx2 && _FromBytes == 4))
4047 {
4048 const auto __yy = __vector_bitcast<float>(__y);
4049 return __intrin_bitcast<_To>(
4050 _mm256_unpacklo_ps(__yy, __yy));
4051 }
4052 else
4053 return __intrin_bitcast<_To>(
4054 _mm256_unpacklo_epi8(__y, __y));
4055 }
4056 else if constexpr (_FromBytes * 4 == _ToBytes)
4057 {
4058 auto __y
4059 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4060 __lo128(__vector_bitcast<_LLong>(
4061 __k))); // drops 3/4 of input
4062 return __intrin_bitcast<_To>(
4063 __concat(_mm_unpacklo_epi16(__y, __y),
4064 _mm_unpackhi_epi16(__y, __y)));
4065 }
4066 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
4067 {
4068 auto __y
4069 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
4070 __lo128(__vector_bitcast<_LLong>(
4071 __k))); // drops 3/4 of input
4072 __y
4073 = _mm_unpacklo_epi16(__y,
4074 __y); // drops another 1/2 => 7/8 total
4075 return __intrin_bitcast<_To>(
4076 __concat(_mm_unpacklo_epi32(__y, __y),
4077 _mm_unpackhi_epi32(__y, __y)));
4078 }
4079 else
4080 __assert_unreachable<_Tp>();
4081 } // }}}
4082 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
4083 { // SSE -> AVX {{{
4084 if constexpr (_FromBytes == _ToBytes)
4085 return __intrin_bitcast<_To>(
4086 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
4087 __zero_extend(__to_intrin(__k))));
4088 else if constexpr (_FromBytes * 2 == _ToBytes)
4089 { // keep all
4090 return __intrin_bitcast<_To>(
4091 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
4092 __vector_bitcast<_LLong>(__k)),
4093 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
4094 __vector_bitcast<_LLong>(__k))));
4095 }
4096 else if constexpr (_FromBytes * 4 == _ToBytes)
4097 {
4098 if constexpr (__have_avx2)
4099 {
4100 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4101 __concat(__vector_bitcast<_LLong>(__k),
4102 __vector_bitcast<_LLong>(__k)),
4103 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
4104 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
4105 6, 6, 7, 7, 7, 7)));
4106 }
4107 else
4108 {
4109 return __intrin_bitcast<_To>(__concat(
4110 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4111 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
4112 2, 2, 2, 2, 3, 3, 3, 3)),
4113 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4114 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
4115 6, 6, 6, 6, 7, 7, 7,
4116 7))));
4117 }
4118 }
4119 else if constexpr (_FromBytes * 8 == _ToBytes)
4120 {
4121 if constexpr (__have_avx2)
4122 {
4123 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
4124 __concat(__vector_bitcast<_LLong>(__k),
4125 __vector_bitcast<_LLong>(__k)),
4126 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
4127 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
4128 3, 3, 3, 3, 3, 3)));
4129 }
4130 else
4131 {
4132 return __intrin_bitcast<_To>(__concat(
4133 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4134 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4135 1, 1, 1, 1, 1, 1, 1, 1)),
4136 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4137 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4138 3, 3, 3, 3, 3, 3, 3,
4139 3))));
4140 }
4141 }
4142 else if constexpr (_FromBytes == _ToBytes * 2)
4143 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4144 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4145 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4146 {
4147 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4148 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4149 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4150 -1, -1, -1, -1, -1, -1, -1,
4151 -1)))));
4152 }
4153 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4154 {
4155 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4156 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4157 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4158 -1, -1, -1, -1, -1, -1, -1,
4159 -1)))));
4160 }
4161 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4162 {
4163 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4164 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4165 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4166 -1, -1, -1, -1, -1, -1, -1,
4167 -1, -1)))));
4168 }
4169 else
4170 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4171 } // }}}
4172 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4173 { // AVX -> SSE {{{
4174 if constexpr (_FromBytes == _ToBytes)
4175 { // keep low 1/2
4176 return __intrin_bitcast<_To>(__lo128(__k));
4177 }
4178 else if constexpr (_FromBytes == _ToBytes * 2)
4179 { // keep all
4180 auto __y = __vector_bitcast<_LLong>(__k);
4181 return __intrin_bitcast<_To>(
4182 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4183 }
4184 else if constexpr (_FromBytes == _ToBytes * 4)
4185 { // add 1/2 undef
4186 auto __y = __vector_bitcast<_LLong>(__k);
4187 return __intrin_bitcast<_To>(
4188 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4189 __m128i()));
4190 }
4191 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4192 { // add 3/4 undef
4193 auto __y = __vector_bitcast<_LLong>(__k);
4194 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4195 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4196 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4197 -1, -1, -1, -1)));
4198 }
4199 else if constexpr (_FromBytes * 2 == _ToBytes)
4200 { // keep low 1/4
4201 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4202 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4203 }
4204 else if constexpr (_FromBytes * 4 == _ToBytes)
4205 { // keep low 1/8
4206 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4207 __y = _mm_unpacklo_epi8(__y, __y);
4208 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4209 }
4210 else if constexpr (_FromBytes * 8 == _ToBytes)
4211 { // keep low 1/16
4212 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4213 __y = _mm_unpacklo_epi8(__y, __y);
4214 __y = _mm_unpacklo_epi8(__y, __y);
4215 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4216 }
4217 else
4218 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4219 } // }}}
4220 else
4221 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4222 /*
4223 if constexpr (_FromBytes > _ToBytes) {
4224 const _To __y = __vector_bitcast<_Up>(__k);
4225 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4226 constexpr int _Stride = _FromBytes / _ToBytes;
4227 return _To{__y[(_Is + 1) * _Stride - 1]...};
4228 }(make_index_sequence<std::min(_ToN, _FromN)>());
4229 } else {
4230 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4231 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4232 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4233 // ...
4234 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4235 constexpr int __dup = _ToBytes / _FromBytes;
4236 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4237 }(make_index_sequence<_FromN>());
4238 }
4239 */
4240 } // }}}
4241 }
4242
4243 // }}}
4244 // _S_to_bits {{{
4245 template <typename _Tp, size_t _Np>
4246 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4247 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4248 {
4249 if constexpr (is_same_v<_Tp, bool>)
4250 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4251 else
4252 {
4253 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4254 if (__builtin_is_constant_evaluated()
4255 || __builtin_constant_p(__x._M_data))
4256 {
4257 const auto __bools = -__x._M_data;
4258 const _ULLong __k = __call_with_n_evaluations<_Np>(
4259 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4260 return (__bits | ...);
4261 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4262 return _ULLong(__bools[+__i]) << __i;
4263 });
4264 if (__builtin_is_constant_evaluated()
4265 || __builtin_constant_p(__k))
4266 return __k;
4267 }
4268 const auto __xi = __to_intrin(__x);
4269 if constexpr (sizeof(_Tp) == 1)
4270 if constexpr (sizeof(__xi) == 16)
4271 if constexpr (__have_avx512bw_vl)
4272 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4273 else // implies SSE2
4274 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4275 else if constexpr (sizeof(__xi) == 32)
4276 if constexpr (__have_avx512bw_vl)
4277 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4278 else // implies AVX2
4279 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4280 else // implies AVX512BW
4281 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4282
4283 else if constexpr (sizeof(_Tp) == 2)
4284 if constexpr (sizeof(__xi) == 16)
4285 if constexpr (__have_avx512bw_vl)
4286 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4287 else if constexpr (__have_avx512bw)
4288 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4289 else // implies SSE2
4290 return _BitMask<_Np>(
4291 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4292 else if constexpr (sizeof(__xi) == 32)
4293 if constexpr (__have_avx512bw_vl)
4294 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4295 else if constexpr (__have_avx512bw)
4296 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4297 else // implies SSE2
4298 return _BitMask<_Np>(_mm_movemask_epi8(
4299 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4300 else // implies AVX512BW
4301 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4302
4303 else if constexpr (sizeof(_Tp) == 4)
4304 if constexpr (sizeof(__xi) == 16)
4305 if constexpr (__have_avx512dq_vl)
4306 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4307 else if constexpr (__have_avx512vl)
4308 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4309 else if constexpr (__have_avx512dq)
4310 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4311 else if constexpr (__have_avx512f)
4312 return _BitMask<_Np>(
4313 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4314 else // implies SSE
4315 return _BitMask<_Np>(
4316 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4317 else if constexpr (sizeof(__xi) == 32)
4318 if constexpr (__have_avx512dq_vl)
4319 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4320 else if constexpr (__have_avx512dq)
4321 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4322 else if constexpr (__have_avx512vl)
4323 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4324 else if constexpr (__have_avx512f)
4325 return _BitMask<_Np>(
4326 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4327 else // implies AVX
4328 return _BitMask<_Np>(
4329 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4330 else // implies AVX512??
4331 if constexpr (__have_avx512dq)
4332 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4333 else // implies AVX512F
4334 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4335
4336 else if constexpr (sizeof(_Tp) == 8)
4337 if constexpr (sizeof(__xi) == 16)
4338 if constexpr (__have_avx512dq_vl)
4339 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4340 else if constexpr (__have_avx512dq)
4341 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4342 else if constexpr (__have_avx512vl)
4343 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4344 else if constexpr (__have_avx512f)
4345 return _BitMask<_Np>(
4346 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4347 else // implies SSE2
4348 return _BitMask<_Np>(
4349 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4350 else if constexpr (sizeof(__xi) == 32)
4351 if constexpr (__have_avx512dq_vl)
4352 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4353 else if constexpr (__have_avx512dq)
4354 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4355 else if constexpr (__have_avx512vl)
4356 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4357 else if constexpr (__have_avx512f)
4358 return _BitMask<_Np>(
4359 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4360 else // implies AVX
4361 return _BitMask<_Np>(
4362 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4363 else // implies AVX512??
4364 if constexpr (__have_avx512dq)
4365 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4366 else // implies AVX512F
4367 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4368
4369 else
4370 __assert_unreachable<_Tp>();
4371 }
4372 }
4373 // }}}
4374};
4375
4376// }}}
4377// _MaskImplX86 {{{
4378template <typename _Abi, typename>
4379 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4380 {
4381 using _MaskImplX86Mixin::_S_to_bits;
4382 using _MaskImplX86Mixin::_S_to_maskvector;
4383 using _MaskImplBuiltin<_Abi>::_S_convert;
4384
4385 // member types {{{
4386 template <typename _Tp>
4387 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4388
4389 template <typename _Tp>
4390 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4391
4392 template <typename _Tp>
4393 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4394
4395 using _Base = _MaskImplBuiltin<_Abi>;
4396
4397 // }}}
4398 // _S_broadcast {{{
4399 template <typename _Tp>
4400 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4401 _S_broadcast(bool __x)
4402 {
4403 if constexpr (__is_avx512_abi<_Abi>())
4404 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4405 : _MaskMember<_Tp>();
4406 else
4407 return _Base::template _S_broadcast<_Tp>(__x);
4408 }
4409
4410 // }}}
4411 // _S_load {{{
4412 template <typename _Tp>
4413 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4414 _S_load(const bool* __mem)
4415 {
4416 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4417 if (__builtin_is_constant_evaluated())
4418 {
4419 if constexpr (__is_avx512_abi<_Abi>())
4420 {
4421 _MaskMember<_Tp> __r{};
4422 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i)
4423 __r._M_data |= _ULLong(__mem[__i]) << __i;
4424 return __r;
4425 }
4426 else
4427 return _Base::template _S_load<_Tp>(__mem);
4428 }
4429 else if constexpr (__have_avx512bw)
4430 {
4431 const auto __to_vec_or_bits
4432 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
4433 if constexpr (__is_avx512_abi<_Abi>())
4434 return __bits;
4435 else
4436 return _S_to_maskvector<_Tp>(
4437 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4438 };
4439
4440 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4441 {
4442 __m128i __a = {};
4443 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4444 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4445 }
4446 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4447 {
4448 __m256i __a = {};
4449 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4450 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4451 }
4452 else if constexpr (_S_size<_Tp> <= 64)
4453 {
4454 __m512i __a = {};
4455 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4456 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4457 }
4458 }
4459 else if constexpr (__is_avx512_abi<_Abi>())
4460 {
4461 if constexpr (_S_size<_Tp> <= 8)
4462 {
4463 __m128i __a = {};
4464 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4465 const auto __b = _mm512_cvtepi8_epi64(__a);
4466 return _mm512_test_epi64_mask(__b, __b);
4467 }
4468 else if constexpr (_S_size<_Tp> <= 16)
4469 {
4470 __m128i __a = {};
4471 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4472 const auto __b = _mm512_cvtepi8_epi32(__a);
4473 return _mm512_test_epi32_mask(__b, __b);
4474 }
4475 else if constexpr (_S_size<_Tp> <= 32)
4476 {
4477 __m128i __a = {};
4478 __builtin_memcpy(&__a, __mem, 16);
4479 const auto __b = _mm512_cvtepi8_epi32(__a);
4480 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4481 const auto __c = _mm512_cvtepi8_epi32(__a);
4482 return _mm512_test_epi32_mask(__b, __b)
4483 | (_mm512_test_epi32_mask(__c, __c) << 16);
4484 }
4485 else if constexpr (_S_size<_Tp> <= 64)
4486 {
4487 __m128i __a = {};
4488 __builtin_memcpy(&__a, __mem, 16);
4489 const auto __b = _mm512_cvtepi8_epi32(__a);
4490 __builtin_memcpy(&__a, __mem + 16, 16);
4491 const auto __c = _mm512_cvtepi8_epi32(__a);
4492 if constexpr (_S_size<_Tp> <= 48)
4493 {
4494 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4495 const auto __d = _mm512_cvtepi8_epi32(__a);
4496 return _mm512_test_epi32_mask(__b, __b)
4497 | (_mm512_test_epi32_mask(__c, __c) << 16)
4498 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4499 }
4500 else
4501 {
4502 __builtin_memcpy(&__a, __mem + 16, 16);
4503 const auto __d = _mm512_cvtepi8_epi32(__a);
4504 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4505 const auto __e = _mm512_cvtepi8_epi32(__a);
4506 return _mm512_test_epi32_mask(__b, __b)
4507 | (_mm512_test_epi32_mask(__c, __c) << 16)
4508 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4509 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4510 }
4511 }
4512 else
4513 __assert_unreachable<_Tp>();
4514 }
4515 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4516 return __vector_bitcast<_Tp>(
4517 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4518 -int(__mem[1]), -int(__mem[1])});
4519 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4520 {
4521 int __bool4 = 0;
4522 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4523 const auto __k = __to_intrin(
4524 (__vector_broadcast<4>(__bool4)
4525 & __make_vector<int>(0x1, 0x100, 0x10000,
4526 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4527 != 0);
4528 return __vector_bitcast<_Tp>(
4529 __concat(_mm_unpacklo_epi32(__k, __k),
4530 _mm_unpackhi_epi32(__k, __k)));
4531 }
4532 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4533 {
4534 int __bools = 0;
4535 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4536 if constexpr (__have_sse2)
4537 {
4538 __m128i __k = _mm_cvtsi32_si128(__bools);
4539 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4540 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4541 _mm_unpacklo_epi16(__k, __k));
4542 }
4543 else
4544 {
4545 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4546 _mm_empty();
4547 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4548 _mm_cmpgt_ps(__k, __m128()));
4549 }
4550 }
4551 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4552 {
4553 __m128i __k = {};
4554 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4555 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4556 return __vector_bitcast<_Tp>(
4557 __concat(_mm_unpacklo_epi16(__k, __k),
4558 _mm_unpackhi_epi16(__k, __k)));
4559 }
4560 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4561 {
4562 __m128i __k = {};
4563 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4564 __k = _mm_cmpgt_epi8(__k, __m128i());
4565 if constexpr (_S_size<_Tp> <= 8)
4566 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4567 _mm_unpacklo_epi8(__k, __k));
4568 else
4569 return __concat(_mm_unpacklo_epi8(__k, __k),
4570 _mm_unpackhi_epi8(__k, __k));
4571 }
4572 else
4573 return _Base::template _S_load<_Tp>(__mem);
4574 }
4575
4576 // }}}
4577 // _S_from_bitmask{{{
4578 template <size_t _Np, typename _Tp>
4579 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4580 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4581 {
4582 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4583 if constexpr (__is_avx512_abi<_Abi>())
4584 return __bits._M_to_bits();
4585 else
4586 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4587 }
4588
4589 // }}}
4590 // _S_masked_load {{{2
4591 template <typename _Tp, size_t _Np>
4592 static inline _SimdWrapper<_Tp, _Np>
4593 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4594 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4595 {
4596 if constexpr (__is_avx512_abi<_Abi>())
4597 {
4598 if constexpr (__have_avx512bw_vl)
4599 {
4600 if constexpr (_Np <= 16)
4601 {
4602 const auto __a
4603 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4604 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4605 }
4606 else if constexpr (_Np <= 32)
4607 {
4608 const auto __a
4609 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4610 return (__merge & ~__mask)
4611 | _mm256_test_epi8_mask(__a, __a);
4612 }
4613 else if constexpr (_Np <= 64)
4614 {
4615 const auto __a
4616 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4617 return (__merge & ~__mask)
4618 | _mm512_test_epi8_mask(__a, __a);
4619 }
4620 else
4621 __assert_unreachable<_Tp>();
4622 }
4623 else
4624 {
4625 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4626 __merge._M_set(__i, __mem[__i]);
4627 });
4628 return __merge;
4629 }
4630 }
4631 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4632 {
4633 const auto __k = _S_to_bits(__mask)._M_to_bits();
4634 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4635 _mm256_mask_loadu_epi8(__m256i(),
4636 __k, __mem));
4637 }
4638 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4639 {
4640 const auto __k = _S_to_bits(__mask)._M_to_bits();
4641 __merge
4642 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4643 __m128i(),
4644 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4645 }
4646 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4647 {
4648 const auto __k = _S_to_bits(__mask)._M_to_bits();
4649 __merge = _mm256_mask_sub_epi16(
4650 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4651 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4652 }
4653 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4654 {
4655 const auto __k = _S_to_bits(__mask)._M_to_bits();
4656 __merge = _mm_mask_sub_epi16(
4657 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4658 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4659 }
4660 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4661 {
4662 const auto __k = _S_to_bits(__mask)._M_to_bits();
4663 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4664 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4665 _mm256_cvtepi8_epi32(
4666 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4667 }
4668 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4669 {
4670 const auto __k = _S_to_bits(__mask)._M_to_bits();
4671 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4672 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4673 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4674 }
4675 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4676 {
4677 const auto __k = _S_to_bits(__mask)._M_to_bits();
4678 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4679 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4680 _mm256_cvtepi8_epi64(
4681 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4682 }
4683 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4684 {
4685 const auto __k = _S_to_bits(__mask)._M_to_bits();
4686 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4687 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4688 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4689 }
4690 else
4691 return _Base::_S_masked_load(__merge, __mask, __mem);
4692 return __merge;
4693 }
4694
4695 // _S_store {{{2
4696 template <typename _Tp, size_t _Np>
4697 _GLIBCXX_SIMD_INTRINSIC static constexpr void
4698 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
4699 {
4700 if (__builtin_is_constant_evaluated())
4701 _Base::_S_store(__v, __mem);
4702 else if constexpr (__is_avx512_abi<_Abi>())
4703 {
4704 if constexpr (__have_avx512bw_vl)
4705 _CommonImplX86::_S_store<_Np>(
4706 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
4707 if constexpr (_Np <= 16)
4708 return _mm_maskz_set1_epi8(__data, 1);
4709 else if constexpr (_Np <= 32)
4710 return _mm256_maskz_set1_epi8(__data, 1);
4711 else
4712 return _mm512_maskz_set1_epi8(__data, 1);
4713 }(__v._M_data)),
4714 __mem);
4715 else if constexpr (_Np <= 8)
4716 _CommonImplX86::_S_store<_Np>(
4717 __vector_bitcast<char>(
4718#if defined __x86_64__
4719 __make_wrapper<_ULLong>(
4720 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4721#else
4722 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4723 _pdep_u32(__v._M_data >> 4,
4724 0x01010101U))
4725#endif
4726 ),
4727 __mem);
4728 else if constexpr (_Np <= 16)
4729 _mm512_mask_cvtepi32_storeu_epi8(
4730 __mem, 0xffffu >> (16 - _Np),
4731 _mm512_maskz_set1_epi32(__v._M_data, 1));
4732 else
4733 __assert_unreachable<_Tp>();
4734 }
4735 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4736 {
4737 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4738 {
4739 const auto __k = __vector_bitcast<int>(__v);
4740 __mem[0] = -__k[1];
4741 __mem[1] = -__k[3];
4742 }
4743 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4744 {
4745 if constexpr (__have_sse2)
4746 {
4747 const unsigned __bool4
4748 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4749 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4750 __to_intrin(__v)),
4751 __m128i()),
4752 __m128i()))[0]
4753 & 0x01010101u;
4754 __builtin_memcpy(__mem, &__bool4, _Np);
4755 }
4756 else if constexpr (__have_mmx)
4757 {
4758 const __m64 __k = _mm_cvtps_pi8(
4759 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4760 __builtin_memcpy(__mem, &__k, _Np);
4761 _mm_empty();
4762 }
4763 else
4764 return _Base::_S_store(__v, __mem);
4765 }
4766 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4767 {
4768 _CommonImplX86::_S_store<_Np>(
4769 __vector_bitcast<char>(_mm_packs_epi16(
4770 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4771 __m128i())),
4772 __mem);
4773 }
4774 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4775 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4776 else
4777 __assert_unreachable<_Tp>();
4778 } // }}}
4779 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4780 {
4781 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4782 {
4783 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4784 int __bool4{};
4785 if constexpr (__have_avx2)
4786 __bool4 = _mm256_movemask_epi8(__k);
4787 else
4788 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4789 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4790 __bool4 &= 0x01010101;
4791 __builtin_memcpy(__mem, &__bool4, _Np);
4792 }
4793 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4794 {
4795 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4796 const auto __k2
4797 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4798 15);
4799 const auto __k3
4800 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4801 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4802 }
4803 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4804 {
4805 if constexpr (__have_avx2)
4806 {
4807 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4808 const auto __bools = __vector_bitcast<char>(
4809 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4810 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4811 }
4812 else
4813 {
4814 const auto __bools
4815 = 1
4816 & __vector_bitcast<_UChar>(
4817 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4818 __hi128(__to_intrin(__v))));
4819 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4820 }
4821 }
4822 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4823 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4824 else
4825 __assert_unreachable<_Tp>();
4826 } // }}}
4827 else
4828 __assert_unreachable<_Tp>();
4829 }
4830
4831 // _S_masked_store {{{2
4832 template <typename _Tp, size_t _Np>
4833 static inline void
4834 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4835 const _SimdWrapper<_Tp, _Np> __k) noexcept
4836 {
4837 if constexpr (__is_avx512_abi<_Abi>())
4838 {
4839 static_assert(is_same_v<_Tp, bool>);
4840 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4841 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4842 else if constexpr (_Np <= 16)
4843 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4844 _mm512_maskz_set1_epi32(__v, 1));
4845 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4846 _mm256_mask_storeu_epi8(__mem, __k,
4847 _mm256_maskz_set1_epi8(__v, 1));
4848 else if constexpr (_Np <= 32 && __have_avx512bw)
4849 _mm256_mask_storeu_epi8(__mem, __k,
4850 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4851 else if constexpr (_Np <= 64 && __have_avx512bw)
4852 _mm512_mask_storeu_epi8(__mem, __k,
4853 _mm512_maskz_set1_epi8(__v, 1));
4854 else
4855 __assert_unreachable<_Tp>();
4856 }
4857 else
4858 _Base::_S_masked_store(__v, __mem, __k);
4859 }
4860
4861 // logical and bitwise operators {{{2
4862 template <typename _Tp, size_t _Np>
4863 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4864 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4865 {
4866 if constexpr (is_same_v<_Tp, bool>)
4867 {
4868 if (__builtin_is_constant_evaluated())
4869 return __x._M_data & __y._M_data;
4870 else if constexpr (__have_avx512dq && _Np <= 8)
4871 return _kand_mask8(__x._M_data, __y._M_data);
4872 else if constexpr (_Np <= 16)
4873 return _kand_mask16(__x._M_data, __y._M_data);
4874 else if constexpr (__have_avx512bw && _Np <= 32)
4875 return _kand_mask32(__x._M_data, __y._M_data);
4876 else if constexpr (__have_avx512bw && _Np <= 64)
4877 return _kand_mask64(__x._M_data, __y._M_data);
4878 else
4879 __assert_unreachable<_Tp>();
4880 }
4881 else
4882 return _Base::_S_logical_and(__x, __y);
4883 }
4884
4885 template <typename _Tp, size_t _Np>
4886 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4887 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4888 {
4889 if constexpr (is_same_v<_Tp, bool>)
4890 {
4891 if (__builtin_is_constant_evaluated())
4892 return __x._M_data | __y._M_data;
4893 else if constexpr (__have_avx512dq && _Np <= 8)
4894 return _kor_mask8(__x._M_data, __y._M_data);
4895 else if constexpr (_Np <= 16)
4896 return _kor_mask16(__x._M_data, __y._M_data);
4897 else if constexpr (__have_avx512bw && _Np <= 32)
4898 return _kor_mask32(__x._M_data, __y._M_data);
4899 else if constexpr (__have_avx512bw && _Np <= 64)
4900 return _kor_mask64(__x._M_data, __y._M_data);
4901 else
4902 __assert_unreachable<_Tp>();
4903 }
4904 else
4905 return _Base::_S_logical_or(__x, __y);
4906 }
4907
4908 template <typename _Tp, size_t _Np>
4909 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4910 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4911 {
4912 if constexpr (is_same_v<_Tp, bool>)
4913 {
4914 if (__builtin_is_constant_evaluated())
4915 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>();
4916 else if constexpr (__have_avx512dq && _Np <= 8)
4917 return _kandn_mask8(__x._M_data,
4918 _Abi::template __implicit_mask_n<_Np>());
4919 else if constexpr (_Np <= 16)
4920 return _kandn_mask16(__x._M_data,
4921 _Abi::template __implicit_mask_n<_Np>());
4922 else if constexpr (__have_avx512bw && _Np <= 32)
4923 return _kandn_mask32(__x._M_data,
4924 _Abi::template __implicit_mask_n<_Np>());
4925 else if constexpr (__have_avx512bw && _Np <= 64)
4926 return _kandn_mask64(__x._M_data,
4927 _Abi::template __implicit_mask_n<_Np>());
4928 else
4929 __assert_unreachable<_Tp>();
4930 }
4931 else
4932 return _Base::_S_bit_not(__x);
4933 }
4934
4935 template <typename _Tp, size_t _Np>
4936 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4937 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4938 {
4939 if constexpr (is_same_v<_Tp, bool>)
4940 {
4941 if (__builtin_is_constant_evaluated())
4942 return __x._M_data & __y._M_data;
4943 else if constexpr (__have_avx512dq && _Np <= 8)
4944 return _kand_mask8(__x._M_data, __y._M_data);
4945 else if constexpr (_Np <= 16)
4946 return _kand_mask16(__x._M_data, __y._M_data);
4947 else if constexpr (__have_avx512bw && _Np <= 32)
4948 return _kand_mask32(__x._M_data, __y._M_data);
4949 else if constexpr (__have_avx512bw && _Np <= 64)
4950 return _kand_mask64(__x._M_data, __y._M_data);
4951 else
4952 __assert_unreachable<_Tp>();
4953 }
4954 else
4955 return _Base::_S_bit_and(__x, __y);
4956 }
4957
4958 template <typename _Tp, size_t _Np>
4959 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4960 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4961 {
4962 if constexpr (is_same_v<_Tp, bool>)
4963 {
4964 if (__builtin_is_constant_evaluated())
4965 return __x._M_data | __y._M_data;
4966 else if constexpr (__have_avx512dq && _Np <= 8)
4967 return _kor_mask8(__x._M_data, __y._M_data);
4968 else if constexpr (_Np <= 16)
4969 return _kor_mask16(__x._M_data, __y._M_data);
4970 else if constexpr (__have_avx512bw && _Np <= 32)
4971 return _kor_mask32(__x._M_data, __y._M_data);
4972 else if constexpr (__have_avx512bw && _Np <= 64)
4973 return _kor_mask64(__x._M_data, __y._M_data);
4974 else
4975 __assert_unreachable<_Tp>();
4976 }
4977 else
4978 return _Base::_S_bit_or(__x, __y);
4979 }
4980
4981 template <typename _Tp, size_t _Np>
4982 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4983 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
4984 {
4985 if constexpr (is_same_v<_Tp, bool>)
4986 {
4987 if (__builtin_is_constant_evaluated())
4988 return __x._M_data ^ __y._M_data;
4989 else if constexpr (__have_avx512dq && _Np <= 8)
4990 return _kxor_mask8(__x._M_data, __y._M_data);
4991 else if constexpr (_Np <= 16)
4992 return _kxor_mask16(__x._M_data, __y._M_data);
4993 else if constexpr (__have_avx512bw && _Np <= 32)
4994 return _kxor_mask32(__x._M_data, __y._M_data);
4995 else if constexpr (__have_avx512bw && _Np <= 64)
4996 return _kxor_mask64(__x._M_data, __y._M_data);
4997 else
4998 __assert_unreachable<_Tp>();
4999 }
5000 else
5001 return _Base::_S_bit_xor(__x, __y);
5002 }
5003
5004 //}}}2
5005 // _S_masked_assign{{{
5006 template <size_t _Np>
5007 _GLIBCXX_SIMD_INTRINSIC static void
5008 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5009 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs)
5010 {
5011 __lhs._M_data
5012 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
5013 }
5014
5015 template <size_t _Np>
5016 _GLIBCXX_SIMD_INTRINSIC static void
5017 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
5018 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
5019 {
5020 if (__rhs)
5021 __lhs._M_data = __k._M_data | __lhs._M_data;
5022 else
5023 __lhs._M_data = ~__k._M_data & __lhs._M_data;
5024 }
5025
5026 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
5027
5028 //}}}
5029 // _S_all_of {{{
5030 template <typename _Tp>
5031 _GLIBCXX_SIMD_INTRINSIC static bool
5032 _S_all_of(simd_mask<_Tp, _Abi> __k)
5033 {
5034 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5035 {
5036 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5037 using _TI = __intrinsic_type_t<_Tp, _Np>;
5038 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5039 if constexpr (__have_sse4_1)
5040 {
5041 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5042 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5043 return 0 != __testc(__a, __b);
5044 }
5045 else if constexpr (__is_x86_ps<_Tp> ())
5046 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
5047 == (1 << _Np) - 1;
5048 else if constexpr (__is_x86_pd<_Tp> ())
5049 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
5050 == (1 << _Np) - 1;
5051 else
5052 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5053 == (1 << (_Np * sizeof(_Tp))) - 1;
5054 }
5055 else if constexpr (__is_avx512_abi<_Abi>())
5056 {
5057 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
5058 const auto __kk = __k._M_data._M_data;
5059 if constexpr (sizeof(__kk) == 1)
5060 {
5061 if constexpr (__have_avx512dq)
5062 return _kortestc_mask8_u8(__kk, _Mask == 0xff
5063 ? __kk
5064 : __mmask8(~_Mask));
5065 else
5066 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
5067 }
5068 else if constexpr (sizeof(__kk) == 2)
5069 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
5070 ? __kk
5071 : __mmask16(~_Mask));
5072 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
5073 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
5074 ? __kk
5075 : __mmask32(~_Mask));
5076 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
5077 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
5078 ? __kk
5079 : __mmask64(~_Mask));
5080 else
5081 __assert_unreachable<_Tp>();
5082 }
5083 }
5084
5085 // }}}
5086 // _S_any_of {{{
5087 template <typename _Tp>
5088 _GLIBCXX_SIMD_INTRINSIC static bool
5089 _S_any_of(simd_mask<_Tp, _Abi> __k)
5090 {
5091 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5092 {
5093 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5094 using _TI = __intrinsic_type_t<_Tp, _Np>;
5095 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5096 if constexpr (__have_sse4_1)
5097 {
5098 if constexpr (_Abi::template _S_is_partial<
5099 _Tp> || sizeof(__k) < 16)
5100 {
5101 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5102 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5103 return 0 == __testz(__a, __b);
5104 }
5105 else
5106 return 0 == __testz(__a, __a);
5107 }
5108 else if constexpr (__is_x86_ps<_Tp> ())
5109 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
5110 else if constexpr (__is_x86_pd<_Tp> ())
5111 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
5112 else
5113 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
5114 != 0;
5115 }
5116 else if constexpr (__is_avx512_abi<_Abi>())
5117 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5118 != 0;
5119 }
5120
5121 // }}}
5122 // _S_none_of {{{
5123 template <typename _Tp>
5124 _GLIBCXX_SIMD_INTRINSIC static bool
5125 _S_none_of(simd_mask<_Tp, _Abi> __k)
5126 {
5127 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5128 {
5129 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5130 using _TI = __intrinsic_type_t<_Tp, _Np>;
5131 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5132 if constexpr (__have_sse4_1)
5133 {
5134 if constexpr (_Abi::template _S_is_partial<
5135 _Tp> || sizeof(__k) < 16)
5136 {
5137 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5138 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5139 return 0 != __testz(__a, __b);
5140 }
5141 else
5142 return 0 != __testz(__a, __a);
5143 }
5144 else if constexpr (__is_x86_ps<_Tp> ())
5145 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5146 else if constexpr (__is_x86_pd<_Tp> ())
5147 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
5148 else
5149 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
5150 == 0;
5151 }
5152 else if constexpr (__is_avx512_abi<_Abi>())
5153 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
5154 == 0;
5155 }
5156
5157 // }}}
5158 // _S_some_of {{{
5159 template <typename _Tp>
5160 _GLIBCXX_SIMD_INTRINSIC static bool
5161 _S_some_of(simd_mask<_Tp, _Abi> __k)
5162 {
5163 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5164 {
5165 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5166 using _TI = __intrinsic_type_t<_Tp, _Np>;
5167 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5168 if constexpr (__have_sse4_1)
5169 {
5170 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5171 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5172 return 0 != __testnzc(__a, __b);
5173 }
5174 else if constexpr (__is_x86_ps<_Tp> ())
5175 {
5176 constexpr int __allbits = (1 << _Np) - 1;
5177 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5178 return __tmp > 0 && __tmp < __allbits;
5179 }
5180 else if constexpr (__is_x86_pd<_Tp> ())
5181 {
5182 constexpr int __allbits = (1 << _Np) - 1;
5183 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5184 return __tmp > 0 && __tmp < __allbits;
5185 }
5186 else
5187 {
5188 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5189 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5190 return __tmp > 0 && __tmp < __allbits;
5191 }
5192 }
5193 else if constexpr (__is_avx512_abi<_Abi>())
5194 return _S_any_of(__k) && !_S_all_of(__k);
5195 else
5196 __assert_unreachable<_Tp>();
5197 }
5198
5199 // }}}
5200 // _S_popcount {{{
5201 template <typename _Tp>
5202 _GLIBCXX_SIMD_INTRINSIC static int
5203 _S_popcount(simd_mask<_Tp, _Abi> __k)
5204 {
5205 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5206 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5207 if constexpr (__is_avx512_abi<_Abi>())
5208 {
5209 if constexpr (_Np > 32)
5210 return __builtin_popcountll(__kk);
5211 else
5212 return __builtin_popcount(__kk);
5213 }
5214 else
5215 {
5216 if constexpr (__have_popcnt)
5217 {
5218 int __bits
5219 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5220 const int __count = __builtin_popcount(__bits);
5221 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5222 }
5223 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5224 {
5225 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5226 return mask - (mask >> 1);
5227 }
5228 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5229 {
5230 auto __x = -(__lo128(__kk) + __hi128(__kk));
5231 return __x[0] + __x[1];
5232 }
5233 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5234 {
5235 if constexpr (__have_sse2)
5236 {
5237 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5238 __x = _mm_add_epi32(
5239 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5240 __x = _mm_add_epi32(
5241 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5242 return -_mm_cvtsi128_si32(__x);
5243 }
5244 else
5245 return __builtin_popcount(
5246 _mm_movemask_ps(__auto_bitcast(__kk)));
5247 }
5248 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5249 {
5250 auto __x = __to_intrin(__kk);
5251 __x = _mm_add_epi16(__x,
5252 _mm_shuffle_epi32(__x,
5253 _MM_SHUFFLE(0, 1, 2, 3)));
5254 __x = _mm_add_epi16(
5255 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5256 __x = _mm_add_epi16(
5257 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5258 return -short(_mm_extract_epi16(__x, 0));
5259 }
5260 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5261 {
5262 auto __x = __to_intrin(__kk);
5263 __x = _mm_add_epi8(__x,
5264 _mm_shuffle_epi32(__x,
5265 _MM_SHUFFLE(0, 1, 2, 3)));
5266 __x = _mm_add_epi8(__x,
5267 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5268 3)));
5269 __x = _mm_add_epi8(__x,
5270 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5271 1)));
5272 auto __y = -__vector_bitcast<_UChar>(__x);
5273 if constexpr (__have_sse4_1)
5274 return __y[0] + __y[1];
5275 else
5276 {
5277 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5278 return (__z & 0xff) + (__z >> 8);
5279 }
5280 }
5281 else if constexpr (sizeof(__kk) == 32)
5282 {
5283 // The following works only as long as the implementations above
5284 // use a summation
5285 using _I = __int_for_sizeof_t<_Tp>;
5286 const auto __as_int = __vector_bitcast<_I>(__kk);
5287 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5288 simd_mask<_I, simd_abi::__sse>(__private_init,
5289 __lo128(__as_int)
5290 + __hi128(__as_int)));
5291 }
5292 else
5293 __assert_unreachable<_Tp>();
5294 }
5295 }
5296
5297 // }}}
5298 // _S_find_first_set {{{
5299 template <typename _Tp>
5300 _GLIBCXX_SIMD_INTRINSIC static int
5301 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5302 {
5303 if constexpr (__is_avx512_abi<_Abi>())
5304 return std::__countr_zero(__k._M_data._M_data);
5305 else
5306 return _Base::_S_find_first_set(__k);
5307 }
5308
5309 // }}}
5310 // _S_find_last_set {{{
5311 template <typename _Tp>
5312 _GLIBCXX_SIMD_INTRINSIC static int
5313 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5314 {
5315 if constexpr (__is_avx512_abi<_Abi>())
5316 return std::__bit_width(_Abi::_S_masked(__k._M_data)._M_data) - 1;
5317 else
5318 return _Base::_S_find_last_set(__k);
5319 }
5320
5321 // }}}
5322 };
5323
5324// }}}
5325
5326_GLIBCXX_SIMD_END_NAMESPACE
5327#endif // __cplusplus >= 201703L
5328#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5329
5330// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition type_traits:2948
typename make_signed< _Tp >::type make_signed_t
Alias template for make_signed.
Definition type_traits:2246
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.