libstdc++
unicode.h
Go to the documentation of this file.
1// Unicode utilities -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
28 */
29
30#ifndef _GLIBCXX_UNICODE_H
31#define _GLIBCXX_UNICODE_H 1
32
33#if __cplusplus >= 202002L
34#include <array>
35#include <bit> // bit_width
36#include <charconv> // __detail::__from_chars_alnum_to_val_table
37#include <string_view>
38#include <cstdint>
39#include <bits/stl_algo.h>
40#include <bits/stl_iterator.h>
41#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42#include <bits/ranges_util.h> // view_interface
43
44namespace std _GLIBCXX_VISIBILITY(default)
45{
46_GLIBCXX_BEGIN_NAMESPACE_VERSION
47namespace __unicode
48{
49 // A Unicode code point that is not a high or low surrogate.
50 constexpr bool
51 __is_scalar_value(char32_t __c)
52 {
53 if (__c < 0xD800) [[likely]]
54 return true;
55 return 0xDFFF < __c && __c <= 0x10FFFF;
56 }
57
58 // A code point that can be encoded in a single code unit of type _CharT.
59 template<typename _CharT>
60 constexpr bool
61 __is_single_code_unit(char32_t __c)
62 {
63 if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
64 return __c < 0x7F; // ASCII character
65 else
66 return __c < __gnu_cxx::__int_traits<_CharT>::__max
67 && __is_scalar_value(__c);
68 }
69
70 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
71
72 struct _Repl
73 {
74 constexpr char32_t
75 operator()() const noexcept
76 { return 0xFFFD; }
77 };
78
79 struct _Null_sentinel_t
80 {
81 template<input_iterator _It>
82 requires default_initializable<iter_value_t<_It>>
83 && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84 friend constexpr auto
85 operator==(_It __it, _Null_sentinel_t)
86 { return *__it == iter_value_t<_It>{}; }
87 };
88
89 // An iterator over an input range of FromFmt code units that yields either
90 // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
91 // The code units from the input range are interpreted as Unicode code points
92 // and the iterator produces the individual code unit for each code point.
93 // Invalid sequences in the input are replaced with U+FFDD so that the result
94 // is always valid UTF-8, UTF-16, or UTF-32.
95 //
96 // The iterator knows the bounds of the underlying input range and will not
97 // read outside those bounds (incrementing or decrementing at the boundary
98 // is erroneously idempotent).
99 //
100 // On construction, the iterator attemps to decode a single code point from
101 // the input range and then encode it into an internal buffer in the output
102 // format, e.g. if the input is UTF-8 and the output is UTF-16, it might read
103 // three char8_t code units from the input and store two char16_t code units
104 // in its buffer. Incrementing the iterator will first iterate over buffer,
105 // yielding each code unit in turn, and then extract another code point from
106 // the input. Failure to extract a valid code point from the input will store
107 // U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.
108 template<typename _FromFmt, typename _ToFmt,
109 input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
110 typename _ErrorHandler = _Repl>
111 requires convertible_to<iter_value_t<_Iter>, _FromFmt>
112 class _Utf_iterator
113 {
114 static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
115
116 public:
117 using value_type = _ToFmt;
118 using difference_type = iter_difference_t<_Iter>;
119 using reference = value_type;
120 using iterator_concept
121 = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
122 bidirectional_iterator_tag>;
123
124 constexpr _Utf_iterator() = default;
125
126 constexpr
127 _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
128 requires bidirectional_iterator<_Iter>
129 : _M_first_and_curr{__first, __it}, _M_last(__last)
130 {
131 if (_M_curr() != _M_last)
132 _M_read();
133 else
134 _M_buf = {};
135 }
136
137 constexpr
138 _Utf_iterator(_Iter __it, _Sent __last)
139 requires (!bidirectional_iterator<_Iter>)
140 : _M_first_and_curr{__it}, _M_last(__last)
141 {
142 if (_M_curr() != _M_last)
143 _M_read();
144 else
145 _M_buf = {};
146 }
147
148 template<class _Iter2, class _Sent2>
149 requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
150 constexpr
151 _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
152 _ErrorHandler>& __other)
153 : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
154 _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
155 _M_last(__other._M_last)
156 { }
157
158 [[nodiscard]]
159 constexpr _Iter
160 begin() const requires bidirectional_iterator<_Iter>
161 { return _M_first(); }
162
163 [[nodiscard]]
164 constexpr _Sent
165 end() const { return _M_last; }
166
167 [[nodiscard]]
168 constexpr _Iter
169 base() const requires forward_iterator<_Iter>
170 { return _M_curr(); }
171
172 [[nodiscard]]
173 constexpr iter_difference_t<_Iter>
174 _M_units() const requires forward_iterator<_Iter>
175 { return _M_to_increment; }
176
177 [[nodiscard]]
178 constexpr value_type
179 operator*() const { return _M_buf[_M_buf_index]; }
180
181 constexpr _Utf_iterator&
182 operator++()
183 {
184 if (_M_buf_index + 1 < _M_buf_last)
185 ++_M_buf_index; // Move to the next code unit in the buffer.
186 else if (_M_curr() != _M_last)
187 {
188 // Advance past the current code point (for non-forward iterators
189 // we already moved there after decoding the last code point).
190 if constexpr (forward_iterator<_Iter>)
191 std::advance(_M_curr(), _M_to_increment);
192 if (_M_curr() == _M_last)
193 _M_buf_index = 0;
194 else // Decode next code point from the input and update buffer.
195 _M_read();
196 }
197 // else erroneous, but ignored for now.
198 return *this;
199 }
200
201 constexpr _Utf_iterator
202 operator++(int)
203 {
204 auto __tmp = *this;
205 ++*this;
206 return __tmp;
207 }
208
209 constexpr _Utf_iterator&
210 operator--() requires bidirectional_iterator<_Iter>
211 {
212 if (_M_buf_index > 0)
213 --_M_buf_index;
214 else if (_M_curr() != _M_first())
215 {
216 _M_read_reverse();
217 _M_buf_index = _M_buf_last - 1;
218 ranges::advance(_M_curr(), -_M_to_increment);
219 }
220 // else erroneous, but ignored for now.
221 return *this;
222 }
223
224 constexpr _Utf_iterator
225 operator--(int)
226 {
227 auto __tmp = *this;
228 --*this;
229 return __tmp;
230 }
231
232 [[nodiscard]]
233 friend constexpr bool
234 operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
235 requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
236 {
237 if constexpr (forward_iterator<_Iter>)
238 return __lhs._M_curr() == __rhs._M_curr()
239 && __lhs._M_buf_index == __rhs._M_buf_index;
240 else if (__lhs._M_curr() != __rhs._M_curr())
241 return false;
242 else if (__lhs._M_buf_index == __rhs._M_buf_index
243 && __lhs._M_buf_last == __rhs._M_buf_last)
244 return true;
245 else
246 return __lhs._M_buf_index == __lhs._M_buf_last
247 && __rhs._M_buf_index == __rhs._M_buf_last;
248 }
249
250 [[nodiscard]]
251 friend constexpr bool
252 operator==(_Utf_iterator __lhs, _Sent __rhs)
253 {
254 if constexpr (forward_iterator<_Iter>)
255 return __lhs._M_curr() == __rhs;
256 else
257 return __lhs._M_curr() == __rhs
258 && __lhs._M_buf_index == __lhs._M_buf_last;
259 }
260
261 private:
262 constexpr void
263 _M_read()
264 {
265 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
266 _M_read_utf8();
267 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
268 _M_read_utf16();
269 else
270 {
271 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
272 _M_read_utf32();
273 }
274 }
275
276 constexpr void
277 _M_read_reverse() requires bidirectional_iterator<_Iter>
278 {
279 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
280 _M_read_reverse_utf8();
281 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
282 _M_read_reverse_utf16();
283 else
284 {
285 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
286 _M_read_reverse_utf32();
287 }
288 }
289
290 template<typename>
291 struct _Guard
292 {
293 _Guard(void*, _Iter&) { }
294 };
295
296 template<typename _It> requires forward_iterator<_It>
297 struct _Guard<_It>
298 {
299 constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
300 _Utf_iterator* _M_this;
301 _It _M_orig;
302 };
303
304 constexpr char32_t
305 _M_read_utf8()
306 {
307 _Guard<_Iter> __g{this, _M_curr()};
308 char32_t __c{};
309 const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
310 uint8_t __u = *_M_curr()++;
311 uint8_t __to_incr = 1;
312 auto __incr = [&, this] {
313 ++__to_incr;
314 return ++_M_curr();
315 };
316
317 if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
318 __c = __u;
319 else if (__u < 0xC2) [[unlikely]]
320 __c = _S_error();
321 else if (_M_curr() == _M_last) [[unlikely]]
322 __c = _S_error();
323 else if (__u <= 0xDF) // 0xC2 to 0xDF
324 {
325 __c = __u & 0x1F;
326 __u = *_M_curr();
327
328 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
329 __c = _S_error();
330 else
331 {
332 __c = (__c << 6) | (__u & 0x3F);
333 __incr();
334 }
335 }
336 else if (__u <= 0xEF) // 0xE0 to 0xEF
337 {
338 const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
339 const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
340
341 __c = __u & 0x0F;
342 __u = *_M_curr();
343
344 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
345 __c = _S_error();
346 else if (__incr() == _M_last) [[unlikely]]
347 __c = _S_error();
348 else
349 {
350 __c = (__c << 6) | (__u & 0x3F);
351 __u = *_M_curr();
352
353 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
354 __c = _S_error();
355 else
356 {
357 __c = (__c << 6) | (__u & 0x3F);
358 __incr();
359 }
360 }
361 }
362 else if (__u <= 0xF4) // 0xF0 to 0xF4
363 {
364 const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
365 const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
366
367 __c = __u & 0x07;
368 __u = *_M_curr();
369
370 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
371 __c = _S_error();
372 else if (__incr() == _M_last) [[unlikely]]
373 __c = _S_error();
374 else
375 {
376 __c = (__c << 6) | (__u & 0x3F);
377 __u = *_M_curr();
378
379 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
380 __c = _S_error();
381 else if (__incr() == _M_last) [[unlikely]]
382 __c = _S_error();
383 else
384 {
385 __c = (__c << 6) | (__u & 0x3F);
386 __u = *_M_curr();
387
388 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
389 __c = _S_error();
390 else
391 {
392 __c = (__c << 6) | (__u & 0x3F);
393 __incr();
394 }
395 }
396 }
397 }
398 else [[unlikely]]
399 __c = _S_error();
400
401 _M_update(__c, __to_incr);
402
403 return __c;
404 }
405
406 constexpr void
407 _M_read_utf16()
408 {
409 _Guard<_Iter> __g{this, _M_curr()};
410 char32_t __c{};
411 uint16_t __u = *_M_curr()++;
412 uint8_t __to_incr = 1;
413
414 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
415 __c = __u;
416 else if (__u < 0xDC00 && _M_curr() != _M_last)
417 {
418 uint16_t __u2 = *_M_curr();
419 if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
420 __c = _S_error();
421 else
422 {
423 ++_M_curr();
424 __to_incr = 2;
425 uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
426 uint32_t __w = (__u >> 6) & 0x1F;
427 __c = (__w + 1) << 16 | __x;
428 }
429 }
430 else
431 __c = _S_error();
432
433 _M_update(__c, __to_incr);
434 }
435
436 constexpr void
437 _M_read_utf32()
438 {
439 _Guard<_Iter> __g{this, _M_curr()};
440 char32_t __c = *_M_curr()++;
441 if (!__is_scalar_value(__c)) [[unlikely]]
442 __c = _S_error();
443 _M_update(__c, 1);
444 }
445
446 constexpr void
447 _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
448 {
449 const auto __first = _M_first();
450 auto __curr = _M_curr();
451 // The code point we decode:
452 char32_t __c{};
453 // The last code unit read:
454 uint8_t __u = *--__curr;
455 // Count of bytes read:
456 uint8_t __to_incr = 1;
457
458 if (__u <= 0x7F) [[likely]]
459 {
460 _M_update(__u, 1);
461 return;
462 }
463
464 // Continuation bytes match 10xxxxxx
465 auto __is_continuation = [](uint8_t __b) {
466 return (__b & 0xC0) == 0x80;
467 };
468 // 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
469 // 0xF5-0xFF would produce code points above U+10FFFF
470 auto __is_invalid = [](uint8_t __b) {
471 return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
472 };
473
474 // No valid or invalid multibyte sequence is longer than 4 bytes,
475 // so skip back over at most four bytes.
476 while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)
477 {
478 ++__to_incr;
479 __u = *--__curr;
480 }
481
482 // If the last byte read was a continuation byte then either we read
483 // four continuation bytes, or stopped at the start of the sequence.
484 // Either way, the maximal subparts are the individual continuation
485 // bytes so each one should be replaced with U+FFFD.
486 if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
487 {
488 // Either found four continuation bytes (maximum allowed is three)
489 // or first non-continuation byte is an invalid UTF-8 code unit.
490 _M_update(_S_error(), 1);
491 return;
492 }
493 // __u is a valid start byte so use countl_one to get the expected
494 // length of the multibyte sequence that starts with this byte.
495 int __seq_length = std::countl_one((unsigned char)__u);
496 if (__seq_length < __to_incr) [[unlikely]]
497 {
498 // If the expected number of continuation bytes is less than
499 // the number we found, then the last continuation byte is a
500 // maximal subpart and the decremented iterator points to it.
501 _M_update(_S_error(), 1);
502 return;
503 }
504
505 auto __orig = std::__exchange(_M_curr(), std::move(__curr));
506 if (_M_read_utf8() == _S_error()) [[unlikely]]
507 {
508 if (_M_to_increment < __to_incr) // Read truncated sequence, set
509 _M_to_increment = 1; // curr to last continuation byte.
510 }
511
512 _M_curr() = std::move(__orig);
513 // operator--() will move back by _M_to_increment
514 }
515
516 constexpr void
517 _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
518 {
519 _Guard<_Iter> __g{this, _M_curr()};
520 char32_t __c{};
521 uint16_t __u = *--_M_curr();
522 uint8_t __to_incr = 1;
523
524 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
525 __c = __u;
526 else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
527 {
528 // read a low surrogate, expect a high surrogate before it.
529 uint16_t __u2 = *--_M_curr();
530 if (__u2 < 0xD800 || __u2 > 0xDC00) [[unlikely]]
531 __c = _S_error(); // unpaired low surrogate
532 else
533 {
534 __to_incr = 2;
535 uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
536 uint32_t __w = (__u2 >> 6) & 0x1F;
537 __c = (__w + 1) << 16 | __x;
538 }
539 }
540 else
541 __c = _S_error(); // unpaired surrogate
542
543 _M_update(__c, __to_incr);
544 }
545
546 constexpr void
547 _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
548 {
549 _Guard<_Iter> __g{this, _M_curr()};
550 char32_t __c = *--_M_curr();
551 if (!__is_scalar_value(__c)) [[unlikely]]
552 __c = _S_error();
553 _M_update(__c, 1);
554 }
555
556 // Encode the code point __c as one or more code units in _M_buf.
557 constexpr void
558 _M_update(char32_t __c, uint8_t __to_incr)
559 {
560 _M_to_increment = __to_incr;
561 _M_buf_index = 0;
562 if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
563 {
564 _M_buf[0] = __c;
565 _M_buf_last = 1;
566 }
567 else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
568 {
569 if (__is_single_code_unit<_ToFmt>(__c))
570 {
571 _M_buf[0] = __c;
572 _M_buf[1] = 0;
573 _M_buf_last = 1;
574 }
575 else
576 {
577 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
578 const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
579 char16_t __lead = __lead_offset + (__c >> 10);
580 char16_t __trail = 0xDC00 + (__c & 0x3FF);
581 _M_buf[0] = __lead;
582 _M_buf[1] = __trail;
583 _M_buf_last = 2;
584 }
585 }
586 else
587 {
588 static_assert(sizeof(_ToFmt) == 1);
589 int __bits = std::bit_width((uint32_t)__c);
590 if (__bits <= 7) [[likely]]
591 {
592 _M_buf[0] = __c;
593 _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
594 _M_buf_last = 1;
595 }
596 else if (__bits <= 11)
597 {
598 _M_buf[0] = 0xC0 | (__c >> 6);
599 _M_buf[1] = 0x80 | (__c & 0x3F);
600 _M_buf[2] = _M_buf[3] = 0;
601 _M_buf_last = 2;
602 }
603 else if (__bits <= 16)
604 {
605 _M_buf[0] = 0xE0 | (__c >> 12);
606 _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
607 _M_buf[2] = 0x80 | (__c & 0x3F);
608 _M_buf[3] = 0;
609 _M_buf_last = 3;
610 }
611 else
612 {
613 _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
614 _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
615 _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
616 _M_buf[3] = 0x80 | (__c & 0x3F);
617 _M_buf_last = 4;
618 }
619 }
620 }
621
622 constexpr char32_t
623 _S_error()
624 {
625 char32_t __c = _ErrorHandler()();
626 __glibcxx_assert(__is_scalar_value(__c));
627 return __c;
628 }
629
630 constexpr _Iter
631 _M_first() const requires bidirectional_iterator<_Iter>
632 { return _M_first_and_curr._M_first; }
633
634 constexpr _Iter&
635 _M_curr() { return _M_first_and_curr._M_curr; }
636
637 constexpr _Iter
638 _M_curr() const { return _M_first_and_curr._M_curr; }
639
640 // _M_first is not needed for non-bidirectional ranges.
641 template<typename _It>
642 struct _First_and_curr
643 {
644 _First_and_curr() = default;
645
646 constexpr
647 _First_and_curr(_It __curr) : _M_curr(__curr) { }
648
649 template<convertible_to<_It> _It2>
650 constexpr
651 _First_and_curr(const _First_and_curr<_It2>& __other)
652 : _M_curr(__other._M_curr) { }
653
654 // First code unit of the current code point for forward iterators,
655 // past-the-end of the current code point for input iterators.
656 _It _M_curr;
657 };
658
659 template<typename _It> requires bidirectional_iterator<_It>
660 struct _First_and_curr<_It>
661 {
662 _First_and_curr() = default;
663
664 constexpr
665 _First_and_curr(_It __first, _It __curr)
666 : _M_first(__first), _M_curr(__curr) { }
667
668 template<convertible_to<_It> _It2>
669 constexpr
670 _First_and_curr(const _First_and_curr<_It2>& __other)
671 : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
672
673 _It _M_first; // Start of the underlying range.
674 _It _M_curr; // First code unit of the current code point.
675 };
676
677 // Iterators pointing to the start of the underlying range and to the
678 // start (or end, for non-forward iterators) of the current code point.
679 _First_and_curr<_Iter> _M_first_and_curr;
680
681 // The end of the underlying input range.
682 [[no_unique_address]] _Sent _M_last;
683
684 // Buffer holding the individual code units of the current code point.
685 array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
686
687 uint8_t _M_buf_index = 0; // Index of current code unit in the buffer.
688 uint8_t _M_buf_last = 0; // Number of code units in the buffer.
689 uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment.
690
691 template<typename _FromFmt2, typename _ToFmt2,
692 input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
693 typename _ErrHandler>
694 requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
695 friend class _Utf_iterator;
696 };
697
698 template<typename _ToFormat, ranges::input_range _Range>
699 class _Utf_view
700 : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
701 {
702 using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
703 _ToFormat, ranges::iterator_t<_Range>,
704 ranges::sentinel_t<_Range>>;
705
706 template<typename _Iter, typename _Sent>
707 constexpr auto
708 _M_begin(_Iter __first, _Sent __last)
709 {
710 if constexpr (bidirectional_iterator<_Iter>)
711 return _Iterator(__first, __first, __last);
712 else
713 return _Iterator(__first, __last);
714 }
715
716 template<typename _Iter, typename _Sent>
717 constexpr auto
718 _M_end(_Iter __first, _Sent __last)
719 {
720 if constexpr (!is_same_v<_Iter, _Sent>)
721 return __last;
722 else if constexpr (bidirectional_iterator<_Iter>)
723 return _Iterator(__first, __last, __last);
724 else
725 return _Iterator(__last, __last);
726 }
727
728 _Range _M_base;
729
730 public:
731 constexpr explicit
732 _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
733
734 constexpr auto begin()
735 { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
736
737 constexpr auto end()
738 { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
739
740 constexpr bool empty() const { return ranges::empty(_M_base); }
741 };
742
743#ifdef __cpp_char8_t
744 template<typename _View>
745 using _Utf8_view = _Utf_view<char8_t, _View>;
746#else
747 template<typename _View>
748 using _Utf8_view = _Utf_view<char, _View>;
749#endif
750 template<typename _View>
751 using _Utf16_view = _Utf_view<char16_t, _View>;
752 template<typename _View>
753 using _Utf32_view = _Utf_view<char32_t, _View>;
754
755inline namespace __v16_0_0
756{
757#define _GLIBCXX_GET_UNICODE_DATA 160000
758#include "unicode-data.h"
759#ifdef _GLIBCXX_GET_UNICODE_DATA
760# error "Invalid unicode data"
761#endif
762
763 // The field width of a code point.
764 constexpr int
765 __field_width(char32_t __c) noexcept
766 {
767 if (__c < __width_edges[0]) [[likely]]
768 return 1;
769
770 auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
771 return (__p - __width_edges) % 2 + 1;
772 }
773
774 // @pre c <= 0x10FFFF
775 constexpr bool
776 __should_escape_category(char32_t __c) noexcept
777 {
778 constexpr uint32_t __mask = 0x01;
779 auto* __end = std::end(__escape_edges);
780 auto* __p = std::lower_bound(__escape_edges, __end,
781 (__c << 1u) + 2);
782 return __p[-1] & __mask;
783 }
784
785
786 // @pre c <= 0x10FFFF
787 constexpr _Gcb_property
788 __grapheme_cluster_break_property(char32_t __c) noexcept
789 {
790 constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
791 auto* __end = std::end(__gcb_edges);
792 auto* __p = std::lower_bound(__gcb_edges, __end,
793 (__c << __gcb_shift_bits) | __mask);
794 return _Gcb_property(__p[-1] & __mask);
795 }
796
797 constexpr bool
798 __is_incb_linker(char32_t __c) noexcept
799 {
800 const auto __end = std::end(__incb_linkers);
801 // Array is small enough that linear search is faster than binary search.
802 return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
803 }
804
805 // @pre c <= 0x10FFFF
806 constexpr _InCB
807 __incb_property(char32_t __c) noexcept
808 {
809 if ((__c << 2) < __incb_edges[0]) [[likely]]
810 return _InCB(0);
811
812 constexpr uint32_t __mask = 0x3;
813 auto* __end = std::end(__incb_edges);
814 auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
815 return _InCB(__p[-1] & __mask);
816 }
817
818 constexpr bool
819 __is_extended_pictographic(char32_t __c)
820 {
821 if (__c < __xpicto_edges[0]) [[likely]]
822 return 0;
823
824 auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
825 return (__p - __xpicto_edges) % 2;
826 }
827
828 struct _Grapheme_cluster_iterator_base
829 {
830 char32_t _M_c; // First code point in the cluster.
831 _Gcb_property _M_prop; // GCB property of _M_c.
832 enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
833 _XPicto _M_xpicto_seq_state = _XPicto::_Init;
834 unsigned char _M_RI_count = 0;
835 bool _M_incb_linker_seen = false;
836
837 constexpr void
838 _M_reset(char32_t __c, _Gcb_property __p)
839 {
840 _M_c = __c;
841 _M_prop = __p;
842 _M_xpicto_seq_state = _XPicto::_Init;
843 _M_RI_count = 0;
844 _M_incb_linker_seen = false;
845 }
846
847 constexpr void
848 _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
849 {
850 if (_M_xpicto_seq_state == _XPicto::_Failed)
851 return;
852
853 auto __next_state = _XPicto::_Failed;
854 if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
855 {
856 if (__p == _Gcb_property::_Gcb_ZWJ)
857 {
858 if (_M_xpicto_seq_state == _XPicto::_Matched)
859 __next_state = _XPicto::_Zwj;
860 // We check _M_c here so that we do the lookup at most once,
861 // and only for clusters containing at least one ZWJ.
862 else if (__is_extended_pictographic(_M_c))
863 __next_state = _XPicto::_Zwj;
864 }
865 else if (__p == _Gcb_property::_Gcb_Extend)
866 __next_state = _M_xpicto_seq_state; // no change
867 }
868 else // Zwj
869 {
870 // This assumes that all \p{Extended_Pictographic} emoji have
871 // Grapheme_Cluster_Break=Other.
872 if (__p == _Gcb_property::_Gcb_Other
873 && __is_extended_pictographic(__c))
874 __next_state = _XPicto::_Matched;
875 }
876 _M_xpicto_seq_state = __next_state;
877 }
878
879 constexpr void
880 _M_update_ri_count(_Gcb_property __p)
881 {
882 if (__p == _Gcb_property::_Gcb_Regional_Indicator)
883 ++_M_RI_count;
884 else
885 _M_RI_count = 0;
886 }
887
888 constexpr void
889 _M_update_incb_state(char32_t __c, _Gcb_property)
890 {
891 if (__is_incb_linker(__c))
892 _M_incb_linker_seen = true;
893 }
894 };
895
896 // Split a range into extended grapheme clusters.
897 template<ranges::forward_range _View> requires ranges::view<_View>
898 class _Grapheme_cluster_view
899 : public ranges::view_interface<_Grapheme_cluster_view<_View>>
900 {
901 public:
902
903 constexpr
904 _Grapheme_cluster_view(_View __v)
905 : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
906 { }
907
908 constexpr auto begin() const { return _M_begin; }
909 constexpr auto end() const { return _M_begin.end(); }
910
911 private:
912 struct _Iterator : private _Grapheme_cluster_iterator_base
913 {
914 private:
915 // Iterator over the underlying code points.
916 using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
917
918 public:
919 // TODO: Change value_type to be subrange<_U32_iterator> instead?
920 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
921 // That would be the whole cluster, not just the first code point.
922 // Would need to store two iterators and find end of current cluster
923 // on increment, so operator* returns value_type(_M_base, _M_next).
924 using value_type = char32_t;
925 using iterator_concept = forward_iterator_tag;
926 using difference_type = ptrdiff_t;
927
928 constexpr
929 _Iterator(_U32_iterator __i)
930 : _M_base(__i)
931 {
932 if (__i != __i.end())
933 {
934 _M_c = *__i;
935 _M_prop = __grapheme_cluster_break_property(_M_c);
936 }
937 }
938
939 // The first code point of the current extended grapheme cluster.
940 constexpr value_type
941 operator*() const
942 { return _M_c; }
943
944 constexpr auto
945 operator->() const
946 { return &_M_c; }
947
948 // Move to the next extended grapheme cluster.
949 constexpr _Iterator&
950 operator++()
951 {
952 const auto __end = _M_base.end();
953 if (_M_base != __end)
954 {
955 auto __p_prev = _M_prop;
956 auto __it = _M_base;
957 while (++__it != __end)
958 {
959 char32_t __c = *__it;
960 auto __p = __grapheme_cluster_break_property(*__it);
961 _M_update_xpicto_seq_state(__c, __p);
962 _M_update_ri_count(__p);
963 _M_update_incb_state(__c, __p);
964 if (_M_is_break(__p_prev, __p, __it))
965 {
966 // Found a grapheme cluster break
967 _M_reset(__c, __p);
968 break;
969 }
970 __p_prev = __p;
971 }
972 _M_base = __it;
973 }
974 return *this;
975 }
976
977 constexpr _Iterator
978 operator++(int)
979 {
980 auto __tmp = *this;
981 ++*this;
982 return __tmp;
983 }
984
985 constexpr bool
986 operator==(const _Iterator& __i) const
987 { return _M_base == __i._M_base; }
988
989 // This supports iter != iter.end()
990 constexpr bool
991 operator==(const ranges::sentinel_t<_View>& __i) const
992 { return _M_base == __i; }
993
994 // Iterator to the start of the current cluster.
995 constexpr auto base() const { return _M_base.base(); }
996
997 // The end of the underlying view (not the end of the current cluster!)
998 constexpr auto end() const { return _M_base.end(); }
999
1000 // Field width of the first code point in the cluster.
1001 constexpr int
1002 width() const noexcept
1003 { return __field_width(_M_c); }
1004
1005 private:
1006 _U32_iterator _M_base;
1007
1008 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
1009 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
1010 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
1011 // Return true if there is a break between code point with property p1
1012 // and code point with property p2.
1013 constexpr bool
1014 _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
1015 _U32_iterator __curr) const
1016 {
1017 using enum _Gcb_property;
1018
1019 if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
1020 return true; // Break after Control or LF.
1021
1022 if (__p1 == _Gcb_CR)
1023 return __p2 != _Gcb_LF; // Do not break between a CR and LF.
1024
1025 // Rule GB5
1026 if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
1027 return true; // Break before Control, CR or LF.
1028
1029 // Rule GB6
1030 if (__p1 == _Gcb_L)
1031 switch (__p2)
1032 {
1033 case _Gcb_L:
1034 case _Gcb_V:
1035 case _Gcb_LV:
1036 case _Gcb_LVT:
1037 return false; // Do not break Hangul syllable sequences.
1038 default:
1039 return true;
1040 }
1041
1042 // Rule GB7
1043 if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
1044 switch (__p2)
1045 {
1046 case _Gcb_V:
1047 case _Gcb_T:
1048 return false; // Do not break Hangul syllable sequences.
1049 default:
1050 return true;
1051 }
1052
1053 // Rule GB8
1054 if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
1055 return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
1056
1057 // Rule GB9
1058 if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
1059 return false; // Do not break before extending characters or ZWJ.
1060
1061 // The following GB9x rules only apply to extended grapheme clusters,
1062 // which is what the C++ standard uses (not legacy grapheme clusters).
1063
1064 // Rule GB9a
1065 if (__p2 == _Gcb_SpacingMark)
1066 return false; // Do not break before SpacingMarks,
1067 // Rule GB9b
1068 if (__p1 == _Gcb_Prepend)
1069 return false; // or after Prepend characters.
1070
1071 // Rule GB9c (Unicode 15.1.0)
1072 // Do not break within certain combinations with
1073 // Indic_Conjunct_Break (InCB)=Linker.
1074 if (_M_incb_linker_seen
1075 && __incb_property(_M_c) == _InCB::_Consonant
1076 && __incb_property(*__curr) == _InCB::_Consonant)
1077 {
1078 // Match [_M_base, __curr] against regular expression
1079 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
1080 bool __have_linker = false;
1081 auto __it = _M_base;
1082 while (++__it != __curr)
1083 {
1084 if (__is_incb_linker(*__it))
1085 __have_linker = true;
1086 else
1087 {
1088 auto __incb = __incb_property(*__it);
1089 if (__incb == _InCB::_Consonant)
1090 __have_linker = false;
1091 else if (__incb != _InCB::_Extend)
1092 break;
1093 }
1094 }
1095 if (__it == __curr && __have_linker)
1096 return false;
1097 }
1098
1099 // Rule GB11
1100 // Do not break within emoji modifier sequences
1101 // or emoji zwj sequences.
1102 if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
1103 return false;
1104
1105 // Rules GB12 and GB13
1106 // Do not break within emoji flag sequences. That is, do not break
1107 // between regional indicator (RI) symbols if there is an odd number
1108 // of RI characters before the break point.
1109 if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
1110 return (_M_RI_count & 1) == 0;
1111
1112 // Rule GB999
1113 return true; // Otherwise, break everywhere.
1114 }
1115 };
1116
1117 _Iterator _M_begin;
1118 };
1119
1120} // namespace __v16_0_0
1121
1122 // Return the field width of a string.
1123 template<typename _CharT>
1124 constexpr size_t
1125 __field_width(basic_string_view<_CharT> __s)
1126 {
1127 if (__s.empty()) [[unlikely]]
1128 return 0;
1129 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1130 auto __it = __gc.begin();
1131 const auto __end = __gc.end();
1132 size_t __n = __it.width();
1133 while (++__it != __end)
1134 __n += __it.width();
1135 return __n;
1136 }
1137
1138 // Truncate a string to at most `__max` field width units, and return the
1139 // resulting field width.
1140 template<typename _CharT>
1141 constexpr size_t
1142 __truncate(basic_string_view<_CharT>& __s, size_t __max)
1143 {
1144 if (__s.empty()) [[unlikely]]
1145 return 0;
1146
1147 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1148 auto __it = __gc.begin();
1149 const auto __end = __gc.end();
1150 size_t __n = __it.width();
1151 if (__n > __max)
1152 {
1153 __s = {};
1154 return 0;
1155 }
1156 while (++__it != __end)
1157 {
1158 size_t __n2 = __n + __it.width();
1159 if (__n2 > __max)
1160 {
1161 __s = basic_string_view<_CharT>(__s.begin(), __it.base());
1162 return __n;
1163 }
1164 __n = __n2;
1165 }
1166 return __n;
1167 }
1168
1169 template<typename _CharT>
1170 consteval bool
1171 __literal_encoding_is_unicode()
1172 {
1173 if constexpr (is_same_v<_CharT, char16_t>)
1174 return true;
1175 else if constexpr (is_same_v<_CharT, char32_t>)
1176 return true;
1177#ifdef __cpp_char8_t
1178 else if constexpr (is_same_v<_CharT, char8_t>)
1179 return true;
1180#endif
1181
1182 const char* __enc = "";
1183
1184#ifdef __GNUC_EXECUTION_CHARSET_NAME
1185 auto __remove_iso10646_prefix = [](const char* __s) {
1186 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1187 if (__s[0] == 'I' || __s[0] == 'i')
1188 if (__s[1] == 'S' || __s[1] == 's')
1189 if (__s[2] == 'O' || __s[2] == 'o')
1190 if (string_view(__s + 3).starts_with("-10646/"))
1191 return __s + 10;
1192 return __s;
1193 };
1194
1195 if constexpr (is_same_v<_CharT, char>)
1196 __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1197# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1198 else
1199 __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1200# endif
1201
1202 if ((__enc[0] == 'U' || __enc[0] == 'u')
1203 && (__enc[1] == 'T' || __enc[1] == 't')
1204 && (__enc[2] == 'F' || __enc[2] == 'f'))
1205 {
1206 __enc += 3;
1207 if (__enc[0] == '-')
1208 ++__enc;
1209 if (__enc[0] == '8')
1210 return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1211 else if constexpr (!is_same_v<_CharT, char>)
1212 {
1213 string_view __s(__enc);
1214 if (__s.ends_with("//"))
1215 __s.remove_suffix(2);
1216 if (__s.ends_with("LE") || __s.ends_with("BE"))
1217 __s.remove_suffix(2);
1218 return __s == "16" || __s == "32";
1219 }
1220 }
1221#elif defined __clang_literal_encoding__
1222 if constexpr (is_same_v<_CharT, char>)
1223 __enc = __clang_literal_encoding__;
1224# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1225 else
1226 __enc = __clang_wide_literal_encoding__;
1227# endif
1228 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1229 string_view __s(__enc);
1230 if (__s == "UTF-8")
1231 return true;
1232 else if constexpr (!is_same_v<_CharT, char>)
1233 return __s == "UTF-16" || __s == "UTF-32";
1234#endif
1235
1236 return false;
1237 }
1238
1239 consteval bool
1240 __literal_encoding_is_utf8()
1241 { return __literal_encoding_is_unicode<char>(); }
1242
1243 consteval bool
1244 __literal_encoding_is_extended_ascii()
1245 {
1246 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1247 && 'a' == 0x61 && 'z' == 0x7a;
1248 }
1249
1250 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1251 constexpr bool
1252 __charset_alias_match(string_view __a, string_view __b)
1253 {
1254 // Map alphanumeric chars to their base 64 value, everything else to 127.
1255 auto __map = [](char __c, bool& __num) -> unsigned char {
1256 if (__c == '0') [[unlikely]]
1257 return __num ? 0 : 127;
1258 const auto __v = __detail::__from_chars_alnum_to_val(__c);
1259 __num = __v < 10;
1260 return __v;
1261 };
1262
1263 auto __ptr_a = __a.begin(), __end_a = __a.end();
1264 auto __ptr_b = __b.begin(), __end_b = __b.end();
1265 bool __num_a = false, __num_b = false;
1266
1267 while (true)
1268 {
1269 // Find the value of the next alphanumeric character in each string.
1270 unsigned char __val_a{}, __val_b{};
1271 while (__ptr_a != __end_a
1272 && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1273 ++__ptr_a;
1274 while (__ptr_b != __end_b
1275 && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1276 ++__ptr_b;
1277 // Stop when we reach the end of a string, or get a mismatch.
1278 if (__ptr_a == __end_a)
1279 return __ptr_b == __end_b;
1280 else if (__ptr_b == __end_b)
1281 return false;
1282 else if (__val_a != __val_b)
1283 return false; // Found non-matching characters.
1284 ++__ptr_a;
1285 ++__ptr_b;
1286 }
1287 return true;
1288 }
1289
1290} // namespace __unicode
1291
1292namespace ranges
1293{
1294 template<typename _To, typename _Range>
1295 inline constexpr bool
1296 enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1297 = enable_borrowed_range<_Range>;
1298
1299 template<typename _Range>
1300 inline constexpr bool
1301 enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1302 = enable_borrowed_range<_Range>;
1303} // namespace ranges
1304
1305_GLIBCXX_END_NAMESPACE_VERSION
1306} // namespace std
1307#endif // C++20
1308#endif // _GLIBCXX_UNICODE_H
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition move.h:138
constexpr _Tp && forward(typename std::remove_reference< _Tp >::type &__t) noexcept
Forward an lvalue.
Definition move.h:72
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition valarray:1251
ISO C++ entities toplevel namespace is std.
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.