libstdc++
text_encoding
Go to the documentation of this file.
1// <text_encoding> -*- C++ -*-
2
3// Copyright The GNU Toolchain Authors.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file include/text_encoding
26 * This is a Standard C++ Library header.
27 */
28
29#ifndef _GLIBCXX_TEXT_ENCODING
30#define _GLIBCXX_TEXT_ENCODING
31
32#ifdef _GLIBCXX_SYSHDR
33#pragma GCC system_header
34#endif
35
36#include <bits/requires_hosted.h>
37
38#define __glibcxx_want_text_encoding
39#include <bits/version.h>
40
41#ifdef __cpp_lib_text_encoding
42#include <compare>
43#include <string_view>
44#include <bits/functional_hash.h> // hash
45#include <bits/ranges_util.h> // view_interface
46#include <bits/unicode.h> // __charset_alias_match
47#include <ext/numeric_traits.h> // __int_traits
48
49namespace std _GLIBCXX_VISIBILITY(default)
50{
51_GLIBCXX_BEGIN_NAMESPACE_VERSION
52
53 /**
54 * @brief An interface for accessing the IANA Character Sets registry.
55 * @ingroup locales
56 * @since C++23
57 */
58 struct text_encoding
59 {
60 private:
61 struct _Rep
62 {
63 using id = __INT_LEAST32_TYPE__;
64 id _M_id;
65 const char* _M_name;
66
67 friend constexpr bool
68 operator<(const _Rep& __r, id __m) noexcept
69 { return __r._M_id < __m; }
70
71 friend constexpr bool
72 operator==(const _Rep& __r, string_view __name) noexcept
73 { return __r._M_name == __name; }
74 };
75
76 public:
77 static constexpr size_t max_name_length = 63;
78
79 enum class id : _Rep::id
80 {
81 other = 1,
82 unknown = 2,
83 ASCII = 3,
84 ISOLatin1 = 4,
85 ISOLatin2 = 5,
86 ISOLatin3 = 6,
87 ISOLatin4 = 7,
88 ISOLatinCyrillic = 8,
89 ISOLatinArabic = 9,
90 ISOLatinGreek = 10,
91 ISOLatinHebrew = 11,
92 ISOLatin5 = 12,
93 ISOLatin6 = 13,
94 ISOTextComm = 14,
95 HalfWidthKatakana = 15,
96 JISEncoding = 16,
97 ShiftJIS = 17,
98 EUCPkdFmtJapanese = 18,
99 EUCFixWidJapanese = 19,
100 ISO4UnitedKingdom = 20,
101 ISO11SwedishForNames = 21,
102 ISO15Italian = 22,
103 ISO17Spanish = 23,
104 ISO21German = 24,
105 ISO60DanishNorwegian = 25,
106 ISO69French = 26,
107 ISO10646UTF1 = 27,
108 ISO646basic1983 = 28,
109 INVARIANT = 29,
110 ISO2IntlRefVersion = 30,
111 NATSSEFI = 31,
112 NATSSEFIADD = 32,
113 ISO10Swedish = 35,
114 KSC56011987 = 36,
115 ISO2022KR = 37,
116 EUCKR = 38,
117 ISO2022JP = 39,
118 ISO2022JP2 = 40,
119 ISO13JISC6220jp = 41,
120 ISO14JISC6220ro = 42,
121 ISO16Portuguese = 43,
122 ISO18Greek7Old = 44,
123 ISO19LatinGreek = 45,
124 ISO25French = 46,
125 ISO27LatinGreek1 = 47,
126 ISO5427Cyrillic = 48,
127 ISO42JISC62261978 = 49,
128 ISO47BSViewdata = 50,
129 ISO49INIS = 51,
130 ISO50INIS8 = 52,
131 ISO51INISCyrillic = 53,
132 ISO54271981 = 54,
133 ISO5428Greek = 55,
134 ISO57GB1988 = 56,
135 ISO58GB231280 = 57,
136 ISO61Norwegian2 = 58,
137 ISO70VideotexSupp1 = 59,
138 ISO84Portuguese2 = 60,
139 ISO85Spanish2 = 61,
140 ISO86Hungarian = 62,
141 ISO87JISX0208 = 63,
142 ISO88Greek7 = 64,
143 ISO89ASMO449 = 65,
144 ISO90 = 66,
145 ISO91JISC62291984a = 67,
146 ISO92JISC62991984b = 68,
147 ISO93JIS62291984badd = 69,
148 ISO94JIS62291984hand = 70,
149 ISO95JIS62291984handadd = 71,
150 ISO96JISC62291984kana = 72,
151 ISO2033 = 73,
152 ISO99NAPLPS = 74,
153 ISO102T617bit = 75,
154 ISO103T618bit = 76,
155 ISO111ECMACyrillic = 77,
156 ISO121Canadian1 = 78,
157 ISO122Canadian2 = 79,
158 ISO123CSAZ24341985gr = 80,
159 ISO88596E = 81,
160 ISO88596I = 82,
161 ISO128T101G2 = 83,
162 ISO88598E = 84,
163 ISO88598I = 85,
164 ISO139CSN369103 = 86,
165 ISO141JUSIB1002 = 87,
166 ISO143IECP271 = 88,
167 ISO146Serbian = 89,
168 ISO147Macedonian = 90,
169 ISO150 = 91,
170 ISO151Cuba = 92,
171 ISO6937Add = 93,
172 ISO153GOST1976874 = 94,
173 ISO8859Supp = 95,
174 ISO10367Box = 96,
175 ISO158Lap = 97,
176 ISO159JISX02121990 = 98,
177 ISO646Danish = 99,
178 USDK = 100,
179 DKUS = 101,
180 KSC5636 = 102,
181 Unicode11UTF7 = 103,
182 ISO2022CN = 104,
183 ISO2022CNEXT = 105,
184 UTF8 = 106,
185 ISO885913 = 109,
186 ISO885914 = 110,
187 ISO885915 = 111,
188 ISO885916 = 112,
189 GBK = 113,
190 GB18030 = 114,
191 OSDEBCDICDF0415 = 115,
192 OSDEBCDICDF03IRV = 116,
193 OSDEBCDICDF041 = 117,
194 ISO115481 = 118,
195 KZ1048 = 119,
196 UCS2 = 1000,
197 UCS4 = 1001,
198 UnicodeASCII = 1002,
199 UnicodeLatin1 = 1003,
200 UnicodeJapanese = 1004,
201 UnicodeIBM1261 = 1005,
202 UnicodeIBM1268 = 1006,
203 UnicodeIBM1276 = 1007,
204 UnicodeIBM1264 = 1008,
205 UnicodeIBM1265 = 1009,
206 Unicode11 = 1010,
207 SCSU = 1011,
208 UTF7 = 1012,
209 UTF16BE = 1013,
210 UTF16LE = 1014,
211 UTF16 = 1015,
212 CESU8 = 1016,
213 UTF32 = 1017,
214 UTF32BE = 1018,
215 UTF32LE = 1019,
216 BOCU1 = 1020,
217 UTF7IMAP = 1021,
218 Windows30Latin1 = 2000,
219 Windows31Latin1 = 2001,
220 Windows31Latin2 = 2002,
221 Windows31Latin5 = 2003,
222 HPRoman8 = 2004,
223 AdobeStandardEncoding = 2005,
224 VenturaUS = 2006,
225 VenturaInternational = 2007,
226 DECMCS = 2008,
227 PC850Multilingual = 2009,
228 PC8DanishNorwegian = 2012,
229 PC862LatinHebrew = 2013,
230 PC8Turkish = 2014,
231 IBMSymbols = 2015,
232 IBMThai = 2016,
233 HPLegal = 2017,
234 HPPiFont = 2018,
235 HPMath8 = 2019,
236 HPPSMath = 2020,
237 HPDesktop = 2021,
238 VenturaMath = 2022,
239 MicrosoftPublishing = 2023,
240 Windows31J = 2024,
241 GB2312 = 2025,
242 Big5 = 2026,
243 Macintosh = 2027,
244 IBM037 = 2028,
245 IBM038 = 2029,
246 IBM273 = 2030,
247 IBM274 = 2031,
248 IBM275 = 2032,
249 IBM277 = 2033,
250 IBM278 = 2034,
251 IBM280 = 2035,
252 IBM281 = 2036,
253 IBM284 = 2037,
254 IBM285 = 2038,
255 IBM290 = 2039,
256 IBM297 = 2040,
257 IBM420 = 2041,
258 IBM423 = 2042,
259 IBM424 = 2043,
260 PC8CodePage437 = 2011,
261 IBM500 = 2044,
262 IBM851 = 2045,
263 PCp852 = 2010,
264 IBM855 = 2046,
265 IBM857 = 2047,
266 IBM860 = 2048,
267 IBM861 = 2049,
268 IBM863 = 2050,
269 IBM864 = 2051,
270 IBM865 = 2052,
271 IBM868 = 2053,
272 IBM869 = 2054,
273 IBM870 = 2055,
274 IBM871 = 2056,
275 IBM880 = 2057,
276 IBM891 = 2058,
277 IBM903 = 2059,
278 IBM904 = 2060,
279 IBM905 = 2061,
280 IBM918 = 2062,
281 IBM1026 = 2063,
282 IBMEBCDICATDE = 2064,
283 EBCDICATDEA = 2065,
284 EBCDICCAFR = 2066,
285 EBCDICDKNO = 2067,
286 EBCDICDKNOA = 2068,
287 EBCDICFISE = 2069,
288 EBCDICFISEA = 2070,
289 EBCDICFR = 2071,
290 EBCDICIT = 2072,
291 EBCDICPT = 2073,
292 EBCDICES = 2074,
293 EBCDICESA = 2075,
294 EBCDICESS = 2076,
295 EBCDICUK = 2077,
296 EBCDICUS = 2078,
297 Unknown8BiT = 2079,
298 Mnemonic = 2080,
299 Mnem = 2081,
300 VISCII = 2082,
301 VIQR = 2083,
302 KOI8R = 2084,
303 HZGB2312 = 2085,
304 IBM866 = 2086,
305 PC775Baltic = 2087,
306 KOI8U = 2088,
307 IBM00858 = 2089,
308 IBM00924 = 2090,
309 IBM01140 = 2091,
310 IBM01141 = 2092,
311 IBM01142 = 2093,
312 IBM01143 = 2094,
313 IBM01144 = 2095,
314 IBM01145 = 2096,
315 IBM01146 = 2097,
316 IBM01147 = 2098,
317 IBM01148 = 2099,
318 IBM01149 = 2100,
319 Big5HKSCS = 2101,
320 IBM1047 = 2102,
321 PTCP154 = 2103,
322 Amiga1251 = 2104,
323 KOI7switched = 2105,
324 BRF = 2106,
325 TSCII = 2107,
326 CP51932 = 2108,
327 windows874 = 2109,
328 windows1250 = 2250,
329 windows1251 = 2251,
330 windows1252 = 2252,
331 windows1253 = 2253,
332 windows1254 = 2254,
333 windows1255 = 2255,
334 windows1256 = 2256,
335 windows1257 = 2257,
336 windows1258 = 2258,
337 TIS620 = 2259,
338 CP50220 = 2260
339 };
340 using enum id;
341
342 constexpr text_encoding() = default;
343
344 constexpr explicit
345 text_encoding(string_view __enc) noexcept
346 : _M_rep(_S_find_name(__enc))
347 {
348 __enc.copy(_M_name, max_name_length);
349 }
350
351 // @pre i has the value of one of the enumerators of id.
352 constexpr
353 text_encoding(id __i) noexcept
354 : _M_rep(_S_find_id(__i))
355 {
356 if (string_view __name(_M_rep->_M_name); !__name.empty())
357 __name.copy(_M_name, max_name_length);
358 }
359
360 constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
361
362 constexpr const char* name() const noexcept { return _M_name; }
363
364 struct aliases_view : ranges::view_interface<aliases_view>
365 {
366 private:
367 class _Iterator;
368 struct _Sentinel { };
369
370 public:
371 constexpr _Iterator begin() const noexcept;
372 constexpr _Sentinel end() const noexcept { return {}; }
373
374 private:
375 friend struct text_encoding;
376
377 constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
378
379 const _Rep* _M_begin = nullptr;
380 };
381
382 constexpr aliases_view
383 aliases() const noexcept
384 {
385 return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
386 }
387
388 friend constexpr bool
389 operator==(const text_encoding& __a,
390 const text_encoding& __b) noexcept
391 {
392 if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
393 return _S_comp(__a._M_name, __b._M_name);
394 else
395 return __a.mib() == __b.mib();
396 }
397
398 friend constexpr bool
399 operator==(const text_encoding& __encoding, id __i) noexcept
400 { return __encoding.mib() == __i; }
401
402#if __CHAR_BIT__ == 8
403 static consteval text_encoding
404 literal() noexcept
405 {
406#ifdef __GNUC_EXECUTION_CHARSET_NAME
407 return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
408#elif defined __clang_literal_encoding__
409 return text_encoding(__clang_literal_encoding__);
410#else
411 return text_encoding();
412#endif
413 }
414
415 static text_encoding
416 environment();
417
418 template<id _Id>
419 static bool
420 environment_is()
421 { return text_encoding(_Id)._M_is_environment(); }
422#else
423 static text_encoding literal() = delete;
424 static text_encoding environment() = delete;
425 template<id> static bool environment_is() = delete;
426#endif
427
428 private:
429 const _Rep* _M_rep = _S_reps + 1; // id::unknown
430 char _M_name[max_name_length + 1] = {0};
431
432 bool
433 _M_is_environment() const;
434
435 static inline constexpr _Rep _S_reps[] = {
436 { 1, "" }, { 2, "" },
437#define _GLIBCXX_GET_ENCODING_DATA
438#include <bits/text_encoding-data.h>
439#ifdef _GLIBCXX_GET_ENCODING_DATA
440# error "Invalid text_encoding data"
441#endif
442 { 9999, nullptr }, // sentinel
443 };
444
445 static constexpr bool
446 _S_comp(string_view __a, string_view __b)
447 { return __unicode::__charset_alias_match(__a, __b); }
448
449 static constexpr const _Rep*
450 _S_find_name(string_view __name) noexcept
451 {
452#ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
453 // Optimize the common UTF-8 case to avoid a linear search through all
454 // strings in the table using the _S_comp function.
455 if (__name == "UTF-8")
456 return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
457#endif
458
459 // The first two array elements (other and unknown) don't have names.
460 // The last element is a sentinel that can never match anything.
461 const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
462 for (auto __r = __first; __r != __end; ++__r)
463 if (_S_comp(__r->_M_name, __name))
464 {
465 // Might have matched an alias. Find the first entry for this ID.
466 const auto __id = __r->_M_id;
467 while (__r[-1]._M_id == __id)
468 --__r;
469 return __r;
470 }
471 return _S_reps; // id::other
472 }
473
474 static constexpr const _Rep*
475 _S_find_id(id __id) noexcept
476 {
477 const auto __i = (_Rep::id)__id;
478 const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
479 if (__r->_M_id == __i) [[likely]]
480 return __r;
481 else
482 {
483 // Preconditions: i has the value of one of the enumerators of id.
484 __glibcxx_assert(__r->_M_id == __i);
485 return _S_reps + 1; // id::unknown
486 }
487 }
488 };
489
490 template<>
491 struct hash<text_encoding>
492 {
493 size_t
494 operator()(const text_encoding& __enc) const noexcept
495 { return std::hash<text_encoding::id>()(__enc.mib()); }
496 };
497
498 class text_encoding::aliases_view::_Iterator
499 {
500 public:
501 using value_type = const char*;
502 using reference = const char*;
503 using difference_type = int;
504
505 constexpr _Iterator() = default;
506
507 constexpr value_type
508 operator*() const
509 {
510 if (_M_dereferenceable()) [[likely]]
511 return _M_rep->_M_name;
512 else
513 {
514 __glibcxx_assert(_M_dereferenceable());
515 return "";
516 }
517 }
518
519 constexpr _Iterator&
520 operator++()
521 {
522 if (_M_dereferenceable()) [[likely]]
523 ++_M_rep;
524 else
525 {
526 __glibcxx_assert(_M_dereferenceable());
527 *this = _Iterator{};
528 }
529 return *this;
530 }
531
532 constexpr _Iterator&
533 operator--()
534 {
535 const bool __decrementable
536 = _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
537 if (__decrementable) [[likely]]
538 --_M_rep;
539 else
540 {
541 __glibcxx_assert(__decrementable);
542 *this = _Iterator{};
543 }
544 return *this;
545 }
546
547 constexpr _Iterator
548 operator++(int)
549 {
550 auto __it = *this;
551 ++*this;
552 return __it;
553 }
554
555 constexpr _Iterator
556 operator--(int)
557 {
558 auto __it = *this;
559 --*this;
560 return __it;
561 }
562
563 constexpr value_type
564 operator[](difference_type __n) const
565 { return *(*this + __n); }
566
567 constexpr _Iterator&
568 operator+=(difference_type __n)
569 {
570 if (_M_rep != nullptr)
571 {
572 if (__n > 0)
573 {
574 if (__n < (std::end(_S_reps) - _M_rep)
575 && _M_rep[__n - 1]._M_id == _M_id) [[likely]]
576 _M_rep += __n;
577 else
578 *this = _Iterator{};
579 }
580 else if (__n < 0)
581 {
582 if (__n > (_S_reps - _M_rep)
583 && _M_rep[__n]._M_id == _M_id) [[likely]]
584 _M_rep += __n;
585 else
586 *this = _Iterator{};
587 }
588 }
589 if (__n != 0)
590 __glibcxx_assert(_M_rep != nullptr);
591 return *this;
592 }
593
594 constexpr _Iterator&
595 operator-=(difference_type __n)
596 {
597 using _Traits = __gnu_cxx::__int_traits<difference_type>;
598 if (__n == _Traits::__min) [[unlikely]]
599 return operator+=(_Traits::__max);
600 return operator+=(-__n);
601 }
602
603 constexpr difference_type
604 operator-(const _Iterator& __i) const
605 {
606 if (_M_id == __i._M_id)
607 return _M_rep - __i._M_rep;
608 __glibcxx_assert(_M_id == __i._M_id);
609 return __gnu_cxx::__int_traits<difference_type>::__max;
610 }
611
612 constexpr bool
613 operator==(const _Iterator&) const = default;
614
615 constexpr bool
616 operator==(_Sentinel) const noexcept
617 { return !_M_dereferenceable(); }
618
619 constexpr strong_ordering
620 operator<=>(const _Iterator& __i) const
621 {
622 __glibcxx_assert(_M_id == __i._M_id);
623 return _M_rep <=> __i._M_rep;
624 }
625
626 friend constexpr _Iterator
627 operator+(_Iterator __i, difference_type __n)
628 {
629 __i += __n;
630 return __i;
631 }
632
633 friend constexpr _Iterator
634 operator+(difference_type __n, _Iterator __i)
635 {
636 __i += __n;
637 return __i;
638 }
639
640 friend constexpr _Iterator
641 operator-(_Iterator __i, difference_type __n)
642 {
643 __i -= __n;
644 return __i;
645 }
646
647 private:
648 friend struct text_encoding;
649
650 constexpr explicit
651 _Iterator(const _Rep* __r) noexcept
652 : _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
653 { }
654
655 constexpr bool
656 _M_dereferenceable() const noexcept
657 { return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
658
659 const _Rep* _M_rep = nullptr;
660 _Rep::id _M_id = 0;
661 };
662
663 constexpr auto
664 text_encoding::aliases_view::begin() const noexcept
665 -> _Iterator
666 { return _Iterator(_M_begin); }
667
668namespace ranges
669{
670 // Opt-in to borrowed_range concept
671 template<>
672 inline constexpr bool
673 enable_borrowed_range<std::text_encoding::aliases_view> = true;
674}
675
676_GLIBCXX_END_NAMESPACE_VERSION
677} // namespace std
678
679#endif // __cpp_lib_text_encoding
680#endif // _GLIBCXX_TEXT_ENCODING