libstdc++/api/a00503_source.html

// Unicode utilities -*- C++ -*-


// Copyright The GNU Toolchain Authors.

//

// This file is part of the GNU ISO C++ Library.  This library is free

// software; you can redistribute it and/or modify it under the

// terms of the GNU General Public License as published by the

// Free Software Foundation; either version 3, or (at your option)

// any later version.


// This library is distributed in the hope that it will be useful,

// but WITHOUT ANY WARRANTY; without even the implied warranty of

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

// GNU General Public License for more details.


// Under Section 7 of GPL version 3, you are granted additional

// permissions described in the GCC Runtime Library Exception, version

// 3.1, as published by the Free Software Foundation.


// You should have received a copy of the GNU General Public License and

// a copy of the GCC Runtime Library Exception along with this program;

// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see

// <http://www.gnu.org/licenses/>.


/** @file include/bits/unicode.h

 *  This is an internal header file, included by other library headers.

 *  Do not attempt to use it directly. @headername{format}

 */


#ifndef _GLIBCXX_UNICODE_H

#define _GLIBCXX_UNICODE_H 1


#if __cplusplus >= 202002L

#include <array>

#include <bit>      // bit_width

#include <charconv> // __detail::__from_chars_alnum_to_val_table

#include <string_view>

#include <cstdint>

#include <bits/stl_algo.h>

#include <bits/stl_iterator.h>

#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.

#include <bits/ranges_util.h> // view_interface


namespace std _GLIBCXX_VISIBILITY(default)

{

_GLIBCXX_BEGIN_NAMESPACE_VERSION

namespace __unicode

{

  // A Unicode code point that is not a high or low surrogate.

  constexpr bool

  __is_scalar_value(char32_t __c)

  {

    if (__c < 0xD800) [[likely]]

      return true;

    return 0xDFFF < __c && __c <= 0x10FFFF;

  }


  // A code point that can be encoded in a single code unit of type _CharT.

  template<typename _CharT>

    constexpr bool

    __is_single_code_unit(char32_t __c)

    {

      if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)

        return __c <= 0x7F; // ASCII character

      else

        return __c < __gnu_cxx::__int_traits<_CharT>::__max

                       && __is_scalar_value(__c);

    }


  // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template


  struct _Repl

  {

    constexpr char32_t

    operator()() const noexcept

    { return 0xFFFD; }

  };


  struct _Null_sentinel_t

  {

    template<input_iterator _It>

      requires default_initializable<iter_value_t<_It>>

        && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>

      friend constexpr auto

      operator==(_It __it, _Null_sentinel_t)

      { return *__it == iter_value_t<_It>{}; }

  };


  // An iterator over an input range of FromFmt code units that yields either

  // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.

  // The code units from the input range are interpreted as Unicode code points

  // and the iterator produces the individual code unit for each code point.

  // Invalid sequences in the input are replaced with U+FFDD so that the result

  // is always valid UTF-8, UTF-16, or UTF-32.

  //

  // The iterator knows the bounds of the underlying input range and will not

  // read outside those bounds (incrementing or decrementing at the boundary

  // is erroneously idempotent).

  //

  // On construction, the iterator attemps to decode a single code point from

  // the input range and then encode it into an internal buffer in the output

  // format, e.g. if the input is UTF-8 and the output is UTF-16, it might read

  // three char8_t code units from the input and store two char16_t code units

  // in its buffer. Incrementing the iterator will first iterate over buffer,

  // yielding each code unit in turn, and then extract another code point from

  // the input. Failure to extract a valid code point from the input will store

  // U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.

  template<typename _FromFmt, typename _ToFmt,

           input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,

           typename _ErrorHandler = _Repl>

    requires convertible_to<iter_value_t<_Iter>, _FromFmt>

    class _Utf_iterator

    {

      static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));


    public:

      using value_type = _ToFmt;

      using difference_type = iter_difference_t<_Iter>;

      using reference = value_type;

      using iterator_concept

        = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,

                                          bidirectional_iterator_tag>;


      constexpr _Utf_iterator() = default;


      constexpr

      _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)

      requires bidirectional_iterator<_Iter>

      : _M_first_and_curr{__first, __it}, _M_last(__last)

      {

        if (_M_curr() != _M_last)

          _M_read();

        else

          _M_buf = {};

      }


      constexpr

      _Utf_iterator(_Iter __it, _Sent __last)

      requires (!bidirectional_iterator<_Iter>)

      : _M_first_and_curr{__it}, _M_last(__last)

      {

        if (_M_curr() != _M_last)

          _M_read();

        else

          _M_buf = {};

      }


      template<class _Iter2, class _Sent2>

        requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>

        constexpr

        _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,

                                          _ErrorHandler>& __other)

        : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),

          _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),

          _M_last(__other._M_last)

        { }


      [[nodiscard]]

      constexpr _Iter

      begin() const requires bidirectional_iterator<_Iter>

      { return _M_first(); }


      [[nodiscard]]

      constexpr _Sent

      end() const { return _M_last; }


      [[nodiscard]]

      constexpr _Iter

      base() const requires forward_iterator<_Iter>

      { return _M_curr(); }


      [[nodiscard]]

      constexpr iter_difference_t<_Iter>

      _M_units() const requires forward_iterator<_Iter>

      { return _M_to_increment; }


      [[nodiscard]]

      constexpr value_type

      operator*() const { return _M_buf[_M_buf_index]; }


      constexpr _Utf_iterator&

      operator++()

      {

        if (_M_buf_index + 1 < _M_buf_last)

          ++_M_buf_index; // Move to the next code unit in the buffer.

        else if (_M_curr() != _M_last)

          {

            // Advance past the current code point (for non-forward iterators

            // we already moved there after decoding the last code point).

            if constexpr (forward_iterator<_Iter>)

              std::advance(_M_curr(), _M_to_increment);

            if (_M_curr() == _M_last)

              _M_buf_index = 0;

            else // Decode next code point from the input and update buffer.

              _M_read();

          }

        // else erroneous, but ignored for now.

        return *this;

      }


      constexpr _Utf_iterator

      operator++(int)

      {

        auto __tmp = *this;

        ++*this;

        return __tmp;

      }


      constexpr _Utf_iterator&

      operator--() requires bidirectional_iterator<_Iter>

      {

        if (_M_buf_index > 0)

          --_M_buf_index;

        else if (_M_curr() != _M_first())

          {

            _M_read_reverse();

            _M_buf_index = _M_buf_last - 1;

            ranges::advance(_M_curr(), -_M_to_increment);

          }

        // else erroneous, but ignored for now.

        return *this;

      }


      constexpr _Utf_iterator

      operator--(int)

      {

        auto __tmp = *this;

        --*this;

        return __tmp;

      }


      [[nodiscard]]

      friend constexpr bool

      operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)

      requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }

      {

        if constexpr (forward_iterator<_Iter>)

          return __lhs._M_curr() == __rhs._M_curr()

                   && __lhs._M_buf_index == __rhs._M_buf_index;

        else if (__lhs._M_curr() != __rhs._M_curr())

          return false;

        else if (__lhs._M_buf_index == __rhs._M_buf_index

                   && __lhs._M_buf_last == __rhs._M_buf_last)

          return true;

        else

          return __lhs._M_buf_index == __lhs._M_buf_last

                   && __rhs._M_buf_index == __rhs._M_buf_last;

      }


      [[nodiscard]]

      friend constexpr bool

      operator==(_Utf_iterator __lhs, _Sent __rhs)

      {

        if constexpr (forward_iterator<_Iter>)

          return __lhs._M_curr() == __rhs;

        else

          return __lhs._M_curr() == __rhs

                   && __lhs._M_buf_index == __lhs._M_buf_last;

      }


    private:

      constexpr void

      _M_read()

      {

        if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))

          _M_read_utf8();

        else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))

          _M_read_utf16();

        else

          {

            static_assert(sizeof(_FromFmt) == sizeof(uint32_t));

            _M_read_utf32();

          }

      }


      constexpr void

      _M_read_reverse() requires bidirectional_iterator<_Iter>

      {

        if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))

          _M_read_reverse_utf8();

        else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))

          _M_read_reverse_utf16();

        else

          {

            static_assert(sizeof(_FromFmt) == sizeof(uint32_t));

            _M_read_reverse_utf32();

          }

      }


      template<typename>

        struct _Guard

        {

          _Guard(void*, _Iter&) { }

        };


      template<typename _It> requires forward_iterator<_It>

        struct _Guard<_It>

        {

          constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }

          _Utf_iterator* _M_this;

          _It _M_orig;

        };


      constexpr char32_t

      _M_read_utf8()

      {

        _Guard<_Iter> __g{this, _M_curr()};

        char32_t __c{};

        const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;

        uint8_t __u = *_M_curr()++;

        uint8_t __to_incr = 1;

        auto __incr = [&, this] {

          ++__to_incr;

          return ++_M_curr();

        };


        if (__u <= 0x7F) [[likely]]      // 0x00 to 0x7F

          __c = __u;

        else if (__u < 0xC2) [[unlikely]]

          __c = _S_error();

        else if (_M_curr() == _M_last) [[unlikely]]

          __c = _S_error();

        else if (__u <= 0xDF) // 0xC2 to 0xDF

          {

            __c = __u & 0x1F;

            __u = *_M_curr();


            if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]

              __c = _S_error();

            else

              {

                __c = (__c << 6) | (__u & 0x3F);

                __incr();

              }

          }

        else if (__u <= 0xEF) // 0xE0 to 0xEF

          {

            const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;

            const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;


            __c = __u & 0x0F;

            __u = *_M_curr();


            if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]

              __c = _S_error();

            else if (__incr() == _M_last) [[unlikely]]

              __c = _S_error();

            else

              {

                __c = (__c << 6) | (__u & 0x3F);

                __u = *_M_curr();


                if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]

                  __c = _S_error();

                else

                  {

                    __c = (__c << 6) | (__u & 0x3F);

                    __incr();

                  }

              }

          }

        else if (__u <= 0xF4) // 0xF0 to 0xF4

          {

            const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;

            const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;


            __c = __u & 0x07;

            __u = *_M_curr();


            if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]

              __c = _S_error();

            else if (__incr() == _M_last) [[unlikely]]

              __c = _S_error();

            else

              {

                __c = (__c << 6) | (__u & 0x3F);

                __u = *_M_curr();


                if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]

                  __c = _S_error();

                else if (__incr() == _M_last) [[unlikely]]

                  __c = _S_error();

                else

                  {

                    __c = (__c << 6) | (__u & 0x3F);

                    __u = *_M_curr();


                    if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]

                      __c = _S_error();

                    else

                      {

                        __c = (__c << 6) | (__u & 0x3F);

                        __incr();

                      }

                  }

              }

          }

        else [[unlikely]]

          __c = _S_error();


        _M_update(__c, __to_incr);


        return __c;

      }


      constexpr void

      _M_read_utf16()

      {

        _Guard<_Iter> __g{this, _M_curr()};

        char32_t __c{};

        uint16_t __u = *_M_curr()++;

        uint8_t __to_incr = 1;


        if (__u < 0xD800 || __u > 0xDFFF) [[likely]]

          __c = __u;

        else if (__u < 0xDC00 && _M_curr() != _M_last)

          {

            uint16_t __u2 = *_M_curr();

            if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]

              __c = _S_error();

            else

              {

                ++_M_curr();

                __to_incr = 2;

                uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);

                uint32_t __w = (__u >> 6) & 0x1F;

                __c = (__w + 1) << 16 | __x;

              }

          }

        else

          __c = _S_error();


        _M_update(__c, __to_incr);

      }


      constexpr void

      _M_read_utf32()

      {

        _Guard<_Iter> __g{this, _M_curr()};

        char32_t __c = *_M_curr()++;

        if (!__is_scalar_value(__c)) [[unlikely]]

          __c = _S_error();

        _M_update(__c, 1);

      }


      constexpr void

      _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>

      {

        const auto __first = _M_first();

        auto __curr = _M_curr();

        // The code point we decode:

        char32_t __c{};

        // The last code unit read:

        uint8_t __u = *--__curr;

        // Count of bytes read:

        uint8_t __to_incr = 1;


        if (__u <= 0x7F) [[likely]]

          {

            _M_update(__u, 1);

            return;

          }


        // Continuation bytes match 10xxxxxx

        auto __is_continuation = [](uint8_t __b) {

          return (__b & 0xC0) == 0x80;

        };

        // 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.

        // 0xF5-0xFF would produce code points above U+10FFFF

        auto __is_invalid = [](uint8_t __b) {

          return (__b & 0xFE) == 0xC0 || __b >= 0xF5;

        };


        // No valid or invalid multibyte sequence is longer than 4 bytes,

        // so skip back over at most four bytes.

        while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)

          {

            ++__to_incr;

            __u = *--__curr;

          }


        // If the last byte read was a continuation byte then either we read

        // four continuation bytes, or stopped at the start of the sequence.

        // Either way, the maximal subparts are the individual continuation

        // bytes so each one should be replaced with U+FFFD.

        if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]

          {

            // Either found four continuation bytes (maximum allowed is three)

            // or first non-continuation byte is an invalid UTF-8 code unit.

            _M_update(_S_error(), 1);

            return;

          }

        // __u is a valid start byte so use countl_one to get the expected

        // length of the multibyte sequence that starts with this byte.

        int __seq_length = std::countl_one((unsigned char)__u);

        if (__seq_length < __to_incr) [[unlikely]]

          {

            // If the expected number of continuation bytes is less than

            // the number we found, then the last continuation byte is a

            // maximal subpart and the decremented iterator points to it.

            _M_update(_S_error(), 1);

            return;

          }


        auto __orig = std::__exchange(_M_curr(), std::move(__curr));

        if (_M_read_utf8() == _S_error()) [[unlikely]]

          {

            if (_M_to_increment < __to_incr) // Read truncated sequence, set

              _M_to_increment = 1;           // curr to last continuation byte.

          }


        _M_curr() = std::move(__orig);

        // operator--() will move back by _M_to_increment

      }


      constexpr void

      _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>

      {

        _Guard<_Iter> __g{this, _M_curr()};

        char32_t __c{};

        uint16_t __u = *--_M_curr();

        uint8_t __to_incr = 1;


        if (__u < 0xD800 || __u > 0xDFFF) [[likely]]

          __c = __u;

        else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]

          {

            // read a low surrogate, expect a high surrogate before it.

            uint16_t __u2 = *--_M_curr();

            if (__u2 < 0xD800 || __u2 >= 0xDC00) [[unlikely]]

              __c = _S_error(); // unpaired low surrogate

            else

              {

                __to_incr = 2;

                uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);

                uint32_t __w = (__u2 >> 6) & 0x1F;

                __c = (__w + 1) << 16 | __x;

              }

          }

        else

          __c = _S_error(); // unpaired surrogate


        _M_update(__c, __to_incr);

      }


      constexpr void

      _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>

      {

        _Guard<_Iter> __g{this, _M_curr()};

        char32_t __c = *--_M_curr();

        if (!__is_scalar_value(__c)) [[unlikely]]

          __c = _S_error();

        _M_update(__c, 1);

      }


      // Encode the code point __c as one or more code units in _M_buf.

      constexpr void

      _M_update(char32_t __c, uint8_t __to_incr)

      {

        _M_to_increment = __to_incr;

        _M_buf_index = 0;

        if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))

          {

            _M_buf[0] = __c;

            _M_buf_last = 1;

          }

        else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))

          {

            if (__is_single_code_unit<_ToFmt>(__c))

              {

                _M_buf[0] = __c;

                _M_buf[1] = 0;

                _M_buf_last = 1;

              }

            else

              {

                // From http://www.unicode.org/faq/utf_bom.html#utf16-4

                const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);

                char16_t __lead = __lead_offset + (__c >> 10);

                char16_t __trail = 0xDC00 + (__c & 0x3FF);

                _M_buf[0] = __lead;

                _M_buf[1] = __trail;

                _M_buf_last = 2;

              }

          }

        else

          {

            static_assert(sizeof(_ToFmt) == 1);

            int __bits = std::bit_width((uint32_t)__c);

            if (__bits <= 7) [[likely]]

              {

                _M_buf[0] = __c;

                _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;

                _M_buf_last = 1;

              }

            else if (__bits <= 11)

              {

                _M_buf[0] = 0xC0 | (__c >> 6);

                _M_buf[1] = 0x80 | (__c & 0x3F);

                _M_buf[2] = _M_buf[3] = 0;

                _M_buf_last = 2;

              }

            else if (__bits <= 16)

              {

                _M_buf[0] = 0xE0 | (__c >> 12);

                _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);

                _M_buf[2] = 0x80 | (__c & 0x3F);

                _M_buf[3] = 0;

                _M_buf_last = 3;

              }

            else

              {

                _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);

                _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);

                _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);

                _M_buf[3] = 0x80 | (__c & 0x3F);

                _M_buf_last = 4;

              }

          }

      }


      constexpr char32_t

      _S_error()

      {

        char32_t __c = _ErrorHandler()();

        __glibcxx_assert(__is_scalar_value(__c));

        return __c;

      }


      constexpr _Iter

      _M_first() const requires bidirectional_iterator<_Iter>

      { return _M_first_and_curr._M_first; }


      constexpr _Iter&

      _M_curr() { return _M_first_and_curr._M_curr; }


      constexpr _Iter

      _M_curr() const { return _M_first_and_curr._M_curr; }


      // _M_first is not needed for non-bidirectional ranges.

      template<typename _It>

        struct _First_and_curr

        {

          _First_and_curr() = default;


          constexpr

          _First_and_curr(_It __curr) : _M_curr(__curr) { }


          template<convertible_to<_It> _It2>

            constexpr

            _First_and_curr(const _First_and_curr<_It2>& __other)

            : _M_curr(__other._M_curr) { }


          // First code unit of the current code point for forward iterators,

          // past-the-end of the current code point for input iterators.

          _It _M_curr;

        };


      template<typename _It> requires bidirectional_iterator<_It>

        struct _First_and_curr<_It>

        {

          _First_and_curr() = default;


          constexpr

          _First_and_curr(_It __first, _It __curr)

          : _M_first(__first), _M_curr(__curr) { }


          template<convertible_to<_It> _It2>

            constexpr

            _First_and_curr(const _First_and_curr<_It2>& __other)

            : _M_first(__other._M_first), _M_curr(__other._M_curr) { }


          _It _M_first; // Start of the underlying range.

          _It _M_curr;  // First code unit of the current code point.

        };


      // Iterators pointing to the start of the underlying range and to the

      // start (or end, for non-forward iterators) of the current code point.

      _First_and_curr<_Iter> _M_first_and_curr;


      // The end of the underlying input range.

      [[no_unique_address]] _Sent _M_last;


      // Buffer holding the individual code units of the current code point.

      array<value_type, 4 / sizeof(_ToFmt)> _M_buf;


      uint8_t _M_buf_index = 0;    // Index of current code unit in the buffer.

      uint8_t _M_buf_last = 0;     // Number of code units in the buffer.

      uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment.


      template<typename _FromFmt2, typename _ToFmt2,

               input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,

               typename _ErrHandler>

        requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>

        friend class _Utf_iterator;

    };


  template<typename _ToFormat, ranges::input_range _View>

    requires ranges::view<_View>

    class _Utf_view

    : public ranges::view_interface<_Utf_view<_ToFormat, _View>>

    {

      using _Iterator = _Utf_iterator<ranges::range_value_t<_View>,

                                      _ToFormat, ranges::iterator_t<_View>,

                                      ranges::sentinel_t<_View>>;


      template<typename _Iter, typename _Sent>

        constexpr auto

        _M_begin(_Iter __first, _Sent __last)

        {

          if constexpr (bidirectional_iterator<_Iter>)

            return _Iterator(__first, __first, __last);

          else

            return _Iterator(__first, __last);

        }


      template<typename _Iter, typename _Sent>

        constexpr auto

        _M_end(_Iter __first, _Sent __last)

        {

          if constexpr (!is_same_v<_Iter, _Sent>)

            return __last;

          else if constexpr (bidirectional_iterator<_Iter>)

            return _Iterator(__first, __last, __last);

          else

            return _Iterator(__last, __last);

        }


      _View _M_base;


    public:

      constexpr explicit

      _Utf_view(_View __r) : _M_base(std::move(__r)) { }


      constexpr auto begin()

      { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }


      constexpr auto end()

      { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }


      constexpr bool empty() const { return ranges::empty(_M_base); }

    };


#ifdef __cpp_char8_t

  template<typename _View>

    using _Utf8_view = _Utf_view<char8_t, _View>;

#else

  template<typename _View>

    using _Utf8_view = _Utf_view<char, _View>;

#endif

  template<typename _View>

    using _Utf16_view = _Utf_view<char16_t, _View>;

  template<typename _View>

    using _Utf32_view = _Utf_view<char32_t, _View>;


inline namespace __v16_0_0

{

#define _GLIBCXX_GET_UNICODE_DATA 160000

#include "unicode-data.h"

#ifdef _GLIBCXX_GET_UNICODE_DATA

# error "Invalid unicode data"

#endif


  // The field width of a code point.

  constexpr int

  __field_width(char32_t __c) noexcept

  {

    if (__c < __width_edges[0]) [[likely]]

      return 1;


    auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);

    return (__p - __width_edges) % 2 + 1;

  }


  // @pre c <= 0x10FFFF

  constexpr bool

  __should_escape_category(char32_t __c) noexcept

  {

    constexpr uint32_t __mask = 0x01;

    auto* __end = std::end(__escape_edges);

    auto* __p = std::lower_bound(__escape_edges, __end,

                                 (__c << 1u) + 2);

    return __p[-1] & __mask;

  }


  // @pre c <= 0x10FFFF

  constexpr _Gcb_property

  __grapheme_cluster_break_property(char32_t __c) noexcept

  {

    constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;

    auto* __end = std::end(__gcb_edges);

    auto* __p = std::lower_bound(__gcb_edges, __end,

                                 (__c << __gcb_shift_bits) | __mask);

    return _Gcb_property(__p[-1] & __mask);

  }


  constexpr bool

  __is_incb_linker(char32_t __c) noexcept

  {

    const auto __end = std::end(__incb_linkers);

    // Array is small enough that linear search is faster than binary search.

    return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;

  }


  // @pre c <= 0x10FFFF

  constexpr _InCB

  __incb_property(char32_t __c) noexcept

  {

    if ((__c << 2) < __incb_edges[0]) [[likely]]

      return _InCB(0);


    constexpr uint32_t __mask = 0x3;

    auto* __end = std::end(__incb_edges);

    auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);

    return _InCB(__p[-1] & __mask);

  }


  constexpr bool

  __is_extended_pictographic(char32_t __c)

  {

    if (__c < __xpicto_edges[0]) [[likely]]

      return 0;


    auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);

    return (__p - __xpicto_edges) % 2;

  }


  struct _Grapheme_cluster_iterator_base

  {

    char32_t _M_c; // First code point in the cluster.

    _Gcb_property _M_prop; // GCB property of _M_c.

    enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };

    _XPicto _M_xpicto_seq_state = _XPicto::_Init;

    unsigned char _M_RI_count = 0;

    bool _M_incb_linker_seen = false;


    constexpr void

    _M_reset(char32_t __c, _Gcb_property __p)

    {

      _M_c = __c;

      _M_prop = __p;

      _M_xpicto_seq_state = _XPicto::_Init;

      _M_RI_count = 0;

      _M_incb_linker_seen = false;

    }


    constexpr void

    _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)

    {

      if (_M_xpicto_seq_state == _XPicto::_Failed)

        return;


      auto __next_state = _XPicto::_Failed;

      if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched

        {

          if (__p == _Gcb_property::_Gcb_ZWJ)

            {

              if (_M_xpicto_seq_state == _XPicto::_Matched)

                __next_state = _XPicto::_Zwj;

              // We check _M_c here so that we do the lookup at most once,

              // and only for clusters containing at least one ZWJ.

              else if (__is_extended_pictographic(_M_c))

                __next_state = _XPicto::_Zwj;

            }

          else if (__p == _Gcb_property::_Gcb_Extend)

            __next_state = _M_xpicto_seq_state; // no change

        }

      else // Zwj

        {

          // This assumes that all \p{Extended_Pictographic} emoji have

          // Grapheme_Cluster_Break=Other.

          if (__p == _Gcb_property::_Gcb_Other

                && __is_extended_pictographic(__c))

            __next_state = _XPicto::_Matched;

        }

      _M_xpicto_seq_state = __next_state;

    }


    constexpr void

    _M_update_ri_count(_Gcb_property __p)

    {

      if (__p == _Gcb_property::_Gcb_Regional_Indicator)

        ++_M_RI_count;

      else

        _M_RI_count = 0;

    }


    constexpr void

    _M_update_incb_state(char32_t __c, _Gcb_property)

    {

      if (__is_incb_linker(__c))

        _M_incb_linker_seen = true;

    }

  };


  // Split a range into extended grapheme clusters.

  template<ranges::forward_range _View> requires ranges::view<_View>

    class _Grapheme_cluster_view

    : public ranges::view_interface<_Grapheme_cluster_view<_View>>

    {

    public:


      constexpr

      _Grapheme_cluster_view(_View __v)

      : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())

      { }


      constexpr auto begin() const { return _M_begin; }

      constexpr auto end() const { return _M_begin.end(); }


    private:

      struct _Iterator : private _Grapheme_cluster_iterator_base

      {

      private:

        // Iterator over the underlying code points.

        using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;


      public:

        // TODO: Change value_type to be subrange<_U32_iterator> instead?

        // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.

        // That would be the whole cluster, not just the first code point.

        // Would need to store two iterators and find end of current cluster

        // on increment, so operator* returns value_type(_M_base, _M_next).

        using value_type = char32_t;

        using iterator_concept = forward_iterator_tag;

        using difference_type = ptrdiff_t;


        constexpr

        _Iterator(_U32_iterator __i)

        : _M_base(__i)

        {

          if (__i != __i.end())

            {

              _M_c = *__i;

              _M_prop = __grapheme_cluster_break_property(_M_c);

            }

        }


        // The first code point of the current extended grapheme cluster.

        constexpr value_type

        operator*() const

        { return _M_c; }


        constexpr auto

        operator->() const

        { return &_M_c; }


        // Move to the next extended grapheme cluster.

        constexpr _Iterator&

        operator++()

        {

          const auto __end = _M_base.end();

          if (_M_base != __end)

            {

              auto __p_prev = _M_prop;

              auto __it = _M_base;

              while (++__it != __end)

                {

                  char32_t __c = *__it;

                  auto __p = __grapheme_cluster_break_property(*__it);

                  _M_update_xpicto_seq_state(__c, __p);

                  _M_update_ri_count(__p);

                  _M_update_incb_state(__c, __p);

                  if (_M_is_break(__p_prev, __p, __it))

                    {

                      // Found a grapheme cluster break

                      _M_reset(__c, __p);

                      break;

                    }

                  __p_prev = __p;

                }

              _M_base = __it;

            }

          return *this;

        }


        constexpr _Iterator

        operator++(int)

        {

          auto __tmp = *this;

          ++*this;

          return __tmp;

        }


        constexpr bool

        operator==(const _Iterator& __i) const

        { return _M_base == __i._M_base; }


        // This supports iter != iter.end()

        constexpr bool

        operator==(const ranges::sentinel_t<_View>& __i) const

        { return _M_base == __i; }


        // Iterator to the start of the current cluster.

        constexpr auto base() const { return _M_base.base(); }


        // The end of the underlying view (not the end of the current cluster!)

        constexpr auto end() const { return _M_base.end(); }


        // Field width of the first code point in the cluster.

        constexpr int

        width() const noexcept

        { return __field_width(_M_c); }


      private:

        _U32_iterator _M_base;


        // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29

        // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules

        // This implements the rules from TR29 revision 43 in Unicode 15.1.0.

        // Return true if there is a break between code point with property p1

        // and code point with property p2.

        constexpr bool

        _M_is_break(_Gcb_property __p1, _Gcb_property __p2,

                    _U32_iterator __curr) const

        {

          using enum _Gcb_property;


          if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)

            return true; // Break after Control or LF.


          if (__p1 == _Gcb_CR)

            return __p2 != _Gcb_LF; // Do not break between a CR and LF.


          // Rule GB5

          if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)

            return true; // Break before Control, CR or LF.


          // Rule GB6

          if (__p1 == _Gcb_L)

            switch (__p2)

            {

              case _Gcb_L:

              case _Gcb_V:

              case _Gcb_LV:

              case _Gcb_LVT:

                return false; // Do not break Hangul syllable sequences.

              default:

                return true;

              }


          // Rule GB7

          if (__p1 == _Gcb_LV || __p1 == _Gcb_V)

            switch (__p2)

            {

              case _Gcb_V:

              case _Gcb_T:

                return false; // Do not break Hangul syllable sequences.

              default:

                return true;

              }


          // Rule GB8

          if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)

            return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.


          // Rule GB9

          if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)

            return false; // Do not break before extending characters or ZWJ.


          // The following GB9x rules only apply to extended grapheme clusters,

          // which is what the C++ standard uses (not legacy grapheme clusters).


          // Rule GB9a

          if (__p2 == _Gcb_SpacingMark)

            return false; // Do not break before SpacingMarks,

          // Rule GB9b

          if (__p1 == _Gcb_Prepend)

            return false; // or after Prepend characters.


          // Rule GB9c (Unicode 15.1.0)

          // Do not break within certain combinations with

          // Indic_Conjunct_Break (InCB)=Linker.

          if (_M_incb_linker_seen

                && __incb_property(_M_c) == _InCB::_Consonant

                && __incb_property(*__curr) == _InCB::_Consonant)

            {

              // Match [_M_base, __curr] against regular expression

              // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+

              bool __have_linker = false;

              auto __it = _M_base;

              while (++__it != __curr)

                {

                  if (__is_incb_linker(*__it))

                    __have_linker = true;

                  else

                    {

                      auto __incb = __incb_property(*__it);

                      if (__incb == _InCB::_Consonant)

                        __have_linker = false;

                      else if (__incb != _InCB::_Extend)

                        break;

                    }

                }

              if (__it == __curr && __have_linker)

                return false;

            }


          // Rule GB11

          // Do not break within emoji modifier sequences

          // or emoji zwj sequences.

          if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)

            return false;


          // Rules GB12 and GB13

          // Do not break within emoji flag sequences. That is, do not break

          // between regional indicator (RI) symbols if there is an odd number

          // of RI characters before the break point.

          if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)

            return (_M_RI_count & 1) == 0;


          // Rule GB999

          return true; // Otherwise, break everywhere.

        }

      };


      _Iterator _M_begin;

    };


} // namespace __v16_0_0


  // Return the field width of a string.

  template<typename _CharT>

    constexpr size_t

    __field_width(basic_string_view<_CharT> __s)

    {

      if (__s.empty()) [[unlikely]]

        return 0;

      _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);

      auto __it = __gc.begin();

      const auto __end = __gc.end();

      size_t __n = __it.width();

      while (++__it != __end)

        __n += __it.width();

      return __n;

    }


  // Truncate a string to at most `__max` field width units, and return the

  // resulting field width.

  template<typename _CharT>

    constexpr size_t

    __truncate(basic_string_view<_CharT>& __s, size_t __max)

    {

      if (__s.empty()) [[unlikely]]

        return 0;


      _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);

      auto __it = __gc.begin();

      const auto __end = __gc.end();

      size_t __n = __it.width();

      if (__n > __max)

        {

          __s = {};

          return 0;

        }

      while (++__it != __end)

        {

          size_t __n2 = __n + __it.width();

          if (__n2 > __max)

            {

              __s = basic_string_view<_CharT>(__s.begin(), __it.base());

              return __n;

            }

          __n = __n2;

        }

      return __n;

    }


  template<typename _CharT>

    consteval bool

    __literal_encoding_is_unicode()

    {

      if constexpr (is_same_v<_CharT, char16_t>)

        return true;

      else if constexpr (is_same_v<_CharT, char32_t>)

          return true;

#ifdef __cpp_char8_t

      else if constexpr (is_same_v<_CharT, char8_t>)

        return true;

#endif


      const char* __enc = "";


#ifdef __GNUC_EXECUTION_CHARSET_NAME

      auto __remove_iso10646_prefix = [](const char* __s) {

        // GNU iconv allows "ISO-10646/" prefix (case-insensitive).

        if (__s[0] == 'I' || __s[0] == 'i')

          if (__s[1] == 'S' || __s[1] == 's')

            if (__s[2] == 'O' || __s[2] == 'o')

              if (string_view(__s + 3).starts_with("-10646/"))

                return __s + 10;

        return __s;

      };


      if constexpr (is_same_v<_CharT, char>)

        __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);

# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME

      else

        __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);

# endif


      if ((__enc[0] == 'U' || __enc[0] == 'u')

            && (__enc[1] == 'T' || __enc[1] == 't')

            && (__enc[2] == 'F' || __enc[2] == 'f'))

        {

          __enc += 3;

          if (__enc[0] == '-')

            ++__enc;

          if (__enc[0] == '8')

            return __enc[1] == '\0' || string_view(__enc + 1) == "//";

          else if constexpr (!is_same_v<_CharT, char>)

            {

              string_view __s(__enc);

              if (__s.ends_with("//"))

                __s.remove_suffix(2);

              if (__s.ends_with("LE") || __s.ends_with("BE"))

                __s.remove_suffix(2);

              return __s == "16" || __s == "32";

            }

        }

#elif defined __clang_literal_encoding__

      if constexpr (is_same_v<_CharT, char>)

        __enc = __clang_literal_encoding__;

# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__

      else

        __enc = __clang_wide_literal_encoding__;

# endif

      // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.

      string_view __s(__enc);

      if (__s == "UTF-8")

        return true;

      else if constexpr (!is_same_v<_CharT, char>)

        return __s == "UTF-16" || __s == "UTF-32";

#endif


      return false;

    }


  consteval bool

  __literal_encoding_is_utf8()

  { return __literal_encoding_is_unicode<char>(); }


  consteval bool

  __literal_encoding_is_extended_ascii()

  {

    return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a

             && 'a' == 0x61 && 'z' == 0x7a;

  }


  // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching

  constexpr bool

  __charset_alias_match(string_view __a, string_view __b)

  {

    // Map alphanumeric chars to their base 64 value, everything else to 127.

    auto __map = [](char __c, bool& __num) -> unsigned char {

      if (__c == '0') [[unlikely]]

        return __num ? 0 : 127;

      const auto __v = __detail::__from_chars_alnum_to_val(__c);

      __num = __v < 10;

      return __v;

    };


    auto __ptr_a = __a.begin(), __end_a = __a.end();

    auto __ptr_b = __b.begin(), __end_b = __b.end();

    bool __num_a = false, __num_b = false;


    while (true)

      {

        // Find the value of the next alphanumeric character in each string.

        unsigned char __val_a{}, __val_b{};

        while (__ptr_a != __end_a

                 && (__val_a = __map(*__ptr_a, __num_a)) == 127)

          ++__ptr_a;

        while (__ptr_b != __end_b

                 && (__val_b = __map(*__ptr_b, __num_b)) == 127)

          ++__ptr_b;

        // Stop when we reach the end of a string, or get a mismatch.

        if (__ptr_a == __end_a)

          return __ptr_b == __end_b;

        else if (__ptr_b == __end_b)

          return false;

        else if (__val_a != __val_b)

          return false; // Found non-matching characters.

        ++__ptr_a;

        ++__ptr_b;

      }

    return true;

  }


} // namespace __unicode


namespace ranges

{

  template<typename _To, typename _Range>

    inline constexpr bool

    enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>

      = enable_borrowed_range<_Range>;


  template<typename _Range>

    inline constexpr bool

    enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>

      = enable_borrowed_range<_Range>;

} // namespace ranges


_GLIBCXX_END_NAMESPACE_VERSION

} // namespace std

#endif // C++20

#endif // _GLIBCXX_UNICODE_H

array

bit

charconv

string_view

cstdint

ranges_base.h

ranges_util.h

stl_algo.h

unicode-data.h

std::move
constexpr std::remove_reference< _Tp >::type && move(_Tp &&__t) noexcept
Convert a value to an rvalue.
Definition move.h:138

std::end
_Tp * end(valarray< _Tp > &__va) noexcept
Return an iterator pointing to one past the last element of the valarray.
Definition valarray:1251

std
ISO C++ entities toplevel namespace is std.

std::advance
constexpr void advance(_InputIterator &__i, _Distance __n)
A generalization of pointer arithmetic.
Definition stl_iterator_base_funcs.h:262

stl_iterator.h