doxygen/bde_api_prod/bdlde__utf8util_8h_source.html

/// @file bdlde_utf8util.h

///

/// The content of this file has been pre-processed for Doxygen.

///


// bdlde_utf8util.h                                                   -*-C++-*-

#ifndef INCLUDED_BDLDE_UTF8UTIL

#define INCLUDED_BDLDE_UTF8UTIL


#include <bsls_ident.h>

BSLS_IDENT("$Id: $")


/// @defgroup bdlde_utf8util bdlde_utf8util

/// @brief  Provide basic utilities for UTF-8 encodings.

/// @addtogroup bdl

/// @{

/// @addtogroup bdlde

/// @{

/// @addtogroup bdlde_utf8util

/// @{

///

/// <h1> Outline </h1>

/// * <a href="#bdlde_utf8util-purpose"> Purpose</a>

/// * <a href="#bdlde_utf8util-classes"> Classes </a>

/// * <a href="#bdlde_utf8util-description"> Description </a>

///   * <a href="#bdlde_utf8util-empty-input-strings"> Empty Input Strings </a>

///   * <a href="#bdlde_utf8util-usage"> Usage </a>

///     * <a href="#bdlde_utf8util-example-1-validating-strings-and-counting-unicode-code-points"> Example 1: Validating Strings and Counting Unicode Code Points </a>

///     * <a href="#bdlde_utf8util-example-2-advancing-over-a-given-number-of-code-points"> Example 2: Advancing Over a Given Number of Code Points </a>

///     * <a href="#bdlde_utf8util-example-3-validating-utf-8-read-from-a-bsl-streambuf"> Example 3: Validating UTF-8 Read from a bsl::streambuf </a>

///

/// # Purpose {#bdlde_utf8util-purpose}

/// Provide basic utilities for UTF-8 encodings.

///

/// # Classes {#bdlde_utf8util-classes}

///

/// -  bdlde::Utf8Util: namespace for utilities for UTF-8 encodings

///

/// # Description {#bdlde_utf8util-description}

/// This component provides, within the `bdlde::Utf8Util` `struct`,

/// a suite of static functions supporting UTF-8 encoded strings.  Two

/// interfaces are provided for each function, one where the length of the

/// string (in *bytes*) is passed as a separate argument, and one where the

/// string is passed as a null-terminated C-style string.

///

/// A string is deemed to contain valid UTF-8 if it is compliant with RFC 3629,

/// meaning that only 1-, 2-, 3-, and 4-byte sequences are allowed.  Values

/// above `U+10ffff` are also not allowed.

///

/// Seven types of functions are provided:

///

/// * `isValid`, which checks for validity, per RFC 3629, of a (candidate)

///   UTF-8 string.  "Overlong values", that is, values encoded in more bytes

///   than necessary, are not tolerated; nor are "surrogate values", which are

///   values in the range `[U+d800 .. U+dfff]`.

/// * `advanceIfValid` and `advanceRaw`, which advance some number of Unicode

///   code points, each of which may be encoded in multiple bytes in a UTF-8

///   string.  `advanceRaw` assumes the string is valid UTF-8, while

///   `advanceIfValid` checks the input for validity and stops advancing if a

///   sequence is encountered that is not valid UTF-8.

/// * `numCodePointsIfValid` and `numCodePointsRaw`, which return the number of

///   Unicode code points in a UTF-8 string.  Note that `numCodePointsIfValid`

///   both validates a (candidate) UTF-8 string and counts the number of

///   Unicode code points that it contains.

/// * `numBytesIfValid`, which returns the number of bytes a specified number

///   of Unicode code points occupy in a UTF-8 string.

/// * `getByteSize`, which returns the length of a single UTF-8 encoded

///   character.

/// * `CodePointValue`, which returns the integral value of a single UTF-8

///   encoded character.

/// * `appendUtf8Character`, which appends a single Unicode code point to a

///   UTF-8 string.

///

/// Embedded null bytes are allowed in strings that are accompanied by an

/// explicit length argument.  Naturally, null-terminated C-style strings cannot

/// contain embedded null code points.

///

/// The UTF-8 format is described in the RFC 3629 document at:

/// @code

/// http://tools.ietf.org/html/rfc3629

/// @endcode

/// and in Wikipedia at:

/// @code

/// http://en.wikipedia.org/wiki/Utf-8

/// @endcode

///

/// ## Empty Input Strings {#bdlde_utf8util-empty-input-strings}

///

///

/// The utility functions provided by this component consider the empty string

/// to be valid UTF-8.  For those functions that take input as a

/// `(pointer, length)` pair, if `0 == pointer` and `0 == length`, then the

/// input is interpreted as a valid, empty string.  However, if `0 == pointer`

/// and `0 != length`, the behavior is undefined.  All such functions have a

/// counterpart that takes a lone pointer to a null-terminated (C-style) string.

/// The behavior is always undefined if 0 is supplied for that lone pointer.

///

/// ## Usage {#bdlde_utf8util-usage}

///

///

/// This section illustrates intended use of this component.

///

/// ### Example 1: Validating Strings and Counting Unicode Code Points {#bdlde_utf8util-example-1-validating-strings-and-counting-unicode-code-points}

///

///

/// In this usage example, we will encode some Unicode code points in UTF-8

/// strings and demonstrate those that are valid and those that are not.

///

/// First, we build an unquestionably valid UTF-8 string:

/// @code

/// bsl::string string;

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x856);

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xfff);

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '.');

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');

/// @endcode

/// Then, we check its validity and measure its length:

/// @code

/// assert(true == bdlde::Utf8Util::isValid(string.data(), string.length()));

/// assert(true == bdlde::Utf8Util::isValid(string.c_str()));

///

/// assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.data(),

///                                                  string.length()));

/// assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.c_str()));

/// @endcode

/// Next, we encode a lone surrogate value, `0xd8ab`, that we encode as the raw

/// 3-byte sequence "\xed\xa2\xab" to avoid validation:

/// @code

/// bsl::string stringWithSurrogate = string + "\xed\xa2\xab";

///

/// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.data(),

///                                          stringWithSurrogate.length()));

/// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.c_str()));

/// @endcode

/// Then, we cannot use `numCodePointsRaw` to count the code points in

/// `stringWithSurrogate`, since the behavior of that method is undefined unless

/// the string is valid.  Instead, the `numCodePointsIfValid` method can be used

/// on strings whose validity we are uncertain of:

/// @code

/// const char *invalidPosition = 0;

///

/// bsls::Types::IntPtr rc;

/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,

///                                            stringWithSurrogate.data(),

///                                            stringWithSurrogate.length());

/// assert(rc < 0);

/// assert(bdlde::Utf8Util::k_SURROGATE == rc);

/// assert(invalidPosition == stringWithSurrogate.data() + string.length());

///

/// invalidPosition = 0;  // reset

///

/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,

///                                            stringWithSurrogate.c_str());

/// assert(rc < 0);

/// assert(bdlde::Utf8Util::k_SURROGATE == rc);

/// assert(invalidPosition == stringWithSurrogate.data() + string.length());

/// @endcode

/// Now, we encode 0, which is allowed.  However, note that we cannot use any

/// interfaces that take a null-terminated string for this case:

/// @code

/// bsl::string stringWithNull = string;

/// stringWithNull += '\0';

///

/// assert(true == bdlde::Utf8Util::isValid(stringWithNull.data(),

///                                         stringWithNull.length()));

///

/// assert(  10 == bdlde::Utf8Util::numCodePointsRaw(stringWithNull.data(),

///                                                  stringWithNull.length()));

/// @endcode

/// Finally, we encode `0x3a` (`:`) as an overlong value using 2 bytes, which is

/// not valid UTF-8 (since `:` can be "encoded" in 1 byte):

/// @code

/// bsl::string stringWithOverlong = string;

/// stringWithOverlong += static_cast<char>(0xc0);        // start of 2-byte

///                                                       // sequence

/// stringWithOverlong += static_cast<char>(0x80 | ':');  // continuation byte

///

/// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.data(),

///                                          stringWithOverlong.length()));

/// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.c_str()));

///

/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,

///                                            stringWithOverlong.data(),

///                                            stringWithOverlong.length());

/// assert(rc < 0);

/// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);

/// assert(invalidPosition == stringWithOverlong.data() + string.length());

///

/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,

///                                            stringWithOverlong.c_str());

/// assert(rc < 0);

/// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);

/// assert(invalidPosition == stringWithOverlong.data() + string.length());

/// @endcode

///

/// ### Example 2: Advancing Over a Given Number of Code Points {#bdlde_utf8util-example-2-advancing-over-a-given-number-of-code-points}

///

///

/// In this example, we will use the various `advance` functions to advance

/// through a UTF-8 string.

///

/// First, build the string using `appendUtf8CodePoint`, keeping track of how

/// many bytes are in each Unicode code point:

/// @code

/// bsl::string string;

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);        // 3 bytes

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1ff);         // 2 bytes

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');           // 1 byte

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);      // 4 bytes

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);       // 4 bytes

/// string += "\xe3\x8f\xfe";           // 3 bytes (invalid 3-byte sequence,

///                                     // the first 2 bytes are valid but the

///                                     // last continuation byte is invalid)

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');           // 1 byte

/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');          // 1 byte

/// @endcode

/// Then, declare a few variables we'll need:

/// @code

/// bsls::Types::IntPtr  rc;

/// int                  status;

/// const char          *result;

/// const char *const start = string.c_str();

/// @endcode

/// Next, try advancing 2 code points, then 3, then 4, observing that the value

/// returned is the number of Unicode code points advanced.  Note that since

/// we're only advancing over valid UTF-8, we can use either `advanceRaw` or

/// `advanceIfValid`:

/// @code

/// rc = bdlde::Utf8Util::advanceRaw(              &result, start, 2);

/// assert(2 == rc);

/// assert(3 + 2 == result - start);

///

/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 2);

/// assert(0 == status);

/// assert(2 == rc);

/// assert(3 + 2 == result - start);

///

/// rc = bdlde::Utf8Util::advanceRaw(             &result, start, 3);

/// assert(3 == rc);

/// assert(3 + 2 + 1 == result - start);

///

/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 3);

/// assert(0 == status);

/// assert(3 == rc);

/// assert(3 + 2 + 1 == result - start);

///

/// rc = bdlde::Utf8Util::advanceRaw(             &result, start, 4);

/// assert(4 == rc);

/// assert(3 + 2 + 1 + 4 == result - start);

///

/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 4);

/// assert(0 == status);

/// assert(4 == rc);

/// assert(3 + 2 + 1 + 4 == result - start);

/// @endcode

/// Then, try advancing by more code points than are present using

/// `advanceIfValid`, and wind up stopping when we encounter invalid input.  The

/// behavior of `advanceRaw` is undefined if it is used on invalid input, so we

/// cannot use it here.  Also note that we will stop at the beginning of the

/// invalid Unicode code point, and not at the first incorrect byte, which is

/// two bytes later:

/// @code

/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);

/// assert(0 != status);

/// assert(5 == rc);

/// assert(3 + 2 + 1 + 4 + 4                 == result - start);

/// assert(static_cast<int>(string.length()) >  result - start);

/// @endcode

/// Now, doctor the string to replace the invalid code point with a valid one,

/// so the string is entirely correct UTF-8:

/// @code

/// string[3 + 2 + 1 + 4 + 4 + 2] = static_cast<char>(0x8a);

/// @endcode

/// Finally, advance using both functions by more code points than are in the

/// string and in both cases wind up at the end of the string.  Note that

/// `advanceIfValid` does not return an error (non-zero) value to `status` when

/// it encounters the end of the string:

/// @code

/// rc = bdlde::Utf8Util::advanceRaw(             &result, start, INT_MAX);

/// assert(8 == rc);

/// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);

/// assert(static_cast<int>(string.length()) == result - start);

///

/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);

/// assert(0 == status);

/// assert(8 == rc);

/// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);

/// assert(static_cast<int>(string.length()) == result - start);

/// @endcode

///

/// ### Example 3: Validating UTF-8 Read from a bsl::streambuf {#bdlde_utf8util-example-3-validating-utf-8-read-from-a-bsl-streambuf}

///

///

/// In this usage example, we will demonstrate reading and validating UTF-8

/// from a stream.

///

/// We write a function to read valid UTF-8 to a `bsl::string`.  We don't know

/// how long the input will be, so we don't know how long to make the string

/// before we start.  We will grow the string in small, 32-byte increments.

/// @code

/// /// Read valid UTF-8 from the specified streambuf `sb` to the specified

/// /// `output`.  Return 0 if the input was exhausted without encountering

/// /// any invalid UTF-8, and a non-zero value otherwise.  If invalid UTF-8

/// /// is encountered, log a message describing the problem after loading

/// /// all the valid UTF-8 preceding it into `output`.  Note that after the

/// /// call, in no case will `output` contain any invalid UTF-8.

/// int utf8StreambufToString(bsl::string    *output,

///                           bsl::streambuf *sb)

/// {

///     enum { k_READ_LENGTH = 32 };

///

///     output->clear();

///     while (true) {

///         bsl::size_t len = output->length();

///         output->resize(len + k_READ_LENGTH);

///         int status;

///         IntPtr numBytes = bdlde::Utf8Util::readIfValid(&status,

///                                                        &(*output)[len],

///                                                        k_READ_LENGTH,

///                                                        sb);

///         BSLS_ASSERT(0 <= numBytes);

///         BSLS_ASSERT(numBytes <= k_READ_LENGTH);

///

///         output->resize(len + numBytes);

///         if (0 < status) {

///             // Buffer was full before the end of input was encountered.

///             // Note that `numBytes` may be up to 3 bytes less than

///             // `k_READ_LENGTH`.

///

///             BSLS_ASSERT(k_READ_LENGTH - 4 < numBytes);

///

///             // Go on to grow the string and get more input.

///

///             continue;

///         }

///         else if (0 == status) {

///             // Success!  We've reached the end of input without

///             // encountering any invalid UTF-8.

///

///             return 0;                                             // RETURN

///         }

///         else {

///             // Invalid UTF-8 encountered; the value of `status` indicates

///             // the exact nature of the problem.  `numBytes` returned from

///             // the above call indicated the number of valid UTF-8 bytes

///             // read before encountering the invalid UTF-8.

///

///             BSLS_LOG_ERROR("Invalid UTF-8 error %s at position %u.\n",

///                            bdlde::Utf8Util::toAscii(status),

///                            static_cast<unsigned>(output->length()));

///

///             return -1;                                            // RETURN

///         }

///     }

/// }

/// @endcode

/// @}

/** @} */

/** @} */


/** @addtogroup bdl

 * @{

 */

/** @addtogroup bdlde

 * @{

 */

/** @addtogroup bdlde_utf8util

 * @{

 */


#include <bdlscm_version.h>


#include <bsls_assert.h>

#include <bsls_libraryfeatures.h>

#include <bsls_review.h>

#include <bsls_types.h>


#include <bsl_cstddef.h>

#include <bsl_iosfwd.h>

#include <bsl_streambuf.h>

#include <bsl_string.h>


#include <string>


namespace bdlde {

                              // ===============

                              // struct Utf8Util

                              // ===============


/// This struct provides a namespace for static methods used for validating

/// UTF-8 strings, for counting the number of Unicode code points in them,

/// for advancing pointers through UTF-8 strings by a specified number of

/// Unicode code points, for counting the number of bytes a UTF-8 leading

/// substring occupies, for counting the number of bytes in a UTF-8

/// character, and for appending a Unicode character to a UTF-8 string.


struct Utf8Util {


    // PUBLIC TYPES

    typedef bsls::Types::size_type size_type;

    typedef bsls::Types::IntPtr    IntPtr;

    typedef bsls::Types::Uint64    Uint64;


    /// Enumerate the error status values that are returned (possibly

    /// through an out parameter) from some methods in this utility.  Note

    /// that some of the functions in this `struct` have a return value

    /// that is non-negative on success, and one of these values when an

    /// error occurs, so all of these values must be negative to distinguish

    /// them from a "success" value.


    enum ErrorStatus {


        k_END_OF_INPUT_TRUNCATION       = -1,

           // The end of input was reached partway through a multibyte UTF-8

           // sequence.


        k_UNEXPECTED_CONTINUATION_OCTET = -2,

           // A continuation byte was encountered when not within a multibyte

           // sequence.


        k_NON_CONTINUATION_OCTET        = -3,

           // A non-continuation byte was encountered where a continuation byte

           // was expected.


        k_OVERLONG_ENCODING             = -4,

           // The encoded Unicode value could have been encoded in a sequence

           // of fewer bytes.


        k_INVALID_INITIAL_OCTET         = -5,

           // A sequence began with an octet with its 5 highest-order bits all

           // set, which is always invalid in UTF-8.


        k_VALUE_LARGER_THAN_0X10FFFF    = -6,

           // A value larger than 0x10FFFF was encoded.


        k_SURROGATE                     = -7

           // Illegal occurrence of Unicode code point reserved for surrogate

           // values in UTF-16.  Note that all surrogate values are illegal as

           // Unicode code points.

    };


    // CLASS METHODS


    /// Advance past 0 or more consecutive *valid* Unicode code points at

    /// the beginning of the specified `string`, until either the specified

    /// `numCodePoints` have been traversed, or the terminating null byte or

    /// invalid UTF-8 is encountered (whichever occurs first), and return

    /// the number of Unicode code points traversed.  Set the specified

    /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value

    /// from the `ErrorStatus` `enum` otherwise.  Set the specified

    /// `*result` to the address of the byte immediately following the last

    /// valid code point traversed, or to `string` if `string` is empty or

    /// `numCodePoints` is 0.  `string` is necessarily null-terminated, so

    /// it cannot contain embedded null bytes.  The behavior is undefined

    /// unless `0 <= numCodePoints`.  Note that the value returned will be

    /// in the range `[0 .. numCodePoints]`.  Also note that `string` may

    /// contain less than `bsl::strlen(string)` Unicode code points.

    static IntPtr advanceIfValid(int         *status,

                                 const char **result,

                                 const char  *string,

                                 IntPtr       numCodePoints);


    /// Advance past 0 or more consecutive *valid* Unicode code points at

    /// the beginning of the specified `string` having the specified

    /// `length` (in bytes), until either the specified `numCodePoints` or

    /// `length` bytes have been traversed, or invalid UTF-8 is encountered

    /// (whichever occurs first), and return the number of Unicode code

    /// points traversed.  Set the specified `*status` to 0 if no invalid

    /// UTF-8 is encountered, and to a value from the `ErrorStatus` `enum`

    /// otherwise.  Set the specified `*result` to the address of the byte

    /// immediately following the last valid code point traversed, or to

    /// `string` if `length` or `numCodePoints` is 0.  `string` need not be

    /// null-terminated and can contain embedded null bytes, and `string`

    /// may be null if `0 == length` (see {Empty Input Strings}).  The

    /// behavior is undefined unless `0 <= numCodePoints`.  Note that the

    /// value returned will be in the range `[0 .. numCodePoints]`.  Also

    /// note that `string` may contain less than `length` Unicode code

    /// points.

    static IntPtr advanceIfValid(int         *status,

                                 const char **result,

                                 const char  *string,

                                 size_type    length,

                                 IntPtr       numCodePoints);


    /// Advance past 0 or more consecutive *valid* Unicode code points at

    /// the beginning of the specified `string`, until either the specified

    /// `numCodePoints` bytes or the whole `string` have been traversed, or

    /// invalid UTF-8 is encountered (whichever occurs first), and return

    /// the number of Unicode code points traversed.  Set the specified

    /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value

    /// from the `ErrorStatus` `enum` otherwise.  Set the specified

    /// `*result` to the address of the byte immediately following the last

    /// valid code point traversed, or to `string` if its length or

    /// `numCodePoints` is 0.  `string` need not be null-terminated and can

    /// contain embedded null bytes.  The behavior is undefined unless

    /// `0 <= numCodePoints`.  Note that the value returned will be in the

    /// range `[0 .. numCodePoints]`.  Also note that `string` may contain

    /// less than `string.length()` Unicode code points.

    static IntPtr advanceIfValid(int                      *status,

                                 const char              **result,

                                 const bsl::string_view&   string,

                                 IntPtr                    numCodePoints);


    /// Advance past 0 or more consecutive Unicode code points at the

    /// beginning of the specified `string`, until either the specified

    /// `numCodePoints` bytes have been traversed or the terminating null

    /// byte is encountered (whichever occurs first), and return the number

    /// of Unicode code points traversed.  Set the specified `*result` to

    /// the address of the byte immediately following the last code point

    /// traversed, or to `string` if `string` is empty or `numCodePoints` is

    /// 0.  `string` is necessarily null-terminated, so it cannot contain

    /// embedded null bytes.  The behavior is undefined unless `string`

    /// contains valid UTF-8 and `0 <= numCodePoints`.  Note that the value

    /// returned will be in the range `[0 .. numCodePoints]`.  Also note

    /// that `string` may contain less than `bsl::strlen(string)` Unicode

    /// code points.

    static IntPtr advanceRaw(const char **result,

                             const char  *string,

                             IntPtr       numCodePoints);


    /// Advance past 0 or more consecutive Unicode code points at the

    /// beginning of the specified `string` having the specified `length`

    /// (in bytes), until either the specified `numCodePoints` or `length`

    /// bytes have been traversed (whichever occurs first), and return the

    /// number of Unicode code points traversed.  Set the specified

    /// `*result` to the address of the byte immediately following the last

    /// code point traversed, or to `string` if `length` or `numCodePoints`

    /// is 0.  `string` need not be null-terminated and can contain embedded

    /// null bytes, and `string` may be null if `0 == length` (see {Empty

    /// Input Strings}).  The behavior is undefined unless the initial

    /// `length` bytes of `string` contain valid UTF-8 and

    /// `0 <= numCodePoints`.  Note that the value returned will be in the

    /// range `[0 .. numCodePoints]`.  Also note that `string` may contain

    /// less than `length` Unicode code points.

    static IntPtr advanceRaw(const char **result,

                             const char  *string,

                             size_type    length,

                             IntPtr       numCodePoints);


    /// Advance past 0 or more consecutive Unicode code points at the

    /// beginning of the specified `string`, until either the specified

    /// `numCodePoints` bytes or the whole string have been traversed

    /// (whichever occurs first), and return the number of Unicode code

    /// points traversed.  Set the specified `*result` to the address of the

    /// byte immediately following the last code point traversed, or to

    /// `string` if `length` or `numCodePoints` is 0.  `string` need not be

    /// null-terminated and can contain embedded null bytes.  The behavior

    /// is undefined unless `string` contains only valid UTF-8 characters

    /// and `0 <= numCodePoints`.  Note that the value returned will be in

    /// the range `[0 .. numCodePoints]`.  Also note that `string` may

    /// contain less than `length` Unicode code points.

    static IntPtr advanceRaw(const char              **result,

                             const bsl::string_view&   string,

                             IntPtr                    numCodePoints);


    /// @deprecated  Use @ref appendUtf8CodePoint instead.

    ///

    /// Append the UTF-8 encoding of the specified Unicode `codePoint` to

    /// the specified `output` string.  Return 0 on success, and a non-zero

    /// value otherwise.

    static int appendUtf8Character(bsl::string  *output,

                                   unsigned int  codePoint);


    static int appendUtf8CodePoint(bsl::string  *output,

                                   unsigned int  codePoint);

    /// Append the UTF-8 encoding of the specified Unicode `codePoint` to

    /// the specified `output` string.  Return 0 on success, and a non-zero

    /// value otherwise.

    static int appendUtf8CodePoint(std::string  *output,

                                   unsigned int  codePoint);

#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING

    static int appendUtf8CodePoint(std::pmr::string  *output,

                                   unsigned int       codePoint);

#endif


    /// Return the numeric value of the UTF-8-encoded code point beginning

    /// at the specified `codePoint`.  The behavior is undefined unless

    /// `codePoint` is the address of the first byte of a valid UTF-8

    /// encoded character.

    static int codePointValue(const char *codePoint);


    /// @deprecated  Use @ref numBytesInCodePoint instead.

    ///

    /// Return the length (in bytes) of the UTF-8-encoded code point

    /// beginning at the specified `codePoint`.  The behavior is undefined

    /// unless `codePoint` is the address of the first byte of a valid UTF-8

    /// encoded character.  Note that the value returned will be in the

    /// range `[1 .. 4]`.  Also note that 1 is returned if `0 == *codePoint`

    /// since '\0' is a valid 1-byte encoding.

    static int getByteSize(const char *codePoint);


    /// Return the length (in bytes) of the UTF-8-encoded code point

    /// beginning at the specified `codePoint`.  The behavior is undefined

    /// unless `codePoint` is the address of the first byte of a valid UTF-8

    /// encoded character.  Note that the value returned will be in the

    /// range `[1 .. 4]`.  Also note that 1 is returned if `0 == *codePoint`

    /// since '\0' is a valid 1-byte encoding.

    static int numBytesInCodePoint(const char *codePoint);


    static int getLineAndColumnNumber(Uint64         *lineNumber,

                                      Uint64         *utf8Column,

                                      Uint64         *startOfLineByteOffset,

                                      bsl::streambuf *input,

                                      Uint64          byteOffset);

    /// For the specified `byteOffset` in the specified `input`, load the

    /// offset's line number into the specified `lineNumber`, the column

    /// number into the specified `utf8Column`, and the byte offset for the

    /// start of the line into `startOfLineByteOffset`.  Optionally specify

    /// `lineDelimeter` used to the determine line separator.   If

    /// `lineDelimeter` is not supplied, lines are delimeted using '\n'.

    /// Return 0 on success, or a non-zero value if `location` cannot be

    /// found in `input` or if `input` contains non-UTF-8 characters.  The

    /// `utf8Column` is the number of UTF-8 code points between

    /// `startOfLineByteOffset` and `byteOffset`.

    static int getLineAndColumnNumber(Uint64         *lineNumber,

                                      Uint64         *utf8Column,

                                      Uint64         *startOfLineByteOffset,

                                      bsl::streambuf *input,

                                      Uint64          byteOffset,

                                      char            lineDelimeter);


    /// Return `true` if the specified `string` contains valid UTF-8, and

    /// `false` otherwise.  `string` is necessarily null-terminated, so it

    /// cannot contain embedded null bytes.

    static bool isValid(const char *string);


    /// Return `true` if the specified `string` having the specified

    /// `length` (in bytes) contains valid UTF-8, and `false` otherwise.

    /// `string` need not be null-terminated and can contain embedded null

    /// bytes, and `string` may be null if `0 == length` (see {Empty Input

    /// Strings}).

    static bool isValid(const char *string, size_type length);


    /// Return `true` if the specified `string` contains valid UTF-8, and

    /// `false` otherwise.  `string` need not be null-terminated and can

    /// contain embedded null bytes.

    static bool isValid(const bsl::string_view& string);


    /// Return `true` if the specified `string` contains valid UTF-8, and

    /// `false` otherwise.  If `string` contains invalid UTF-8, load into

    /// the specified `invalidString` the address of the beginning of the

    /// first invalid UTF-8 sequence encountered; `invalidString` is

    /// unaffected if `string` contains only valid UTF-8.  `string` is

    /// necessarily null-terminated, so it cannot contain embedded null

    /// bytes.

    static bool isValid(const char **invalidString, const char *string);


    /// Return `true` if the specified `string` having the specified

    /// `length` (in bytes) contains valid UTF-8, and `false` otherwise.  If

    /// `string` contains invalid UTF-8, load into the specified

    /// `invalidString` the address of the byte after the last valid code

    /// point traversed; `invalidString` is unaffected if `string` contains

    /// only valid UTF-8.  `string` need not be null-terminated and can

    /// contain embedded null bytes, and `string` may be null if

    /// `0 == length` (see {Empty Input Strings}).

    static bool isValid(const char **invalidString,

                        const char  *string,

                        size_type    length);


    /// Return `true` if the specified `string` contains only valid UTF-8

    /// characters, and `false` otherwise.  If `string` contains invalid

    /// UTF-8, load into the specified `invalidString` the address of the

    /// byte after the last valid code point traversed; `invalidString` is

    /// unaffected if `string` contains only valid UTF-8.  `string` need not

    /// be null-terminated and can contain embedded null bytes.

    static bool isValid(const char              **invalidString,

                        const bsl::string_view&   string);


    /// If the specified `codePoint` (having at least the specified

    /// `numBytes`) refers to a valid UTF-8 code point then return `true`

    /// and load the specified `status` with the number of bytes in the

    /// code-point; otherwise, if `codePoint` is not a valid code-point,

    /// return `false` and load `status` with one of the (negative)

    /// `ErrorStatus` constants.  The behavior is undefined unless

    /// `numBytes > 0`.

    static bool isValidCodePoint(int        *status,

                                 const char *codePoint,

                                 size_type   numBytes);


    /// @deprecated  Use @ref numBytesRaw instead.

    ///

    /// Return the length (in bytes) of the specified `numCodePoints` UTF-8

    /// encodings in the specified `string`, or a value less than 0 if

    /// `string` contains less than `numCodePoints` encodings.  The behavior

    /// is undefined unless `string` refers to valid UTF-8.  Note that

    /// `string` may contain more than `numCodePoints` encodings in which

    /// case the trailing ones are ignored.

    static IntPtr numBytesIfValid(const bsl::string_view& string,

                                  IntPtr                  numCodePoints);


    /// Return the length (in bytes) of the specified `numCodePoints` UTF-8

    /// encodings in the specified `string`, or a value less than 0 if

    /// `string` contains less than `numCodePoints` encodings.  The behavior

    /// is undefined unless `string` refers to valid UTF-8.  Note that

    /// `string` may contain more than `numCodePoints` encodings in which

    /// case the trailing ones are ignored.

    static IntPtr numBytesRaw(const bsl::string_view& string,

                              IntPtr                  numCodePoints);


    /// @deprecated  Use @ref numCodePointsRaw instead.

    ///

    /// Return the number of Unicode code points in the specified `string`.

    /// `string` is necessarily null-terminated, so it cannot contain

    /// embedded null bytes.  The behavior is undefined unless `string`

    /// contains valid UTF-8.  Note that `string` may contain less than

    /// `bsl::strlen(string)` Unicode code points.

    static IntPtr numCharacters(const char *string);


    /// @deprecated  Use @ref numCodePointsRaw instead.

    ///

    /// Return the number of Unicode code points in the specified `string`

    /// having the specified `length` (in bytes).  `string` need not be

    /// null-terminated and can contain embedded null bytes, and `string`

    /// may be null if `0 == length` (see {Empty Input Strings}).  The

    /// behavior is undefined unless `string` contains valid UTF-8.  Note

    /// that `string` may contain less than `length` Unicode code points.

    static IntPtr numCharacters(const char *string, size_type length);


    /// @deprecated  Use @ref numCodePointsIfValid instead.

    ///

    /// Return the number of Unicode code points in the specified `string`

    /// if it contains valid UTF-8, with no effect on the specified

    /// `invalidString`.  Otherwise, return a negative value and load into

    /// `invalidString` the address of the byte after the last valid Unicode

    /// code point traversed.  `string` is necessarily null-terminated, so

    /// it cannot contain embedded null bytes.  Note that `string` may

    /// contain less than `bsl::strlen(string)` Unicode code points.

    static IntPtr numCharactersIfValid(const char **invalidString,

                                       const char  *string);


    /// @deprecated  Use @ref numCodePointsIfValid instead.

    ///

    /// Return the number of Unicode code points in the specified `string`

    /// having the specified `length` (in bytes) if `string` contains valid

    /// UTF-8, with no effect on the specified `invalidString`.  Otherwise,

    /// return a negative value and load into `invalidString` the address of

    /// the byte after the last valid Unicode code point traversed.

    /// `string` need not be null-terminated and may contain embedded null

    /// bytes, and `string` may be null if `0 == length` (see {Empty Input

    /// Strings}).  Note that `string` may contain less than `length`

    /// Unicode code points.

    static IntPtr numCharactersIfValid(const char **invalidString,

                                       const char  *string,

                                       size_type    length);


    /// @deprecated  Use @ref numCodePointsRaw instead.

    ///

    /// Return the number of Unicode code points in the specified `string`.

    /// `string` is necessarily null-terminated, so it cannot contain

    /// embedded null bytes.  The behavior is undefined unless `string`

    /// contains valid UTF-8.  Note that `string` may contain less than

    /// `bsl::strlen(string)` Unicode code points.

    static IntPtr numCharactersRaw(const char *string);


    /// @deprecated  Use @ref numCodePointsRaw instead.

    ///

    /// Return the number of Unicode code points in the specified `string`

    /// having the specified `length` (in bytes).  `string` need not be

    /// null-terminated and can contain embedded null bytes, and `string`

    /// may be null if `0 == length` (see {Empty Input Strings}).  The

    /// behavior is undefined `string` contains valid UTF-8.  Note that

    /// `string` may contain less than `length` Unicode code points.

    static IntPtr numCharactersRaw(const char *string, size_type length);


    /// Return the number of Unicode code points in the specified `string`

    /// if it contains valid UTF-8, with no effect on the specified

    /// `invalidString`.  Otherwise, return a value from the `ErrorStatus`

    /// `enum` (which are all negative) and load into `invalidString` the

    /// address of the byte after the last valid Unicode code point

    /// traversed.  `string` is necessarily null-terminated, so it cannot

    /// contain embedded null bytes.  Note that `string` may contain less

    /// than `bsl::strlen(string)` Unicode code points.

    static IntPtr numCodePointsIfValid(const char **invalidString,

                                       const char  *string);


    /// Return the number of Unicode code points in the specified `string`

    /// having the specified `length` (in bytes) if `string` contains valid

    /// UTF-8, with no effect on the specified `invalidString`.  Otherwise,

    /// return a value from the `ErrorStatus` `enum` (which are all

    /// negative) and load into `invalidString` the address of the byte

    /// after the last valid Unicode code point traversed.  `string` need

    /// not be null-terminated and may contain embedded null bytes, and

    /// `string` may be null if `0 == length` (see {Empty Input Strings}).

    /// Note that `string` may contain less than `length` Unicode code

    /// points.

    static IntPtr numCodePointsIfValid(const char **invalidString,

                                       const char  *string,

                                       size_type    length);


    /// Return the number of Unicode code points in the specified `string`

    /// if `string` contains valid UTF-8, with no effect on the specified

    /// `invalidString`.  Otherwise, return a value from the `ErrorStatus`

    /// `enum` (which are all negative) and load into `invalidString` the

    /// address of the byte after the last valid Unicode code point

    /// traversed.  `string` need not be null-terminated and may contain

    /// embedded null bytes.

    static IntPtr numCodePointsIfValid(const char              **invalidString,

                                       const bsl::string_view&   string);


    /// Return the number of Unicode code points in the specified `string`.

    /// `string` is necessarily null-terminated, so it cannot contain

    /// embedded null bytes.  The behavior is undefined unless `string`

    /// contains valid UTF-8.  Note that `string` may contain less than

    /// `bsl::strlen(string)` Unicode code points.

    static IntPtr numCodePointsRaw(const char *string);


    /// Return the number of Unicode code points in the specified `string`

    /// having the specified `length` (in bytes).  `string` need not be

    /// null-terminated and can contain embedded null bytes, and `string`

    /// may be null if `0 == length` (see {Empty Input Strings}).  The

    /// behavior is undefined unless `string` contains valid UTF-8.  Note

    /// that `string` may contain less than `length` Unicode code points.

    static IntPtr numCodePointsRaw(const char *string, size_type length);


    /// Return the number of Unicode code points in the specified `string`.

    /// `string` need not be null-terminated and can contain embedded null

    /// bytes.  The behavior is undefined unless `string` contains valid

    /// UTF-8.

    static IntPtr numCodePointsRaw(const bsl::string_view& string);


    /// Read from the specified `input` and copy *valid* UTF-8 (only) to the

    /// specified `outputBuffer` having the specified `outputBufferLength`

    /// (in bytes).  Load the specified `status` with:

    /// * 0 if `input` reached `eof` without encountering any invalid UTF-8

    ///   or prematurely exhausting `outputBuffer`.

    /// * A positive value if `input` was not completely read due to

    ///   `outputBuffer` being filled (or nearly filled) without

    ///   encountering any invalid UTF-8.

    /// * A negative value from `ErrorStatus` if invalid UTF-8 was

    ///   encountered (without having written the invalid sequence to

    ///   `outputBuffer`).

    /// Return the number of bytes of valid UTF-8 written to 'outputBuffer.

    /// If no invalid UTF-8 is encountered, or if `input` supports

    /// `sputbackc` with a putback buffer capacity of at least 4 bytes,

    /// `input` will be left positioned at the end of the valid UTF-8 read,

    /// otherwise, `input` will be left in an unspecified state.  The

    /// behavior is undefined unless `4 <= outputBufferLength`.  Note that

    /// this function will stop reading `input` when less than 4 bytes of

    /// space remain in `outputBuffer` to prevent the possibility of a

    /// 4-byte UTF-8 sequence being truncated partway through.

    static size_type readIfValid(int            *status,

                                 char           *outputBuffer,

                                 size_type       outputBufferLength,

                                 bsl::streambuf *input);


    /// Return the non-modifiable string representation of the `ErrorStatus`

    /// enumerator matching the specified `value`, if it exists, and "(*

    /// unrecognized value *)" otherwise.  The string representation of an

    /// enumerator that matches `value` is the enumerator name with the "k_"

    /// prefix elided.  Note that this method may be used to aid in

    /// interpreting status values that are returned from some methods in

    /// this utility.  See `ErrorStatus`.

    static const char *toAscii(IntPtr value);

};


                          // =======================

                          // struct Utf8Util_ImpUtil

                          // =======================


/// [**PRIVATE**] This struct provides a namespace for static methods used to

/// implement `Utf8Util`.  Note that the functions are not typically useful

/// for clients, and are primarily exposed to allow for more thorough

/// testing.


struct Utf8Util_ImpUtil {


    // TYPES

    typedef bsls::Types::Uint64 Uint64;


    // CLASS METHODS


    /// For the specified `byteOffset` in the specified `input`, load the

    /// byte offset's line number into the specified `lineNumber`, the

    /// column number into the specified `utf8Column`, and the byte offset

    /// for the start of the line into the specified

    /// `startOfLineByteOffset`, using the specified `lineDelimeter` as the

    /// line separator, and using the specified `temporaryReadBuffer` (of

    /// the specified length `temporaryReadBufferNumBytes`) as a temporary

    /// buffer for reading.  Return 0 on success, or a non-zero value if

    /// `location` cannot be found in `input` or if `input` contains

    /// non-UTF-8 characters.  The `utf8Column` is the number of UTF-8 code

    /// points between `startOfLineByteOffset` and `byteOffset`.  The

    /// behavior is undefined unless `temporaryReadBuffer` refers to a valid

    /// buffer of at least `temporaryReadBufferNumBytes` bytes, and

    /// `temporaryReadBufferNumBytes` is greater than or equal to 4.

    static int getLineAndColumnNumber(

                                  Uint64         *lineNumber,

                                  Uint64         *utf8Column,

                                  Uint64         *startOfLineByteOffset,

                                  bsl::streambuf *input,

                                  Uint64          byteOffset,

                                  char            lineDelimeter,

                                  char           *temporaryReadBuffer,

                                  int             temporaryReadBufferNumBytes);

};


// ============================================================================

//                            INLINE DEFINITIONS

// ============================================================================


                              // ---------------

                              // struct Utf8Util

                              // ---------------


// CLASS METHODS

inline


Utf8Util::IntPtr Utf8Util::advanceIfValid(

                                       int                      *status,

                                       const char              **result,

                                       const bsl::string_view&   string,

                                       IntPtr                    numCodePoints)

{


    return advanceIfValid(status,

                          result,

                          string.data(),

                          string.length(),

                          numCodePoints);

}


inline


Utf8Util::IntPtr Utf8Util::advanceRaw(const char              **result,

                                      const bsl::string_view&   string,

                                      IntPtr                    numCodePoints)

{

    return advanceRaw(result, string.data(), string.length(), numCodePoints);

}


inline


int Utf8Util::appendUtf8Character(bsl::string  *output,

                                  unsigned int  codePoint)

{

    return appendUtf8CodePoint(output, codePoint);

}


inline


int Utf8Util::getByteSize(const char *codePoint)

{

    return numBytesInCodePoint(codePoint);

}


inline


int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,

                                     Uint64         *utf8Column,

                                     Uint64         *startOfLineByteOffset,

                                     bsl::streambuf *input,

                                     Uint64          byteOffset)

{

    return getLineAndColumnNumber(lineNumber,

                                  utf8Column,

                                  startOfLineByteOffset,

                                  input,

                                  byteOffset,

                                  '\n');

}


inline


int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,

                                     Uint64         *utf8Column,

                                     Uint64         *startOfLineByteOffset,

                                     bsl::streambuf *input,

                                     Uint64          byteOffset,

                                     char            lineDelimeter)

{

    enum { k_BUFFER_SIZE = 2048 };

    char buffer[k_BUFFER_SIZE];

    return Utf8Util_ImpUtil::getLineAndColumnNumber(lineNumber,

                                                    utf8Column,

                                                    startOfLineByteOffset,

                                                    input,

                                                    byteOffset,

                                                    lineDelimeter,

                                                    buffer,

                                                    k_BUFFER_SIZE);

}


inline


bool Utf8Util::isValid(const char *string)

{

    BSLS_ASSERT(string);


    const char *dummy = 0;

    return isValid(&dummy, string);

}


inline


bool Utf8Util::isValid(const char *string, size_type length)

{

    BSLS_ASSERT(string || 0 == length);


    const char *dummy = 0;

    return isValid(&dummy, string, length);

}


inline


bool Utf8Util::isValid(const bsl::string_view& string)

{

    const char *dummy = 0;

    return isValid(&dummy, string);

}


inline


Utf8Util::IntPtr Utf8Util::numBytesIfValid(

                                         const bsl::string_view& string,

                                         IntPtr                  numCodePoints)

{

    return numBytesRaw(string, numCodePoints);

}


inline


Utf8Util::IntPtr Utf8Util::numCharacters(const char *string)

{

    return numCodePointsRaw(string);

}


inline


Utf8Util::IntPtr Utf8Util::numCharacters(const char *string, size_type length)

{

    return numCodePointsRaw(string, length);

}


inline


Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,

                                                const char  *string)

{

    return numCodePointsIfValid(invalidString, string);

}


inline


Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,

                                                const char  *string,

                                                size_type    length)

{

    return numCodePointsIfValid(invalidString, string, length);

}


inline


Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string)

{

    return numCodePointsRaw(string);

}


inline


Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string,

                                            size_type   length)

{

    return numCodePointsRaw(string, length);

}


inline


Utf8Util::IntPtr Utf8Util::numCodePointsRaw(const bsl::string_view& string)

{

    return numCodePointsRaw(string.data(), string.length());

}


}  // close package namespace


#endif


// ----------------------------------------------------------------------------

// Copyright 2015 Bloomberg Finance L.P.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//     http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

// ----------------------------- END-OF-FILE ----------------------------------


/** @} */

/** @} */

/** @} */

bsls_assert.h

bsls_ident.h

bsls_libraryfeatures.h

bsls_review.h

bsls_types.h

bsl::basic_string_view
Definition bslstl_stringview.h:441

bsl::basic_string
Definition bslstl_string.h:1281

BSLS_ASSERT
#define BSLS_ASSERT(X)
Definition bsls_assert.h:1804

BSLS_IDENT
#define BSLS_IDENT(str)
Definition bsls_ident.h:195

bdlde
Definition bdlde_base64alphabet.h:118

bdlde::Utf8Util_ImpUtil
Definition bdlde_utf8util.h:870

bdlde::Utf8Util_ImpUtil::getLineAndColumnNumber
static int getLineAndColumnNumber(Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset, char lineDelimeter, char *temporaryReadBuffer, int temporaryReadBufferNumBytes)

bdlde::Utf8Util_ImpUtil::Uint64
bsls::Types::Uint64 Uint64
Definition bdlde_utf8util.h:873

bdlde::Utf8Util
Definition bdlde_utf8util.h:404

bdlde::Utf8Util::readIfValid
static size_type readIfValid(int *status, char *outputBuffer, size_type outputBufferLength, bsl::streambuf *input)

bdlde::Utf8Util::numCodePointsIfValid
static IntPtr numCodePointsIfValid(const char **invalidString, const char *string)

bdlde::Utf8Util::advanceRaw
static IntPtr advanceRaw(const char **result, const char *string, size_type length, IntPtr numCodePoints)

bdlde::Utf8Util::Uint64
bsls::Types::Uint64 Uint64
Definition bdlde_utf8util.h:409

bdlde::Utf8Util::toAscii
static const char * toAscii(IntPtr value)

bdlde::Utf8Util::advanceIfValid
static IntPtr advanceIfValid(int *status, const char **result, const char *string, size_type length, IntPtr numCodePoints)

bdlde::Utf8Util::numCharactersRaw
static IntPtr numCharactersRaw(const char *string)
Definition bdlde_utf8util.h:1043

bdlde::Utf8Util::numBytesInCodePoint
static int numBytesInCodePoint(const char *codePoint)

bdlde::Utf8Util::isValid
static bool isValid(const char *string)
Definition bdlde_utf8util.h:983

bdlde::Utf8Util::codePointValue
static int codePointValue(const char *codePoint)

bdlde::Utf8Util::appendUtf8CodePoint
static int appendUtf8CodePoint(bsl::string *output, unsigned int codePoint)

bdlde::Utf8Util::numCodePointsRaw
static IntPtr numCodePointsRaw(const char *string, size_type length)

bdlde::Utf8Util::numCharactersIfValid
static IntPtr numCharactersIfValid(const char **invalidString, const char *string)
Definition bdlde_utf8util.h:1028

bdlde::Utf8Util::advanceRaw
static IntPtr advanceRaw(const char **result, const char *string, IntPtr numCodePoints)

bdlde::Utf8Util::isValid
static bool isValid(const char **invalidString, const char *string, size_type length)

bdlde::Utf8Util::numCodePointsRaw
static IntPtr numCodePointsRaw(const char *string)

bdlde::Utf8Util::IntPtr
bsls::Types::IntPtr IntPtr
Definition bdlde_utf8util.h:408

bdlde::Utf8Util::isValid
static bool isValid(const char **invalidString, const char *string)

bdlde::Utf8Util::isValidCodePoint
static bool isValidCodePoint(int *status, const char *codePoint, size_type numBytes)

bdlde::Utf8Util::appendUtf8Character
static int appendUtf8Character(bsl::string *output, unsigned int codePoint)
Definition bdlde_utf8util.h:935

bdlde::Utf8Util::numBytesIfValid
static IntPtr numBytesIfValid(const bsl::string_view &string, IntPtr numCodePoints)
Definition bdlde_utf8util.h:1008

bdlde::Utf8Util::isValid
static bool isValid(const char **invalidString, const bsl::string_view &string)

bdlde::Utf8Util::numBytesRaw
static IntPtr numBytesRaw(const bsl::string_view &string, IntPtr numCodePoints)

bdlde::Utf8Util::numCodePointsIfValid
static IntPtr numCodePointsIfValid(const char **invalidString, const bsl::string_view &string)

bdlde::Utf8Util::getLineAndColumnNumber
static int getLineAndColumnNumber(Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset)
Definition bdlde_utf8util.h:948

bdlde::Utf8Util::advanceIfValid
static IntPtr advanceIfValid(int *status, const char **result, const char *string, IntPtr numCodePoints)

bdlde::Utf8Util::getByteSize
static int getByteSize(const char *codePoint)
Definition bdlde_utf8util.h:942

bdlde::Utf8Util::numCodePointsIfValid
static IntPtr numCodePointsIfValid(const char **invalidString, const char *string, size_type length)

bdlde::Utf8Util::appendUtf8CodePoint
static int appendUtf8CodePoint(std::string *output, unsigned int codePoint)

bdlde::Utf8Util::ErrorStatus
ErrorStatus
Definition bdlde_utf8util.h:417

bdlde::Utf8Util::k_UNEXPECTED_CONTINUATION_OCTET
@ k_UNEXPECTED_CONTINUATION_OCTET
Definition bdlde_utf8util.h:423

bdlde::Utf8Util::k_NON_CONTINUATION_OCTET
@ k_NON_CONTINUATION_OCTET
Definition bdlde_utf8util.h:427

bdlde::Utf8Util::k_OVERLONG_ENCODING
@ k_OVERLONG_ENCODING
Definition bdlde_utf8util.h:431

bdlde::Utf8Util::k_SURROGATE
@ k_SURROGATE
Definition bdlde_utf8util.h:442

bdlde::Utf8Util::k_VALUE_LARGER_THAN_0X10FFFF
@ k_VALUE_LARGER_THAN_0X10FFFF
Definition bdlde_utf8util.h:439

bdlde::Utf8Util::k_END_OF_INPUT_TRUNCATION
@ k_END_OF_INPUT_TRUNCATION
Definition bdlde_utf8util.h:419

bdlde::Utf8Util::k_INVALID_INITIAL_OCTET
@ k_INVALID_INITIAL_OCTET
Definition bdlde_utf8util.h:435

bdlde::Utf8Util::numCharacters
static IntPtr numCharacters(const char *string)
Definition bdlde_utf8util.h:1016

bdlde::Utf8Util::size_type
bsls::Types::size_type size_type
Definition bdlde_utf8util.h:407

bsls::Types::size_type
std::size_t size_type
Definition bsls_types.h:124

bsls::Types::Uint64
unsigned long long Uint64
Definition bsls_types.h:137

bsls::Types::IntPtr
std::ptrdiff_t IntPtr
Definition bsls_types.h:130