// bdlde_utf8util.h                                                   -*-C++-*-

// ----------------------------------------------------------------------------
//                                   NOTICE
//
// This component is not up to date with current BDE coding standards, and
// should not be used as an example for new development.
// ----------------------------------------------------------------------------

#ifndef INCLUDED_BDLDE_UTF8UTIL
#define INCLUDED_BDLDE_UTF8UTIL

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide basic utilities for UTF-8 encodings.
//
//@CLASSES:
//  bdlde::Utf8Util: namespace for utilities for UTF-8 encodings
//
//@DESCRIPTION: This component provides, within the 'bdlde::Utf8Util' 'struct',
// a suite of static functions supporting UTF-8 encoded strings.  Two
// interfaces are provided for each function, one where the length of the
// string (in *bytes*) is passed as a separate argument, and one where the
// string is passed as a null-terminated C-style string.
//
// A string is deemed to contain valid UTF-8 if it is compliant with RFC 3629,
// meaning that only 1-, 2-, 3-, and 4-byte sequences are allowed.  Values
// above 'U+10ffff' are also not allowed.
//
// Six types of functions are provided:
//
//: o 'isValid', which checks for validity, per RFC 3629, of a (candidate)
//:   UTF-8 string.  "Overlong values", that is, values encoded in more bytes
//:   than necessary, are not tolerated; nor are "surrogate values", which are
//:   values in the range '[U+d800 .. U+dfff]'.
//:
//: o 'advanceIfValid' and 'advanceRaw', which advance some number of Unicode
//:   code points, each of which may be encoded in multiple bytes in a UTF-8
//:   string.  'advanceRaw' assumes the string is valid UTF-8, while
//:   'advanceIfValid' checks the input for validity and stops advancing if a
//:   sequence is encountered that is not valid UTF-8.
//:
//: o 'numCodePointsIfValid' and 'numCodePointsRaw', which return the number of
//:   Unicode code points in a UTF-8 string.  Note that 'numCodePointsIfValid'
//:   both validates a (candidate) UTF-8 string and counts the number of
//:   Unicode code points that it contains.
//:
//: o 'numBytesIfValid', which returns the number of bytes a specified number
//:   of Unicode code points occupy in a UTF-8 string.
//:
//: o 'getByteSize', which returns the length of a single UTF-8 encoded
//:   character.
//:
//: o 'appendUtf8Character', which appends a single Unicode code point to a
//:   UTF-8 string.
//
// Embedded null bytes are allowed in strings that are accompanied by an
// explicit length argument.  Naturally, null-terminated C-style strings cannot
// contain embedded null code points.
//
// The UTF-8 format is described in the RFC 3629 document at:
//..
//  http://tools.ietf.org/html/rfc3629
//..
// and in Wikipedia at:
//..
//  http://en.wikipedia.org/wiki/Utf-8
//..
//
///Empty Input Strings
///-------------------
// The utility functions provided by this component consider the empty string
// to be valid UTF-8.  For those functions that take input as a
// '(pointer, length)' pair, if '0 == pointer' and '0 == length', then the
// input is interpreted as a valid, empty string.  However, if '0 == pointer'
// and '0 != length', the behavior is undefined.  All such functions have a
// counterpart that takes a lone pointer to a null-terminated (C-style) string.
// The behavior is always undefined if 0 is supplied for that lone pointer.
//
///Usage
///-----
// In this section we show intended use of this component.
//
///Example 1: Validating Strings and Counting Unicode Code Points
/// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this usage example, we will encode some Unicode code points in UTF-8
// strings and demonstrate those that are valid and those that are not.
//
// First, we build an unquestionably valid UTF-8 string:
//..
//  bsl::string string;
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x856);
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xfff);
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, '.');
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');
//..
// Then, we check its validity and measure its length:
//..
//  assert(true == bdlde::Utf8Util::isValid(string.data(), string.length()));
//  assert(true == bdlde::Utf8Util::isValid(string.c_str()));
//
//  assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.data(),
//                                                   string.length()));
//  assert(   9 == bdlde::Utf8Util::numCodePointsRaw(string.c_str()));
//..
// Next, we encode a lone surrogate value, '0xd8ab', that we encode as the raw
// 3-byte sequence "\xed\xa2\xab" to avoid validation:
//..
//  bsl::string stringWithSurrogate = string + "\xed\xa2\xab";
//
//  assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.data(),
//                                           stringWithSurrogate.length()));
//  assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.c_str()));
//..
// Then, we cannot use 'numCodePointsRaw' to count the code points in
// 'stringWithSurrogate', since the behavior of that method is undefined unless
// the string is valid.  Instead, the 'numCodePointsIfValid' method can be used
// on strings whose validity we are uncertain of:
//..
//  const char *invalidPosition = 0;
//
//  bsls::Types::IntPtr rc;
//  rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                             stringWithSurrogate.data(),
//                                             stringWithSurrogate.length());
//  assert(rc < 0);
//  assert(bdlde::Utf8Util::k_SURROGATE == rc);
//  assert(invalidPosition == stringWithSurrogate.data() + string.length());
//
//  invalidPosition = 0;  // reset
//
//  rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                             stringWithSurrogate.c_str());
//  assert(rc < 0);
//  assert(bdlde::Utf8Util::k_SURROGATE == rc);
//  assert(invalidPosition == stringWithSurrogate.data() + string.length());
//..
// Now, we encode 0, which is allowed.  However, note that we cannot use any
// interfaces that take a null-terminated string for this case:
//..
//  bsl::string stringWithNull = string;
//  stringWithNull += '\0';
//
//  assert(true == bdlde::Utf8Util::isValid(stringWithNull.data(),
//                                          stringWithNull.length()));
//
//  assert(  10 == bdlde::Utf8Util::numCodePointsRaw(stringWithNull.data(),
//                                                   stringWithNull.length()));
//..
// Finally, we encode '0x3a' (':') as an overlong value using 2 bytes, which is
// not valid UTF-8 (since ':' can be "encoded" in 1 byte):
//..
//  bsl::string stringWithOverlong = string;
//  stringWithOverlong += static_cast<char>(0xc0);        // start of 2-byte
//                                                        // sequence
//  stringWithOverlong += static_cast<char>(0x80 | ':');  // continuation byte
//
//  assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.data(),
//                                           stringWithOverlong.length()));
//  assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.c_str()));
//
//  rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                             stringWithOverlong.data(),
//                                             stringWithOverlong.length());
//  assert(rc < 0);
//  assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
//  assert(invalidPosition == stringWithOverlong.data() + string.length());
//
//  rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
//                                             stringWithOverlong.c_str());
//  assert(rc < 0);
//  assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
//  assert(invalidPosition == stringWithOverlong.data() + string.length());
//..
//
///Example 2: Advancing Over a Given Number of Code Points
///- - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this example, we will use the various 'advance' functions to advance
// through a UTF-8 string.
//
// First, build the string using 'appendUtf8CodePoint', keeping track of how
// many bytes are in each Unicode code point:
//..
//  bsl::string string;
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);        // 3 bytes
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1ff);         // 2 bytes
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');           // 1 byte
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);      // 4 bytes
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);       // 4 bytes
//  string += "\xe3\x8f\xfe";           // 3 bytes (invalid 3-byte sequence,
//                                      // the first 2 bytes are valid but the
//                                      // last continuation byte is invalid)
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');           // 1 byte
//  bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');          // 1 byte
//..
// Then, declare a few variables we'll need:
//..
//  bsls::Types::IntPtr  rc;
//  int                  status;
//  const char          *result;
//  const char *const start = string.c_str();
//..
// Next, try advancing 2 code points, then 3, then 4, observing that the value
// returned is the number of Unicode code points advanced.  Note that since
// we're only advancing over valid UTF-8, we can use either 'advanceRaw' or
// 'advanceIfValid':
//..
//  rc = bdlde::Utf8Util::advanceRaw(              &result, start, 2);
//  assert(2 == rc);
//  assert(3 + 2 == result - start);
//
//  rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 2);
//  assert(0 == status);
//  assert(2 == rc);
//  assert(3 + 2 == result - start);
//
//  rc = bdlde::Utf8Util::advanceRaw(             &result, start, 3);
//  assert(3 == rc);
//  assert(3 + 2 + 1 == result - start);
//
//  rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 3);
//  assert(0 == status);
//  assert(3 == rc);
//  assert(3 + 2 + 1 == result - start);
//
//  rc = bdlde::Utf8Util::advanceRaw(             &result, start, 4);
//  assert(4 == rc);
//  assert(3 + 2 + 1 + 4 == result - start);
//
//  rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 4);
//  assert(0 == status);
//  assert(4 == rc);
//  assert(3 + 2 + 1 + 4 == result - start);
//..
// Then, try advancing by more code points than are present using
// 'advanceIfValid', and wind up stopping when we encounter invalid input.  The
// behavior of 'advanceRaw' is undefined if it is used on invalid input, so we
// cannot use it here.  Also note that we will stop at the beginning of the
// invalid Unicode code point, and not at the first incorrect byte, which is
// two bytes later:
//..
//  rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
//  assert(0 != status);
//  assert(5 == rc);
//  assert(3 + 2 + 1 + 4 + 4                 == result - start);
//  assert(static_cast<int>(string.length()) >  result - start);
//..
// Now, doctor the string to replace the invalid code point with a valid one,
// so the string is entirely correct UTF-8:
//..
//  string[3 + 2 + 1 + 4 + 4 + 2] = static_cast<char>(0x8a);
//..
// Finally, advance using both functions by more code points than are in the
// string and in both cases wind up at the end of the string.  Note that
// 'advanceIfValid' does not return an error (non-zero) value to 'status' when
// it encounters the end of the string:
//..
//  rc = bdlde::Utf8Util::advanceRaw(             &result, start, INT_MAX);
//  assert(8 == rc);
//  assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);
//  assert(static_cast<int>(string.length()) == result - start);
//
//  rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
//  assert(0 == status);
//  assert(8 == rc);
//  assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1     == result - start);
//  assert(static_cast<int>(string.length()) == result - start);
//..
//
///Example 3: Validating UTF-8 Read from a 'bsl::streambuf'
/// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// In this usage example, we will demonstrate reading and validating UTF-8
// from a stream.
//
// We write a function to read valid UTF-8 to a 'bsl::string'.  We don't know
// how long the input will be, so we don't know how long to make the string
// before we start.  We will grow the string in small, 32-byte increments.
//..
//  int utf8StreambufToString(bsl::string    *output,
//                            bsl::streambuf *sb)
//      // Read valid UTF-8 from the specified streambuf 'sb' to the specified
//      // 'output'.  Return 0 if the input was exhausted without encountering
//      // any invalid UTF-8, and a non-zero value otherwise.  If invalid UTF-8
//      // is encountered, log a message describing the problem after loading
//      // all the valid UTF-8 preceding it into 'output'.  Note that after the
//      // call, in no case will 'output' contain any invalid UTF-8.
//  {
//      enum { k_READ_LENGTH = 32 };
//
//      output->clear();
//      while (true) {
//          bsl::size_t len = output->length();
//          output->resize(len + k_READ_LENGTH);
//          int status;
//          IntPtr numBytes = bdlde::Utf8Util::readIfValid(&status,
//                                                         &(*output)[len],
//                                                         k_READ_LENGTH,
//                                                         sb);
//          BSLS_ASSERT(0 <= numBytes);
//          BSLS_ASSERT(numBytes <= k_READ_LENGTH);
//
//          output->resize(len + numBytes);
//          if (0 < status) {
//              // Buffer was full before the end of input was encountered.
//              // Note that 'numBytes' may be up to 3 bytes less than
//              // 'k_READ_LENGTH'.
//
//              BSLS_ASSERT(k_READ_LENGTH - 4 < numBytes);
//
//              // Go on to grow the string and get more input.
//
//              continue;
//          }
//          else if (0 == status) {
//              // Success!  We've reached the end of input without
//              // encountering any invalid UTF-8.
//
//              return 0;                                             // RETURN
//          }
//          else {
//              // Invalid UTF-8 encountered; the value of 'status' indicates
//              // the exact nature of the problem.  'numBytes' returned from
//              // the above call indicated the number of valid UTF-8 bytes
//              // read before encountering the invalid UTF-8.
//
//              BSLS_LOG_ERROR("Invalid UTF-8 error %s at position %u.\n",
//                             bdlde::Utf8Util::toAscii(status),
//                             static_cast<unsigned>(output->length()));
//
//              return -1;                                            // RETURN
//          }
//      }
//  }
//..

#include <bdlscm_version.h>

#include <bsls_assert.h>
#include <bsls_review.h>
#include <bsls_types.h>

#include <bsl_cstddef.h>
#include <bsl_iosfwd.h>
#include <bsl_string.h>

#include <bsls_libraryfeatures.h>

#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
#include <memory_resource>
#endif
#include <string>            // 'std::string', 'std::pmr::string'
#include <bsl_streambuf.h>

namespace BloombergLP {

namespace bdlde {
                              // ===============
                              // struct Utf8Util
                              // ===============

struct Utf8Util {
    // This struct provides a namespace for static methods used for validating
    // UTF-8 strings, for counting the number of Unicode code points in them,
    // for advancing pointers through UTF-8 strings by a specified number of
    // Unicode code points, for counting the number of bytes a UTF-8 leading
    // substring occupies, for counting the number of bytes in a UTF-8
    // character, and for appending a Unicode character to a UTF-8 string.

    // PUBLIC TYPES
    typedef bsls::Types::size_type size_type;
    typedef bsls::Types::IntPtr    IntPtr;
    typedef bsls::Types::Uint64    Uint64;

    enum ErrorStatus {
        // Enumerate the error status values that are returned (possibly
        // through an out parameter) from some methods in this utility.  Note
        // that some of the functions in this 'struct' have a return value
        // that is non-negative on success, and one of these values when an
        // error occurs, so all of these values must be negative to distinguish
        // them from a "success" value.

        k_END_OF_INPUT_TRUNCATION       = -1,
           // The end of input was reached partway through a multibyte UTF-8
           // sequence.

        k_UNEXPECTED_CONTINUATION_OCTET = -2,
           // A continuation byte was encountered when not within a multibyte
           // sequence.

        k_NON_CONTINUATION_OCTET        = -3,
           // A non-continuation byte was encountered where a continuation byte
           // was expected.

        k_OVERLONG_ENCODING             = -4,
           // The encoded Unicode value could have been encoded in a sequence
           // of fewer bytes.

        k_INVALID_INITIAL_OCTET         = -5,
           // A sequence began with an octet with its 5 highest-order bits all
           // set, which is always invalid in UTF-8.

        k_VALUE_LARGER_THAN_0X10FFFF    = -6,
           // A value larger than 0x10FFFF was encoded.

        k_SURROGATE                     = -7
           // Illegal occurrence of Unicode code point reserved for surrogate
           // values in UTF-16.  Note that all surrogate values are illegal as
           // Unicode code points.
    };

    // CLASS METHODS
    static IntPtr advanceIfValid(int         *status,
                                 const char **result,
                                 const char  *string,
                                 IntPtr       numCodePoints);
        // Advance past 0 or more consecutive *valid* Unicode code points at
        // the beginning of the specified 'string', until either the specified
        // 'numCodePoints' have been traversed, or the terminating null byte or
        // invalid UTF-8 is encountered (whichever occurs first), and return
        // the number of Unicode code points traversed.  Set the specified
        // '*status' to 0 if no invalid UTF-8 is encountered, and to a value
        // from the 'ErrorStatus' 'enum' otherwise.  Set the specified
        // '*result' to the address of the byte immediately following the last
        // valid code point traversed, or to 'string' if 'string' is empty or
        // 'numCodePoints' is 0.  'string' is necessarily null-terminated, so
        // it cannot contain embedded null bytes.  The behavior is undefined
        // unless '0 <= numCodePoints'.  Note that the value returned will be
        // in the range '[0 .. numCodePoints]'.  Also note that 'string' may
        // contain less than 'bsl::strlen(string)' Unicode code points.

    static IntPtr advanceIfValid(int         *status,
                                 const char **result,
                                 const char  *string,
                                 size_type    length,
                                 IntPtr       numCodePoints);
        // Advance past 0 or more consecutive *valid* Unicode code points at
        // the beginning of the specified 'string' having the specified
        // 'length' (in bytes), until either the specified 'numCodePoints' or
        // 'length' bytes have been traversed, or invalid UTF-8 is encountered
        // (whichever occurs first), and return the number of Unicode code
        // points traversed.  Set the specified '*status' to 0 if no invalid
        // UTF-8 is encountered, and to a value from the 'ErrorStatus' 'enum'
        // otherwise.  Set the specified '*result' to the address of the byte
        // immediately following the last valid code point traversed, or to
        // 'string' if 'length' or 'numCodePoints' is 0.  'string' need not be
        // null-terminated and can contain embedded null bytes, and 'string'
        // may be null if '0 == length' (see {Empty Input Strings}).  The
        // behavior is undefined unless '0 <= numCodePoints'.  Note that the
        // value returned will be in the range '[0 .. numCodePoints]'.  Also
        // note that 'string' may contain less than 'length' Unicode code
        // points.

    static IntPtr advanceIfValid(int                      *status,
                                 const char              **result,
                                 const bsl::string_view&   string,
                                 IntPtr                    numCodePoints);
        // Advance past 0 or more consecutive *valid* Unicode code points at
        // the beginning of the specified 'string', until either the specified
        // 'numCodePoints' bytes or the whole 'string' have been traversed, or
        // invalid UTF-8 is encountered (whichever occurs first), and return
        // the number of Unicode code points traversed.  Set the specified
        // '*status' to 0 if no invalid UTF-8 is encountered, and to a value
        // from the 'ErrorStatus' 'enum' otherwise.  Set the specified
        // '*result' to the address of the byte immediately following the last
        // valid code point traversed, or to 'string' if its length or
        // 'numCodePoints' is 0.  'string' need not be null-terminated and can
        // contain embedded null bytes.  The behavior is undefined unless
        // '0 <= numCodePoints'.  Note that the value returned will be in the
        // range '[0 .. numCodePoints]'.  Also note that 'string' may contain
        // less than 'string.length()' Unicode code points.

    static IntPtr advanceRaw(const char **result,
                             const char  *string,
                             IntPtr       numCodePoints);
        // Advance past 0 or more consecutive Unicode code points at the
        // beginning of the specified 'string', until either the specified
        // 'numCodePoints' bytes have been traversed or the terminating null
        // byte is encountered (whichever occurs first), and return the number
        // of Unicode code points traversed.  Set the specified '*result' to
        // the address of the byte immediately following the last code point
        // traversed, or to 'string' if 'string' is empty or 'numCodePoints' is
        // 0.  'string' is necessarily null-terminated, so it cannot contain
        // embedded null bytes.  The behavior is undefined unless 'string'
        // contains valid UTF-8 and '0 <= numCodePoints'.  Note that the value
        // returned will be in the range '[0 .. numCodePoints]'.  Also note
        // that 'string' may contain less than 'bsl::strlen(string)' Unicode
        // code points.

    static IntPtr advanceRaw(const char **result,
                             const char  *string,
                             size_type    length,
                             IntPtr       numCodePoints);
        // Advance past 0 or more consecutive Unicode code points at the
        // beginning of the specified 'string' having the specified 'length'
        // (in bytes), until either the specified 'numCodePoints' or 'length'
        // bytes have been traversed (whichever occurs first), and return the
        // number of Unicode code points traversed.  Set the specified
        // '*result' to the address of the byte immediately following the last
        // code point traversed, or to 'string' if 'length' or 'numCodePoints'
        // is 0.  'string' need not be null-terminated and can contain embedded
        // null bytes, and 'string' may be null if '0 == length' (see {Empty
        // Input Strings}).  The behavior is undefined unless the initial
        // 'length' bytes of 'string' contain valid UTF-8 and
        // '0 <= numCodePoints'.  Note that the value returned will be in the
        // range '[0 .. numCodePoints]'.  Also note that 'string' may contain
        // less than 'length' Unicode code points.

    static IntPtr advanceRaw(const char              **result,
                             const bsl::string_view&   string,
                             IntPtr                    numCodePoints);
        // Advance past 0 or more consecutive Unicode code points at the
        // beginning of the specified 'string', until either the specified
        // 'numCodePoints' bytes or the whole string have been traversed
        // (whichever occurs first), and return the number of Unicode code
        // points traversed.  Set the specified '*result' to the address of the
        // byte immediately following the last code point traversed, or to
        // 'string' if 'length' or 'numCodePoints' is 0.  'string' need not be
        // null-terminated and can contain embedded null bytes.  The behavior
        // is undefined unless 'string' contains only valid UTF-8 characters
        // and '0 <= numCodePoints'.  Note that the value returned will be in
        // the range '[0 .. numCodePoints]'.  Also note that 'string' may
        // contain less than 'length' Unicode code points.

    static int appendUtf8Character(bsl::string  *output,
                                   unsigned int  codePoint);
        // !DEPRECATED!: Use 'appendUtf8CodePoint' instead.
        //
        // Append the UTF-8 encoding of the specified Unicode 'codePoint' to
        // the specified 'output' string.  Return 0 on success, and a non-zero
        // value otherwise.

    static int appendUtf8CodePoint(bsl::string  *output,
                                   unsigned int  codePoint);
    static int appendUtf8CodePoint(std::string  *output,
                                   unsigned int  codePoint);
#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
    static int appendUtf8CodePoint(std::pmr::string  *output,
                                   unsigned int       codePoint);
#endif
        // Append the UTF-8 encoding of the specified Unicode 'codePoint' to
        // the specified 'output' string.  Return 0 on success, and a non-zero
        // value otherwise.

    static int getByteSize(const char *codePoint);
        // !DEPRECATED!: Use 'numBytesInCodePoint' instead.
        //
        // Return the length (in bytes) of the UTF-8-encoded code point
        // beginning at the specified 'codePoint'.  The behavior is undefined
        // unless 'codePoint' addresses a code point of valid UTF-8.  Note
        // that the value returned will be in the range '[1 .. 4]'.  Also note
        // that 1 is returned if '0 == *codePoint' since '\0' is a valid 1-byte
        // encoding.

    static int numBytesInCodePoint(const char *codePoint);
        // Return the length (in bytes) of the UTF-8-encoded code point
        // beginning at the specified 'codePoint'.  The behavior is undefined
        // unless 'codePoint' addresses a code point of valid UTF-8.  Note
        // that the value returned will be in the range '[1 .. 4]'.  Also note
        // that 1 is returned if '0 == *codePoint' since '\0' is a valid 1-byte
        // encoding.

    static int getLineAndColumnNumber(Uint64         *lineNumber,
                                      Uint64         *utf8Column,
                                      Uint64         *startOfLineByteOffset,
                                      bsl::streambuf *input,
                                      Uint64          byteOffset);
    static int getLineAndColumnNumber(Uint64         *lineNumber,
                                      Uint64         *utf8Column,
                                      Uint64         *startOfLineByteOffset,
                                      bsl::streambuf *input,
                                      Uint64          byteOffset,
                                      char            lineDelimeter);
        // For the specified 'byteOffset' in the specified 'input', load the
        // offset's line number into the specified 'lineNumber', the column
        // number into the specified 'utf8Column', and the byte offset for the
        // start of the line into 'startOfLineByteOffset'.  Optionally specify
        // 'lineDelimeter' used to the determine line separator.   If
        // 'lineDelimeter' is not supplied, lines are delimeted using '\n'.
        // Return 0 on success, or a non-zero value if 'location' cannot be
        // found in 'input' or if 'input' contains non-UTF-8 characters.  The
        // 'utf8Column' is the number of UTF-8 code points between
        // 'startOfLineByteOffset' and 'byteOffset'.

    static bool isValid(const char *string);
        // Return 'true' if the specified 'string' contains valid UTF-8, and
        // 'false' otherwise.  'string' is necessarily null-terminated, so it
        // cannot contain embedded null bytes.

    static bool isValid(const char *string, size_type length);
        // Return 'true' if the specified 'string' having the specified
        // 'length' (in bytes) contains valid UTF-8, and 'false' otherwise.
        // 'string' need not be null-terminated and can contain embedded null
        // bytes, and 'string' may be null if '0 == length' (see {Empty Input
        // Strings}).

    static bool isValid(const bsl::string_view& string);
        // Return 'true' if the specified 'string' contains valid UTF-8, and
        // 'false' otherwise.  'string' need not be null-terminated and can
        // contain embedded null bytes.

    static bool isValid(const char **invalidString, const char *string);
        // Return 'true' if the specified 'string' contains valid UTF-8, and
        // 'false' otherwise.  If 'string' contains invalid UTF-8, load into
        // the specified 'invalidString' the address of the beginning of the
        // first invalid UTF-8 sequence encountered; 'invalidString' is
        // unaffected if 'string' contains only valid UTF-8.  'string' is
        // necessarily null-terminated, so it cannot contain embedded null
        // bytes.

    static bool isValid(const char **invalidString,
                        const char  *string,
                        size_type    length);
        // Return 'true' if the specified 'string' having the specified
        // 'length' (in bytes) contains valid UTF-8, and 'false' otherwise.  If
        // 'string' contains invalid UTF-8, load into the specified
        // 'invalidString' the address of the byte after the last valid code
        // point traversed; 'invalidString' is unaffected if 'string' contains
        // only valid UTF-8.  'string' need not be null-terminated and can
        // contain embedded null bytes, and 'string' may be null if
        // '0 == length' (see {Empty Input Strings}).

    static bool isValid(const char              **invalidString,
                        const bsl::string_view&   string);
        // Return 'true' if the specified 'string' contains only valid UTF-8
        // characters, and 'false' otherwise.  If 'string' contains invalid
        // UTF-8, load into the specified 'invalidString' the address of the
        // byte after the last valid code point traversed; 'invalidString' is
        // unaffected if 'string' contains only valid UTF-8.  'string' need not
        // be null-terminated and can contain embedded null bytes.

    static bool isValidCodePoint(int        *status,
                                 const char *codePoint,
                                 size_type   numBytes);
        // If the specified 'codePoint' (having at least the specified
        // 'numBytes') refers to a valid UTF-8 code point then return 'true'
        // and load the specified 'status' with the number of bytes in the
        // code-point; otherwise, if 'codePoint' is not a valid code-point,
        // return 'false' and load 'status' with one of the (negative)
        // 'ErrorStatus' constants.  The behavior is undefined unless
        // 'numBytes > 0'.

    static IntPtr numBytesIfValid(const bsl::string_view& string,
                                  IntPtr                  numCodePoints);
        // !DEPRECATED!: Use 'numBytesRaw' instead.
        //
        // Return the length (in bytes) of the specified 'numCodePoints' UTF-8
        // encodings in the specified 'string', or a value less than 0 if
        // 'string' contains less than 'numCodePoints' encodings.  The behavior
        // is undefined unless 'string' refers to valid UTF-8.  Note that
        // 'string' may contain more than 'numCodePoints' encodings in which
        // case the trailing ones are ignored.

    static IntPtr numBytesRaw(const bsl::string_view& string,
                              IntPtr                  numCodePoints);
        // Return the length (in bytes) of the specified 'numCodePoints' UTF-8
        // encodings in the specified 'string', or a value less than 0 if
        // 'string' contains less than 'numCodePoints' encodings.  The behavior
        // is undefined unless 'string' refers to valid UTF-8.  Note that
        // 'string' may contain more than 'numCodePoints' encodings in which
        // case the trailing ones are ignored.

    static IntPtr numCharacters(const char *string);
        // !DEPRECATED!: Use 'numCodePointsRaw' instead.
        //
        // Return the number of Unicode code points in the specified 'string'.
        // 'string' is necessarily null-terminated, so it cannot contain
        // embedded null bytes.  The behavior is undefined unless 'string'
        // contains valid UTF-8.  Note that 'string' may contain less than
        // 'bsl::strlen(string)' Unicode code points.

    static IntPtr numCharacters(const char *string, size_type length);
        // !DEPRECATED!: Use 'numCodePointsRaw' instead.
        //
        // Return the number of Unicode code points in the specified 'string'
        // having the specified 'length' (in bytes).  'string' need not be
        // null-terminated and can contain embedded null bytes, and 'string'
        // may be null if '0 == length' (see {Empty Input Strings}).  The
        // behavior is undefined unless 'string' contains valid UTF-8.  Note
        // that 'string' may contain less than 'length' Unicode code points.

    static IntPtr numCharactersIfValid(const char **invalidString,
                                       const char  *string);
        // !DEPRECATED!: Use 'numCodePointsIfValid' instead.
        //
        // Return the number of Unicode code points in the specified 'string'
        // if it contains valid UTF-8, with no effect on the specified
        // 'invalidString'.  Otherwise, return a negative value and load into
        // 'invalidString' the address of the byte after the last valid Unicode
        // code point traversed.  'string' is necessarily null-terminated, so
        // it cannot contain embedded null bytes.  Note that 'string' may
        // contain less than 'bsl::strlen(string)' Unicode code points.

    static IntPtr numCharactersIfValid(const char **invalidString,
                                       const char  *string,
                                       size_type    length);
        // !DEPRECATED!: Use 'numCodePointsIfValid' instead.
        //
        // Return the number of Unicode code points in the specified 'string'
        // having the specified 'length' (in bytes) if 'string' contains valid
        // UTF-8, with no effect on the specified 'invalidString'.  Otherwise,
        // return a negative value and load into 'invalidString' the address of
        // the byte after the last valid Unicode code point traversed.
        // 'string' need not be null-terminated and may contain embedded null
        // bytes, and 'string' may be null if '0 == length' (see {Empty Input
        // Strings}).  Note that 'string' may contain less than 'length'
        // Unicode code points.

    static IntPtr numCharactersRaw(const char *string);
        // !DEPRECATED!: Use 'numCodePointsRaw' instead.
        //
        // Return the number of Unicode code points in the specified 'string'.
        // 'string' is necessarily null-terminated, so it cannot contain
        // embedded null bytes.  The behavior is undefined unless 'string'
        // contains valid UTF-8.  Note that 'string' may contain less than
        // 'bsl::strlen(string)' Unicode code points.

    static IntPtr numCharactersRaw(const char *string, size_type length);
        // !DEPRECATED!: Use 'numCodePointsRaw' instead.
        //
        // Return the number of Unicode code points in the specified 'string'
        // having the specified 'length' (in bytes).  'string' need not be
        // null-terminated and can contain embedded null bytes, and 'string'
        // may be null if '0 == length' (see {Empty Input Strings}).  The
        // behavior is undefined 'string' contains valid UTF-8.  Note that
        // 'string' may contain less than 'length' Unicode code points.

    static IntPtr numCodePointsIfValid(const char **invalidString,
                                       const char  *string);
        // Return the number of Unicode code points in the specified 'string'
        // if it contains valid UTF-8, with no effect on the specified
        // 'invalidString'.  Otherwise, return a value from the 'ErrorStatus'
        // 'enum' (which are all negative) and load into 'invalidString' the
        // address of the byte after the last valid Unicode code point
        // traversed.  'string' is necessarily null-terminated, so it cannot
        // contain embedded null bytes.  Note that 'string' may contain less
        // than 'bsl::strlen(string)' Unicode code points.

    static IntPtr numCodePointsIfValid(const char **invalidString,
                                       const char  *string,
                                       size_type    length);
        // Return the number of Unicode code points in the specified 'string'
        // having the specified 'length' (in bytes) if 'string' contains valid
        // UTF-8, with no effect on the specified 'invalidString'.  Otherwise,
        // return a value from the 'ErrorStatus' 'enum' (which are all
        // negative) and load into 'invalidString' the address of the byte
        // after the last valid Unicode code point traversed.  'string' need
        // not be null-terminated and may contain embedded null bytes, and
        // 'string' may be null if '0 == length' (see {Empty Input Strings}).
        // Note that 'string' may contain less than 'length' Unicode code
        // points.

    static IntPtr numCodePointsIfValid(const char              **invalidString,
                                       const bsl::string_view&   string);
        // Return the number of Unicode code points in the specified 'string'
        // if 'string' contains valid UTF-8, with no effect on the specified
        // 'invalidString'.  Otherwise, return a value from the 'ErrorStatus'
        // 'enum' (which are all negative) and load into 'invalidString' the
        // address of the byte after the last valid Unicode code point
        // traversed.  'string' need not be null-terminated and may contain
        // embedded null bytes.

    static IntPtr numCodePointsRaw(const char *string);
        // Return the number of Unicode code points in the specified 'string'.
        // 'string' is necessarily null-terminated, so it cannot contain
        // embedded null bytes.  The behavior is undefined unless 'string'
        // contains valid UTF-8.  Note that 'string' may contain less than
        // 'bsl::strlen(string)' Unicode code points.

    static IntPtr numCodePointsRaw(const char *string, size_type length);
        // Return the number of Unicode code points in the specified 'string'
        // having the specified 'length' (in bytes).  'string' need not be
        // null-terminated and can contain embedded null bytes, and 'string'
        // may be null if '0 == length' (see {Empty Input Strings}).  The
        // behavior is undefined unless 'string' contains valid UTF-8.  Note
        // that 'string' may contain less than 'length' Unicode code points.

    static IntPtr numCodePointsRaw(const bsl::string_view& string);
        // Return the number of Unicode code points in the specified 'string'.
        // 'string' need not be null-terminated and can contain embedded null
        // bytes.  The behavior is undefined unless 'string' contains valid
        // UTF-8.

    static size_type readIfValid(int            *status,
                                 char           *outputBuffer,
                                 size_type       outputBufferLength,
                                 bsl::streambuf *input);
        // Read from the specified 'input' and copy *valid* UTF-8 (only) to the
        // specified 'outputBuffer' having the specified 'outputBufferLength'
        // (in bytes).  Load the specified 'status' with:
        //: o 0 if 'input' reached 'eof' without encountering any invalid UTF-8
        //:   or prematurely exhausting 'outputBuffer'.
        //:
        //: o A positive value if 'input' was not completely read due to
        //:   'outputBuffer' being filled (or nearly filled) without
        //:   encountering any invalid UTF-8.
        //:
        //: o A negative value from 'ErrorStatus' if invalid UTF-8 was
        //:   encountered (without having written the invalid sequence to
        //:   'outputBuffer').
        // Return the number of bytes of valid UTF-8 written to 'outputBuffer.
        // If no invalid UTF-8 is encountered, or if 'input' supports
        // 'sputbackc' with a putback buffer capacity of at least 4 bytes,
        // 'input' will be left positioned at the end of the valid UTF-8 read,
        // otherwise, 'input' will be left in an unspecified state.  The
        // behavior is undefined unless '4 <= outputBufferLength'.  Note that
        // this function will stop reading 'input' when less than 4 bytes of
        // space remain in 'outputBuffer' to prevent the possibility of a
        // 4-byte UTF-8 sequence being truncated partway through.

    static const char *toAscii(IntPtr value);
        // Return the non-modifiable string representation of the 'ErrorStatus'
        // enumerator matching the specified 'value', if it exists, and "(*
        // unrecognized value *)" otherwise.  The string representation of an
        // enumerator that matches 'value' is the enumerator name with the "k_"
        // prefix elided.  Note that this method may be used to aid in
        // interpreting status values that are returned from some methods in
        // this utility.  See 'ErrorStatus'.
};

                          // =======================
                          // struct Utf8Util_ImpUtil
                          // =======================

struct Utf8Util_ImpUtil {
    // [!PRIVATE!] This struct provides a namespace for static methods used to
    // implement 'Utf8Util'.  Note that the functions are not typically useful
    // for clients, and are primarily exposed to allow for more thorough
    // testing.

    // TYPES
    typedef bsls::Types::Uint64 Uint64;

    // CLASS METHODS
    static int getLineAndColumnNumber(
                                  Uint64         *lineNumber,
                                  Uint64         *utf8Column,
                                  Uint64         *startOfLineByteOffset,
                                  bsl::streambuf *input,
                                  Uint64          byteOffset,
                                  char            lineDelimeter,
                                  char           *temporaryReadBuffer,
                                  int             temporaryReadBufferNumBytes);
        // For the specified 'byteOffset' in the specified 'input', load the
        // byte offset's line number into the specified 'lineNumber', the
        // column number into the specified 'utf8Column', and the byte offset
        // for the start of the line into the specified
        // 'startOfLineByteOffset', using the specified 'lineDelimeter' as the
        // line separator, and using the specified 'temporaryReadBuffer' (of
        // the specified length 'temporaryReadBufferNumBytes') as a temporary
        // buffer for reading.  Return 0 on success, or a non-zero value if
        // 'location' cannot be found in 'input' or if 'input' contains
        // non-UTF-8 characters.  The 'utf8Column' is the number of UTF-8 code
        // points between 'startOfLineByteOffset' and 'byteOffset'.  The
        // behavior is undefined unless 'temporaryReadBuffer' refers to a valid
        // buffer of at least 'temporaryReadBufferNumBytes' bytes, and
        // 'temporaryReadBufferNumBytes' is greater than or equal to 4.
};

// ============================================================================
//                            INLINE DEFINITIONS
// ============================================================================

                              // ---------------
                              // struct Utf8Util
                              // ---------------

// CLASS METHODS
inline
Utf8Util::IntPtr Utf8Util::advanceIfValid(
                                       int                      *status,
                                       const char              **result,
                                       const bsl::string_view&   string,
                                       IntPtr                    numCodePoints)
{

    return advanceIfValid(status,
                          result,
                          string.data(),
                          string.length(),
                          numCodePoints);
}

inline
Utf8Util::IntPtr Utf8Util::advanceRaw(const char              **result,
                                      const bsl::string_view&   string,
                                      IntPtr                    numCodePoints)
{
    return advanceRaw(result, string.data(), string.length(), numCodePoints);
}

inline
int Utf8Util::appendUtf8Character(bsl::string  *output,
                                  unsigned int  codePoint)
{
    return appendUtf8CodePoint(output, codePoint);
}

inline
int Utf8Util::getByteSize(const char *codePoint)
{
    return numBytesInCodePoint(codePoint);
}

inline
int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,
                                     Uint64         *utf8Column,
                                     Uint64         *startOfLineByteOffset,
                                     bsl::streambuf *input,
                                     Uint64          byteOffset)
{
    return getLineAndColumnNumber(lineNumber,
                                  utf8Column,
                                  startOfLineByteOffset,
                                  input,
                                  byteOffset,
                                  '\n');
}

inline
int Utf8Util::getLineAndColumnNumber(Uint64         *lineNumber,
                                     Uint64         *utf8Column,
                                     Uint64         *startOfLineByteOffset,
                                     bsl::streambuf *input,
                                     Uint64          byteOffset,
                                     char            lineDelimeter)
{
    enum { k_BUFFER_SIZE = 2048 };
    char buffer[k_BUFFER_SIZE];
    return Utf8Util_ImpUtil::getLineAndColumnNumber(lineNumber,
                                                    utf8Column,
                                                    startOfLineByteOffset,
                                                    input,
                                                    byteOffset,
                                                    lineDelimeter,
                                                    buffer,
                                                    k_BUFFER_SIZE);
}

inline
bool Utf8Util::isValid(const char *string)
{
    BSLS_ASSERT(string);

    const char *dummy = 0;
    return isValid(&dummy, string);
}

inline
bool Utf8Util::isValid(const char *string, size_type length)
{
    BSLS_ASSERT(string || 0 == length);

    const char *dummy = 0;
    return isValid(&dummy, string, length);
}

inline
bool Utf8Util::isValid(const bsl::string_view& string)
{
    const char *dummy = 0;
    return isValid(&dummy, string);
}

inline
Utf8Util::IntPtr Utf8Util::numBytesIfValid(
                                         const bsl::string_view& string,
                                         IntPtr                  numCodePoints)
{
    return numBytesRaw(string, numCodePoints);
}

inline
Utf8Util::IntPtr Utf8Util::numCharacters(const char *string)
{
    return numCodePointsRaw(string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharacters(const char *string, size_type length)
{
    return numCodePointsRaw(string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,
                                                const char  *string)
{
    return numCodePointsIfValid(invalidString, string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersIfValid(const char **invalidString,
                                                const char  *string,
                                                size_type    length)
{
    return numCodePointsIfValid(invalidString, string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string)
{
    return numCodePointsRaw(string);
}

inline
Utf8Util::IntPtr Utf8Util::numCharactersRaw(const char *string,
                                            size_type   length)
{
    return numCodePointsRaw(string, length);
}

inline
Utf8Util::IntPtr Utf8Util::numCodePointsRaw(const bsl::string_view& string)
{
    return numCodePointsRaw(string.data(), string.length());
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2015 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------