BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlde::Utf8Util Struct Reference

#include <bdlde_utf8util.h>

Public Types

enum  ErrorStatus {
  k_END_OF_INPUT_TRUNCATION = -1 , k_UNEXPECTED_CONTINUATION_OCTET = -2 , k_NON_CONTINUATION_OCTET = -3 , k_OVERLONG_ENCODING = -4 ,
  k_INVALID_INITIAL_OCTET = -5 , k_VALUE_LARGER_THAN_0X10FFFF = -6 , k_SURROGATE = -7
}
 
typedef bsls::Types::size_type size_type
 
typedef bsls::Types::IntPtr IntPtr
 
typedef bsls::Types::Uint64 Uint64
 

Static Public Member Functions

static IntPtr advanceIfValid (int *status, const char **result, const char *string, IntPtr numCodePoints)
 
static IntPtr advanceIfValid (int *status, const char **result, const char *string, size_type length, IntPtr numCodePoints)
 
static IntPtr advanceIfValid (int *status, const char **result, const bsl::string_view &string, IntPtr numCodePoints)
 
static IntPtr advanceRaw (const char **result, const char *string, IntPtr numCodePoints)
 
static IntPtr advanceRaw (const char **result, const char *string, size_type length, IntPtr numCodePoints)
 
static IntPtr advanceRaw (const char **result, const bsl::string_view &string, IntPtr numCodePoints)
 
static int appendUtf8Character (bsl::string *output, unsigned int codePoint)
 
static int appendUtf8CodePoint (bsl::string *output, unsigned int codePoint)
 
static int appendUtf8CodePoint (std::string *output, unsigned int codePoint)
 
static int codePointValue (const char *codePoint)
 
static int getByteSize (const char *codePoint)
 
static int numBytesInCodePoint (const char *codePoint)
 
static int getLineAndColumnNumber (Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset)
 
static int getLineAndColumnNumber (Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset, char lineDelimeter)
 
static bool isValid (const char *string)
 
static bool isValid (const char *string, size_type length)
 
static bool isValid (const bsl::string_view &string)
 
static bool isValid (const char **invalidString, const char *string)
 
static bool isValid (const char **invalidString, const char *string, size_type length)
 
static bool isValid (const char **invalidString, const bsl::string_view &string)
 
static bool isValidCodePoint (int *status, const char *codePoint, size_type numBytes)
 
static IntPtr numBytesIfValid (const bsl::string_view &string, IntPtr numCodePoints)
 
static IntPtr numBytesRaw (const bsl::string_view &string, IntPtr numCodePoints)
 
static IntPtr numCharacters (const char *string)
 
static IntPtr numCharacters (const char *string, size_type length)
 
static IntPtr numCharactersIfValid (const char **invalidString, const char *string)
 
static IntPtr numCharactersIfValid (const char **invalidString, const char *string, size_type length)
 
static IntPtr numCharactersRaw (const char *string)
 
static IntPtr numCharactersRaw (const char *string, size_type length)
 
static IntPtr numCodePointsIfValid (const char **invalidString, const char *string)
 
static IntPtr numCodePointsIfValid (const char **invalidString, const char *string, size_type length)
 
static IntPtr numCodePointsIfValid (const char **invalidString, const bsl::string_view &string)
 
static IntPtr numCodePointsRaw (const char *string)
 
static IntPtr numCodePointsRaw (const char *string, size_type length)
 
static IntPtr numCodePointsRaw (const bsl::string_view &string)
 
static size_type readIfValid (int *status, char *outputBuffer, size_type outputBufferLength, bsl::streambuf *input)
 
static const char * toAscii (IntPtr value)
 

Detailed Description

This struct provides a namespace for static methods used for validating UTF-8 strings, for counting the number of Unicode code points in them, for advancing pointers through UTF-8 strings by a specified number of Unicode code points, for counting the number of bytes a UTF-8 leading substring occupies, for counting the number of bytes in a UTF-8 character, and for appending a Unicode character to a UTF-8 string.

Member Typedef Documentation

◆ IntPtr

◆ size_type

◆ Uint64

Member Enumeration Documentation

◆ ErrorStatus

Enumerate the error status values that are returned (possibly through an out parameter) from some methods in this utility. Note that some of the functions in this struct have a return value that is non-negative on success, and one of these values when an error occurs, so all of these values must be negative to distinguish them from a "success" value.

Enumerator
k_END_OF_INPUT_TRUNCATION 
k_UNEXPECTED_CONTINUATION_OCTET 
k_NON_CONTINUATION_OCTET 
k_OVERLONG_ENCODING 
k_INVALID_INITIAL_OCTET 
k_VALUE_LARGER_THAN_0X10FFFF 
k_SURROGATE 

Member Function Documentation

◆ advanceIfValid() [1/3]

Utf8Util::IntPtr bdlde::Utf8Util::advanceIfValid ( int *  status,
const char **  result,
const bsl::string_view string,
IntPtr  numCodePoints 
)
inlinestatic

Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string, until either the specified numCodePoints bytes or the whole string have been traversed, or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus enum otherwise. Set the specified *result to the address of the byte immediately following the last valid code point traversed, or to string if its length or numCodePoints is 0. string need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than string.length() Unicode code points.

◆ advanceIfValid() [2/3]

static IntPtr bdlde::Utf8Util::advanceIfValid ( int *  status,
const char **  result,
const char *  string,
IntPtr  numCodePoints 
)
static

Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string, until either the specified numCodePoints have been traversed, or the terminating null byte or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus enum otherwise. Set the specified *result to the address of the byte immediately following the last valid code point traversed, or to string if string is empty or numCodePoints is 0. string is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than bsl::strlen(string) Unicode code points.

◆ advanceIfValid() [3/3]

static IntPtr bdlde::Utf8Util::advanceIfValid ( int *  status,
const char **  result,
const char *  string,
size_type  length,
IntPtr  numCodePoints 
)
static

Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string having the specified length (in bytes), until either the specified numCodePoints or length bytes have been traversed, or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus enum otherwise. Set the specified *result to the address of the byte immediately following the last valid code point traversed, or to string if length or numCodePoints is 0. string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). The behavior is undefined unless 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than length Unicode code points.

◆ advanceRaw() [1/3]

Utf8Util::IntPtr bdlde::Utf8Util::advanceRaw ( const char **  result,
const bsl::string_view string,
IntPtr  numCodePoints 
)
inlinestatic

Advance past 0 or more consecutive Unicode code points at the beginning of the specified string, until either the specified numCodePoints bytes or the whole string have been traversed (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result to the address of the byte immediately following the last code point traversed, or to string if length or numCodePoints is 0. string need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless string contains only valid UTF-8 characters and 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than length Unicode code points.

◆ advanceRaw() [2/3]

static IntPtr bdlde::Utf8Util::advanceRaw ( const char **  result,
const char *  string,
IntPtr  numCodePoints 
)
static

Advance past 0 or more consecutive Unicode code points at the beginning of the specified string, until either the specified numCodePoints bytes have been traversed or the terminating null byte is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result to the address of the byte immediately following the last code point traversed, or to string if string is empty or numCodePoints is 0. string is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string contains valid UTF-8 and 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than bsl::strlen(string) Unicode code points.

◆ advanceRaw() [3/3]

static IntPtr bdlde::Utf8Util::advanceRaw ( const char **  result,
const char *  string,
size_type  length,
IntPtr  numCodePoints 
)
static

Advance past 0 or more consecutive Unicode code points at the beginning of the specified string having the specified length (in bytes), until either the specified numCodePoints or length bytes have been traversed (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result to the address of the byte immediately following the last code point traversed, or to string if length or numCodePoints is 0. string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). The behavior is undefined unless the initial length bytes of string contain valid UTF-8 and 0 <= numCodePoints. Note that the value returned will be in the range [0 .. numCodePoints]. Also note that string may contain less than length Unicode code points.

◆ appendUtf8Character()

int bdlde::Utf8Util::appendUtf8Character ( bsl::string output,
unsigned int  codePoint 
)
inlinestatic
Deprecated:
Use appendUtf8CodePoint instead.

Append the UTF-8 encoding of the specified Unicode codePoint to the specified output string. Return 0 on success, and a non-zero value otherwise.

◆ appendUtf8CodePoint() [1/2]

static int bdlde::Utf8Util::appendUtf8CodePoint ( bsl::string output,
unsigned int  codePoint 
)
static

◆ appendUtf8CodePoint() [2/2]

static int bdlde::Utf8Util::appendUtf8CodePoint ( std::string *  output,
unsigned int  codePoint 
)
static

Append the UTF-8 encoding of the specified Unicode codePoint to the specified output string. Return 0 on success, and a non-zero value otherwise.

◆ codePointValue()

static int bdlde::Utf8Util::codePointValue ( const char *  codePoint)
static

Return the numeric value of the UTF-8-encoded code point beginning at the specified codePoint. The behavior is undefined unless codePoint is the address of the first byte of a valid UTF-8 encoded character.

◆ getByteSize()

int bdlde::Utf8Util::getByteSize ( const char *  codePoint)
inlinestatic
Deprecated:
Use numBytesInCodePoint instead.

Return the length (in bytes) of the UTF-8-encoded code point beginning at the specified codePoint. The behavior is undefined unless codePoint is the address of the first byte of a valid UTF-8 encoded character. Note that the value returned will be in the range [1 .. 4]. Also note that 1 is returned if 0 == *codePoint since '\0' is a valid 1-byte encoding.

◆ getLineAndColumnNumber() [1/2]

int bdlde::Utf8Util::getLineAndColumnNumber ( Uint64 lineNumber,
Uint64 utf8Column,
Uint64 startOfLineByteOffset,
bsl::streambuf *  input,
Uint64  byteOffset 
)
inlinestatic

◆ getLineAndColumnNumber() [2/2]

int bdlde::Utf8Util::getLineAndColumnNumber ( Uint64 lineNumber,
Uint64 utf8Column,
Uint64 startOfLineByteOffset,
bsl::streambuf *  input,
Uint64  byteOffset,
char  lineDelimeter 
)
inlinestatic

For the specified byteOffset in the specified input, load the offset's line number into the specified lineNumber, the column number into the specified utf8Column, and the byte offset for the start of the line into startOfLineByteOffset. Optionally specify lineDelimeter used to the determine line separator. If lineDelimeter is not supplied, lines are delimeted using '
'. Return 0 on success, or a non-zero value if location cannot be found in input or if input contains non-UTF-8 characters. The utf8Column is the number of UTF-8 code points between startOfLineByteOffset and byteOffset.

◆ isValid() [1/6]

bool bdlde::Utf8Util::isValid ( const bsl::string_view string)
inlinestatic

Return true if the specified string contains valid UTF-8, and false otherwise. string need not be null-terminated and can contain embedded null bytes.

◆ isValid() [2/6]

static bool bdlde::Utf8Util::isValid ( const char **  invalidString,
const bsl::string_view string 
)
static

Return true if the specified string contains only valid UTF-8 characters, and false otherwise. If string contains invalid UTF-8, load into the specified invalidString the address of the byte after the last valid code point traversed; invalidString is unaffected if string contains only valid UTF-8. string need not be null-terminated and can contain embedded null bytes.

◆ isValid() [3/6]

static bool bdlde::Utf8Util::isValid ( const char **  invalidString,
const char *  string 
)
static

Return true if the specified string contains valid UTF-8, and false otherwise. If string contains invalid UTF-8, load into the specified invalidString the address of the beginning of the first invalid UTF-8 sequence encountered; invalidString is unaffected if string contains only valid UTF-8. string is necessarily null-terminated, so it cannot contain embedded null bytes.

◆ isValid() [4/6]

static bool bdlde::Utf8Util::isValid ( const char **  invalidString,
const char *  string,
size_type  length 
)
static

Return true if the specified string having the specified length (in bytes) contains valid UTF-8, and false otherwise. If string contains invalid UTF-8, load into the specified invalidString the address of the byte after the last valid code point traversed; invalidString is unaffected if string contains only valid UTF-8. string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}).

◆ isValid() [5/6]

bool bdlde::Utf8Util::isValid ( const char *  string)
inlinestatic

Return true if the specified string contains valid UTF-8, and false otherwise. string is necessarily null-terminated, so it cannot contain embedded null bytes.

◆ isValid() [6/6]

bool bdlde::Utf8Util::isValid ( const char *  string,
size_type  length 
)
inlinestatic

Return true if the specified string having the specified length (in bytes) contains valid UTF-8, and false otherwise. string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}).

◆ isValidCodePoint()

static bool bdlde::Utf8Util::isValidCodePoint ( int *  status,
const char *  codePoint,
size_type  numBytes 
)
static

If the specified codePoint (having at least the specified numBytes) refers to a valid UTF-8 code point then return true and load the specified status with the number of bytes in the code-point; otherwise, if codePoint is not a valid code-point, return false and load status with one of the (negative) ErrorStatus constants. The behavior is undefined unless numBytes > 0.

◆ numBytesIfValid()

Utf8Util::IntPtr bdlde::Utf8Util::numBytesIfValid ( const bsl::string_view string,
IntPtr  numCodePoints 
)
inlinestatic
Deprecated:
Use numBytesRaw instead.

Return the length (in bytes) of the specified numCodePoints UTF-8 encodings in the specified string, or a value less than 0 if string contains less than numCodePoints encodings. The behavior is undefined unless string refers to valid UTF-8. Note that string may contain more than numCodePoints encodings in which case the trailing ones are ignored.

◆ numBytesInCodePoint()

static int bdlde::Utf8Util::numBytesInCodePoint ( const char *  codePoint)
static

Return the length (in bytes) of the UTF-8-encoded code point beginning at the specified codePoint. The behavior is undefined unless codePoint is the address of the first byte of a valid UTF-8 encoded character. Note that the value returned will be in the range [1 .. 4]. Also note that 1 is returned if 0 == *codePoint since '\0' is a valid 1-byte encoding.

◆ numBytesRaw()

static IntPtr bdlde::Utf8Util::numBytesRaw ( const bsl::string_view string,
IntPtr  numCodePoints 
)
static

Return the length (in bytes) of the specified numCodePoints UTF-8 encodings in the specified string, or a value less than 0 if string contains less than numCodePoints encodings. The behavior is undefined unless string refers to valid UTF-8. Note that string may contain more than numCodePoints encodings in which case the trailing ones are ignored.

◆ numCharacters() [1/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharacters ( const char *  string)
inlinestatic
Deprecated:
Use numCodePointsRaw instead.

Return the number of Unicode code points in the specified string. string is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string contains valid UTF-8. Note that string may contain less than bsl::strlen(string) Unicode code points.

◆ numCharacters() [2/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharacters ( const char *  string,
size_type  length 
)
inlinestatic
Deprecated:
Use numCodePointsRaw instead.

Return the number of Unicode code points in the specified string having the specified length (in bytes). string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). The behavior is undefined unless string contains valid UTF-8. Note that string may contain less than length Unicode code points.

◆ numCharactersIfValid() [1/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharactersIfValid ( const char **  invalidString,
const char *  string 
)
inlinestatic
Deprecated:
Use numCodePointsIfValid instead.

Return the number of Unicode code points in the specified string if it contains valid UTF-8, with no effect on the specified invalidString. Otherwise, return a negative value and load into invalidString the address of the byte after the last valid Unicode code point traversed. string is necessarily null-terminated, so it cannot contain embedded null bytes. Note that string may contain less than bsl::strlen(string) Unicode code points.

◆ numCharactersIfValid() [2/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharactersIfValid ( const char **  invalidString,
const char *  string,
size_type  length 
)
inlinestatic
Deprecated:
Use numCodePointsIfValid instead.

Return the number of Unicode code points in the specified string having the specified length (in bytes) if string contains valid UTF-8, with no effect on the specified invalidString. Otherwise, return a negative value and load into invalidString the address of the byte after the last valid Unicode code point traversed. string need not be null-terminated and may contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). Note that string may contain less than length Unicode code points.

◆ numCharactersRaw() [1/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharactersRaw ( const char *  string)
inlinestatic
Deprecated:
Use numCodePointsRaw instead.

Return the number of Unicode code points in the specified string. string is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string contains valid UTF-8. Note that string may contain less than bsl::strlen(string) Unicode code points.

◆ numCharactersRaw() [2/2]

Utf8Util::IntPtr bdlde::Utf8Util::numCharactersRaw ( const char *  string,
size_type  length 
)
inlinestatic
Deprecated:
Use numCodePointsRaw instead.

Return the number of Unicode code points in the specified string having the specified length (in bytes). string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). The behavior is undefined string contains valid UTF-8. Note that string may contain less than length Unicode code points.

◆ numCodePointsIfValid() [1/3]

static IntPtr bdlde::Utf8Util::numCodePointsIfValid ( const char **  invalidString,
const bsl::string_view string 
)
static

Return the number of Unicode code points in the specified string if string contains valid UTF-8, with no effect on the specified invalidString. Otherwise, return a value from the ErrorStatus enum (which are all negative) and load into invalidString the address of the byte after the last valid Unicode code point traversed. string need not be null-terminated and may contain embedded null bytes.

◆ numCodePointsIfValid() [2/3]

static IntPtr bdlde::Utf8Util::numCodePointsIfValid ( const char **  invalidString,
const char *  string 
)
static

Return the number of Unicode code points in the specified string if it contains valid UTF-8, with no effect on the specified invalidString. Otherwise, return a value from the ErrorStatus enum (which are all negative) and load into invalidString the address of the byte after the last valid Unicode code point traversed. string is necessarily null-terminated, so it cannot contain embedded null bytes. Note that string may contain less than bsl::strlen(string) Unicode code points.

◆ numCodePointsIfValid() [3/3]

static IntPtr bdlde::Utf8Util::numCodePointsIfValid ( const char **  invalidString,
const char *  string,
size_type  length 
)
static

Return the number of Unicode code points in the specified string having the specified length (in bytes) if string contains valid UTF-8, with no effect on the specified invalidString. Otherwise, return a value from the ErrorStatus enum (which are all negative) and load into invalidString the address of the byte after the last valid Unicode code point traversed. string need not be null-terminated and may contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). Note that string may contain less than length Unicode code points.

◆ numCodePointsRaw() [1/3]

Utf8Util::IntPtr bdlde::Utf8Util::numCodePointsRaw ( const bsl::string_view string)
inlinestatic

Return the number of Unicode code points in the specified string. string need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless string contains valid UTF-8.

◆ numCodePointsRaw() [2/3]

static IntPtr bdlde::Utf8Util::numCodePointsRaw ( const char *  string)
static

Return the number of Unicode code points in the specified string. string is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string contains valid UTF-8. Note that string may contain less than bsl::strlen(string) Unicode code points.

◆ numCodePointsRaw() [3/3]

static IntPtr bdlde::Utf8Util::numCodePointsRaw ( const char *  string,
size_type  length 
)
static

Return the number of Unicode code points in the specified string having the specified length (in bytes). string need not be null-terminated and can contain embedded null bytes, and string may be null if 0 == length (see {Empty Input Strings}). The behavior is undefined unless string contains valid UTF-8. Note that string may contain less than length Unicode code points.

◆ readIfValid()

static size_type bdlde::Utf8Util::readIfValid ( int *  status,
char *  outputBuffer,
size_type  outputBufferLength,
bsl::streambuf *  input 
)
static

Read from the specified input and copy valid UTF-8 (only) to the specified outputBuffer having the specified outputBufferLength (in bytes). Load the specified status with:

  • 0 if input reached eof without encountering any invalid UTF-8 or prematurely exhausting outputBuffer.
  • A positive value if input was not completely read due to outputBuffer being filled (or nearly filled) without encountering any invalid UTF-8.
  • A negative value from ErrorStatus if invalid UTF-8 was encountered (without having written the invalid sequence to outputBuffer). Return the number of bytes of valid UTF-8 written to 'outputBuffer. If no invalid UTF-8 is encountered, or if input supports sputbackc with a putback buffer capacity of at least 4 bytes, input will be left positioned at the end of the valid UTF-8 read, otherwise, input will be left in an unspecified state. The behavior is undefined unless 4 <= outputBufferLength. Note that this function will stop reading input when less than 4 bytes of space remain in outputBuffer to prevent the possibility of a 4-byte UTF-8 sequence being truncated partway through.

◆ toAscii()

static const char * bdlde::Utf8Util::toAscii ( IntPtr  value)
static

Return the non-modifiable string representation of the ErrorStatus enumerator matching the specified value, if it exists, and "(* unrecognized value *)" otherwise. The string representation of an enumerator that matches value is the enumerator name with the "k_" prefix elided. Note that this method may be used to aid in interpreting status values that are returned from some methods in this utility. See ErrorStatus.


The documentation for this struct was generated from the following file: