BDE 4.14.0 Production release
|
#include <bdlde_utf8util.h>
Public Types | |
enum | ErrorStatus { k_END_OF_INPUT_TRUNCATION = -1 , k_UNEXPECTED_CONTINUATION_OCTET = -2 , k_NON_CONTINUATION_OCTET = -3 , k_OVERLONG_ENCODING = -4 , k_INVALID_INITIAL_OCTET = -5 , k_VALUE_LARGER_THAN_0X10FFFF = -6 , k_SURROGATE = -7 } |
typedef bsls::Types::size_type | size_type |
typedef bsls::Types::IntPtr | IntPtr |
typedef bsls::Types::Uint64 | Uint64 |
Static Public Member Functions | |
static IntPtr | advanceIfValid (int *status, const char **result, const char *string, IntPtr numCodePoints) |
static IntPtr | advanceIfValid (int *status, const char **result, const char *string, size_type length, IntPtr numCodePoints) |
static IntPtr | advanceIfValid (int *status, const char **result, const bsl::string_view &string, IntPtr numCodePoints) |
static IntPtr | advanceRaw (const char **result, const char *string, IntPtr numCodePoints) |
static IntPtr | advanceRaw (const char **result, const char *string, size_type length, IntPtr numCodePoints) |
static IntPtr | advanceRaw (const char **result, const bsl::string_view &string, IntPtr numCodePoints) |
static int | appendUtf8Character (bsl::string *output, unsigned int codePoint) |
static int | appendUtf8CodePoint (bsl::string *output, unsigned int codePoint) |
static int | appendUtf8CodePoint (std::string *output, unsigned int codePoint) |
static int | codePointValue (const char *codePoint) |
static int | getByteSize (const char *codePoint) |
static int | numBytesInCodePoint (const char *codePoint) |
static int | getLineAndColumnNumber (Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset) |
static int | getLineAndColumnNumber (Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset, char lineDelimeter) |
static bool | isValid (const char *string) |
static bool | isValid (const char *string, size_type length) |
static bool | isValid (const bsl::string_view &string) |
static bool | isValid (const char **invalidString, const char *string) |
static bool | isValid (const char **invalidString, const char *string, size_type length) |
static bool | isValid (const char **invalidString, const bsl::string_view &string) |
static bool | isValidCodePoint (int *status, const char *codePoint, size_type numBytes) |
static IntPtr | numBytesIfValid (const bsl::string_view &string, IntPtr numCodePoints) |
static IntPtr | numBytesRaw (const bsl::string_view &string, IntPtr numCodePoints) |
static IntPtr | numCharacters (const char *string) |
static IntPtr | numCharacters (const char *string, size_type length) |
static IntPtr | numCharactersIfValid (const char **invalidString, const char *string) |
static IntPtr | numCharactersIfValid (const char **invalidString, const char *string, size_type length) |
static IntPtr | numCharactersRaw (const char *string) |
static IntPtr | numCharactersRaw (const char *string, size_type length) |
static IntPtr | numCodePointsIfValid (const char **invalidString, const char *string) |
static IntPtr | numCodePointsIfValid (const char **invalidString, const char *string, size_type length) |
static IntPtr | numCodePointsIfValid (const char **invalidString, const bsl::string_view &string) |
static IntPtr | numCodePointsRaw (const char *string) |
static IntPtr | numCodePointsRaw (const char *string, size_type length) |
static IntPtr | numCodePointsRaw (const bsl::string_view &string) |
static size_type | readIfValid (int *status, char *outputBuffer, size_type outputBufferLength, bsl::streambuf *input) |
static const char * | toAscii (IntPtr value) |
This struct provides a namespace for static methods used for validating UTF-8 strings, for counting the number of Unicode code points in them, for advancing pointers through UTF-8 strings by a specified number of Unicode code points, for counting the number of bytes a UTF-8 leading substring occupies, for counting the number of bytes in a UTF-8 character, and for appending a Unicode character to a UTF-8 string.
Enumerate the error status values that are returned (possibly through an out parameter) from some methods in this utility. Note that some of the functions in this struct
have a return value that is non-negative on success, and one of these values when an error occurs, so all of these values must be negative to distinguish them from a "success" value.
Enumerator | |
---|---|
k_END_OF_INPUT_TRUNCATION | |
k_UNEXPECTED_CONTINUATION_OCTET | |
k_NON_CONTINUATION_OCTET | |
k_OVERLONG_ENCODING | |
k_INVALID_INITIAL_OCTET | |
k_VALUE_LARGER_THAN_0X10FFFF | |
k_SURROGATE |
|
inlinestatic |
Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string
, until either the specified numCodePoints
bytes or the whole string
have been traversed, or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status
to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus
enum
otherwise. Set the specified *result
to the address of the byte immediately following the last valid code point traversed, or to string
if its length or numCodePoints
is 0. string
need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than string.length()
Unicode code points.
|
static |
Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string
, until either the specified numCodePoints
have been traversed, or the terminating null byte or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status
to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus
enum
otherwise. Set the specified *result
to the address of the byte immediately following the last valid code point traversed, or to string
if string
is empty or numCodePoints
is 0. string
is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
static |
Advance past 0 or more consecutive valid Unicode code points at the beginning of the specified string
having the specified length
(in bytes), until either the specified numCodePoints
or length
bytes have been traversed, or invalid UTF-8 is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *status
to 0 if no invalid UTF-8 is encountered, and to a value from the ErrorStatus
enum
otherwise. Set the specified *result
to the address of the byte immediately following the last valid code point traversed, or to string
if length
or numCodePoints
is 0. string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). The behavior is undefined unless 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than length
Unicode code points.
|
inlinestatic |
Advance past 0 or more consecutive Unicode code points at the beginning of the specified string
, until either the specified numCodePoints
bytes or the whole string have been traversed (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result
to the address of the byte immediately following the last code point traversed, or to string
if length
or numCodePoints
is 0. string
need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless string
contains only valid UTF-8 characters and 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than length
Unicode code points.
|
static |
Advance past 0 or more consecutive Unicode code points at the beginning of the specified string
, until either the specified numCodePoints
bytes have been traversed or the terminating null byte is encountered (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result
to the address of the byte immediately following the last code point traversed, or to string
if string
is empty or numCodePoints
is 0. string
is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string
contains valid UTF-8 and 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
static |
Advance past 0 or more consecutive Unicode code points at the beginning of the specified string
having the specified length
(in bytes), until either the specified numCodePoints
or length
bytes have been traversed (whichever occurs first), and return the number of Unicode code points traversed. Set the specified *result
to the address of the byte immediately following the last code point traversed, or to string
if length
or numCodePoints
is 0. string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). The behavior is undefined unless the initial length
bytes of string
contain valid UTF-8 and 0 <= numCodePoints
. Note that the value returned will be in the range [0 .. numCodePoints]
. Also note that string
may contain less than length
Unicode code points.
|
inlinestatic |
Append the UTF-8 encoding of the specified Unicode codePoint
to the specified output
string. Return 0 on success, and a non-zero value otherwise.
|
static |
|
static |
Append the UTF-8 encoding of the specified Unicode codePoint
to the specified output
string. Return 0 on success, and a non-zero value otherwise.
|
static |
Return the numeric value of the UTF-8-encoded code point beginning at the specified codePoint
. The behavior is undefined unless codePoint
is the address of the first byte of a valid UTF-8 encoded character.
|
inlinestatic |
Return the length (in bytes) of the UTF-8-encoded code point beginning at the specified codePoint
. The behavior is undefined unless codePoint
is the address of the first byte of a valid UTF-8 encoded character. Note that the value returned will be in the range [1 .. 4]
. Also note that 1 is returned if 0 == *codePoint
since '\0' is a valid 1-byte encoding.
|
inlinestatic |
|
inlinestatic |
For the specified byteOffset
in the specified input
, load the offset's line number into the specified lineNumber
, the column number into the specified utf8Column
, and the byte offset for the start of the line into startOfLineByteOffset
. Optionally specify lineDelimeter
used to the determine line separator. If lineDelimeter
is not supplied, lines are delimeted using '
'. Return 0 on success, or a non-zero value if location
cannot be found in input
or if input
contains non-UTF-8 characters. The utf8Column
is the number of UTF-8 code points between startOfLineByteOffset
and byteOffset
.
|
inlinestatic |
Return true
if the specified string
contains valid UTF-8, and false
otherwise. string
need not be null-terminated and can contain embedded null bytes.
|
static |
Return true
if the specified string
contains only valid UTF-8 characters, and false
otherwise. If string
contains invalid UTF-8, load into the specified invalidString
the address of the byte after the last valid code point traversed; invalidString
is unaffected if string
contains only valid UTF-8. string
need not be null-terminated and can contain embedded null bytes.
|
static |
Return true
if the specified string
contains valid UTF-8, and false
otherwise. If string
contains invalid UTF-8, load into the specified invalidString
the address of the beginning of the first invalid UTF-8 sequence encountered; invalidString
is unaffected if string
contains only valid UTF-8. string
is necessarily null-terminated, so it cannot contain embedded null bytes.
|
static |
Return true
if the specified string
having the specified length
(in bytes) contains valid UTF-8, and false
otherwise. If string
contains invalid UTF-8, load into the specified invalidString
the address of the byte after the last valid code point traversed; invalidString
is unaffected if string
contains only valid UTF-8. string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}).
|
inlinestatic |
Return true
if the specified string
contains valid UTF-8, and false
otherwise. string
is necessarily null-terminated, so it cannot contain embedded null bytes.
|
inlinestatic |
Return true
if the specified string
having the specified length
(in bytes) contains valid UTF-8, and false
otherwise. string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}).
|
static |
If the specified codePoint
(having at least the specified numBytes
) refers to a valid UTF-8 code point then return true
and load the specified status
with the number of bytes in the code-point; otherwise, if codePoint
is not a valid code-point, return false
and load status
with one of the (negative) ErrorStatus
constants. The behavior is undefined unless numBytes > 0
.
|
inlinestatic |
Return the length (in bytes) of the specified numCodePoints
UTF-8 encodings in the specified string
, or a value less than 0 if string
contains less than numCodePoints
encodings. The behavior is undefined unless string
refers to valid UTF-8. Note that string
may contain more than numCodePoints
encodings in which case the trailing ones are ignored.
|
static |
Return the length (in bytes) of the UTF-8-encoded code point beginning at the specified codePoint
. The behavior is undefined unless codePoint
is the address of the first byte of a valid UTF-8 encoded character. Note that the value returned will be in the range [1 .. 4]
. Also note that 1 is returned if 0 == *codePoint
since '\0' is a valid 1-byte encoding.
|
static |
Return the length (in bytes) of the specified numCodePoints
UTF-8 encodings in the specified string
, or a value less than 0 if string
contains less than numCodePoints
encodings. The behavior is undefined unless string
refers to valid UTF-8. Note that string
may contain more than numCodePoints
encodings in which case the trailing ones are ignored.
|
inlinestatic |
Return the number of Unicode code points in the specified string
. string
is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string
contains valid UTF-8. Note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
having the specified length
(in bytes). string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). The behavior is undefined unless string
contains valid UTF-8. Note that string
may contain less than length
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
if it contains valid UTF-8, with no effect on the specified invalidString
. Otherwise, return a negative value and load into invalidString
the address of the byte after the last valid Unicode code point traversed. string
is necessarily null-terminated, so it cannot contain embedded null bytes. Note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
having the specified length
(in bytes) if string
contains valid UTF-8, with no effect on the specified invalidString
. Otherwise, return a negative value and load into invalidString
the address of the byte after the last valid Unicode code point traversed. string
need not be null-terminated and may contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). Note that string
may contain less than length
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
. string
is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string
contains valid UTF-8. Note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
having the specified length
(in bytes). string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). The behavior is undefined string
contains valid UTF-8. Note that string
may contain less than length
Unicode code points.
|
static |
Return the number of Unicode code points in the specified string
if string
contains valid UTF-8, with no effect on the specified invalidString
. Otherwise, return a value from the ErrorStatus
enum
(which are all negative) and load into invalidString
the address of the byte after the last valid Unicode code point traversed. string
need not be null-terminated and may contain embedded null bytes.
|
static |
Return the number of Unicode code points in the specified string
if it contains valid UTF-8, with no effect on the specified invalidString
. Otherwise, return a value from the ErrorStatus
enum
(which are all negative) and load into invalidString
the address of the byte after the last valid Unicode code point traversed. string
is necessarily null-terminated, so it cannot contain embedded null bytes. Note that string
may contain less than bsl::strlen(string)
Unicode code points.
|
static |
Return the number of Unicode code points in the specified string
having the specified length
(in bytes) if string
contains valid UTF-8, with no effect on the specified invalidString
. Otherwise, return a value from the ErrorStatus
enum
(which are all negative) and load into invalidString
the address of the byte after the last valid Unicode code point traversed. string
need not be null-terminated and may contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). Note that string
may contain less than length
Unicode code points.
|
inlinestatic |
Return the number of Unicode code points in the specified string
. string
need not be null-terminated and can contain embedded null bytes. The behavior is undefined unless string
contains valid UTF-8.
|
static |
Return the number of Unicode code points in the specified string
. string
is necessarily null-terminated, so it cannot contain embedded null bytes. The behavior is undefined unless string
contains valid UTF-8. Note that string
may contain less than bsl::strlen(string)
Unicode code points.
Return the number of Unicode code points in the specified string
having the specified length
(in bytes). string
need not be null-terminated and can contain embedded null bytes, and string
may be null if 0 == length
(see {Empty Input Strings}). The behavior is undefined unless string
contains valid UTF-8. Note that string
may contain less than length
Unicode code points.
|
static |
Read from the specified input
and copy valid UTF-8 (only) to the specified outputBuffer
having the specified outputBufferLength
(in bytes). Load the specified status
with:
input
reached eof
without encountering any invalid UTF-8 or prematurely exhausting outputBuffer
.input
was not completely read due to outputBuffer
being filled (or nearly filled) without encountering any invalid UTF-8.ErrorStatus
if invalid UTF-8 was encountered (without having written the invalid sequence to outputBuffer
). Return the number of bytes of valid UTF-8 written to 'outputBuffer. If no invalid UTF-8 is encountered, or if input
supports sputbackc
with a putback buffer capacity of at least 4 bytes, input
will be left positioned at the end of the valid UTF-8 read, otherwise, input
will be left in an unspecified state. The behavior is undefined unless 4 <= outputBufferLength
. Note that this function will stop reading input
when less than 4 bytes of space remain in outputBuffer
to prevent the possibility of a 4-byte UTF-8 sequence being truncated partway through.
|
static |
Return the non-modifiable string representation of the ErrorStatus
enumerator matching the specified value
, if it exists, and "(*
unrecognized value *)" otherwise. The string representation of an enumerator that matches value
is the enumerator name with the "k_" prefix elided. Note that this method may be used to aid in interpreting status values that are returned from some methods in this utility. See ErrorStatus
.