BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlde_utf8util.h
Go to the documentation of this file.
1/// @file bdlde_utf8util.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlde_utf8util.h -*-C++-*-
8#ifndef INCLUDED_BDLDE_UTF8UTIL
9#define INCLUDED_BDLDE_UTF8UTIL
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdlde_utf8util bdlde_utf8util
15/// @brief Provide basic utilities for UTF-8 encodings.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdlde
19/// @{
20/// @addtogroup bdlde_utf8util
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdlde_utf8util-purpose"> Purpose</a>
25/// * <a href="#bdlde_utf8util-classes"> Classes </a>
26/// * <a href="#bdlde_utf8util-description"> Description </a>
27/// * <a href="#bdlde_utf8util-empty-input-strings"> Empty Input Strings </a>
28/// * <a href="#bdlde_utf8util-usage"> Usage </a>
29/// * <a href="#bdlde_utf8util-example-1-validating-strings-and-counting-unicode-code-points"> Example 1: Validating Strings and Counting Unicode Code Points </a>
30/// * <a href="#bdlde_utf8util-example-2-advancing-over-a-given-number-of-code-points"> Example 2: Advancing Over a Given Number of Code Points </a>
31/// * <a href="#bdlde_utf8util-example-3-validating-utf-8-read-from-a-bsl-streambuf"> Example 3: Validating UTF-8 Read from a bsl::streambuf </a>
32///
33/// # Purpose {#bdlde_utf8util-purpose}
34/// Provide basic utilities for UTF-8 encodings.
35///
36/// # Classes {#bdlde_utf8util-classes}
37///
38/// - bdlde::Utf8Util: namespace for utilities for UTF-8 encodings
39///
40/// # Description {#bdlde_utf8util-description}
41/// This component provides, within the `bdlde::Utf8Util` `struct`,
42/// a suite of static functions supporting UTF-8 encoded strings. Two
43/// interfaces are provided for each function, one where the length of the
44/// string (in *bytes*) is passed as a separate argument, and one where the
45/// string is passed as a null-terminated C-style string.
46///
47/// A string is deemed to contain valid UTF-8 if it is compliant with RFC 3629,
48/// meaning that only 1-, 2-, 3-, and 4-byte sequences are allowed. Values
49/// above `U+10ffff` are also not allowed.
50///
51/// Seven types of functions are provided:
52///
53/// * `isValid`, which checks for validity, per RFC 3629, of a (candidate)
54/// UTF-8 string. "Overlong values", that is, values encoded in more bytes
55/// than necessary, are not tolerated; nor are "surrogate values", which are
56/// values in the range `[U+d800 .. U+dfff]`.
57/// * `advanceIfValid` and `advanceRaw`, which advance some number of Unicode
58/// code points, each of which may be encoded in multiple bytes in a UTF-8
59/// string. `advanceRaw` assumes the string is valid UTF-8, while
60/// `advanceIfValid` checks the input for validity and stops advancing if a
61/// sequence is encountered that is not valid UTF-8.
62/// * `numCodePointsIfValid` and `numCodePointsRaw`, which return the number of
63/// Unicode code points in a UTF-8 string. Note that `numCodePointsIfValid`
64/// both validates a (candidate) UTF-8 string and counts the number of
65/// Unicode code points that it contains.
66/// * `numBytesIfValid`, which returns the number of bytes a specified number
67/// of Unicode code points occupy in a UTF-8 string.
68/// * `getByteSize`, which returns the length of a single UTF-8 encoded
69/// character.
70/// * `CodePointValue`, which returns the integral value of a single UTF-8
71/// encoded character.
72/// * `appendUtf8Character`, which appends a single Unicode code point to a
73/// UTF-8 string.
74///
75/// Embedded null bytes are allowed in strings that are accompanied by an
76/// explicit length argument. Naturally, null-terminated C-style strings cannot
77/// contain embedded null code points.
78///
79/// The UTF-8 format is described in the RFC 3629 document at:
80/// @code
81/// http://tools.ietf.org/html/rfc3629
82/// @endcode
83/// and in Wikipedia at:
84/// @code
85/// http://en.wikipedia.org/wiki/Utf-8
86/// @endcode
87///
88/// ## Empty Input Strings {#bdlde_utf8util-empty-input-strings}
89///
90///
91/// The utility functions provided by this component consider the empty string
92/// to be valid UTF-8. For those functions that take input as a
93/// `(pointer, length)` pair, if `0 == pointer` and `0 == length`, then the
94/// input is interpreted as a valid, empty string. However, if `0 == pointer`
95/// and `0 != length`, the behavior is undefined. All such functions have a
96/// counterpart that takes a lone pointer to a null-terminated (C-style) string.
97/// The behavior is always undefined if 0 is supplied for that lone pointer.
98///
99/// ## Usage {#bdlde_utf8util-usage}
100///
101///
102/// This section illustrates intended use of this component.
103///
104/// ### Example 1: Validating Strings and Counting Unicode Code Points {#bdlde_utf8util-example-1-validating-strings-and-counting-unicode-code-points}
105///
106///
107/// In this usage example, we will encode some Unicode code points in UTF-8
108/// strings and demonstrate those that are valid and those that are not.
109///
110/// First, we build an unquestionably valid UTF-8 string:
111/// @code
112/// bsl::string string;
113/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00);
114/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x856);
115/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a');
116/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa);
117/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xfff);
118/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w');
119/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd);
120/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '.');
121/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n');
122/// @endcode
123/// Then, we check its validity and measure its length:
124/// @code
125/// assert(true == bdlde::Utf8Util::isValid(string.data(), string.length()));
126/// assert(true == bdlde::Utf8Util::isValid(string.c_str()));
127///
128/// assert( 9 == bdlde::Utf8Util::numCodePointsRaw(string.data(),
129/// string.length()));
130/// assert( 9 == bdlde::Utf8Util::numCodePointsRaw(string.c_str()));
131/// @endcode
132/// Next, we encode a lone surrogate value, `0xd8ab`, that we encode as the raw
133/// 3-byte sequence "\xed\xa2\xab" to avoid validation:
134/// @code
135/// bsl::string stringWithSurrogate = string + "\xed\xa2\xab";
136///
137/// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.data(),
138/// stringWithSurrogate.length()));
139/// assert(false == bdlde::Utf8Util::isValid(stringWithSurrogate.c_str()));
140/// @endcode
141/// Then, we cannot use `numCodePointsRaw` to count the code points in
142/// `stringWithSurrogate`, since the behavior of that method is undefined unless
143/// the string is valid. Instead, the `numCodePointsIfValid` method can be used
144/// on strings whose validity we are uncertain of:
145/// @code
146/// const char *invalidPosition = 0;
147///
148/// bsls::Types::IntPtr rc;
149/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
150/// stringWithSurrogate.data(),
151/// stringWithSurrogate.length());
152/// assert(rc < 0);
153/// assert(bdlde::Utf8Util::k_SURROGATE == rc);
154/// assert(invalidPosition == stringWithSurrogate.data() + string.length());
155///
156/// invalidPosition = 0; // reset
157///
158/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
159/// stringWithSurrogate.c_str());
160/// assert(rc < 0);
161/// assert(bdlde::Utf8Util::k_SURROGATE == rc);
162/// assert(invalidPosition == stringWithSurrogate.data() + string.length());
163/// @endcode
164/// Now, we encode 0, which is allowed. However, note that we cannot use any
165/// interfaces that take a null-terminated string for this case:
166/// @code
167/// bsl::string stringWithNull = string;
168/// stringWithNull += '\0';
169///
170/// assert(true == bdlde::Utf8Util::isValid(stringWithNull.data(),
171/// stringWithNull.length()));
172///
173/// assert( 10 == bdlde::Utf8Util::numCodePointsRaw(stringWithNull.data(),
174/// stringWithNull.length()));
175/// @endcode
176/// Finally, we encode `0x3a` (`:`) as an overlong value using 2 bytes, which is
177/// not valid UTF-8 (since `:` can be "encoded" in 1 byte):
178/// @code
179/// bsl::string stringWithOverlong = string;
180/// stringWithOverlong += static_cast<char>(0xc0); // start of 2-byte
181/// // sequence
182/// stringWithOverlong += static_cast<char>(0x80 | ':'); // continuation byte
183///
184/// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.data(),
185/// stringWithOverlong.length()));
186/// assert(false == bdlde::Utf8Util::isValid(stringWithOverlong.c_str()));
187///
188/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
189/// stringWithOverlong.data(),
190/// stringWithOverlong.length());
191/// assert(rc < 0);
192/// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
193/// assert(invalidPosition == stringWithOverlong.data() + string.length());
194///
195/// rc = bdlde::Utf8Util::numCodePointsIfValid(&invalidPosition,
196/// stringWithOverlong.c_str());
197/// assert(rc < 0);
198/// assert(bdlde::Utf8Util::k_OVERLONG_ENCODING == rc);
199/// assert(invalidPosition == stringWithOverlong.data() + string.length());
200/// @endcode
201///
202/// ### Example 2: Advancing Over a Given Number of Code Points {#bdlde_utf8util-example-2-advancing-over-a-given-number-of-code-points}
203///
204///
205/// In this example, we will use the various `advance` functions to advance
206/// through a UTF-8 string.
207///
208/// First, build the string using `appendUtf8CodePoint`, keeping track of how
209/// many bytes are in each Unicode code point:
210/// @code
211/// bsl::string string;
212/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0xff00); // 3 bytes
213/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1ff); // 2 bytes
214/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'a'); // 1 byte
215/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1008aa); // 4 bytes
216/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 0x1abcd); // 4 bytes
217/// string += "\xe3\x8f\xfe"; // 3 bytes (invalid 3-byte sequence,
218/// // the first 2 bytes are valid but the
219/// // last continuation byte is invalid)
220/// bdlde::Utf8Util::appendUtf8CodePoint(&string, 'w'); // 1 byte
221/// bdlde::Utf8Util::appendUtf8CodePoint(&string, '\n'); // 1 byte
222/// @endcode
223/// Then, declare a few variables we'll need:
224/// @code
225/// bsls::Types::IntPtr rc;
226/// int status;
227/// const char *result;
228/// const char *const start = string.c_str();
229/// @endcode
230/// Next, try advancing 2 code points, then 3, then 4, observing that the value
231/// returned is the number of Unicode code points advanced. Note that since
232/// we're only advancing over valid UTF-8, we can use either `advanceRaw` or
233/// `advanceIfValid`:
234/// @code
235/// rc = bdlde::Utf8Util::advanceRaw( &result, start, 2);
236/// assert(2 == rc);
237/// assert(3 + 2 == result - start);
238///
239/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 2);
240/// assert(0 == status);
241/// assert(2 == rc);
242/// assert(3 + 2 == result - start);
243///
244/// rc = bdlde::Utf8Util::advanceRaw( &result, start, 3);
245/// assert(3 == rc);
246/// assert(3 + 2 + 1 == result - start);
247///
248/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 3);
249/// assert(0 == status);
250/// assert(3 == rc);
251/// assert(3 + 2 + 1 == result - start);
252///
253/// rc = bdlde::Utf8Util::advanceRaw( &result, start, 4);
254/// assert(4 == rc);
255/// assert(3 + 2 + 1 + 4 == result - start);
256///
257/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, 4);
258/// assert(0 == status);
259/// assert(4 == rc);
260/// assert(3 + 2 + 1 + 4 == result - start);
261/// @endcode
262/// Then, try advancing by more code points than are present using
263/// `advanceIfValid`, and wind up stopping when we encounter invalid input. The
264/// behavior of `advanceRaw` is undefined if it is used on invalid input, so we
265/// cannot use it here. Also note that we will stop at the beginning of the
266/// invalid Unicode code point, and not at the first incorrect byte, which is
267/// two bytes later:
268/// @code
269/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
270/// assert(0 != status);
271/// assert(5 == rc);
272/// assert(3 + 2 + 1 + 4 + 4 == result - start);
273/// assert(static_cast<int>(string.length()) > result - start);
274/// @endcode
275/// Now, doctor the string to replace the invalid code point with a valid one,
276/// so the string is entirely correct UTF-8:
277/// @code
278/// string[3 + 2 + 1 + 4 + 4 + 2] = static_cast<char>(0x8a);
279/// @endcode
280/// Finally, advance using both functions by more code points than are in the
281/// string and in both cases wind up at the end of the string. Note that
282/// `advanceIfValid` does not return an error (non-zero) value to `status` when
283/// it encounters the end of the string:
284/// @code
285/// rc = bdlde::Utf8Util::advanceRaw( &result, start, INT_MAX);
286/// assert(8 == rc);
287/// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1 == result - start);
288/// assert(static_cast<int>(string.length()) == result - start);
289///
290/// rc = bdlde::Utf8Util::advanceIfValid(&status, &result, start, INT_MAX);
291/// assert(0 == status);
292/// assert(8 == rc);
293/// assert(3 + 2 + 1 + 4 + 4 + 3 + 1 + 1 == result - start);
294/// assert(static_cast<int>(string.length()) == result - start);
295/// @endcode
296///
297/// ### Example 3: Validating UTF-8 Read from a bsl::streambuf {#bdlde_utf8util-example-3-validating-utf-8-read-from-a-bsl-streambuf}
298///
299///
300/// In this usage example, we will demonstrate reading and validating UTF-8
301/// from a stream.
302///
303/// We write a function to read valid UTF-8 to a `bsl::string`. We don't know
304/// how long the input will be, so we don't know how long to make the string
305/// before we start. We will grow the string in small, 32-byte increments.
306/// @code
307/// /// Read valid UTF-8 from the specified streambuf `sb` to the specified
308/// /// `output`. Return 0 if the input was exhausted without encountering
309/// /// any invalid UTF-8, and a non-zero value otherwise. If invalid UTF-8
310/// /// is encountered, log a message describing the problem after loading
311/// /// all the valid UTF-8 preceding it into `output`. Note that after the
312/// /// call, in no case will `output` contain any invalid UTF-8.
313/// int utf8StreambufToString(bsl::string *output,
314/// bsl::streambuf *sb)
315/// {
316/// enum { k_READ_LENGTH = 32 };
317///
318/// output->clear();
319/// while (true) {
320/// bsl::size_t len = output->length();
321/// output->resize(len + k_READ_LENGTH);
322/// int status;
323/// IntPtr numBytes = bdlde::Utf8Util::readIfValid(&status,
324/// &(*output)[len],
325/// k_READ_LENGTH,
326/// sb);
327/// BSLS_ASSERT(0 <= numBytes);
328/// BSLS_ASSERT(numBytes <= k_READ_LENGTH);
329///
330/// output->resize(len + numBytes);
331/// if (0 < status) {
332/// // Buffer was full before the end of input was encountered.
333/// // Note that `numBytes` may be up to 3 bytes less than
334/// // `k_READ_LENGTH`.
335///
336/// BSLS_ASSERT(k_READ_LENGTH - 4 < numBytes);
337///
338/// // Go on to grow the string and get more input.
339///
340/// continue;
341/// }
342/// else if (0 == status) {
343/// // Success! We've reached the end of input without
344/// // encountering any invalid UTF-8.
345///
346/// return 0; // RETURN
347/// }
348/// else {
349/// // Invalid UTF-8 encountered; the value of `status` indicates
350/// // the exact nature of the problem. `numBytes` returned from
351/// // the above call indicated the number of valid UTF-8 bytes
352/// // read before encountering the invalid UTF-8.
353///
354/// BSLS_LOG_ERROR("Invalid UTF-8 error %s at position %u.\n",
355/// bdlde::Utf8Util::toAscii(status),
356/// static_cast<unsigned>(output->length()));
357///
358/// return -1; // RETURN
359/// }
360/// }
361/// }
362/// @endcode
363/// @}
364/** @} */
365/** @} */
366
367/** @addtogroup bdl
368 * @{
369 */
370/** @addtogroup bdlde
371 * @{
372 */
373/** @addtogroup bdlde_utf8util
374 * @{
375 */
376
377#include <bdlscm_version.h>
378
379#include <bsls_assert.h>
380#include <bsls_libraryfeatures.h>
381#include <bsls_review.h>
382#include <bsls_types.h>
383
384#include <bsl_cstddef.h>
385#include <bsl_iosfwd.h>
386#include <bsl_streambuf.h>
387#include <bsl_string.h>
388
389#include <string>
390
391
392
393namespace bdlde {
394 // ===============
395 // struct Utf8Util
396 // ===============
397
398/// This struct provides a namespace for static methods used for validating
399/// UTF-8 strings, for counting the number of Unicode code points in them,
400/// for advancing pointers through UTF-8 strings by a specified number of
401/// Unicode code points, for counting the number of bytes a UTF-8 leading
402/// substring occupies, for counting the number of bytes in a UTF-8
403/// character, and for appending a Unicode character to a UTF-8 string.
404struct Utf8Util {
405
406 // PUBLIC TYPES
410
411 /// Enumerate the error status values that are returned (possibly
412 /// through an out parameter) from some methods in this utility. Note
413 /// that some of the functions in this `struct` have a return value
414 /// that is non-negative on success, and one of these values when an
415 /// error occurs, so all of these values must be negative to distinguish
416 /// them from a "success" value.
418
420 // The end of input was reached partway through a multibyte UTF-8
421 // sequence.
422
424 // A continuation byte was encountered when not within a multibyte
425 // sequence.
426
428 // A non-continuation byte was encountered where a continuation byte
429 // was expected.
430
432 // The encoded Unicode value could have been encoded in a sequence
433 // of fewer bytes.
434
436 // A sequence began with an octet with its 5 highest-order bits all
437 // set, which is always invalid in UTF-8.
438
440 // A value larger than 0x10FFFF was encoded.
441
442 k_SURROGATE = -7
443 // Illegal occurrence of Unicode code point reserved for surrogate
444 // values in UTF-16. Note that all surrogate values are illegal as
445 // Unicode code points.
446 };
447
448 // CLASS METHODS
449
450 /// Advance past 0 or more consecutive *valid* Unicode code points at
451 /// the beginning of the specified `string`, until either the specified
452 /// `numCodePoints` have been traversed, or the terminating null byte or
453 /// invalid UTF-8 is encountered (whichever occurs first), and return
454 /// the number of Unicode code points traversed. Set the specified
455 /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value
456 /// from the `ErrorStatus` `enum` otherwise. Set the specified
457 /// `*result` to the address of the byte immediately following the last
458 /// valid code point traversed, or to `string` if `string` is empty or
459 /// `numCodePoints` is 0. `string` is necessarily null-terminated, so
460 /// it cannot contain embedded null bytes. The behavior is undefined
461 /// unless `0 <= numCodePoints`. Note that the value returned will be
462 /// in the range `[0 .. numCodePoints]`. Also note that `string` may
463 /// contain less than `bsl::strlen(string)` Unicode code points.
464 static IntPtr advanceIfValid(int *status,
465 const char **result,
466 const char *string,
467 IntPtr numCodePoints);
468
469 /// Advance past 0 or more consecutive *valid* Unicode code points at
470 /// the beginning of the specified `string` having the specified
471 /// `length` (in bytes), until either the specified `numCodePoints` or
472 /// `length` bytes have been traversed, or invalid UTF-8 is encountered
473 /// (whichever occurs first), and return the number of Unicode code
474 /// points traversed. Set the specified `*status` to 0 if no invalid
475 /// UTF-8 is encountered, and to a value from the `ErrorStatus` `enum`
476 /// otherwise. Set the specified `*result` to the address of the byte
477 /// immediately following the last valid code point traversed, or to
478 /// `string` if `length` or `numCodePoints` is 0. `string` need not be
479 /// null-terminated and can contain embedded null bytes, and `string`
480 /// may be null if `0 == length` (see {Empty Input Strings}). The
481 /// behavior is undefined unless `0 <= numCodePoints`. Note that the
482 /// value returned will be in the range `[0 .. numCodePoints]`. Also
483 /// note that `string` may contain less than `length` Unicode code
484 /// points.
485 static IntPtr advanceIfValid(int *status,
486 const char **result,
487 const char *string,
488 size_type length,
489 IntPtr numCodePoints);
490
491 /// Advance past 0 or more consecutive *valid* Unicode code points at
492 /// the beginning of the specified `string`, until either the specified
493 /// `numCodePoints` bytes or the whole `string` have been traversed, or
494 /// invalid UTF-8 is encountered (whichever occurs first), and return
495 /// the number of Unicode code points traversed. Set the specified
496 /// `*status` to 0 if no invalid UTF-8 is encountered, and to a value
497 /// from the `ErrorStatus` `enum` otherwise. Set the specified
498 /// `*result` to the address of the byte immediately following the last
499 /// valid code point traversed, or to `string` if its length or
500 /// `numCodePoints` is 0. `string` need not be null-terminated and can
501 /// contain embedded null bytes. The behavior is undefined unless
502 /// `0 <= numCodePoints`. Note that the value returned will be in the
503 /// range `[0 .. numCodePoints]`. Also note that `string` may contain
504 /// less than `string.length()` Unicode code points.
505 static IntPtr advanceIfValid(int *status,
506 const char **result,
507 const bsl::string_view& string,
508 IntPtr numCodePoints);
509
510 /// Advance past 0 or more consecutive Unicode code points at the
511 /// beginning of the specified `string`, until either the specified
512 /// `numCodePoints` bytes have been traversed or the terminating null
513 /// byte is encountered (whichever occurs first), and return the number
514 /// of Unicode code points traversed. Set the specified `*result` to
515 /// the address of the byte immediately following the last code point
516 /// traversed, or to `string` if `string` is empty or `numCodePoints` is
517 /// 0. `string` is necessarily null-terminated, so it cannot contain
518 /// embedded null bytes. The behavior is undefined unless `string`
519 /// contains valid UTF-8 and `0 <= numCodePoints`. Note that the value
520 /// returned will be in the range `[0 .. numCodePoints]`. Also note
521 /// that `string` may contain less than `bsl::strlen(string)` Unicode
522 /// code points.
523 static IntPtr advanceRaw(const char **result,
524 const char *string,
525 IntPtr numCodePoints);
526
527 /// Advance past 0 or more consecutive Unicode code points at the
528 /// beginning of the specified `string` having the specified `length`
529 /// (in bytes), until either the specified `numCodePoints` or `length`
530 /// bytes have been traversed (whichever occurs first), and return the
531 /// number of Unicode code points traversed. Set the specified
532 /// `*result` to the address of the byte immediately following the last
533 /// code point traversed, or to `string` if `length` or `numCodePoints`
534 /// is 0. `string` need not be null-terminated and can contain embedded
535 /// null bytes, and `string` may be null if `0 == length` (see {Empty
536 /// Input Strings}). The behavior is undefined unless the initial
537 /// `length` bytes of `string` contain valid UTF-8 and
538 /// `0 <= numCodePoints`. Note that the value returned will be in the
539 /// range `[0 .. numCodePoints]`. Also note that `string` may contain
540 /// less than `length` Unicode code points.
541 static IntPtr advanceRaw(const char **result,
542 const char *string,
543 size_type length,
544 IntPtr numCodePoints);
545
546 /// Advance past 0 or more consecutive Unicode code points at the
547 /// beginning of the specified `string`, until either the specified
548 /// `numCodePoints` bytes or the whole string have been traversed
549 /// (whichever occurs first), and return the number of Unicode code
550 /// points traversed. Set the specified `*result` to the address of the
551 /// byte immediately following the last code point traversed, or to
552 /// `string` if `length` or `numCodePoints` is 0. `string` need not be
553 /// null-terminated and can contain embedded null bytes. The behavior
554 /// is undefined unless `string` contains only valid UTF-8 characters
555 /// and `0 <= numCodePoints`. Note that the value returned will be in
556 /// the range `[0 .. numCodePoints]`. Also note that `string` may
557 /// contain less than `length` Unicode code points.
558 static IntPtr advanceRaw(const char **result,
559 const bsl::string_view& string,
560 IntPtr numCodePoints);
561
562 /// @deprecated Use @ref appendUtf8CodePoint instead.
563 ///
564 /// Append the UTF-8 encoding of the specified Unicode `codePoint` to
565 /// the specified `output` string. Return 0 on success, and a non-zero
566 /// value otherwise.
567 static int appendUtf8Character(bsl::string *output,
568 unsigned int codePoint);
569
571 unsigned int codePoint);
572 /// Append the UTF-8 encoding of the specified Unicode `codePoint` to
573 /// the specified `output` string. Return 0 on success, and a non-zero
574 /// value otherwise.
575 static int appendUtf8CodePoint(std::string *output,
576 unsigned int codePoint);
577#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
578 static int appendUtf8CodePoint(std::pmr::string *output,
579 unsigned int codePoint);
580#endif
581
582 /// Return the numeric value of the UTF-8-encoded code point beginning
583 /// at the specified `codePoint`. The behavior is undefined unless
584 /// `codePoint` is the address of the first byte of a valid UTF-8
585 /// encoded character.
586 static int codePointValue(const char *codePoint);
587
588 /// @deprecated Use @ref numBytesInCodePoint instead.
589 ///
590 /// Return the length (in bytes) of the UTF-8-encoded code point
591 /// beginning at the specified `codePoint`. The behavior is undefined
592 /// unless `codePoint` is the address of the first byte of a valid UTF-8
593 /// encoded character. Note that the value returned will be in the
594 /// range `[1 .. 4]`. Also note that 1 is returned if `0 == *codePoint`
595 /// since '\0' is a valid 1-byte encoding.
596 static int getByteSize(const char *codePoint);
597
598 /// Return the length (in bytes) of the UTF-8-encoded code point
599 /// beginning at the specified `codePoint`. The behavior is undefined
600 /// unless `codePoint` is the address of the first byte of a valid UTF-8
601 /// encoded character. Note that the value returned will be in the
602 /// range `[1 .. 4]`. Also note that 1 is returned if `0 == *codePoint`
603 /// since '\0' is a valid 1-byte encoding.
604 static int numBytesInCodePoint(const char *codePoint);
605
606 static int getLineAndColumnNumber(Uint64 *lineNumber,
607 Uint64 *utf8Column,
608 Uint64 *startOfLineByteOffset,
609 bsl::streambuf *input,
610 Uint64 byteOffset);
611 /// For the specified `byteOffset` in the specified `input`, load the
612 /// offset's line number into the specified `lineNumber`, the column
613 /// number into the specified `utf8Column`, and the byte offset for the
614 /// start of the line into `startOfLineByteOffset`. Optionally specify
615 /// `lineDelimeter` used to the determine line separator. If
616 /// `lineDelimeter` is not supplied, lines are delimeted using '\n'.
617 /// Return 0 on success, or a non-zero value if `location` cannot be
618 /// found in `input` or if `input` contains non-UTF-8 characters. The
619 /// `utf8Column` is the number of UTF-8 code points between
620 /// `startOfLineByteOffset` and `byteOffset`.
621 static int getLineAndColumnNumber(Uint64 *lineNumber,
622 Uint64 *utf8Column,
623 Uint64 *startOfLineByteOffset,
624 bsl::streambuf *input,
625 Uint64 byteOffset,
626 char lineDelimeter);
627
628 /// Return `true` if the specified `string` contains valid UTF-8, and
629 /// `false` otherwise. `string` is necessarily null-terminated, so it
630 /// cannot contain embedded null bytes.
631 static bool isValid(const char *string);
632
633 /// Return `true` if the specified `string` having the specified
634 /// `length` (in bytes) contains valid UTF-8, and `false` otherwise.
635 /// `string` need not be null-terminated and can contain embedded null
636 /// bytes, and `string` may be null if `0 == length` (see {Empty Input
637 /// Strings}).
638 static bool isValid(const char *string, size_type length);
639
640 /// Return `true` if the specified `string` contains valid UTF-8, and
641 /// `false` otherwise. `string` need not be null-terminated and can
642 /// contain embedded null bytes.
643 static bool isValid(const bsl::string_view& string);
644
645 /// Return `true` if the specified `string` contains valid UTF-8, and
646 /// `false` otherwise. If `string` contains invalid UTF-8, load into
647 /// the specified `invalidString` the address of the beginning of the
648 /// first invalid UTF-8 sequence encountered; `invalidString` is
649 /// unaffected if `string` contains only valid UTF-8. `string` is
650 /// necessarily null-terminated, so it cannot contain embedded null
651 /// bytes.
652 static bool isValid(const char **invalidString, const char *string);
653
654 /// Return `true` if the specified `string` having the specified
655 /// `length` (in bytes) contains valid UTF-8, and `false` otherwise. If
656 /// `string` contains invalid UTF-8, load into the specified
657 /// `invalidString` the address of the byte after the last valid code
658 /// point traversed; `invalidString` is unaffected if `string` contains
659 /// only valid UTF-8. `string` need not be null-terminated and can
660 /// contain embedded null bytes, and `string` may be null if
661 /// `0 == length` (see {Empty Input Strings}).
662 static bool isValid(const char **invalidString,
663 const char *string,
664 size_type length);
665
666 /// Return `true` if the specified `string` contains only valid UTF-8
667 /// characters, and `false` otherwise. If `string` contains invalid
668 /// UTF-8, load into the specified `invalidString` the address of the
669 /// byte after the last valid code point traversed; `invalidString` is
670 /// unaffected if `string` contains only valid UTF-8. `string` need not
671 /// be null-terminated and can contain embedded null bytes.
672 static bool isValid(const char **invalidString,
673 const bsl::string_view& string);
674
675 /// If the specified `codePoint` (having at least the specified
676 /// `numBytes`) refers to a valid UTF-8 code point then return `true`
677 /// and load the specified `status` with the number of bytes in the
678 /// code-point; otherwise, if `codePoint` is not a valid code-point,
679 /// return `false` and load `status` with one of the (negative)
680 /// `ErrorStatus` constants. The behavior is undefined unless
681 /// `numBytes > 0`.
682 static bool isValidCodePoint(int *status,
683 const char *codePoint,
684 size_type numBytes);
685
686 /// @deprecated Use @ref numBytesRaw instead.
687 ///
688 /// Return the length (in bytes) of the specified `numCodePoints` UTF-8
689 /// encodings in the specified `string`, or a value less than 0 if
690 /// `string` contains less than `numCodePoints` encodings. The behavior
691 /// is undefined unless `string` refers to valid UTF-8. Note that
692 /// `string` may contain more than `numCodePoints` encodings in which
693 /// case the trailing ones are ignored.
694 static IntPtr numBytesIfValid(const bsl::string_view& string,
695 IntPtr numCodePoints);
696
697 /// Return the length (in bytes) of the specified `numCodePoints` UTF-8
698 /// encodings in the specified `string`, or a value less than 0 if
699 /// `string` contains less than `numCodePoints` encodings. The behavior
700 /// is undefined unless `string` refers to valid UTF-8. Note that
701 /// `string` may contain more than `numCodePoints` encodings in which
702 /// case the trailing ones are ignored.
703 static IntPtr numBytesRaw(const bsl::string_view& string,
704 IntPtr numCodePoints);
705
706 /// @deprecated Use @ref numCodePointsRaw instead.
707 ///
708 /// Return the number of Unicode code points in the specified `string`.
709 /// `string` is necessarily null-terminated, so it cannot contain
710 /// embedded null bytes. The behavior is undefined unless `string`
711 /// contains valid UTF-8. Note that `string` may contain less than
712 /// `bsl::strlen(string)` Unicode code points.
713 static IntPtr numCharacters(const char *string);
714
715 /// @deprecated Use @ref numCodePointsRaw instead.
716 ///
717 /// Return the number of Unicode code points in the specified `string`
718 /// having the specified `length` (in bytes). `string` need not be
719 /// null-terminated and can contain embedded null bytes, and `string`
720 /// may be null if `0 == length` (see {Empty Input Strings}). The
721 /// behavior is undefined unless `string` contains valid UTF-8. Note
722 /// that `string` may contain less than `length` Unicode code points.
723 static IntPtr numCharacters(const char *string, size_type length);
724
725 /// @deprecated Use @ref numCodePointsIfValid instead.
726 ///
727 /// Return the number of Unicode code points in the specified `string`
728 /// if it contains valid UTF-8, with no effect on the specified
729 /// `invalidString`. Otherwise, return a negative value and load into
730 /// `invalidString` the address of the byte after the last valid Unicode
731 /// code point traversed. `string` is necessarily null-terminated, so
732 /// it cannot contain embedded null bytes. Note that `string` may
733 /// contain less than `bsl::strlen(string)` Unicode code points.
734 static IntPtr numCharactersIfValid(const char **invalidString,
735 const char *string);
736
737 /// @deprecated Use @ref numCodePointsIfValid instead.
738 ///
739 /// Return the number of Unicode code points in the specified `string`
740 /// having the specified `length` (in bytes) if `string` contains valid
741 /// UTF-8, with no effect on the specified `invalidString`. Otherwise,
742 /// return a negative value and load into `invalidString` the address of
743 /// the byte after the last valid Unicode code point traversed.
744 /// `string` need not be null-terminated and may contain embedded null
745 /// bytes, and `string` may be null if `0 == length` (see {Empty Input
746 /// Strings}). Note that `string` may contain less than `length`
747 /// Unicode code points.
748 static IntPtr numCharactersIfValid(const char **invalidString,
749 const char *string,
750 size_type length);
751
752 /// @deprecated Use @ref numCodePointsRaw instead.
753 ///
754 /// Return the number of Unicode code points in the specified `string`.
755 /// `string` is necessarily null-terminated, so it cannot contain
756 /// embedded null bytes. The behavior is undefined unless `string`
757 /// contains valid UTF-8. Note that `string` may contain less than
758 /// `bsl::strlen(string)` Unicode code points.
759 static IntPtr numCharactersRaw(const char *string);
760
761 /// @deprecated Use @ref numCodePointsRaw instead.
762 ///
763 /// Return the number of Unicode code points in the specified `string`
764 /// having the specified `length` (in bytes). `string` need not be
765 /// null-terminated and can contain embedded null bytes, and `string`
766 /// may be null if `0 == length` (see {Empty Input Strings}). The
767 /// behavior is undefined `string` contains valid UTF-8. Note that
768 /// `string` may contain less than `length` Unicode code points.
769 static IntPtr numCharactersRaw(const char *string, size_type length);
770
771 /// Return the number of Unicode code points in the specified `string`
772 /// if it contains valid UTF-8, with no effect on the specified
773 /// `invalidString`. Otherwise, return a value from the `ErrorStatus`
774 /// `enum` (which are all negative) and load into `invalidString` the
775 /// address of the byte after the last valid Unicode code point
776 /// traversed. `string` is necessarily null-terminated, so it cannot
777 /// contain embedded null bytes. Note that `string` may contain less
778 /// than `bsl::strlen(string)` Unicode code points.
779 static IntPtr numCodePointsIfValid(const char **invalidString,
780 const char *string);
781
782 /// Return the number of Unicode code points in the specified `string`
783 /// having the specified `length` (in bytes) if `string` contains valid
784 /// UTF-8, with no effect on the specified `invalidString`. Otherwise,
785 /// return a value from the `ErrorStatus` `enum` (which are all
786 /// negative) and load into `invalidString` the address of the byte
787 /// after the last valid Unicode code point traversed. `string` need
788 /// not be null-terminated and may contain embedded null bytes, and
789 /// `string` may be null if `0 == length` (see {Empty Input Strings}).
790 /// Note that `string` may contain less than `length` Unicode code
791 /// points.
792 static IntPtr numCodePointsIfValid(const char **invalidString,
793 const char *string,
794 size_type length);
795
796 /// Return the number of Unicode code points in the specified `string`
797 /// if `string` contains valid UTF-8, with no effect on the specified
798 /// `invalidString`. Otherwise, return a value from the `ErrorStatus`
799 /// `enum` (which are all negative) and load into `invalidString` the
800 /// address of the byte after the last valid Unicode code point
801 /// traversed. `string` need not be null-terminated and may contain
802 /// embedded null bytes.
803 static IntPtr numCodePointsIfValid(const char **invalidString,
804 const bsl::string_view& string);
805
806 /// Return the number of Unicode code points in the specified `string`.
807 /// `string` is necessarily null-terminated, so it cannot contain
808 /// embedded null bytes. The behavior is undefined unless `string`
809 /// contains valid UTF-8. Note that `string` may contain less than
810 /// `bsl::strlen(string)` Unicode code points.
811 static IntPtr numCodePointsRaw(const char *string);
812
813 /// Return the number of Unicode code points in the specified `string`
814 /// having the specified `length` (in bytes). `string` need not be
815 /// null-terminated and can contain embedded null bytes, and `string`
816 /// may be null if `0 == length` (see {Empty Input Strings}). The
817 /// behavior is undefined unless `string` contains valid UTF-8. Note
818 /// that `string` may contain less than `length` Unicode code points.
819 static IntPtr numCodePointsRaw(const char *string, size_type length);
820
821 /// Return the number of Unicode code points in the specified `string`.
822 /// `string` need not be null-terminated and can contain embedded null
823 /// bytes. The behavior is undefined unless `string` contains valid
824 /// UTF-8.
825 static IntPtr numCodePointsRaw(const bsl::string_view& string);
826
827 /// Read from the specified `input` and copy *valid* UTF-8 (only) to the
828 /// specified `outputBuffer` having the specified `outputBufferLength`
829 /// (in bytes). Load the specified `status` with:
830 /// * 0 if `input` reached `eof` without encountering any invalid UTF-8
831 /// or prematurely exhausting `outputBuffer`.
832 /// * A positive value if `input` was not completely read due to
833 /// `outputBuffer` being filled (or nearly filled) without
834 /// encountering any invalid UTF-8.
835 /// * A negative value from `ErrorStatus` if invalid UTF-8 was
836 /// encountered (without having written the invalid sequence to
837 /// `outputBuffer`).
838 /// Return the number of bytes of valid UTF-8 written to 'outputBuffer.
839 /// If no invalid UTF-8 is encountered, or if `input` supports
840 /// `sputbackc` with a putback buffer capacity of at least 4 bytes,
841 /// `input` will be left positioned at the end of the valid UTF-8 read,
842 /// otherwise, `input` will be left in an unspecified state. The
843 /// behavior is undefined unless `4 <= outputBufferLength`. Note that
844 /// this function will stop reading `input` when less than 4 bytes of
845 /// space remain in `outputBuffer` to prevent the possibility of a
846 /// 4-byte UTF-8 sequence being truncated partway through.
847 static size_type readIfValid(int *status,
848 char *outputBuffer,
849 size_type outputBufferLength,
850 bsl::streambuf *input);
851
852 /// Return the non-modifiable string representation of the `ErrorStatus`
853 /// enumerator matching the specified `value`, if it exists, and "(*
854 /// unrecognized value *)" otherwise. The string representation of an
855 /// enumerator that matches `value` is the enumerator name with the "k_"
856 /// prefix elided. Note that this method may be used to aid in
857 /// interpreting status values that are returned from some methods in
858 /// this utility. See `ErrorStatus`.
859 static const char *toAscii(IntPtr value);
860};
861
862 // =======================
863 // struct Utf8Util_ImpUtil
864 // =======================
865
866/// [**PRIVATE**] This struct provides a namespace for static methods used to
867/// implement `Utf8Util`. Note that the functions are not typically useful
868/// for clients, and are primarily exposed to allow for more thorough
869/// testing.
871
872 // TYPES
874
875 // CLASS METHODS
876
877 /// For the specified `byteOffset` in the specified `input`, load the
878 /// byte offset's line number into the specified `lineNumber`, the
879 /// column number into the specified `utf8Column`, and the byte offset
880 /// for the start of the line into the specified
881 /// `startOfLineByteOffset`, using the specified `lineDelimeter` as the
882 /// line separator, and using the specified `temporaryReadBuffer` (of
883 /// the specified length `temporaryReadBufferNumBytes`) as a temporary
884 /// buffer for reading. Return 0 on success, or a non-zero value if
885 /// `location` cannot be found in `input` or if `input` contains
886 /// non-UTF-8 characters. The `utf8Column` is the number of UTF-8 code
887 /// points between `startOfLineByteOffset` and `byteOffset`. The
888 /// behavior is undefined unless `temporaryReadBuffer` refers to a valid
889 /// buffer of at least `temporaryReadBufferNumBytes` bytes, and
890 /// `temporaryReadBufferNumBytes` is greater than or equal to 4.
892 Uint64 *lineNumber,
893 Uint64 *utf8Column,
894 Uint64 *startOfLineByteOffset,
895 bsl::streambuf *input,
896 Uint64 byteOffset,
897 char lineDelimeter,
898 char *temporaryReadBuffer,
899 int temporaryReadBufferNumBytes);
900};
901
902// ============================================================================
903// INLINE DEFINITIONS
904// ============================================================================
905
906 // ---------------
907 // struct Utf8Util
908 // ---------------
909
910// CLASS METHODS
911inline
913 int *status,
914 const char **result,
915 const bsl::string_view& string,
916 IntPtr numCodePoints)
917{
918
919 return advanceIfValid(status,
920 result,
921 string.data(),
922 string.length(),
923 numCodePoints);
924}
925
926inline
928 const bsl::string_view& string,
929 IntPtr numCodePoints)
930{
931 return advanceRaw(result, string.data(), string.length(), numCodePoints);
932}
933
934inline
936 unsigned int codePoint)
937{
938 return appendUtf8CodePoint(output, codePoint);
939}
940
941inline
942int Utf8Util::getByteSize(const char *codePoint)
943{
944 return numBytesInCodePoint(codePoint);
945}
946
947inline
949 Uint64 *utf8Column,
950 Uint64 *startOfLineByteOffset,
951 bsl::streambuf *input,
952 Uint64 byteOffset)
953{
954 return getLineAndColumnNumber(lineNumber,
955 utf8Column,
956 startOfLineByteOffset,
957 input,
958 byteOffset,
959 '\n');
960}
961
962inline
964 Uint64 *utf8Column,
965 Uint64 *startOfLineByteOffset,
966 bsl::streambuf *input,
967 Uint64 byteOffset,
968 char lineDelimeter)
969{
970 enum { k_BUFFER_SIZE = 2048 };
971 char buffer[k_BUFFER_SIZE];
973 utf8Column,
974 startOfLineByteOffset,
975 input,
976 byteOffset,
977 lineDelimeter,
978 buffer,
979 k_BUFFER_SIZE);
980}
981
982inline
983bool Utf8Util::isValid(const char *string)
984{
985 BSLS_ASSERT(string);
986
987 const char *dummy = 0;
988 return isValid(&dummy, string);
989}
990
991inline
992bool Utf8Util::isValid(const char *string, size_type length)
993{
994 BSLS_ASSERT(string || 0 == length);
995
996 const char *dummy = 0;
997 return isValid(&dummy, string, length);
998}
999
1000inline
1002{
1003 const char *dummy = 0;
1004 return isValid(&dummy, string);
1005}
1006
1007inline
1009 const bsl::string_view& string,
1010 IntPtr numCodePoints)
1011{
1012 return numBytesRaw(string, numCodePoints);
1013}
1014
1015inline
1017{
1018 return numCodePointsRaw(string);
1019}
1020
1021inline
1023{
1024 return numCodePointsRaw(string, length);
1025}
1026
1027inline
1029 const char *string)
1030{
1031 return numCodePointsIfValid(invalidString, string);
1032}
1033
1034inline
1036 const char *string,
1037 size_type length)
1038{
1039 return numCodePointsIfValid(invalidString, string, length);
1040}
1041
1042inline
1044{
1045 return numCodePointsRaw(string);
1046}
1047
1048inline
1050 size_type length)
1051{
1052 return numCodePointsRaw(string, length);
1053}
1054
1055inline
1057{
1058 return numCodePointsRaw(string.data(), string.length());
1059}
1060
1061} // close package namespace
1062
1063
1064#endif
1065
1066// ----------------------------------------------------------------------------
1067// Copyright 2015 Bloomberg Finance L.P.
1068//
1069// Licensed under the Apache License, Version 2.0 (the "License");
1070// you may not use this file except in compliance with the License.
1071// You may obtain a copy of the License at
1072//
1073// http://www.apache.org/licenses/LICENSE-2.0
1074//
1075// Unless required by applicable law or agreed to in writing, software
1076// distributed under the License is distributed on an "AS IS" BASIS,
1077// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1078// See the License for the specific language governing permissions and
1079// limitations under the License.
1080// ----------------------------- END-OF-FILE ----------------------------------
1081
1082/** @} */
1083/** @} */
1084/** @} */
Definition bslstl_stringview.h:441
Definition bslstl_string.h:1281
#define BSLS_ASSERT(X)
Definition bsls_assert.h:1804
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdlde_base64alphabet.h:118
Definition bdlde_utf8util.h:870
static int getLineAndColumnNumber(Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset, char lineDelimeter, char *temporaryReadBuffer, int temporaryReadBufferNumBytes)
bsls::Types::Uint64 Uint64
Definition bdlde_utf8util.h:873
Definition bdlde_utf8util.h:404
static size_type readIfValid(int *status, char *outputBuffer, size_type outputBufferLength, bsl::streambuf *input)
static IntPtr numCodePointsIfValid(const char **invalidString, const char *string)
static IntPtr advanceRaw(const char **result, const char *string, size_type length, IntPtr numCodePoints)
bsls::Types::Uint64 Uint64
Definition bdlde_utf8util.h:409
static const char * toAscii(IntPtr value)
static IntPtr advanceIfValid(int *status, const char **result, const char *string, size_type length, IntPtr numCodePoints)
static IntPtr numCharactersRaw(const char *string)
Definition bdlde_utf8util.h:1043
static int numBytesInCodePoint(const char *codePoint)
static bool isValid(const char *string)
Definition bdlde_utf8util.h:983
static int codePointValue(const char *codePoint)
static int appendUtf8CodePoint(bsl::string *output, unsigned int codePoint)
static IntPtr numCodePointsRaw(const char *string, size_type length)
static IntPtr numCharactersIfValid(const char **invalidString, const char *string)
Definition bdlde_utf8util.h:1028
static IntPtr advanceRaw(const char **result, const char *string, IntPtr numCodePoints)
static bool isValid(const char **invalidString, const char *string, size_type length)
static IntPtr numCodePointsRaw(const char *string)
bsls::Types::IntPtr IntPtr
Definition bdlde_utf8util.h:408
static bool isValid(const char **invalidString, const char *string)
static bool isValidCodePoint(int *status, const char *codePoint, size_type numBytes)
static int appendUtf8Character(bsl::string *output, unsigned int codePoint)
Definition bdlde_utf8util.h:935
static IntPtr numBytesIfValid(const bsl::string_view &string, IntPtr numCodePoints)
Definition bdlde_utf8util.h:1008
static bool isValid(const char **invalidString, const bsl::string_view &string)
static IntPtr numBytesRaw(const bsl::string_view &string, IntPtr numCodePoints)
static IntPtr numCodePointsIfValid(const char **invalidString, const bsl::string_view &string)
static int getLineAndColumnNumber(Uint64 *lineNumber, Uint64 *utf8Column, Uint64 *startOfLineByteOffset, bsl::streambuf *input, Uint64 byteOffset)
Definition bdlde_utf8util.h:948
static IntPtr advanceIfValid(int *status, const char **result, const char *string, IntPtr numCodePoints)
static int getByteSize(const char *codePoint)
Definition bdlde_utf8util.h:942
static IntPtr numCodePointsIfValid(const char **invalidString, const char *string, size_type length)
static int appendUtf8CodePoint(std::string *output, unsigned int codePoint)
ErrorStatus
Definition bdlde_utf8util.h:417
@ k_UNEXPECTED_CONTINUATION_OCTET
Definition bdlde_utf8util.h:423
@ k_NON_CONTINUATION_OCTET
Definition bdlde_utf8util.h:427
@ k_OVERLONG_ENCODING
Definition bdlde_utf8util.h:431
@ k_SURROGATE
Definition bdlde_utf8util.h:442
@ k_VALUE_LARGER_THAN_0X10FFFF
Definition bdlde_utf8util.h:439
@ k_END_OF_INPUT_TRUNCATION
Definition bdlde_utf8util.h:419
@ k_INVALID_INITIAL_OCTET
Definition bdlde_utf8util.h:435
static IntPtr numCharacters(const char *string)
Definition bdlde_utf8util.h:1016
bsls::Types::size_type size_type
Definition bdlde_utf8util.h:407
std::size_t size_type
Definition bsls_types.h:124
unsigned long long Uint64
Definition bsls_types.h:137
std::ptrdiff_t IntPtr
Definition bsls_types.h:130