BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlde_charconvertutf32.h
Go to the documentation of this file.
1/// @file bdlde_charconvertutf32.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlde_charconvertutf32.h -*-C++-*-
8#ifndef INCLUDED_BDLDE_CHARCONVERTUTF32
9#define INCLUDED_BDLDE_CHARCONVERTUTF32
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdlde_charconvertutf32 bdlde_charconvertutf32
15/// @brief Provide fast, safe conversion between UTF-8 encoding and UTF-32.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdlde
19/// @{
20/// @addtogroup bdlde_charconvertutf32
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdlde_charconvertutf32-purpose"> Purpose</a>
25/// * <a href="#bdlde_charconvertutf32-classes"> Classes </a>
26/// * <a href="#bdlde_charconvertutf32-description"> Description </a>
27/// * <a href="#bdlde_charconvertutf32-history-and-motivation"> History and Motivation </a>
28/// * <a href="#bdlde_charconvertutf32-usage"> Usage </a>
29/// * <a href="#bdlde_charconvertutf32-example-1-round-trip-multi-lingual-conversion"> Example 1: Round-Trip Multi-Lingual Conversion </a>
30///
31/// # Purpose {#bdlde_charconvertutf32-purpose}
32/// Provide fast, safe conversion between UTF-8 encoding and UTF-32.
33///
34/// # Classes {#bdlde_charconvertutf32-classes}
35///
36/// - bdlde::CharConvertUtf32: namespace for conversion between UTF-8 and UTF-32
37///
38/// # Description {#bdlde_charconvertutf32-description}
39/// This component provides a `struct`, `bdlde::CharConvertUtf32`,
40/// that provides a suite of static functions supporting the *fast* conversion
41/// of UTF-8 data to UTF-32, and vice versa. UTF-8 input can take the form of
42/// null-terminated "C" strings or `bsl::string_view`s, while UTF-32 input can
43/// only take the form of null-terminated buffers of `unsigned int`. Output can
44/// be to STL vectors, `bsl::string`s (in the case of UTF-8), and fixed-length
45/// buffers. Invalid byte sequences and code points forbidden by either
46/// encoding are removed and (optionally) replaced by an error byte or word
47/// provided by the caller. The byte order of the UTF-32 input or output can be
48/// specified via the optional `byteOrder` argument, which is assumed to be host
49/// byte order if not specified. The byte or word count and code point count
50/// that are optionally returned through pointer arguments include the
51/// terminating null byte or word.
52///
53/// ## History and Motivation {#bdlde_charconvertutf32-history-and-motivation}
54///
55///
56/// UTF-8 is a Unicode encoding that allows 32-bit Unicode to be represented
57/// using null-terminated (8-bit) byte strings, while allowing "standard ASCII"
58/// strings to be used "as-is". Note that UTF-8 is described in detail in RFC
59/// 3629 (http://www.ietf.org/rfc/rfc3629.txt).
60///
61/// UTF-32 is simply a name for storing raw Unicode values as sequential
62/// `unsigned int` values in memory.
63///
64/// Valid Unicode values are in the ranges `[ 1 .. 0xd7ff ]` and
65/// `[ 0xe000 .. 0x10ffff ]`. The value `0` is used to terminate sequences.
66///
67/// The functions here that translate to fixed buffers make a single pass
68/// through the data. The functions that translate to `bsl::string`s and
69/// `bsl::vector`s, however, like the `glib` conversion routines, make two
70/// passes: a size estimation pass, after which the output container is sized
71/// appropriately, and then the translation pass.
72///
73/// The methods that output to a `vector` or `string` will all grow the output
74/// object as necessary to fit the data, and in the end will exactly resize the
75/// object to the output (including the terminating 0 for `vector`, not
76/// including it for `string`). The resizing will not affect the capacity.
77///
78/// Non-minimal UTF-8 encodings of code points are reported as errors. Octets
79/// and post-conversion code points in the forbidden ranges are treated as
80/// errors and removed if 0 is specified as `errorWord`, or replaced with
81/// `errorWord` otherwise.
82///
83/// ## Usage {#bdlde_charconvertutf32-usage}
84///
85///
86/// This section illustrates intended use of this component.
87///
88/// ### Example 1: Round-Trip Multi-Lingual Conversion {#bdlde_charconvertutf32-example-1-round-trip-multi-lingual-conversion}
89///
90///
91/// The following snippets of code illustrate a typical use of the
92/// `bdlde::CharConvertUtf32` struct's utility functions, first converting from
93/// UTF-8 to UTF-32, and then converting back to make sure the round trip
94/// returns the same value.
95///
96/// First, we declare a string of UTF-8 containing single-, double-, triple-,
97/// and quadruple-octet code points:
98/// @code
99/// const char utf8MultiLang[] = {
100/// "Hello" // -- ASCII
101/// "\xce\x97" "\xce\x95" "\xce\xbb" // -- Greek
102/// "\xe4\xb8\xad" "\xe5\x8d\x8e" // -- Chinese
103/// "\xe0\xa4\xad" "\xe0\xa4\xbe" // -- Hindi
104/// "\xf2\x94\xb4\xa5" "\xf3\xb8\xac\x83" }; // -- Quad octets
105/// @endcode
106/// Then, we declare an `enum` summarizing the counts of code points in the
107/// string and verify that the counts add up to the length of the string:
108/// @code
109/// enum { NUM_ASCII_CODE_POINTS = 5,
110/// NUM_GREEK_CODE_POINTS = 3,
111/// NUM_CHINESE_CODE_POINTS = 2,
112/// NUM_HINDI_CODE_POINTS = 2,
113/// NUM_QUAD_CODE_POINTS = 2 };
114///
115/// assert(1 * NUM_ASCII_CODE_POINTS +
116/// 2 * NUM_GREEK_CODE_POINTS +
117/// 3 * NUM_CHINESE_CODE_POINTS +
118/// 3 * NUM_HINDI_CODE_POINTS +
119/// 4 * NUM_QUAD_CODE_POINTS == bsl::strlen(utf8MultiLang));
120/// @endcode
121/// Next, we declare the vector where our UTF-32 output will go, and a variable
122/// into which the number of code points written will be stored. It is not
123/// necessary to create a `utf32CodePointsWritten` variable, since the number of
124/// code points will be the size of the vector when we are done.
125/// @code
126/// bsl::vector<unsigned int> v32;
127/// @endcode
128/// Note that it is a waste of time to `v32.reserve(sizeof(utf8MultiLang))`; it
129/// is entirely redundant -- `v32` will automatically be grown to the correct
130/// size. Also note that if `v32` were not empty, that would not be a problem
131/// -- any contents will be discarded.
132///
133/// Then, we do the translation to `UTF-32`:
134/// @code
135/// int retVal = bdlde::CharConvertUtf32::utf8ToUtf32(&v32,
136/// utf8MultiLang);
137///
138/// assert(0 == retVal); // verify success
139/// assert(0 == v32.back()); // verify null terminated
140/// @endcode
141/// Next, we verify that the number of code points that was returned is correct.
142/// Note that in UTF-32, the number of Unicode code points written is the same
143/// as the number of 32-bit words written:
144/// @code
145/// enum { EXPECTED_CODE_POINTS_WRITTEN =
146/// NUM_ASCII_CODE_POINTS +
147/// NUM_GREEK_CODE_POINTS +
148/// NUM_CHINESE_CODE_POINTS +
149/// NUM_HINDI_CODE_POINTS +
150/// NUM_QUAD_CODE_POINTS + 1 };
151/// assert(EXPECTED_CODE_POINTS_WRITTEN == v32.size());
152/// @endcode
153/// Next, we calculate and confirm the difference between the number of UTF-32
154/// words output and the number of bytes input. The ASCII bytes will take 1
155/// 32-bit word apiece, the Greek code points are double octets that will become
156/// single `unsigned int` values, the Chinese code points are encoded as UTF-8
157/// triple octets that will turn into single 32-bit words, the same for the
158/// Hindi code points, and the quad code points are quadruple octets that will
159/// turn into single `unsigned int` words:
160/// @code
161/// enum { SHRINKAGE =
162/// NUM_ASCII_CODE_POINTS * (1-1) +
163/// NUM_GREEK_CODE_POINTS * (2-1) +
164/// NUM_CHINESE_CODE_POINTS * (3-1) +
165/// NUM_HINDI_CODE_POINTS * (3-1) +
166/// NUM_QUAD_CODE_POINTS * (4-1) };
167///
168/// assert(v32.size() == sizeof(utf8MultiLang) - SHRINKAGE);
169/// @endcode
170/// Then, we go on to do the reverse `utf32ToUtf8` transform to turn it back
171/// into UTF-8, and we should get a result identical to our original input.
172/// Declare a `bsl::string` for our output, and a variable to count the number
173/// of code points translated:
174/// @code
175/// bsl::string s;
176/// bsl::size_t codePointsWritten;
177/// @endcode
178/// Again, note that it would be a waste of time for the caller to `resize` or
179/// `reserve` `v32`; it will be automatically `resize`d by the translator to the
180/// right length.
181///
182/// Now, we do the reverse transform:
183/// @code
184/// retVal = bdlde::CharConvertUtf32::utf32ToUtf8(&s,
185/// v32.begin(),
186/// &codePointsWritten);
187/// @endcode
188/// Finally, we verify that a successful status was returned, that the output of
189/// the reverse transform was identical to the original input, and that the
190/// number of code points translated was as expected:
191/// @code
192/// assert(0 == retVal);
193/// assert(utf8MultiLang == s);
194/// assert(s.length() + 1 == sizeof(utf8MultiLang));
195///
196/// assert(EXPECTED_CODE_POINTS_WRITTEN == codePointsWritten);
197/// assert(v32.size() == codePointsWritten);
198/// @endcode
199/// @}
200/** @} */
201/** @} */
202
203/** @addtogroup bdl
204 * @{
205 */
206/** @addtogroup bdlde
207 * @{
208 */
209/** @addtogroup bdlde_charconvertutf32
210 * @{
211 */
212
213#include <bdlscm_version.h>
214
215#include <bdlde_byteorder.h>
217
218#include <bsl_cstddef.h> // 'bsl::size_t'
219#include <bsl_string.h>
220#include <bsl_string_view.h>
221#include <bsl_vector.h>
222
223#include <bsls_libraryfeatures.h>
224
225#include <string> // 'std::string', 'std::pmr::string'
226#include <vector> // 'std::vector', 'std::pmr::vector'
227
228
229
230namespace bdlde {
231 // =======================
232 // struct CharConvertUtf32
233 // =======================
234
235/// This `struct` provides a namespace for a suite of static functions to
236/// convert buffers between UTF-8 and UTF-32. Byte Order Mark (BOM) code
237/// points are neither generated nor recognized as special, and thus may be
238/// incorrect for the actual byte order of output. If a BOM is present in
239/// the input, it will be translated, whether correct (`0xfeff`) or
240/// incorrect (`0xfffe`), into the output without any special handling.
242 // CLASS METHODS
243
244 // UTF-8 to UTF-32 Methods
245
247 const char *srcString,
248 unsigned int errorWord = '?',
249 ByteOrder::Enum byteOrder =
251 /// Load into the specified `dstVector` the result of converting the
252 /// specified UTF-8 `srcString` to its UTF-32 equivalent. Optionally
253 /// specify `errorWord` to be substituted, if not 0, for invalid
254 /// encodings in the input string. Optionally specify `byteOrder` to
255 /// indicate the byte order of the UTF-32 output; if `byteOrder` is not
256 /// specified, the output is assumed to be in host byte order. Return 0
257 /// on success and `CharConvertStatus::k_INVALID_INPUT_BIT` otherwise.
258 /// Invalid encodings are multi-byte encoding parts out of sequence,
259 /// non-minimal UTF-8 encodings, UTF-8 encodings more than four bytes in
260 /// length, or code points outside the ranges that UTF-32 can validly
261 /// encode (i.e., `[ 1 .. 0xd7ff ]` and `[ 0xe000 .. 0x10ffff ]`). If
262 /// `errorWord` is 0, invalid input sequences are ignored (i.e., produce
263 /// no corresponding output). Any previous contents of the destination
264 /// are discarded. The behavior is undefined unless `srcString` is
265 /// null-terminated when specified as a `const char *`, and unless
266 /// `errorWord` is either 0 or a valid Unicode code point. Note that
267 /// one code point always occupies one 32-bit *ord of output; there is
268 /// no `numCodePointsWritten` argument since, after the call,
269 /// `dstVector->size()` will equal the number of code points written.
270 /// Also note that when the input is a `bsl::string_view`, it may
271 /// contain embedded nulls, which are translated to zeroes in the
272 /// output. Also note that `errorWord` is assumed to be in host byte
273 /// order.
274 static int utf8ToUtf32(std::vector<unsigned int> *dstVector,
275 const char *srcString,
276 unsigned int errorWord = '?',
277 ByteOrder::Enum byteOrder =
279#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
280 static int utf8ToUtf32(
281 std::pmr::vector<unsigned int> *dstVector,
282 const char *srcString,
283 unsigned int errorWord = '?',
285#endif
287 const bsl::string_view& srcString,
288 unsigned int errorWord = '?',
289 ByteOrder::Enum byteOrder =
291 static int utf8ToUtf32(std::vector<unsigned int> *dstVector,
292 const bsl::string_view& srcString,
293 unsigned int errorWord = '?',
294 ByteOrder::Enum byteOrder =
296#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
297 static int utf8ToUtf32(
298 std::pmr::vector<unsigned int> *dstVector,
299 const bsl::string_view& srcString,
300 unsigned int errorWord = '?',
302#endif
303
304 static int utf8ToUtf32(
305 unsigned int *dstBuffer,
306 bsl::size_t dstCapacity,
307 const char *srcString,
308 bsl::size_t *numCodePointsWritten = 0,
309 unsigned int errorWord = '?',
310 ByteOrder::Enum byteOrder =
312 /// Load into the specified `dstBuffer` of the specified `dstCapacity`,
313 /// the result of converting the specified UTF-8 `srcString` to its
314 /// UTF-32 equivalent. Optionally specify `numCodePointsWritten`, which
315 /// (if not 0) indicates the location of the variable into which the
316 /// number of Unicode code points (including the null terminator)
317 /// written is to be loaded. Optionally specify `errorWord` to be
318 /// substituted (if not 0) for invalid encodings in the input string.
319 /// Invalid encodings are multi-byte encoding parts out of sequence,
320 /// non-minimal UTF-8 encodings, UTF-8 encodings more than four bytes in
321 /// length, or code points outside the ranges that UTF-32 can validly
322 /// encode (i.e., `[ 1 .. 0xd7ff ]` and `[ 0xe000 .. 0x10ffff ]`). If
323 /// `errorWord` is 0, invalid input code points are ignored (i.e.,
324 /// produce no corresponding output). Optionally specify `byteOrder` to
325 /// indicate the byte order of the UTF-32 output; if `byteOrder` is not
326 /// specified, the output is assumed to be in host byte order. Return 0
327 /// on success and a bit-wise OR of the masks defined by
328 /// `CharConvertStatus::Enum` otherwise, where
329 /// `CharConvertStatus::k_INVALID_INPUT_BIT` will be set if one or more
330 /// invalid sequences were encountered in the input, and
331 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` will be set if the
332 /// output space was exhausted before conversion was complete. If
333 /// `dstCapacity > 0` yet `dstCapacity` specifies a buffer too small to
334 /// hold the output, the maximal null-terminated prefix of the properly
335 /// converted result string is loaded into `dstBuffer`. The behavior is
336 /// undefined unless `dstBuffer` refers to an array of at least
337 /// `dstCapacity` elements, `srcString`, if specified as a
338 /// `const char *`, is null-terminated, and `errorWord` is either 0 or a
339 /// valid UTF-32 code point (in the range `[ 1 .. 0xd7ff ]` or
340 /// `[ 0xe000 .. 0x10ffff ]`). Note that if `dstCapacity` is 0,
341 /// `*dstBuffer` is not modified and this function returns a value with
342 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` set and 0 is written
343 /// into `*numCodePointsWritten` (if that pointer is not 0), since there
344 /// is insufficient space for even a null terminator alone. Also note
345 /// that one Unicode code point always occupies one 32-bit *word* in
346 /// UTF-32, but may occupy more than one *byte* of UTF-8, so that
347 /// `*numCodePointsWritten` equals the number of *words* written. Also
348 /// note that `errorWord` is assumed to be in host byte order.
349 static int utf8ToUtf32(
350 unsigned int *dstBuffer,
351 bsl::size_t dstCapacity,
352 const bsl::string_view& srcString,
353 bsl::size_t *numCodePointsWritten = 0,
354 unsigned int errorWord = '?',
356
357 // UTF-32 to UTF-8 Methods
358
359 static int utf32ToUtf8(bsl::string *dstString,
360 const unsigned int *srcString,
361 bsl::size_t *numCodePointsWritten = 0,
362 unsigned char errorByte = '?',
363 ByteOrder::Enum byteOrder =
365 /// Load into the specified `dstString` the result of converting the
366 /// specified `srcString` of `UTF-32` values to `UTF-8` and return 0 on
367 /// success or `CharConvertStatus::k_INVALID_INPUT_BIT` if invalid
368 /// `UTF-32` values (in the range `[0xD800 .. 0xDFFF]` or above
369 /// 0x10FFFF) are encountered. Optionally specify `srcStringlength` as
370 /// the number of `UTF-32` values to be converted. If `srcStringLength`
371 /// is specified, convert that many UTF-32 values from `srcString`
372 /// (including zero values), otherwise convert values up to but not
373 /// including a terminating zero value. Optionally specify
374 /// `numCodePointsWritten` to receive the number of `UTF-8` code points
375 /// written to `dstString`, including the null-terminator. Optionally
376 /// specify `errorByte` as the character to be written to `dstString` as
377 /// the translation of invalid `UTF-32` values; if not specified, `?` is
378 /// used, and if given as 0, no character is written at all. Optionally
379 /// specify `byteOrder` to determine how `UTF-32` values in `srcString`
380 /// are interpreted; if not given, host byte order is used. The
381 /// behavior is undefined if `errorByte` is 0x80 or above. Note that if
382 /// you are passing the `bsl::vector<unsigned int>` obtained from a call
383 /// to `utf8ToUtf32` and using `srcStringLength`, you must take care to
384 /// pass `vector.size() - 1` to `srcStringLength` to avoid embedding the
385 /// terminating 0.
386 static int utf32ToUtf8(std::string *dstString,
387 const unsigned int *srcString,
388 bsl::size_t *numCodePointsWritten = 0,
389 unsigned char errorByte = '?',
390 ByteOrder::Enum byteOrder =
392#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
393 static int utf32ToUtf8(std::pmr::string *dstString,
394 const unsigned int *srcString,
395 bsl::size_t *numCodePointsWritten = 0,
396 unsigned char errorByte = '?',
398#endif
399 static int utf32ToUtf8(bsl::string *dstString,
400 const unsigned int *srcString,
401 bsl::size_t srcStringLength,
402 bsl::size_t *numCodePointsWritten = 0,
403 unsigned char errorByte = '?',
404 ByteOrder::Enum byteOrder =
406 static int utf32ToUtf8(std::string *dstString,
407 const unsigned int *srcString,
408 bsl::size_t srcStringLength,
409 bsl::size_t *numCodePointsWritten = 0,
410 unsigned char errorByte = '?',
411 ByteOrder::Enum byteOrder =
413#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
414 static int utf32ToUtf8(std::pmr::string *dstString,
415 const unsigned int *srcString,
416 bsl::size_t srcStringLength,
417 bsl::size_t *numCodePointsWritten = 0,
418 unsigned char errorByte = '?',
420#endif
421
422 static int utf32ToUtf8(bsl::vector<char> *dstVector,
423 const unsigned int *srcString,
424 bsl::size_t *numCodePointsWritten = 0,
425 unsigned char errorByte = '?',
426 ByteOrder::Enum byteOrder =
428 /// Load into the specified `dstVector` the result of converting the
429 /// specified `srcString` of `UTF-32` values to `UTF-8`, always followed by
430 /// a null character, and return 0 on success or
431 /// `CharConvertStatus::k_INVALID_INPUT_BIT` if invalid `UTF-32` values (in
432 /// the range `[0xD800 .. 0xDFFF]` or above 0x10FFFF) are seen. Optionally
433 /// specify `srcStringlength` as the number of `UTF-32` values to be
434 /// converted. If `srcStringLength` is specified, convert that many UTF-32
435 /// values from `srcString` (including zero values), otherwise convert
436 /// values up to but not including a terminating zero value. Optionally
437 /// specify `numCodePointsWritten` to receive the number of `UTF-8` code
438 /// points written to `dstVector`. Optionally specify `errorByte` as the
439 /// character to be written to `dstVector` as the translation of invalid
440 /// `UTF-32` values; if not specified, `?` is used, and if given as 0, no
441 /// character is written at all. Optionally specify `byteOrder` to
442 /// determine how `UTF-32` values in `srcString` are interpreted; if not
443 /// given, host byte order is used. The behavior is undefined if
444 /// `errorByte` is 0x80 or above. Note that if you are passing the
445 /// `bsl::vector<unsigned int>` obtained from a call to `utf8ToUtf32` and
446 /// using `srcStringLength`, you must take care to pass `vector.size() - 1`
447 /// to `srcStringLength` to avoid embedding the terminating 0.
448 static int utf32ToUtf8(std::vector<char> *dstVector,
449 const unsigned int *srcString,
450 bsl::size_t *numCodePointsWritten = 0,
451 unsigned char errorByte = '?',
452 ByteOrder::Enum byteOrder =
454#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
455 static int utf32ToUtf8(
456 std::pmr::vector<char> *dstVector,
457 const unsigned int *srcString,
458 bsl::size_t *numCodePointsWritten = 0,
459 unsigned char errorByte = '?',
461#endif
462 static int utf32ToUtf8(bsl::vector<char> *dstVector,
463 const unsigned int *srcString,
464 bsl::size_t srcStringLength,
465 bsl::size_t *numCodePointsWritten = 0,
466 unsigned char errorByte = '?',
467 ByteOrder::Enum byteOrder =
469 static int utf32ToUtf8(std::vector<char> *dstVector,
470 const unsigned int *srcString,
471 bsl::size_t srcStringLength,
472 bsl::size_t *numCodePointsWritten = 0,
473 unsigned char errorByte = '?',
474 ByteOrder::Enum byteOrder =
476#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
477 static int utf32ToUtf8(
478 std::pmr::vector<char> *dstVector,
479 const unsigned int *srcString,
480 bsl::size_t srcStringLength,
481 bsl::size_t *numCodePointsWritten = 0,
482 unsigned char errorByte = '?',
484#endif
485
486 static int utf32ToUtf8(char *dstBuffer,
487 bsl::size_t dstCapacity,
488 const unsigned int *srcString,
489 bsl::size_t *numCodePointsWritten = 0,
490 bsl::size_t *numBytesWritten = 0,
491 unsigned char errorByte = '?',
492 ByteOrder::Enum byteOrder =
494 /// Unless `dstCapacity == 0`, load into the specified `dstBuffer` all
495 /// or as many complete `UTF-8` sequences converted from the specified
496 /// `srcString` of UTF-32 as will fit, along with an always-present
497 /// terminating null byte, into the specified `dstCapacity` bytes, and
498 /// return 0 on success or a bit-wise OR of
499 /// `CharConvertStatus::k_INVALID_INPUT_BIT` if invalid `UTF-32` values
500 /// (in the range `[0xD800 .. 0xDFFF]` or above 0x10FFFF) are seen and
501 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` if there is insufficient
502 /// room for the entire result to be written. If `dstCapacity == 0`
503 /// return `CharConvertStatus::k_INVALID_OUT_OF_SPACE_BIT` without
504 /// modifying `dstBuffer`. Optionally specify `srcStringlength` as the
505 /// number of `UTF-32` values to be converted. If `srcStringLength` is
506 /// specified, convert that many UTF-32 values from `srcString`
507 /// (including zero values), otherwise convert values up to but not
508 /// including a terminating zero value. Optionally specify
509 /// `numCodePointsWritten` to receive the number of `UTF-8` code points
510 /// written to `dstBuffer`. Optionally specify `numBytesWritten` to
511 /// receive the number of bytes written to `dstBuffer`. Optionally
512 /// specify `errorByte` as the character to be written to `dstBuffer` as
513 /// the translation of invalid `UTF-32` values; if not specified, `?` is
514 /// used, and if given as 0, no character is written at all. Optionally
515 /// specify `byteOrder` to determine how `UTF-32` values in `srcString`
516 /// are interpreted; if not given, host byte order is used. The
517 /// behavior is undefined if `errorByte` is 0x80 or above. Note that if
518 /// you are passing the `bsl::vector<unsigned int>` obtained from a call
519 /// to `utf8ToUtf32` and using `srcStringLength`, you must take care to
520 /// pass `vector.size() - 1` to `srcStringLength` to avoid embedding the
521 /// terminating 0.
522 static int utf32ToUtf8(char *dstBuffer,
523 bsl::size_t dstCapacity,
524 const unsigned int *srcString,
525 bsl::size_t srcStringLength,
526 bsl::size_t *numCodePointsWritten = 0,
527 bsl::size_t *numBytesWritten = 0,
528 unsigned char errorByte = '?',
529 ByteOrder::Enum byteOrder =
531};
532
533} // close package namespace
534
535
536#endif
537
538// ----------------------------------------------------------------------------
539// Copyright 2015 Bloomberg Finance L.P.
540//
541// Licensed under the Apache License, Version 2.0 (the "License");
542// you may not use this file except in compliance with the License.
543// You may obtain a copy of the License at
544//
545// http://www.apache.org/licenses/LICENSE-2.0
546//
547// Unless required by applicable law or agreed to in writing, software
548// distributed under the License is distributed on an "AS IS" BASIS,
549// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
550// See the License for the specific language governing permissions and
551// limitations under the License.
552// ----------------------------- END-OF-FILE ----------------------------------
553
554/** @} */
555/** @} */
556/** @} */
Definition bslstl_stringview.h:441
Definition bslstl_string.h:1281
Definition bslstl_vector.h:1025
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdlde_base64alphabet.h:118
Enum
Definition bdlde_byteorder.h:134
@ e_HOST
Definition bdlde_byteorder.h:144
Definition bdlde_charconvertutf32.h:241
static int utf8ToUtf32(bsl::vector< unsigned int > *dstVector, const char *srcString, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(std::vector< char > *dstVector, const unsigned int *srcString, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(std::string *dstString, const unsigned int *srcString, bsl::size_t srcStringLength, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf32(bsl::vector< unsigned int > *dstVector, const bsl::string_view &srcString, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned int *srcString, bsl::size_t srcStringLength, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf32(std::vector< unsigned int > *dstVector, const bsl::string_view &srcString, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(bsl::vector< char > *dstVector, const unsigned int *srcString, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf32(unsigned int *dstBuffer, bsl::size_t dstCapacity, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf32(unsigned int *dstBuffer, bsl::size_t dstCapacity, const char *srcString, bsl::size_t *numCodePointsWritten=0, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(std::string *dstString, const unsigned int *srcString, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(bsl::string *dstString, const unsigned int *srcString, bsl::size_t srcStringLength, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(bsl::string *dstString, const unsigned int *srcString, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(std::vector< char > *dstVector, const unsigned int *srcString, bsl::size_t srcStringLength, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf32(std::vector< unsigned int > *dstVector, const char *srcString, unsigned int errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(bsl::vector< char > *dstVector, const unsigned int *srcString, bsl::size_t srcStringLength, bsl::size_t *numCodePointsWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf32ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned int *srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, unsigned char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)