BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlde_charconvertutf16.h
Go to the documentation of this file.
1/// @file bdlde_charconvertutf16.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlde_charconvertutf16.h -*-C++-*-
8#ifndef INCLUDED_BDLDE_CHARCONVERTUTF16
9#define INCLUDED_BDLDE_CHARCONVERTUTF16
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdlde_charconvertutf16 bdlde_charconvertutf16
15/// @brief Provide fast, safe conversion between UTF-8 and UTF-16 encodings.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdlde
19/// @{
20/// @addtogroup bdlde_charconvertutf16
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdlde_charconvertutf16-purpose"> Purpose</a>
25/// * <a href="#bdlde_charconvertutf16-classes"> Classes </a>
26/// * <a href="#bdlde_charconvertutf16-description"> Description </a>
27/// * <a href="#bdlde_charconvertutf16-history-and-motivation"> History and Motivation </a>
28/// * <a href="#bdlde_charconvertutf16-wstrings-and-utf-16"> WSTRINGS and UTF-16 </a>
29/// * <a href="#bdlde_charconvertutf16-usage"> Usage </a>
30/// * <a href="#bdlde_charconvertutf16-example-1-translation-to-fixed-length-buffers"> Example 1: Translation to Fixed-Length Buffers </a>
31/// * <a href="#bdlde_charconvertutf16-example-2-translation-to-stl-containers"> Example 2: Translation to STL Containers </a>
32///
33/// # Purpose {#bdlde_charconvertutf16-purpose}
34/// Provide fast, safe conversion between UTF-8 and UTF-16 encodings.
35///
36/// # Classes {#bdlde_charconvertutf16-classes}
37///
38/// - bdlde::CharConvertUtf16: namespace for conversions between UTF-8 and UTF-16
39///
40/// # Description {#bdlde_charconvertutf16-description}
41/// This component provides a suite of static functions supporting
42/// the *fast* conversion of *valid* UTF-8 encoded strings to *valid* UTF-16
43/// 16-bit word arrays, wstrings, and vectors, and conversion of *valid* UTF-16
44/// encoded word sequences to *valid* UTF-8 byte arrays, strings, and byte
45/// vectors. Invalid byte sequences and code points forbidden by either
46/// encoding are removed and (optionally) replaced by a single word or byte
47/// provided by the caller. In UTF-16 -> UTF-8 conversion, the replacement word
48/// must be a non-zero byte, in the other direction, it must be a single,
49/// non-zero word. The byte or word count and code point count that are
50/// optionally returned through pointer arguments include the terminating null
51/// code point in their count. The byte order of the UTF-16 input or output can
52/// be specified via the optional `byteOrder` argument, which is assumed to be
53/// host byte order if not specified. In functions taking UTF-8, input is in
54/// the form of a `bslstl::StringRef` or a null-terminated `const char *`. In
55/// functions taking UTF-16, input is either in the form of a
56/// `bslstl::StringRefWide` or a pointer to a null-terminated array of
57/// `unsigned short` or `wchar_t`.
58///
59/// ## History and Motivation {#bdlde_charconvertutf16-history-and-motivation}
60///
61///
62/// UTF-8 is an encoding that allows 32-bit character sets like Unicode
63/// to be represented using (8-bit) byte strings, while allowing "standard
64/// ASCII" strings to be used "as-is". Note that UTF-8 is described in detail
65/// in RFC 3629 (http://www.ietf.org/rfc/rfc3629.txt).
66///
67/// UTF-16 is a 16-bit encoding that allows Unicode code points up to 0x10ffff
68/// to be encoded using one or two 16-bit values. Note that UTF-16 is described
69/// in detail in RFC 2781 (http://www.ietf.org/rfc/rfc2781.txt).
70///
71/// The functions here that translate to fixed buffers make a single pass
72/// through the data. The functions that translate to `bsl::string`s and STL
73/// containers, however, like the `glib` conversion routines, make two passes: a
74/// size estimation pass, after which the output container is sized
75/// appropriately, and then the translation pass.
76///
77/// The methods that output to a `vector`, `string`, or `wstring` will all grow
78/// the output object as necessary to fit the data, and in the end will exactly
79/// resize the object to the output (including the terminating 0 for `vector`,
80/// which is not included for `string` or `wstring`). Note that in the case of
81/// `string` or `wstring`, the terminating 0 code point is still included in the
82/// code point count.
83///
84/// Non-minimal UTF-8 encodings of code points are reported as errors. Octets
85/// and post-conversion code points in the forbidden ranges are treated as
86/// errors and removed (or replaced, if a replacement word is provided).
87///
88/// ## WSTRINGS and UTF-16 {#bdlde_charconvertutf16-wstrings-and-utf-16}
89///
90///
91/// UTF-16 (or UTF-8, for that matter) can be stored in `wstring`s, but note
92/// that the size of a `wstring::value_type`, also known as a `wchar_t` word,
93/// varies across different platforms -- it is 4 bytes on Solaris, Linux, and
94/// Darwin, and 2 bytes on AIX and Windows. So a file of `wchar_t` words
95/// written by one platform may not be readable by another. Byte order is also
96/// a consideration, and a non-host byte order can be handled by using the
97/// optional `byteOrder` argument of these functions. Another factor is that,
98/// since UTF-16 words all fit in 2 bytes, using `wchar_t` to store UTF-16 is
99/// very wasteful of space on many platforms.
100///
101/// ## Usage {#bdlde_charconvertutf16-usage}
102///
103///
104/// This section illustrates intended use of this component.
105///
106/// ### Example 1: Translation to Fixed-Length Buffers {#bdlde_charconvertutf16-example-1-translation-to-fixed-length-buffers}
107///
108///
109/// In this example, we will translate a string containing a non-ASCII code
110/// point from UTF-16 to UTF-8 and back using fixed-length buffers.
111///
112/// First, we create a UTF-16 string spelling `ecole` in French, which begins
113/// with `0xc9`, a non-ASCII `e` with an accent over it:
114/// @code
115/// unsigned short utf16String[] = { 0xc9, 'c', 'o', 'l', 'e', 0 };
116/// @endcode
117/// Then, we create a byte buffer to store the UTF-8 result of the translation
118/// in, and variables to monitor counts of code points and bytes translated:
119/// @code
120/// char utf8String[7];
121/// bsl::size_t numCodePoints, numBytes;
122/// numCodePoints = numBytes = -1; // garbage
123/// @endcode
124/// Next, we call `utf16ToUtf8` to do the translation:
125/// @code
126/// int rc = bdlde::CharConvertUtf16::utf16ToUtf8(utf8String,
127/// sizeof(utf8String),
128/// utf16String,
129/// &numCodePoints,
130/// &numBytes);
131/// @endcode
132/// Then, we observe that no errors or warnings occurred, and that the numbers
133/// of code points and bytes were as expected. Note that both `numCodePoints`
134/// and `numBytes` include the terminating 0:
135/// @code
136/// assert(0 == rc);
137/// assert(6 == numCodePoints);
138/// assert(7 == numBytes);
139/// @endcode
140/// Next, we examine the length of the translated string:
141/// @code
142/// assert(numBytes - 1 == bsl::strlen(utf8String));
143/// @endcode
144/// Then, we examine the individual bytes of the translated UTF-8:
145/// @code
146/// assert((char)0xc3 == utf8String[0]);
147/// assert((char)0x89 == utf8String[1]);
148/// assert('c' == utf8String[2]);
149/// assert('o' == utf8String[3]);
150/// assert('l' == utf8String[4]);
151/// assert('e' == utf8String[5]);
152/// assert(0 == utf8String[6]);
153/// @endcode
154/// Next, in preparation for translation back to UTF-16, we create a buffer of
155/// `short` values and the variable `numWords` to track the number of UTF-16
156/// words occupied by the result:
157/// @code
158/// unsigned short secondUtf16String[6];
159/// bsl::size_t numWords;
160/// numCodePoints = numWords = -1; // garbage
161/// @endcode
162/// Then, we do the reverse translation:
163/// @code
164/// rc = bdlde::CharConvertUtf16::utf8ToUtf16(secondUtf16String,
165/// 6,
166/// utf8String,
167/// &numCodePoints,
168/// &numWords);
169/// @endcode
170/// Next, we observe that no errors or warnings were reported, and that the
171/// number of code points and words were as expected. Note that `numCodePoints`
172/// and `numWords` both include the terminating 0:
173/// @code
174/// assert(0 == rc);
175/// assert(6 == numCodePoints);
176/// assert(6 == numWords);
177/// @endcode
178/// Now, we observe that our output is identical to the original UTF-16 string:
179/// @code
180/// assert(0 == bsl::memcmp(utf16String,
181/// secondUtf16String,
182/// sizeof(utf16String)));
183/// @endcode
184/// Finally, we examine the individual words of the reverse translation:
185/// @code
186/// assert(0xc9 == secondUtf16String[0]);
187/// assert('c' == secondUtf16String[1]);
188/// assert('o' == secondUtf16String[2]);
189/// assert('l' == secondUtf16String[3]);
190/// assert('e' == secondUtf16String[4]);
191/// assert(0 == secondUtf16String[5]);
192/// @endcode
193///
194/// ### Example 2: Translation to STL Containers {#bdlde_charconvertutf16-example-2-translation-to-stl-containers}
195///
196///
197/// The following snippets of code illustrate a typical use of the
198/// `bdlde::CharConvertUtf16` struct's utility functions, first converting from
199/// UTF-8 to UTF-16, and then converting back to make sure the round trip
200/// returns the same value, translating to STL containers in both directions.
201///
202/// First, we declare a string of UTF-8 containing single-, double-, triple-,
203/// and quadruple-octet code points:
204/// @code
205/// const char utf8MultiLang[] = {
206/// "Hello" // -- ASCII
207/// "\xce\x97" "\xce\x95" "\xce\xbb" // -- Greek
208/// "\xe4\xb8\xad" "\xe5\x8d\x8e" // -- Chinese
209/// "\xe0\xa4\xad" "\xe0\xa4\xbe" // -- Hindi
210/// "\xf2\x94\xb4\xa5" "\xf3\xb8\xac\x83" }; // -- Quad octets
211/// @endcode
212/// Then, we declare an `enum` summarizing the counts of code points in the
213/// string and verify that the counts add up to the length of the string:
214/// @code
215/// enum { NUM_ASCII_CODE_POINTS = 5,
216/// NUM_GREEK_CODE_POINTS = 3,
217/// NUM_CHINESE_CODE_POINTS = 2,
218/// NUM_HINDI_CODE_POINTS = 2,
219/// NUM_QUAD_CODE_POINTS = 2 };
220///
221/// assert(1 * NUM_ASCII_CODE_POINTS +
222/// 2 * NUM_GREEK_CODE_POINTS +
223/// 3 * NUM_CHINESE_CODE_POINTS +
224/// 3 * NUM_HINDI_CODE_POINTS +
225/// 4 * NUM_QUAD_CODE_POINTS == bsl::strlen(utf8MultiLang));
226/// @endcode
227/// Next, we declare the vector where our UTF-16 output will go, and a variable
228/// into which the number of code points (not bytes or words) written will be
229/// stored. It is not necessary to initialize `utf16CodePointsWritten`:
230/// @code
231/// bsl::vector<unsigned short> v16;
232/// bsl::size_t utf16CodePointsWritten;
233/// @endcode
234/// Note that for performance, we should `v16.reserve(sizeof(utf8MultiLang))`,
235/// but it's not strictly necessary -- the vector will automatically be grown to
236/// the correct size. Also note that if `v16` were not empty, that wouldn't be
237/// a problem -- any contents will be discarded.
238///
239/// Then, we do the translation to UTF-16:
240/// @code
241/// int retVal = bdlde::CharConvertUtf16::utf8ToUtf16(&v16,
242/// utf8MultiLang,
243/// &utf16CodePointsWritten);
244///
245/// assert(0 == retVal); // verify success
246/// assert(0 == v16.back()); // verify null terminated
247/// @endcode
248/// Next, we verify that the number of code points (not bytes or words) that was
249/// returned is correct:
250/// @code
251/// enum { EXPECTED_CODE_POINTS_WRITTEN =
252/// NUM_ASCII_CODE_POINTS + NUM_GREEK_CODE_POINTS +
253/// NUM_CHINESE_CODE_POINTS + NUM_HINDI_CODE_POINTS +
254/// NUM_QUAD_CODE_POINTS + 1 };
255///
256/// assert(EXPECTED_CODE_POINTS_WRITTEN == utf16CodePointsWritten);
257/// @endcode
258/// Then, we verify that the number of 16-bit words written was correct. The
259/// quad octet code points each require 2 `short` words of output:
260/// @code
261/// enum { EXPECTED_UTF16_WORDS_WRITTEN =
262/// NUM_ASCII_CODE_POINTS + NUM_GREEK_CODE_POINTS +
263/// NUM_CHINESE_CODE_POINTS + NUM_HINDI_CODE_POINTS +
264/// NUM_QUAD_CODE_POINTS * 2 + 1 };
265///
266/// assert(EXPECTED_UTF16_WORDS_WRITTEN == v16.size());
267/// @endcode
268/// Next, we calculate and confirm the difference between the number of UTF-16
269/// words output and the number of bytes input. The ASCII code points will take
270/// 1 16-bit word apiece, the Greek code points are double octets that will
271/// become single `short` values, the Chinese code points are encoded as UTF-8
272/// triple octets that will turn into single 16-bit words, the same for the
273/// Hindi code points, and the quad code points are quadruple octets that will
274/// turn into double `short` values:
275/// @code
276/// enum { SHRINKAGE = NUM_ASCII_CODE_POINTS * (1-1) +
277/// NUM_GREEK_CODE_POINTS * (2-1) +
278/// NUM_CHINESE_CODE_POINTS * (3-1) +
279/// NUM_HINDI_CODE_POINTS * (3-1) +
280/// NUM_QUAD_CODE_POINTS * (4-2) };
281///
282/// assert(v16.size() == sizeof(utf8MultiLang) - SHRINKAGE);
283/// @endcode
284/// Then, we go on to do the reverse `utf16ToUtf8` transform to turn it back
285/// into UTF-8, and we should get a result identical to our original input. We
286/// declare a `bsl::string` for our output, and a variable to count the number
287/// of code points (not bytes or words) translated:
288/// @code
289/// bsl::string s;
290/// bsl::size_t uf8CodePointsWritten;
291/// @endcode
292/// Again, note that for performance, we should ideally
293/// `s.reserve(3 * v16.size())` but it's not really necessary.
294///
295/// Now, we do the reverse transform:
296/// @code
297/// retVal = bdlde::CharConvertUtf16::utf16ToUtf8(&s,
298/// v16.begin(),
299/// &uf8CodePointsWritten);
300/// @endcode
301/// Finally, we verify that a successful status was returned, that the output of
302/// the reverse transform was identical to the original input, and that the
303/// number of code points translated was as expected:
304/// @code
305/// assert(0 == retVal);
306/// assert(utf8MultiLang == s);
307/// assert(s.length() + 1 == sizeof(utf8MultiLang));
308///
309/// assert(EXPECTED_CODE_POINTS_WRITTEN == uf8CodePointsWritten);
310/// assert(utf16CodePointsWritten == uf8CodePointsWritten);
311/// @endcode
312/// @}
313/** @} */
314/** @} */
315
316/** @addtogroup bdl
317 * @{
318 */
319/** @addtogroup bdlde
320 * @{
321 */
322/** @addtogroup bdlde_charconvertutf16
323 * @{
324 */
325
326#include <bdlscm_version.h>
327
328#include <bdlde_byteorder.h>
330
331#include <bsls_libraryfeatures.h>
332
333#include <bsl_cstddef.h> // 'bsl::size_t'
334#include <bsl_string.h>
335#include <bsl_string_view.h>
336#include <bsl_vector.h>
337
338#include <string>
339#include <vector>
340
341
342
343namespace bdlde {
344 // =======================
345 // struct CharConvertUtf16
346 // =======================
347
348/// This `struct` provides a namespace for a suite of static functions to
349/// convert buffers or containers between UTF-8 and UTF-16. Note that Byte
350/// Order Mark (BOM) sequences are neither generated nor recognized as
351/// special. If a BOM is present in the input, it will be translated,
352/// whether correct (`0xfeff`) or incorrect (`0xfffe`), into the output
353/// without any special handling.
355
356 // CLASS METHODS
357
358 // -- UTF-8 to UTF-16 Methods
359
360 /// Return the number of words required to store the translation of the
361 /// specified UTF-8 string `srcBuffer` into a 0 terminated UTF-16 string
362 /// (including the 0 terminating word into the returned count).
363 /// Optionally specify `endPtr`, referring to one past the last input
364 /// character. If `endPtr` is not supplied, or is 0, treat `srcBuffer`
365 /// as 0 terminated. Note that this function will return the size
366 /// `utf8ToUtf16` will require, assuming the `errorWord` argument to
367 /// `utf8ToUtf16` is non-zero.
368 static bsl::size_t computeRequiredUtf16Words(const char *srcBuffer,
369 const char *endPtr = 0);
370
371 /// Load into the specified `dstString` the result of converting the
372 /// specified UTF-8 `srcString` to its UTF-16 equivalent. Optionally
373 /// specify `numCodePointsWritten`, which, if not 0, indicates the
374 /// location of the modifiable variable into which the number of Unicode
375 /// code points written, including the terminating null character, is to
376 /// be loaded. Optionally specify an `errorChar` to be substituted, if
377 /// not 0, for invalid encodings in the input string. Optionally
378 /// specify `byteOrder` to indicate the byte order of the UTF-16 output;
379 /// if `byteOrder` is not specified, the output is assumed to be in host
380 /// byte order. Return 0 on success and
381 /// `CharConvertStatus::k_INVALID_INPUT_BIT` otherwise. Invalid
382 /// encodings are multi-byte encoding parts out of sequence, non-minimal
383 // UTF-8 encodings of code points, or code points outside the ranges
384 static int utf8ToUtf16(
385 bsl::wstring *dstString,
386 const bsl::string_view& srcString,
387 bsl::size_t *numCodePointsWritten = 0,
388 wchar_t errorWord = '?',
389 ByteOrder::Enum byteOrder =
391 /// that UTF-16 can validly encode (in the range `[ 1 .. 0xd7ff ]` or
392 /// `[ 0xe000 .. 0x10ffff ]`). If `errorChar` is 0, invalid input code
393 /// points are ignored (i.e., produce no corresponding output). The
394 /// behavior is undefined unless `srcString` is null-terminated when
395 /// specified as a `const char *`. Note that one code point can occupy
396 /// multiple UTF-16 words, and that if `srcString` is a
397 /// `bslstl::StringRef`, it may contain embedded null bytes that will be
398 /// translated to null words embedded in the output.
399 static int utf8ToUtf16(
400 std::wstring *dstString,
401 const bsl::string_view& srcString,
402 bsl::size_t *numCodePointsWritten = 0,
403 wchar_t errorWord = '?',
404 ByteOrder::Enum byteOrder =
406#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
407 static int utf8ToUtf16(
408 std::pmr::wstring *dstString,
409 const bsl::string_view& srcString,
410 bsl::size_t *numCodePointsWritten = 0,
411 wchar_t errorWord = '?',
413#endif
414 static int utf8ToUtf16(
415 bsl::wstring *dstString,
416 const char *srcString,
417 bsl::size_t *numCodePointsWritten = 0,
418 wchar_t errorWord = '?',
419 ByteOrder::Enum byteOrder =
421 static int utf8ToUtf16(
422 std::wstring *dstString,
423 const char *srcString,
424 bsl::size_t *numCodePointsWritten = 0,
425 wchar_t errorWord = '?',
426 ByteOrder::Enum byteOrder =
428#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
429 static int utf8ToUtf16(std::pmr::wstring *dstString,
430 const char *srcString,
431 bsl::size_t *numCodePointsWritten = 0,
432 wchar_t errorWord = '?',
434#endif
435#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
436 static int utf8ToUtf16(
437 bsl::u16string *dstString,
438 const bsl::string_view& srcString,
439 bsl::size_t *numCodePointsWritten = 0,
440 char16_t errorChar = '?',
441 ByteOrder::Enum byteOrder =
443 static int utf8ToUtf16(
444 std::u16string *dstString,
445 const bsl::string_view& srcString,
446 bsl::size_t *numCodePointsWritten = 0,
447 char16_t errorChar = '?',
448 ByteOrder::Enum byteOrder =
450# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
451 static int utf8ToUtf16(
452 std::pmr::u16string *dstString,
453 const bsl::string_view& srcString,
454 bsl::size_t *numCodePointsWritten = 0,
455 char16_t errorChar = '?',
456 ByteOrder::Enum byteOrder =
458# endif
459 static int utf8ToUtf16(
460 bsl::u16string *dstString,
461 const char *srcString,
462 bsl::size_t *numCodePointsWritten = 0,
463 char16_t errorChar = '?',
464 ByteOrder::Enum byteOrder =
466 static int utf8ToUtf16(
467 std::u16string *dstString,
468 const char *srcString,
469 bsl::size_t *numCodePointsWritten = 0,
470 char16_t errorChar = '?',
471 ByteOrder::Enum byteOrder =
473# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
474 static int utf8ToUtf16(
475 std::pmr::u16string *dstString,
476 const char *srcString,
477 bsl::size_t *numCodePointsWritten = 0,
478 char16_t errorChar = '?',
479 ByteOrder::Enum byteOrder =
481# endif
482#endif
483
484 static int utf8ToUtf16(
486 const bsl::string_view& srcString,
487 bsl::size_t *numCodePointsWritten = 0,
488 unsigned short errorWord = '?',
489 ByteOrder::Enum byteOrder =
491 /// Load into the specified `dstVector` the result of converting the
492 /// specified UTF-8 `srcString` to its UTF-16 equivalent. Optionally
493 /// specify `numCodePointsWritten`, which (if not 0) indicates the
494 /// location of the modifiable variable into which the number of UTF-16
495 /// code points (including the null terminator) written is to be loaded.
496 /// Optionally specify an `errorWord` to be substituted (if not 0) for
497 /// invalid encodings in the input string. Invalid encodings are
498 /// multi-byte encoding parts out of sequence, non-minimal UTF-8
499 /// encodings, or code points outside the ranges that UTF-16 can validly
500 /// encode (in the range `[ 1 .. 0xd7ff ]` or `[ 0xe000 .. 0x10ffff ]`).
501 /// If `errorWord` is 0, invalid input is ignored (i.e., produces no
502 /// corresponding output). Optionally specify `byteOrder` to indicate
503 /// the byte order of the UTF-16 output; if `byteOrder` is not
504 /// specified, the output is assumed to be in host byte order. Any
505 /// previous contents of the destination are discarded. Return 0 on
506 /// success and `CharConvertStatus::k_INVALID_INPUT_BIT` otherwise. The
507 /// behavior is undefined unless `errorWord` is either 0 or a valid
508 /// single-word encoded UTF-16 code point (in the range
509 /// `[ 1 .. 0xd7ff ]` or `[ 0xe000 .. 0xffff ]`) and `srcString` is
510 /// null-terminated when specified as a `const char *`. Note that one
511 /// code point can occupy multiple 16-bit words. Also note that the
512 /// size of the result vector is always fitted to the null-terminated
513 /// result, including the terminating 0. Also note that if `srcString`
514 /// is a `bslstl::StringRef`, it may contain embedded null bytes that
515 /// will be translated to null words embedded in the output.
516 static int utf8ToUtf16(
517 std::vector<unsigned short> *dstVector,
518 const bsl::string_view& srcString,
519 bsl::size_t *numCodePointsWritten = 0,
520 unsigned short errorWord = '?',
521 ByteOrder::Enum byteOrder =
523#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
524 static int utf8ToUtf16(
525 std::pmr::vector<unsigned short> *dstVector,
526 const bsl::string_view& srcString,
527 bsl::size_t *numCodePointsWritten = 0,
528 unsigned short errorWord = '?',
530#endif
531 static int utf8ToUtf16(
533 const char *srcString,
534 bsl::size_t *numCodePointsWritten = 0,
535 unsigned short errorWord = '?',
536 ByteOrder::Enum byteOrder =
538 static int utf8ToUtf16(
539 std::vector<unsigned short> *dstVector,
540 const char *srcString,
541 bsl::size_t *numCodePointsWritten = 0,
542 unsigned short errorWord = '?',
543 ByteOrder::Enum byteOrder =
545#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
546 static int utf8ToUtf16(
547 std::pmr::vector<unsigned short> *dstVector,
548 const char *srcString,
549 bsl::size_t *numCodePointsWritten = 0,
550 unsigned short errorWord = '?',
552#endif
553
554 static int utf8ToUtf16(
555 unsigned short *dstBuffer,
556 bsl::size_t dstCapacity,
557 const bsl::string_view& srcString,
558 bsl::size_t *numCodePointsWritten = 0,
559 bsl::size_t *numWordsWritten = 0,
560 unsigned short errorWord = '?',
561 ByteOrder::Enum byteOrder =
563 static int utf8ToUtf16(
564 unsigned short *dstBuffer,
565 bsl::size_t dstCapacity,
566 const char *srcString,
567 bsl::size_t *numCodePointsWritten = 0,
568 bsl::size_t *numWordsWritten = 0,
569 unsigned short errorWord = '?',
570 ByteOrder::Enum byteOrder =
572
573 static int utf8ToUtf16(
574 wchar_t *dstBuffer,
575 bsl::size_t dstCapacity,
576 const bsl::string_view& srcString,
577 bsl::size_t *numCodePointsWritten = 0,
578 bsl::size_t *numWordsWritten = 0,
579 wchar_t errorWord = '?',
580 ByteOrder::Enum byteOrder =
582 /// Load into the specified `dstBuffer` of the specified `dstCapacity`,
583 /// the result of converting the specified UTF-8 `srcString` to its
584 /// UTF-16 equivalent. Optionally specify `numCodePointsWritten`, which
585 /// (if not 0) indicates the location of the variable into which the
586 /// number of UTF-16 code points (including the null terminator) written
587 /// is to be loaded. Optionally specify `numWordsWritten`, which (if
588 /// not 0) indicates the location of the modifiable variable into which
589 /// the number of `short` *memory words* written (including the null
590 /// terminator) is to be loaded. Optionally specify an `errorWord` to
591 /// be substituted (if not 0) for invalid encodings in the input string.
592 /// Invalid encodings are multi-byte encoding parts out of sequence,
593 /// non-minimal UTF-8 encodings of code points, or code points outside
594 /// the ranges that UTF-16 can validly encode (in the range
595 /// `[ 1 .. 0xd7ff ]` or `[ 0xe000 .. 0x10ffff ]`). If `errorWord` is
596 /// 0, invalid input sequences are ignored (i.e., produce no
597 /// corresponding output). Optionally specify `byteOrder` to indicate
598 /// the byte order of the UTF-16 output; if `byteOrder` is not
599 /// specified, the output is assumed to be in host byte order. Return 0
600 /// on success and a bit-wise or of the bits specified by
601 /// `CharConvertStatus::Enum` otherwise to indicate that there were
602 /// invalid input sequences or if `dstCapacity` was inadequate to store
603 /// the output. If `dstCapacity > 0` yet `dstCapacity` specifies a
604 /// buffer too small to hold the output, the maximal null-terminated
605 /// prefix of the properly converted result string is loaded into
606 /// `dstBuffer`. The behavior is undefined unless `dstBuffer` refers to
607 /// an array of at least `dstCapacity` elements, `errorWord` is either 0
608 /// or a valid single-word encoded UTF-16 code point (in the range
609 /// `[ 1 .. 0xd7ff ]` or `[ 0xe000 .. 0xffff ]`), and `srcString` is
610 /// null-terminated when supplied as a `const char *`. Note that if
611 /// `dstCapacity` is 0, `*dstBuffer` is not modified and this function
612 /// returns a value with `CharConvertStatus::k_OUT_OF_SPACE_BIT` set and
613 /// 0 is written into `*numCodePointsWritten` and `*numWordsWritten` (if
614 /// those pointers are non-null), since there is insufficient space for
615 /// even a null terminator alone. Also note that one code point can
616 /// occupy multiple 16-bit *words*, so that `*numWordsWritten` may be
617 /// greater than `*numCodePointsWritten`, and therefore that an input
618 /// `srcString` of `dstCapacity` code points may not fit into
619 /// `dstBuffer`, however, an input `srcString` of `dstCapacity` bytes
620 /// (including null terminator, if present) will always fit (since the
621 /// UTF-8 encoding of a code point requires at least as many bytes as
622 /// the UTF-16 encoding requires words). Also note that if `srcString`
623 /// is a `bslstl::StringRef`, it may contain embedded null bytes that
624 /// will be translated to null words embedded in the output.
625 static int utf8ToUtf16(
626 wchar_t *dstBuffer,
627 bsl::size_t dstCapacity,
628 const char *srcString,
629 bsl::size_t *numCodePointsWritten = 0,
630 bsl::size_t *numWordsWritten = 0,
631 wchar_t errorWord = '?',
632 ByteOrder::Enum byteOrder =
634#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
635 static int utf8ToUtf16(
636 char16_t *dstBuffer,
637 bsl::size_t dstCapacity,
638 const bsl::string_view& srcString,
639 bsl::size_t *numCodePointsWritten = 0,
640 bsl::size_t *numWordsWritten = 0,
641 char16_t errorChar = '?',
642 ByteOrder::Enum byteOrder =
644 /// Load into the specified `dstBuffer` of the specified `dstCapacity`,
645 /// the result of converting the specified UTF-8 `srcString` to its
646 /// UTF-16 equivalent. Optionally specify `numCodePointsWritten`, which
647 /// (if not 0) indicates the location of the variable into which the
648 /// number of UTF-16 code points (including the terminating 0) written
649 /// is to be loaded. Optionally specify `numWordsWritten`, which (if
650 /// not 0) indicates the location of the modifiable variable into which
651 /// the number of `short` *memory words* written (including the null
652 /// terminator) is to be loaded. Optionally specify an `errorWord` to
653 /// be substituted (if not 0) for invalid encodings in the input string.
654 /// Invalid encodings are multi-byte encoding parts out of sequence,
655 /// non-minimal UTF-8 encodings of code points, or code points outside
656 /// the ranges that UTF-16 can validly encode (in the range
657 /// `[ 1 .. 0xd7ff ]` or `[ 0xde00 .. 0x10ffff ]`). Optionally specify
658 /// `byteOrder` to indicate the byte order of the UTF-16 output; if
659 /// `byteOrder` is not specified, the output is assumed to be in host
660 /// byte order. If `errorWord` is 0, invalid input sequences are
661 /// ignored (i.e., produce no corresponding output). Return 0 on
662 /// success and a bit-wise or of the bits specified by
663 /// `CharConvertStatus::Enum` otherwise to indicate that there were
664 /// invalid sequences or if `dstCapacity` was inadequate to store the
665 /// output. If `dstCapacity > 0` yet `dstCapacity` specifies a buffer
666 /// too small to hold the output, the maximal null-terminated prefix of
667 /// the properly converted result string is loaded into `dstBuffer`.
668 /// The behavior is undefined unless `dstBuffer`, if specified, refers
669 /// to an array of at least `dstCapacity` elements, `errorWord` is
670 /// either 0 or a valid single-word encoded UTF-16 code point (in the
671 /// range `[ 1 .. 0xd7ff ]` or `[ 0xe000 .. 0xffff ]`), and `srcString`
672 /// is null-terminated if supplied as a `const char *`. Note that if
673 /// `dstCapacity` is 0, `*dstBuffer` is not modified and this function
674 /// returns a value with `CharConvertStatus::k_OUT_OF_SPACE_BIT` set and
675 /// 0 is written into `*numCodePointsWritten` and `*numWordsWritten` (if
676 /// those pointers are non-null), since there is insufficient space for
677 /// even a null terminator alone. Also note that one code point can
678 /// occupy multiple 16-bit words, so that `*numWordsWritten` may be
679 /// greater than `*numCodePointsWritten`, and therefore that an input
680 /// `srcString` of `dstCapacity` code points may not fit into
681 /// `dstBuffer`. However, an input `srcString` of `dstCapacity` bytes
682 /// (including terminating 0, if present) will always fit (since the
683 /// UTF-8 encoding of a code point requires at least as many bytes as
684 /// the UTF-16 encoding requires words). Also note that if `srcString`
685 /// is a `bslstl::StringRef`, it may contain embedded null bytes that
686 /// will be translated to null words embedded in the output.
687 static int utf8ToUtf16(
688 char16_t *dstBuffer,
689 bsl::size_t dstCapacity,
690 const char *srcString,
691 bsl::size_t *numCodePointsWritten = 0,
692 bsl::size_t *numWordsWritten = 0,
693 char16_t errorChar = '?',
694 ByteOrder::Enum byteOrder =
696#endif
697
698 // -- UTF-16 to UTF-8 Methods
699
700 /// Return the length needed in bytes, for a buffer to hold the
701 /// null-terminated UTF-8 string translated from the specified UTF-16
702 /// string `srcBuffer` (including the terminating '\0' in the returned
703 /// count). Optionally specify `endPtr`, referring to one past the last
704 /// input character. If `endPtr` is not supplied, or is 0, treat
705 /// `srcBuffer` as 0 terminated. Optionally specify `byteOrder`
706 /// indicating the byte order of `srcBuffer`; if `byteOrder` is not
707 /// supplied, the host byte order is used. Note that this function will
708 /// return the size `utf16ToUtf8` will require, assuming the `errorByte`
709 /// argument to `utf16ToUtf8` is non-zero.
710 static bsl::size_t computeRequiredUtf8Bytes(
711 const unsigned short *srcBuffer,
712 const unsigned short *endPtr = 0,
713 ByteOrder::Enum byteOrder =
715 static bsl::size_t computeRequiredUtf8Bytes(
716 const wchar_t *srcBuffer,
717 const wchar_t *endPtr = 0,
718 ByteOrder::Enum byteOrder =
720#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
721 static bsl::size_t computeRequiredUtf8Bytes(
722 const char16_t *srcBuffer,
723 const char16_t *endPtr = 0,
724 ByteOrder::Enum byteOrder =
726#endif
727
728
729 static int utf16ToUtf8(bsl::string *dstString,
730 const unsigned short *srcString,
731 bsl::size_t *numCodePointsWritten = 0,
732 char errorByte = '?',
733 ByteOrder::Enum byteOrder =
735 /// Load into the specified `dstString` the result of converting the
736 /// specified UTF-16 `srcString` to its UTF-8 equivalent. Optionally
737 /// specify `numCodePointsWritten`, which (if not 0) indicates the
738 /// location of the modifiable variable into which the number of Unicode
739 /// code points written, including the null terminator, is to be loaded,
740 /// where one code point may occupy multiple bytes. Optionally specify
741 /// an `errorByte` to be substituted (if not 0) for invalid encodings in
742 /// the input string. Invalid encodings are incomplete multi-word
743 /// encodings or parts of a two-word encoding out of their proper
744 /// sequence. If `errorByte` is 0, invalid input sequences are ignored
745 /// (i.e., produce no corresponding output). Any previous contents of
746 /// the destination are discarded. Optionally specify `byteOrder` to
747 /// indicate the byte order of the UTF-16 input; if `byteOrder` is not
748 /// specified, the input is assumed to be in host byte order. Return 0
749 /// on success and `CharConvertStatus::k_INVALID_INPUT_BIT` if one or
750 /// more invalid sequences were encountered in the input. The behavior
751 /// is undefined unless `errorByte` is either 0 or a valid single-byte
752 /// Unicode code point (`0 < errorByte < 0x80`) and `srcString` is
753 /// null-terminated if supplied as a `const wchar_t *`. Note that if
754 /// `srcString` is a `bslstl::StringRefWide`, it may contain embedded 0
755 /// words that will be translated to null bytes embedded in the output.
756 static int utf16ToUtf8(std::string *dstString,
757 const unsigned short *srcString,
758 bsl::size_t *numCodePointsWritten = 0,
759 char errorByte = '?',
760 ByteOrder::Enum byteOrder =
762#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
763 static int utf16ToUtf8(
764 std::pmr::string *dstString,
765 const unsigned short *srcString,
766 bsl::size_t *numCodePointsWritten = 0,
767 char errorByte = '?',
769#endif
770 static int utf16ToUtf8(
771 bsl::string *dstString,
772 const unsigned short *srcString,
773 bsl::size_t srcLengthInWords,
774 bsl::size_t *numCodePointsWritten = 0,
775 char errorByte = '?',
777 static int utf16ToUtf8(
778 std::string *dstString,
779 const unsigned short *srcString,
780 bsl::size_t srcLengthInWords,
781 bsl::size_t *numCodePointsWritten = 0,
782 char errorByte = '?',
784#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
785 static int utf16ToUtf8(
786 std::pmr::string *dstString,
787 const unsigned short *srcString,
788 bsl::size_t srcLengthInWords,
789 bsl::size_t *numCodePointsWritten = 0,
790 char errorByte = '?',
792#endif
793 static int utf16ToUtf8(
794 bsl::string *dstString,
795 const bsl::wstring_view& srcString,
796 bsl::size_t *numCodePointsWritten = 0,
797 char errorByte = '?',
798 ByteOrder::Enum byteOrder =
800 static int utf16ToUtf8(
801 std::string *dstString,
802 const bsl::wstring_view& srcString,
803 bsl::size_t *numCodePointsWritten = 0,
804 char errorByte = '?',
805 ByteOrder::Enum byteOrder =
807#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
808 static int utf16ToUtf8(
809 std::pmr::string *dstString,
810 const bsl::wstring_view& srcString,
811 bsl::size_t *numCodePointsWritten = 0,
812 char errorByte = '?',
813 ByteOrder::Enum byteOrder =
815#endif
816 static int utf16ToUtf8(bsl::string *dstString,
817 const wchar_t *srcString,
818 bsl::size_t *numCodePointsWritten = 0,
819 char errorByte = '?',
820 ByteOrder::Enum byteOrder =
822 static int utf16ToUtf8(std::string *dstString,
823 const wchar_t *srcString,
824 bsl::size_t *numCodePointsWritten = 0,
825 char errorByte = '?',
826 ByteOrder::Enum byteOrder =
828#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
829 static int utf16ToUtf8(std::pmr::string *dstString,
830 const wchar_t *srcString,
831 bsl::size_t *numCodePointsWritten = 0,
832 char errorByte = '?',
833 ByteOrder::Enum byteOrder =
835#endif
836#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
837 static int utf16ToUtf8(
838 bsl::string *dstString,
839 const bsl::u16string_view& srcString,
840 bsl::size_t *numCodePointsWritten = 0,
841 char errorByte = '?',
842 ByteOrder::Enum byteOrder =
844 static int utf16ToUtf8(
845 std::string *dstString,
846 const bsl::u16string_view& srcString,
847 bsl::size_t *numCodePointsWritten = 0,
848 char errorByte = '?',
849 ByteOrder::Enum byteOrder =
851# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
852 static int utf16ToUtf8(
853 std::pmr::string *dstString,
854 const bsl::u16string_view& srcString,
855 bsl::size_t *numCodePointsWritten = 0,
856 char errorByte = '?',
857 ByteOrder::Enum byteOrder =
859# endif
860 static int utf16ToUtf8(bsl::string *dstString,
861 const char16_t *srcString,
862 bsl::size_t *numCodePointsWritten = 0,
863 char errorByte = '?',
864 ByteOrder::Enum byteOrder =
866 static int utf16ToUtf8(std::string *dstString,
867 const char16_t *srcString,
868 bsl::size_t *numCodePointsWritten = 0,
869 char errorByte = '?',
870 ByteOrder::Enum byteOrder =
872# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
873 static int utf16ToUtf8(std::pmr::string *dstString,
874 const char16_t *srcString,
875 bsl::size_t *numCodePointsWritten = 0,
876 char errorByte = '?',
877 ByteOrder::Enum byteOrder =
879# endif
880#endif
881
882 static int utf16ToUtf8(bsl::vector<char> *dstVector,
883 const unsigned short *srcString,
884 bsl::size_t *numCodePointsWritten = 0,
885 char errorByte = '?',
886 ByteOrder::Enum byteOrder =
888 /// Load into the specified `dstVector` the null-terminated result of
889 /// converting the specified UTF-16 `*srcString` to its UTF-8
890 /// equivalent. Optionally specify `srcLengthInWords`, the number of
891 /// `unsigned short`s of input. If `srcLengthInWords` is not specified,
892 /// the input must be terminated by a null word. Optionally specify
893 /// `numCodePointsWritten`, which (if not 0) indicates the location of
894 /// the modifiable variable into which the number of Unicode code points
895 /// written, including the null terminator, is to be loaded, where one
896 /// code point may occupy multiple bytes. Optionally specify an
897 /// `errorByte` to be substituted (if not 0) for invalid encodings in
898 /// the input string. Invalid encodings are incomplete multi-word
899 /// encodings or parts of a two-word encoding out of their proper
900 /// sequence. If `errorByte` is 0, invalid input sequences are ignored
901 /// (i.e., produce no corresponding output). Optionally specify
902 /// `byteOrder` to indicate the byte order of the UTF-16 input; if
903 /// `byteOrder` is not specified, the input is assumed to be in host
904 /// byte order. Any previous contents of the destination are discarded.
905 /// Return 0 on success and `CharConvertStatus::k_INVALID_INPUT_BIT` if
906 /// one or more invalid sequences were encountered in the input. The
907 /// behavior is undefined unless either `srcLengthInWords` is passed or
908 /// `srcString` is null-terminated, and `errorByte` is either 0 or a
909 /// valid single-byte Unicode code point (`0 < errorByte < 0x80`).
910 static int utf16ToUtf8(std::vector<char> *dstVector,
911 const unsigned short *srcString,
912 bsl::size_t *numCodePointsWritten = 0,
913 char errorByte = '?',
914 ByteOrder::Enum byteOrder =
916#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
917 static int utf16ToUtf8(std::pmr::vector<char> *dstVector,
918 const unsigned short *srcString,
919 bsl::size_t *numCodePointsWritten = 0,
920 char errorByte = '?',
921 ByteOrder::Enum byteOrder =
923#endif
924 static int utf16ToUtf8(bsl::vector<char> *dstVector,
925 const unsigned short *srcString,
926 bsl::size_t srcLengthInWords,
927 bsl::size_t *numCodePointsWritten = 0,
928 char errorByte = '?',
929 ByteOrder::Enum byteOrder =
931 static int utf16ToUtf8(std::vector<char> *dstVector,
932 const unsigned short *srcString,
933 bsl::size_t srcLengthInWords,
934 bsl::size_t *numCodePointsWritten = 0,
935 char errorByte = '?',
936 ByteOrder::Enum byteOrder =
938#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
939 static int utf16ToUtf8(std::pmr::vector<char> *dstVector,
940 const unsigned short *srcString,
941 bsl::size_t srcLengthInWords,
942 bsl::size_t *numCodePointsWritten = 0,
943 char errorByte = '?',
944 ByteOrder::Enum byteOrder =
946#endif
947
948 static int utf16ToUtf8(
949 bsl::vector<char> *dstVector,
950 const bsl::wstring_view& srcString,
951 bsl::size_t *numCodePointsWritten = 0,
952 char errorByte = '?',
953 ByteOrder::Enum byteOrder =
955 static int utf16ToUtf8(
956 std::vector<char> *dstVector,
957 const bsl::wstring_view& srcString,
958 bsl::size_t *numCodePointsWritten = 0,
959 char errorByte = '?',
960 ByteOrder::Enum byteOrder =
962#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
963 static int utf16ToUtf8(
964 std::pmr::vector<char> *dstVector,
965 const bsl::wstring_view& srcString,
966 bsl::size_t *numCodePointsWritten = 0,
967 char errorByte = '?',
968 ByteOrder::Enum byteOrder =
970#endif
971 static int utf16ToUtf8(
972 bsl::vector<char> *dstVector,
973 const wchar_t *srcString,
974 bsl::size_t *numCodePointsWritten = 0,
975 char errorByte = '?',
976 ByteOrder::Enum byteOrder =
978 static int utf16ToUtf8(
979 std::vector<char> *dstVector,
980 const wchar_t *srcString,
981 bsl::size_t *numCodePointsWritten = 0,
982 char errorByte = '?',
983 ByteOrder::Enum byteOrder =
985#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
986 static int utf16ToUtf8(
987 std::pmr::vector<char> *dstVector,
988 const wchar_t *srcString,
989 bsl::size_t *numCodePointsWritten = 0,
990 char errorByte = '?',
991 ByteOrder::Enum byteOrder =
993#endif
994
995#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
996 static int utf16ToUtf8(
997 bsl::vector<char> *dstVector,
998 const bsl::u16string_view& srcString,
999 bsl::size_t *numCodePointsWritten = 0,
1000 char errorByte = '?',
1001 ByteOrder::Enum byteOrder =
1003 /// Load into the specified `dstVector` the null-terminated result of
1004 /// converting the specified UTF-16 `srcString` to its UTF-8 equivalent.
1005 /// Optionally specify `numCodePointsWritten`, which (if not 0)
1006 /// indicates the location of the modifiable variable into which the
1007 /// number of Unicode code points written, including the null
1008 /// terminator, is to be loaded, where one code point may occupy
1009 /// multiple bytes. Optionally specify an `errorByte` to be substituted
1010 /// (if not 0) for invalid encodings in the input string. Invalid
1011 /// encodings are incomplete multi-word encodings or parts of a two-word
1012 /// encoding out of their proper sequence. If `errorByte` is 0, invalid
1013 /// input sequences are ignored (i.e., produce no corresponding output).
1014 /// Optionally specify `byteOrder` to indicate the byte order of the
1015 /// UTF-16 input; if `byteOrder` is not specified, the input is assumed
1016 /// to be in host byte order. Any previous contents of the destination
1017 /// are discarded. Return 0 on success and
1018 /// `CharConvertStatus::k_INVALID_INPUT_BIT` if one or more invalid
1019 /// sequences were encountered in the input. The behavior is undefined
1020 /// unless `errorByte` is either 0 or a valid single-byte Unicode code
1021 /// point (`0 < errorByte < 0x80`) and `srcString` is null-terminated if
1022 /// supplied as a `const wchar_t *`. Note that if `srcString` is a
1023 /// `bslstl::StringRef`, it may contain embedded 0 words that will be
1024 /// translated to null bytes embedded in the output.
1025 static int utf16ToUtf8(
1026 std::vector<char> *dstVector,
1027 const bsl::u16string_view& srcString,
1028 bsl::size_t *numCodePointsWritten = 0,
1029 char errorByte = '?',
1030 ByteOrder::Enum byteOrder =
1032# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
1033 static int utf16ToUtf8(
1034 std::pmr::vector<char> *dstVector,
1035 const bsl::u16string_view& srcString,
1036 bsl::size_t *numCodePointsWritten = 0,
1037 char errorByte = '?',
1038 ByteOrder::Enum byteOrder =
1040# endif
1041 static int utf16ToUtf8(
1042 bsl::vector<char> *dstVector,
1043 const char16_t *srcString,
1044 bsl::size_t *numCodePointsWritten = 0,
1045 char errorByte = '?',
1046 ByteOrder::Enum byteOrder =
1048 static int utf16ToUtf8(
1049 std::vector<char> *dstVector,
1050 const char16_t *srcString,
1051 bsl::size_t *numCodePointsWritten = 0,
1052 char errorByte = '?',
1053 ByteOrder::Enum byteOrder =
1055# ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
1056 static int utf16ToUtf8(
1057 std::pmr::vector<char> *dstVector,
1058 const char16_t *srcString,
1059 bsl::size_t *numCodePointsWritten = 0,
1060 char errorByte = '?',
1061 ByteOrder::Enum byteOrder =
1063# endif
1064#endif
1065
1066 static int utf16ToUtf8(char *dstBuffer,
1067 bsl::size_t dstCapacity,
1068 const unsigned short *srcString,
1069 bsl::size_t *numCodePointsWritten = 0,
1070 bsl::size_t *numBytesWritten = 0,
1071 char errorByte = '?',
1072 ByteOrder::Enum byteOrder =
1074 static int utf16ToUtf8(char *dstBuffer,
1075 bsl::size_t dstCapacity,
1076 const unsigned short *srcString,
1077 bsl::size_t srcLengthInWords,
1078 bsl::size_t *numCodePointsWritten = 0,
1079 bsl::size_t *numBytesWritten = 0,
1080 char errorByte = '?',
1081 ByteOrder::Enum byteOrder =
1083 static int utf16ToUtf8(
1084 char *dstBuffer,
1085 bsl::size_t dstCapacity,
1086 const bsl::wstring_view& srcString,
1087 bsl::size_t *numCodePointsWritten = 0,
1088 bsl::size_t *numBytesWritten = 0,
1089 char errorByte = '?',
1090 ByteOrder::Enum byteOrder =
1092 /// Load, into the specified `dstBuffer` of the specified `dstCapacity`,
1093 /// the result of converting the specified UTF-16 `srcString` to its
1094 /// UTF-8 equivalent. Optionally specify `numCodePointsWritten`, which
1095 /// (if not 0) indicates the location of the modifiable variable into
1096 /// which the number of Unicode code points (including the terminating
1097 /// 0, if any) written is to be loaded, where one code point can occupy
1098 /// multiple bytes. Optionally specify `numBytesWritten`, which (if not
1099 /// 0) indicates the location of the modifiable variable into which the
1100 /// number of bytes written (including the null terminator, if any) is
1101 /// to be loaded. Optionally specify an `errorByte` to be substituted
1102 /// (if not 0) for invalid encodings in the input string. Invalid
1103 /// encodings are incomplete multi-word encodings or parts of a two-word
1104 /// encoding out of their proper sequence. If `errorByte` is 0, invalid
1105 /// input sequences are ignored (i.e., produce no corresponding output).
1106 /// Optionally specify `byteOrder` to indicate the byte order of the
1107 /// UTF-16 input; if `byteOrder` is not specified, the input is assumed
1108 /// to be in host byte order. Return 0 on success and a bitwise-or of
1109 /// the flags defined by `CharConvertStatus::Enum` otherwise.
1110 /// `CharConvertStatus::k_INVALID_INPUT_BIT` will be set if one or more
1111 /// invalid sequences were encountered in the input, and
1112 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` will be set if the output
1113 /// space was exhausted before conversion was complete. The behavior is
1114 /// undefined unless `dstBuffer` refers to an array of at least
1115 /// `dstCapacity` elements, `errorByte` is either 0 or a valid
1116 /// single-byte Unicode code point (`0 < errorByte < 0x80`), and
1117 /// `srcString` is null-terminated if supplied as a pointer. Note that
1118 /// if `dstCapacity` is 0, this function returns
1119 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` set and 0 is written into
1120 /// `*numCodePointsWritten` and `*numBytesWritten` (if those pointers
1121 /// are non-null), since there is insufficient space for even a null
1122 /// terminator alone. Also note that since UTF-8 is a variable-length
1123 /// encoding, `numBytesWritten` may be up to four times
1124 /// `numCodePointsWritten`, and therefore that an input `srcString` of
1125 /// `dstCapacity` code points (including the terminating 0, if present)
1126 /// may not fit into `dstBuffer`. A one-word (two-byte) UTF-16 code
1127 /// point will require one to three UTF-8 octets (bytes); a two-word
1128 /// (four-byte) UTF-16 code point will always require four UTF-8 octets.
1129 /// Also note that the amount of room needed will vary with the contents
1130 /// of the data and the language being translated, but never will the
1131 /// number of bytes output exceed three times the number of words input.
1132 /// Also note that, if `dstCapacity > 0`, then, after completion,
1133 /// `strlen(dstBuffer) + 1 == *numBytesWritten`. Also note that if
1134 /// `srcString` is a `bslstl::StringRef`, it may contain embedded 0
1135 /// words that will be translated to null bytes embedded in the output.
1136 static int utf16ToUtf8(
1137 char *dstBuffer,
1138 bsl::size_t dstCapacity,
1139 const wchar_t *srcString,
1140 bsl::size_t *numCodePointsWritten = 0,
1141 bsl::size_t *numBytesWritten = 0,
1142 char errorByte = '?',
1143 ByteOrder::Enum byteOrder =
1145#if defined(BSLS_COMPILERFEATURES_SUPPORT_UNICODE_CHAR_TYPES)
1146 static int utf16ToUtf8(
1147 char *dstBuffer,
1148 bsl::size_t dstCapacity,
1149 const bsl::u16string_view& srcString,
1150 bsl::size_t *numCodePointsWritten = 0,
1151 bsl::size_t *numBytesWritten = 0,
1152 char errorByte = '?',
1153 ByteOrder::Enum byteOrder =
1155 static int utf16ToUtf8(
1156 char *dstBuffer,
1157 bsl::size_t dstCapacity,
1158 const char16_t *srcString,
1159 bsl::size_t *numCodePointsWritten = 0,
1160 bsl::size_t *numBytesWritten = 0,
1161 char errorByte = '?',
1162 ByteOrder::Enum byteOrder =
1164#endif
1165};
1166} // close package namespace
1167
1168
1169
1170#endif
1171
1172// ----------------------------------------------------------------------------
1173// Copyright 2015 Bloomberg Finance L.P.
1174//
1175// Licensed under the Apache License, Version 2.0 (the "License");
1176// you may not use this file except in compliance with the License.
1177// You may obtain a copy of the License at
1178//
1179// http://www.apache.org/licenses/LICENSE-2.0
1180//
1181// Unless required by applicable law or agreed to in writing, software
1182// distributed under the License is distributed on an "AS IS" BASIS,
1183// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1184// See the License for the specific language governing permissions and
1185// limitations under the License.
1186// ----------------------------- END-OF-FILE ----------------------------------
1187
1188/** @} */
1189/** @} */
1190/** @} */
Definition bslstl_stringview.h:441
Definition bslstl_string.h:1281
Definition bslstl_vector.h:1025
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdlde_base64alphabet.h:118
Enum
Definition bdlde_byteorder.h:134
@ e_HOST
Definition bdlde_byteorder.h:144
Definition bdlde_charconvertutf16.h:354
static int utf16ToUtf8(std::string *dstString, const wchar_t *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::vector< char > *dstVector, const unsigned short *srcString, bsl::size_t srcLengthInWords, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::string *dstString, const bsl::wstring_view &srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::vector< char > *dstVector, const unsigned short *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned short *srcString, bsl::size_t srcLengthInWords, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::string *dstString, const unsigned short *srcString, bsl::size_t srcLengthInWords, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(wchar_t *dstBuffer, bsl::size_t dstCapacity, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numWordsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::string *dstString, const bsl::wstring_view &srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::string *dstString, const unsigned short *srcString, bsl::size_t srcLengthInWords, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::vector< char > *dstVector, const wchar_t *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned short *srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::string *dstString, const wchar_t *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::vector< char > *dstVector, const unsigned short *srcString, bsl::size_t srcLengthInWords, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const wchar_t *srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::vector< char > *dstVector, const bsl::wstring_view &srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static bsl::size_t computeRequiredUtf8Bytes(const wchar_t *srcBuffer, const wchar_t *endPtr=0, ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(std::wstring *dstString, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static bsl::size_t computeRequiredUtf8Bytes(const unsigned short *srcBuffer, const unsigned short *endPtr=0, ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(bsl::wstring *dstString, const char *srcString, bsl::size_t *numCodePointsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::string *dstString, const unsigned short *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static bsl::size_t computeRequiredUtf16Words(const char *srcBuffer, const char *endPtr=0)
static int utf8ToUtf16(wchar_t *dstBuffer, bsl::size_t dstCapacity, const char *srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numWordsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(bsl::wstring *dstString, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::vector< char > *dstVector, const unsigned short *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::vector< char > *dstVector, const bsl::wstring_view &srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(std::wstring *dstString, const char *srcString, bsl::size_t *numCodePointsWritten=0, wchar_t errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(std::vector< unsigned short > *dstVector, const char *srcString, bsl::size_t *numCodePointsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(std::string *dstString, const unsigned short *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(unsigned short *dstBuffer, bsl::size_t dstCapacity, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numWordsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(unsigned short *dstBuffer, bsl::size_t dstCapacity, const char *srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numWordsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(bsl::vector< unsigned short > *dstVector, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(std::vector< unsigned short > *dstVector, const bsl::string_view &srcString, bsl::size_t *numCodePointsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(bsl::vector< char > *dstVector, const wchar_t *srcString, bsl::size_t *numCodePointsWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf16ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const bsl::wstring_view &srcString, bsl::size_t *numCodePointsWritten=0, bsl::size_t *numBytesWritten=0, char errorByte='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)
static int utf8ToUtf16(bsl::vector< unsigned short > *dstVector, const char *srcString, bsl::size_t *numCodePointsWritten=0, unsigned short errorWord='?', ByteOrder::Enum byteOrder=ByteOrder::e_HOST)