BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlde_charconvertucs2.h
Go to the documentation of this file.
1/// @file bdlde_charconvertucs2.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlde_charconvertucs2.h -*-C++-*-
8#ifndef INCLUDED_BDLDE_CHARCONVERTUCS2
9#define INCLUDED_BDLDE_CHARCONVERTUCS2
10
11#include <bsls_ident.h>
12
13BSLS_IDENT("$Id: $")
14
15/// @defgroup bdlde_charconvertucs2 bdlde_charconvertucs2
16/// @brief Provide efficient conversions between UTF-8 and UCS-2 encodings.
17/// @addtogroup bdl
18/// @{
19/// @addtogroup bdlde
20/// @{
21/// @addtogroup bdlde_charconvertucs2
22/// @{
23///
24/// <h1> Outline </h1>
25/// * <a href="#bdlde_charconvertucs2-purpose"> Purpose</a>
26/// * <a href="#bdlde_charconvertucs2-classes"> Classes </a>
27/// * <a href="#bdlde_charconvertucs2-description"> Description </a>
28/// * <a href="#bdlde_charconvertucs2-history-and-motivation"> History and Motivation </a>
29/// * <a href="#bdlde_charconvertucs2-usage"> Usage </a>
30/// * <a href="#bdlde_charconvertucs2-example-1-c-style-interface"> Example 1: C-Style Interface </a>
31/// * <a href="#bdlde_charconvertucs2-example-2-c-style-round-trip"> Example 2: C-Style Round-Trip </a>
32/// * <a href="#bdlde_charconvertucs2-example-3-c--style-interface"> Example 3: C++-Style Interface </a>
33///
34/// # Purpose {#bdlde_charconvertucs2-purpose}
35/// Provide efficient conversions between UTF-8 and UCS-2 encodings.
36///
37/// # Classes {#bdlde_charconvertucs2-classes}
38///
39/// - bdlde::CharConvertUcs2: namespace for conversions between UTF-8 and UCS-2
40///
41/// # Description {#bdlde_charconvertucs2-description}
42/// This component provides a suite of pure procedures supporting
43/// the *fast* conversion of *valid* UTF-8 encoded "C" strings to *valid* UCS-2
44/// 16-bit character arrays and vice versa. In order to provide the fastest
45/// possible implementation, some error checking is deliberately omitted, and
46/// the input strings are required to be null-terminated; however, all C-style
47/// functions will honor `strlcpy` semantics and null-terminate any output
48/// buffer having a non-zero length.
49///
50/// ## History and Motivation {#bdlde_charconvertucs2-history-and-motivation}
51///
52///
53/// UTF-8 is a character encoding that allows 32-bit character sets like Unicode
54/// to be represented using null-terminated (8-bit) byte strings (NTBS), while
55/// allowing "standard ASCII" strings to be used "as-is". Note that UTF-8 is
56/// described in detail in RFC 2279 (http://tools.ietf.org/html/rfc2279).
57///
58/// UCS-2 is a 16-bit character encoding with no support for "higher-order"
59/// character encodings. UCS-2 is equivalent to UTF-16 in the Basic
60/// Multilingual Plane (BMP) of Unicode (the first 65536 character points,
61/// excluding the "surrogate code points" U+D800-U+DFFF, which do not map to
62/// Unicode characters). If the characters being represented are within the
63/// BMP, then UCS-2 can be thought of as "the Windows encoding" for
64/// international characters. Historically, UCS-2 was the only "wide char"
65/// representation for Windows versions prior to Windows 2000. UTF-16 was
66/// adopted instead for Windows 2000, and has been used ever since.
67///
68/// Most conversion routines strive for correctness at the cost of performance.
69/// The `glib` conversion routines are *much* slower than the functions
70/// implemented here because the `glib` functions first compute the number of
71/// output characters required, allocate the memory for them, and then perform
72/// the conversion, validating the input characters. The C-style methods of
73/// `bdlde::CharConvertUcs2`, on the other hand, assume that the user-provided
74/// output buffer is wide enough, make a "best effort" to convert into it, and
75/// return an error code if not enough space was provided. The C++-style
76/// methods are more forgiving, since the output `bsl::string` or
77/// `bsl::vector<unsigned short>` is resized as needed. No attempt is made to
78/// validate whether the character codes correspond to valid Unicode code
79/// points, nor is validation performed to check for overlong UTF-8 encodings
80/// (where characters that could be expressed in one octet are encoded using two
81/// octets).
82///
83/// ## Usage {#bdlde_charconvertucs2-usage}
84///
85///
86/// This section illustrates intended use of this component.
87///
88/// ### Example 1: C-Style Interface {#bdlde_charconvertucs2-example-1-c-style-interface}
89///
90///
91/// The following snippet of code illustrates a typical use of the
92/// `bdlde::CharConvertUcs2` struct's C-style utility functions, converting a
93/// simple UTF-8 string to UCS-2.
94/// @code
95/// void testCFunction1()
96/// {
97/// unsigned short buffer[256]; // arbitrary "wide-enough" size
98/// bsl::size_t buffSize = sizeof buffer / sizeof *buffer;
99/// bsl::size_t charsWritten;
100///
101/// int retVal =
102/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer,
103/// buffSize,
104/// "Hello",
105/// &charsWritten);
106///
107/// assert( 0 == retVal);
108/// assert('H' == buffer[0]);
109/// assert('e' == buffer[1]);
110/// assert('l' == buffer[2]);
111/// assert('l' == buffer[3]);
112/// assert('o' == buffer[4]);
113/// assert( 0 == buffer[5]);
114/// assert( 6 == charsWritten);
115/// }
116/// @endcode
117///
118/// ### Example 2: C-Style Round-Trip {#bdlde_charconvertucs2-example-2-c-style-round-trip}
119///
120///
121/// The following snippet of code illustrates another typical use of the
122/// `bdlde::CharConvertUcs2` struct's C-style utility functions, converting a
123/// simple UTF-8 string to UCS-2, then converting the UCS-2 back and making sure
124/// the round-trip conversion results in the input.
125/// @code
126/// void testCFunction2()
127/// {
128/// unsigned short buffer[256]; // arbitrary "wide-enough" size
129/// bsl::size_t buffSize = sizeof buffer / sizeof *buffer;
130/// bsl::size_t charsWritten;
131///
132/// // "&Eacute;cole", the French word for School. '&Eacute;' is the HTML
133/// // entity equivalent to "Unicode-E WITH ACUTE, LATIN CAPITAL LETTER".
134/// int retVal =
135/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer,
136/// buffSize,
137/// "\xc3\x89" "cole",
138/// &charsWritten);
139///
140/// assert( 0 == retVal);
141/// assert(0xc9 == buffer[0]); // Unicode-E WITH ACUTE, LATIN CAPITAL LETTER
142/// assert('c' == buffer[1]);
143/// assert('o' == buffer[2]);
144/// assert('l' == buffer[3]);
145/// assert('e' == buffer[4]);
146/// assert( 0 == buffer[5]);
147/// assert( 6 == charsWritten);
148///
149/// char buffer2[256]; // arbitrary "wide-enough" size
150/// bsl::size_t buffer2Size = sizeof buffer2 / sizeof *buffer2;
151/// bsl::size_t bytesWritten = 0;
152///
153/// // Reversing the conversion returns the original string:
154/// retVal =
155/// BloombergLP::bdlde::CharConvertUcs2::ucs2ToUtf8(buffer2,
156/// buffer2Size,
157/// buffer,
158/// &charsWritten,
159/// &bytesWritten);
160///
161/// assert( 0 == retVal);
162/// assert( 0 == bsl::strcmp(buffer2, "\xc3\x89" "cole"));
163///
164/// // 6 characters written, but 7 bytes, since the first character takes 2
165/// // octets.
166///
167/// assert( 6 == charsWritten);
168/// assert( 7 == bytesWritten);
169/// }
170/// @endcode
171/// In this example, a UTF-8 input string is converted then passed to another
172/// function, which expects a UCS-2 buffer.
173///
174/// First, we define a utility *strlen* replacement for UCS-2:
175/// @code
176/// int wideStrlen(const unsigned short *str)
177/// {
178/// int len = 0;
179///
180/// while (*str++) {
181/// ++len;
182/// }
183///
184/// return len;
185/// }
186/// @endcode
187/// Now, some arbitrary function that calls `wideStrlen`:
188/// @code
189/// void functionRequiringUcs2(const unsigned short *str, bsl::size_t strLen)
190/// {
191/// // Would probably do something more reasonable here.
192///
193/// assert(wideStrlen(str) + 1 == static_cast<int>(strLen));
194/// }
195/// @endcode
196/// Finally, we can take some UTF-8 as an input and call
197/// `functionRequiringUcs2`:
198/// @code
199/// void processUtf8(const char *strU8)
200/// {
201/// unsigned short buffer[1024]; // some "large enough" size
202/// bsl::size_t buffSize = sizeof buffer / sizeof *buffer;
203/// bsl::size_t charsWritten = 0;
204///
205/// int result =
206/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer,
207/// buffSize,
208/// strU8,
209/// &charsWritten);
210///
211/// if (0 == result) {
212/// functionRequiringUcs2(buffer, charsWritten);
213/// }
214/// }
215/// @endcode
216///
217/// ### Example 3: C++-Style Interface {#bdlde_charconvertucs2-example-3-c--style-interface}
218///
219///
220/// The following snippet of code illustrates a typical use of the
221/// `bdlde::CharConvertUcs2` struct's C++-style utility functions, converting a
222/// simple UTF-8 string to UCS-2.
223/// @code
224/// void loadUCS2Hello(bsl::vector<unsigned short> *result)
225/// {
226/// int retVal =
227/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(result,
228/// "Hello");
229///
230/// assert( 0 == retVal);
231/// assert('H' == (*result)[0]);
232/// assert('e' == (*result)[1]);
233/// assert('l' == (*result)[2]);
234/// assert('l' == (*result)[3]);
235/// assert('o' == (*result)[4]);
236/// assert( 0 == (*result)[5]);
237/// assert( 6 == result->size());
238/// }
239/// @endcode
240/// The following snippet of code illustrates another typical use of the
241/// `bdlde::CharConvertUcs2` struct's C++-style utility functions, first
242/// converting from UTF-8 to UCS-2, and then converting back to make sure the
243/// round trip returns the same value.
244/// @code
245/// void checkCppRoundTrip()
246/// {
247/// bsl::vector<unsigned short> result;
248///
249/// // "&Eacute;cole", the French word for School. &Eacute; is the HTML
250/// // entity corresponding to "Unicode-E WITH ACUTE, LATIN CAPITAL LETTER".
251/// int retVal =
252/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(&result,
253/// "\xc3\x89" "cole");
254///
255/// assert( 0 == retVal);
256/// assert(0xc9 == result[0]); // Unicode-E WITH ACUTE, LATIN CAPITAL LETTER
257/// assert('c' == result[1]);
258/// assert('o' == result[2]);
259/// assert('l' == result[3]);
260/// assert('e' == result[4]);
261/// assert( 0 == result[5]);
262/// assert( 6 == result.size());
263///
264/// bsl::string result2;
265/// bsl::size_t charsWritten = 0;
266///
267/// // Reversing the conversion returns the original string:
268/// retVal =
269/// BloombergLP::bdlde::CharConvertUcs2::ucs2ToUtf8(&result2,
270/// &result.front(),
271/// &charsWritten);
272///
273/// assert( 0 == retVal);
274/// assert( result2 == "\xc3\x89" "cole");
275///
276/// // 6 characters written (including the null-terminator), and 6 bytes,
277/// // since the first character takes 2 octets and the null-terminator is
278/// // not counted in "length()".
279/// assert( 6 == charsWritten);
280/// assert( 6 == result2.length());
281/// }
282/// @endcode
283/// In this example, a UTF-8 input string is converted then returned.
284/// @code
285/// bsl::vector<unsigned short> processUtf8(const bsl::string& strU8)
286/// {
287/// bsl::vector<unsigned short> result;
288///
289/// BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(&result, strU8.c_str());
290///
291/// return result;
292/// }
293/// @endcode
294/// @}
295/** @} */
296/** @} */
297
298/** @addtogroup bdl
299 * @{
300 */
301/** @addtogroup bdlde
302 * @{
303 */
304/** @addtogroup bdlde_charconvertucs2
305 * @{
306 */
307
308#include <bdlscm_version.h>
309
311
312#include <bsl_cstddef.h> // 'bsl::size_t'
313#include <bsl_string.h>
314#include <bsl_vector.h>
315
316#include <bsls_libraryfeatures.h>
317
318#include <string> // 'std::string', 'std::pmr::string'
319#include <vector> // 'std::vector', 'std::pmr::vector'
320
321
322
323namespace bdlde {
324 // ======================
325 // struct CharConvertUcs2
326 // ======================
327
328/// This `struct` provides a namespace for a suite of pure procedures to
329/// convert character buffers between UTF-8 and UCS-2. UCS-2 conversions
330/// are performed to/from the full `2 ^ 16` bit space (the "UTF-16" hole
331/// U+D800-U+DFFF is not treated as a special case). Note that all C-style
332/// routines in this component honor *strlcpy* semantics, meaning that all
333/// returned C-style strings will be null-terminated as long as the return
334/// buffer size is positive (i.e., `dstCapacity > 0`). Note that since all
335/// UCS-2 operations take place as `unsigned short`s, byte order is not
336/// taken into consideration, and Byte Order Mark (BOM) characters are not
337/// generated. If a BOM is present in the input, it will be translated into
338/// the output.
340
341 // CLASS METHODS
342
343 static int utf8ToUcs2(unsigned short *dstBuffer,
344 bsl::size_t dstCapacity,
345 const char *srcString,
346 bsl::size_t *numCharsWritten = 0,
347 unsigned short errorCharacter = '?');
348
350 const char *srcString,
351 unsigned short errorCharacter = '?');
352 /// Load, into the specified `dstBuffer` of the specified `dstCapacity`,
353 /// the result of converting the specified null-terminated UTF-8
354 /// `srcString` to its UCS-2 equivalent. Optionally specify
355 /// `numCharsWritten` which (if non-zero) indicates the modifiable
356 /// integer into which the number of characters written (including the
357 /// null terminator) is to be loaded. Optionally specify
358 /// `errorCharacter` to be substituted for invalid (i.e., not
359 /// convertible to UCS-2) input characters. If `errorCharacter` is 0,
360 /// invalid input characters are ignored (i.e., produce no corresponding
361 /// output characters). Return 0 on success and a bitwise-or of the
362 /// masks specified by `CharConvertStatus::Enum` otherwise, with
363 /// `CharConvertStatus::k_INVALID_INPUT_BIT` set to indicate that at
364 /// least one invalid input sequence was encountered, and
365 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` set to indicate that
366 /// `dstCapacity` was insufficient to accommodate the output. If
367 /// `dstCapacity` was insufficient, the maximal null-terminated prefix
368 /// of the properly converted result string is loaded into `dstBuffer`,
369 /// and (unless null) `*numCharsWritten` is set to `dstCapacity`. The
370 /// behavior is undefined unless `0 <= dstCapacity`, `dstBuffer` refers
371 /// to an array of at least `dstCapacity` elements, and `srcString` is
372 /// null-terminated. Note that if `dstCapacity` is 0, this function
373 /// returns exactly 2 and `*numCharsWritten` (if specified) is loaded
374 /// with 0 (since there is insufficient space for the null terminator
375 /// even for an empty input string).
376 static int utf8ToUcs2(std::vector<unsigned short> *result,
377 const char *srcString,
378 unsigned short errorCharacter = '?');
379#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR
380 static int utf8ToUcs2(
381 std::pmr::vector<unsigned short> *result,
382 const char *srcString,
383 unsigned short errorCharacter = '?');
384#endif
385 // Load into the specified 'result' the conversion of the specified
386 // null-terminated UTF-8 'srcString' to its null-terminated UCS-2
387 // equivalent. Optionally specify 'errorCharacter' to be substituted
388 // for invalid (i.e., not convertible to UCS-2) input characters. If
389 // 'errorCharacter' is 0, invalid input characters are ignored (i.e.,
390 // produce no corresponding output characters). Return 0 on success
391 // and 'CharConvertStatus::k_INVALILD_CHARS_BIT' otherwise, meaning
392 // that at least one sequence of characters was encountered that could
393 // not be translated to UCS-2. If 'result & 1' is non-zero, one or
394 // more input characters are invalid (in which case the conversion
395 // continues). The behavior is undefined unless 'srcString' is
396 // null-terminated. Note that the null-terminating word counts towards
397 // 'result->size()'.
398
399 static int ucs2ToUtf8(char *dstBuffer,
400 bsl::size_t dstCapacity,
401 const unsigned short *srcString,
402 bsl::size_t *numCharsWritten = 0,
403 bsl::size_t *numBytesWritten = 0);
404
405 static int ucs2ToUtf8(bsl::string *result,
406 const unsigned short *srcString,
407 bsl::size_t *numCharsWritten = 0);
408 /// Load, into the specified `dstBuffer` of the specified `dstCapacity`,
409 /// the result of converting the specified null-terminated UCS-2
410 /// `srcString` to its UTF-8 equivalent. Optionally specify
411 /// `numCharsWritten` which (if not 0) indicates the modifiable integer
412 /// into which the number of *UTF-8 characters* written (including the
413 /// null terminator) is to be loaded. Optionally specify
414 /// `numBytesWritten` which (if not 0) indicates the modifiable integer
415 /// into which the number of *bytes* written (including the null
416 /// terminator) is to be loaded. Return 0 on success and a bitwise-or
417 /// of the masks specified by `CharConvertStatus::Enum` otherwise,
418 /// with `CharConvertStatus::k_INVALID_INPUT_BIT` set to indicate that
419 /// at least one invalid input sequence was encountered, and
420 /// `CharConvertStatus::k_OUT_OF_SPACE_BIT` set to indicate that
421 /// `dstCapacity` was insufficient to accommodate the output. If
422 /// `dstCapacity` was insufficient, the maximal null-terminated prefix
423 /// of the properly converted result string is loaded into `dstBuffer`.
424 /// The behavior is undefined unless `0 <= dstCapacity`, `dstBuffer`
425 /// refers to an array of at least `dstCapacity` elements, and
426 /// `srcString` is null-terminated. Note that if `dstCapacity` is 0,
427 /// this function returns exactly 2 and `*numCharsWritten` and
428 /// `*numBytesWritten` (if not null) are loaded with 0 (since there is
429 /// insufficient space for the null terminator even for an empty input
430 /// string). Also note that since UTF-8 is a variable-length encoding,
431 /// it is possible for `numBytesWritten` to be greater than
432 /// `numCharsWritten`, and therefore that an input `srcString` of
433 /// `dstCapacity - 1` *characters* may not fit into `dstBuffer`.
434 static int ucs2ToUtf8(std::string *result,
435 const unsigned short *srcString,
436 bsl::size_t *numCharsWritten = 0);
437#ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR_STRING
438 static int ucs2ToUtf8(std::pmr::string *result,
439 const unsigned short *srcString,
440 bsl::size_t *numCharsWritten = 0);
441#endif
442 // Load, into the specified 'result', the conversion of the specified
443 // null-terminated UCS-2 'srcString' to its UTF-8 equivalent.
444 // Optionally specify 'numCharsWritten' which (if not 0) indicates the
445 // modifiable integer into which the number of *characters* written
446 // (including the null terminator) is to be loaded. Return 0 on
447 // success and 'CharConvertStatus::k_INVALILD_CHARS_BIT' otherwise,
448 // meaning that at least one sequence of characters was encountered
449 // that could not be translated to UTF-8. The behavior is undefined
450 // unless 'srcString' is null-terminated. Note that the
451 // null-terminating character is not counted in 'result->length()'.
452 // Also note that this function does not currently implement failure
453 // modes; however, this could change if UTF-8 input validation is
454 // added.
455};
456
457} // close package namespace
458
459
460#endif
461
462// ----------------------------------------------------------------------------
463// Copyright 2015 Bloomberg Finance L.P.
464//
465// Licensed under the Apache License, Version 2.0 (the "License");
466// you may not use this file except in compliance with the License.
467// You may obtain a copy of the License at
468//
469// http://www.apache.org/licenses/LICENSE-2.0
470//
471// Unless required by applicable law or agreed to in writing, software
472// distributed under the License is distributed on an "AS IS" BASIS,
473// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
474// See the License for the specific language governing permissions and
475// limitations under the License.
476// ----------------------------- END-OF-FILE ----------------------------------
477
478/** @} */
479/** @} */
480/** @} */
Definition bslstl_string.h:1281
Definition bslstl_vector.h:1025
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdlde_base64alphabet.h:118
Definition bdlde_charconvertucs2.h:339
static int ucs2ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned short *srcString, bsl::size_t *numCharsWritten=0, bsl::size_t *numBytesWritten=0)
static int utf8ToUcs2(std::vector< unsigned short > *result, const char *srcString, unsigned short errorCharacter='?')
static int ucs2ToUtf8(std::string *result, const unsigned short *srcString, bsl::size_t *numCharsWritten=0)
static int utf8ToUcs2(unsigned short *dstBuffer, bsl::size_t dstCapacity, const char *srcString, bsl::size_t *numCharsWritten=0, unsigned short errorCharacter='?')
static int utf8ToUcs2(bsl::vector< unsigned short > *result, const char *srcString, unsigned short errorCharacter='?')
static int ucs2ToUtf8(bsl::string *result, const unsigned short *srcString, bsl::size_t *numCharsWritten=0)