// bdlde_charconvertucs2.h -*-C++-*- // ---------------------------------------------------------------------------- // NOTICE // // This component is not up to date with current BDE coding standards, and // should not be used as an example for new development. // ---------------------------------------------------------------------------- #ifndef INCLUDED_BDLDE_CHARCONVERTUCS2 #define INCLUDED_BDLDE_CHARCONVERTUCS2 #include <bsls_ident.h> BSLS_IDENT("$Id: $") //@PURPOSE: Provide efficient conversions between UTF-8 and UCS-2 encodings. // //@CLASSES: // bdlde::CharConvertUcs2: namespace for conversions between UTF-8 and UCS-2 // //@DESCRIPTION: This component provides a suite of pure procedures supporting // the *fast* conversion of *valid* UTF-8 encoded "C" strings to *valid* UCS-2 // 16-bit character arrays and vice versa. In order to provide the fastest // possible implementation, some error checking is deliberately omitted, and // the input strings are required to be null-terminated; however, all C-style // functions will honor 'strlcpy' semantics and null-terminate any output // buffer having a non-zero length. // ///History and Motivation ///---------------------- // UTF-8 is a character encoding that allows 32-bit character sets like Unicode // to be represented using null-terminated (8-bit) byte strings (NTBS), while // allowing "standard ASCII" strings to be used "as-is". Note that UTF-8 is // described in detail in RFC 2279 (http://tools.ietf.org/html/rfc2279). // // UCS-2 is a 16-bit character encoding with no support for "higher-order" // character encodings. UCS-2 is equivalent to UTF-16 in the Basic // Multilingual Plane (BMP) of Unicode (the first 65536 character points, // excluding the "surrogate code points" U+D800-U+DFFF, which do not map to // Unicode characters). If the characters being represented are within the // BMP, then UCS-2 can be thought of as "the Windows encoding" for // international characters. Historically, UCS-2 was the only "wide char" // representation for Windows versions prior to Windows 2000. UTF-16 was // adopted instead for Windows 2000, and has been used ever since. // // Most conversion routines strive for correctness at the cost of performance. // The 'glib' conversion routines are *much* slower than the functions // implemented here because the 'glib' functions first compute the number of // output characters required, allocate the memory for them, and then perform // the conversion, validating the input characters. The C-style methods of // 'bdlde::CharConvertUcs2', on the other hand, assume that the user-provided // output buffer is wide enough, make a "best effort" to convert into it, and // return an error code if not enough space was provided. The C++-style // methods are more forgiving, since the output 'bsl::string' or // 'bsl::vector<unsigned short>' is resized as needed. No attempt is made to // validate whether the character codes correspond to valid Unicode code // points, nor is validation performed to check for overlong UTF-8 encodings // (where characters that could be expressed in one octet are encoded using two // octets). // ///Usage ///- - - // This section illustrates intended use of this component. // ///Example 1: C-Style Interface /// - - - - - - - - - - - - - - // The following snippet of code illustrates a typical use of the // 'bdlde::CharConvertUcs2' struct's C-style utility functions, converting a // simple UTF-8 string to UCS-2. //.. // void testCFunction1() // { // unsigned short buffer[256]; // arbitrary "wide-enough" size // bsl::size_t buffSize = sizeof buffer / sizeof *buffer; // bsl::size_t charsWritten; // // int retVal = // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer, // buffSize, // "Hello", // &charsWritten); // // assert( 0 == retVal); // assert('H' == buffer[0]); // assert('e' == buffer[1]); // assert('l' == buffer[2]); // assert('l' == buffer[3]); // assert('o' == buffer[4]); // assert( 0 == buffer[5]); // assert( 6 == charsWritten); // } //.. // ///Example 2: C-Style Round-Trip ///- - - - - - - - - - - - - - - // The following snippet of code illustrates another typical use of the // 'bdlde::CharConvertUcs2' struct's C-style utility functions, converting a // simple UTF-8 string to UCS-2, then converting the UCS-2 back and making sure // the round-trip conversion results in the input. //.. // void testCFunction2() // { // unsigned short buffer[256]; // arbitrary "wide-enough" size // bsl::size_t buffSize = sizeof buffer / sizeof *buffer; // bsl::size_t charsWritten; // // // "École", the French word for School. 'É' is the HTML // // entity equivalent to "Unicode-E WITH ACUTE, LATIN CAPITAL LETTER". // int retVal = // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer, // buffSize, // "\xc3\x89" "cole", // &charsWritten); // // assert( 0 == retVal); // assert(0xc9 == buffer[0]); // Unicode-E WITH ACUTE, LATIN CAPITAL LETTER // assert('c' == buffer[1]); // assert('o' == buffer[2]); // assert('l' == buffer[3]); // assert('e' == buffer[4]); // assert( 0 == buffer[5]); // assert( 6 == charsWritten); // // char buffer2[256]; // arbitrary "wide-enough" size // bsl::size_t buffer2Size = sizeof buffer2 / sizeof *buffer2; // bsl::size_t bytesWritten = 0; // // // Reversing the conversion returns the original string: // retVal = // BloombergLP::bdlde::CharConvertUcs2::ucs2ToUtf8(buffer2, // buffer2Size, // buffer, // &charsWritten, // &bytesWritten); // // assert( 0 == retVal); // assert( 0 == bsl::strcmp(buffer2, "\xc3\x89" "cole")); // // // 6 characters written, but 7 bytes, since the first character takes 2 // // octets. // // assert( 6 == charsWritten); // assert( 7 == bytesWritten); // } //.. // In this example, a UTF-8 input string is converted then passed to another // function, which expects a UCS-2 buffer. // // First, we define a utility *strlen* replacement for UCS-2: //.. // int wideStrlen(const unsigned short *str) // { // int len = 0; // // while (*str++) { // ++len; // } // // return len; // } //.. // Now, some arbitrary function that calls 'wideStrlen': //.. // void functionRequiringUcs2(const unsigned short *str, bsl::size_t strLen) // { // // Would probably do something more reasonable here. // // assert(wideStrlen(str) + 1 == static_cast<int>(strLen)); // } //.. // Finally, we can take some UTF-8 as an input and call // 'functionRequiringUcs2': //.. // void processUtf8(const char *strU8) // { // unsigned short buffer[1024]; // some "large enough" size // bsl::size_t buffSize = sizeof buffer / sizeof *buffer; // bsl::size_t charsWritten = 0; // // int result = // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(buffer, // buffSize, // strU8, // &charsWritten); // // if (0 == result) { // functionRequiringUcs2(buffer, charsWritten); // } // } //.. // ///Example 3: C++-Style Interface /// - - - - - - - - - - - - - - - // The following snippet of code illustrates a typical use of the // 'bdlde::CharConvertUcs2' struct's C++-style utility functions, converting a // simple UTF-8 string to UCS-2. //.. // void loadUCS2Hello(bsl::vector<unsigned short> *result) // { // int retVal = // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(result, // "Hello"); // // assert( 0 == retVal); // assert('H' == (*result)[0]); // assert('e' == (*result)[1]); // assert('l' == (*result)[2]); // assert('l' == (*result)[3]); // assert('o' == (*result)[4]); // assert( 0 == (*result)[5]); // assert( 6 == result->size()); // } //.. // The following snippet of code illustrates another typical use of the // 'bdlde::CharConvertUcs2' struct's C++-style utility functions, first // converting from UTF-8 to UCS-2, and then converting back to make sure the // round trip returns the same value. //.. // void checkCppRoundTrip() // { // bsl::vector<unsigned short> result; // // // "École", the French word for School. É is the HTML // // entity corresponding to "Unicode-E WITH ACUTE, LATIN CAPITAL LETTER". // int retVal = // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(&result, // "\xc3\x89" "cole"); // // assert( 0 == retVal); // assert(0xc9 == result[0]); // Unicode-E WITH ACUTE, LATIN CAPITAL LETTER // assert('c' == result[1]); // assert('o' == result[2]); // assert('l' == result[3]); // assert('e' == result[4]); // assert( 0 == result[5]); // assert( 6 == result.size()); // // bsl::string result2; // bsl::size_t charsWritten = 0; // // // Reversing the conversion returns the original string: // retVal = // BloombergLP::bdlde::CharConvertUcs2::ucs2ToUtf8(&result2, // &result.front(), // &charsWritten); // // assert( 0 == retVal); // assert( result2 == "\xc3\x89" "cole"); // // // 6 characters written (including the null-terminator), and 6 bytes, // // since the first character takes 2 octets and the null-terminator is // // not counted in "length()". // assert( 6 == charsWritten); // assert( 6 == result2.length()); // } //.. // In this example, a UTF-8 input string is converted then returned. //.. // bsl::vector<unsigned short> processUtf8(const bsl::string& strU8) // { // bsl::vector<unsigned short> result; // // BloombergLP::bdlde::CharConvertUcs2::utf8ToUcs2(&result, strU8.c_str()); // // return result; // } //.. #include <bdlscm_version.h> #include <bdlde_charconvertstatus.h> #include <bsl_cstddef.h> // 'bsl::size_t' #include <bsl_string.h> #include <bsl_vector.h> #include <bsls_libraryfeatures.h> #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR #include <memory_resource> #endif #include <string> // 'std::string', 'std::pmr::string' #include <vector> // 'std::vector', 'std::pmr::vector' namespace BloombergLP { namespace bdlde { // ====================== // struct CharConvertUcs2 // ====================== struct CharConvertUcs2 { // This 'struct' provides a namespace for a suite of pure procedures to // convert character buffers between UTF-8 and UCS-2. UCS-2 conversions // are performed to/from the full '2 ^ 16' bit space (the "UTF-16" hole // U+D800-U+DFFF is not treated as a special case). Note that all C-style // routines in this component honor *strlcpy* semantics, meaning that all // returned C-style strings will be null-terminated as long as the return // buffer size is positive (i.e., 'dstCapacity > 0'). Note that since all // UCS-2 operations take place as 'unsigned short's, byte order is not // taken into consideration, and Byte Order Mark (BOM) characters are not // generated. If a BOM is present in the input, it will be translated into // the output. // CLASS METHODS static int utf8ToUcs2(unsigned short *dstBuffer, bsl::size_t dstCapacity, const char *srcString, bsl::size_t *numCharsWritten = 0, unsigned short errorCharacter = '?'); // Load, into the specified 'dstBuffer' of the specified 'dstCapacity', // the result of converting the specified null-terminated UTF-8 // 'srcString' to its UCS-2 equivalent. Optionally specify // 'numCharsWritten' which (if non-zero) indicates the modifiable // integer into which the number of characters written (including the // null terminator) is to be loaded. Optionally specify // 'errorCharacter' to be substituted for invalid (i.e., not // convertible to UCS-2) input characters. If 'errorCharacter' is 0, // invalid input characters are ignored (i.e., produce no corresponding // output characters). Return 0 on success and a bitwise-or of the // masks specified by 'CharConvertStatus::Enum' otherwise, with // 'CharConvertStatus::k_INVALID_INPUT_BIT' set to indicate that at // least one invalid input sequence was encountered, and // 'CharConvertStatus::k_OUT_OF_SPACE_BIT' set to indicate that // 'dstCapacity' was insufficient to accommodate the output. If // 'dstCapacity' was insufficient, the maximal null-terminated prefix // of the properly converted result string is loaded into 'dstBuffer', // and (unless null) '*numCharsWritten' is set to 'dstCapacity'. The // behavior is undefined unless '0 <= dstCapacity', 'dstBuffer' refers // to an array of at least 'dstCapacity' elements, and 'srcString' is // null-terminated. Note that if 'dstCapacity' is 0, this function // returns exactly 2 and '*numCharsWritten' (if specified) is loaded // with 0 (since there is insufficient space for the null terminator // even for an empty input string). static int utf8ToUcs2(bsl::vector<unsigned short> *result, const char *srcString, unsigned short errorCharacter = '?'); static int utf8ToUcs2(std::vector<unsigned short> *result, const char *srcString, unsigned short errorCharacter = '?'); #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR static int utf8ToUcs2( std::pmr::vector<unsigned short> *result, const char *srcString, unsigned short errorCharacter = '?'); #endif // Load into the specified 'result' the conversion of the specified // null-terminated UTF-8 'srcString' to its null-terminated UCS-2 // equivalent. Optionally specify 'errorCharacter' to be substituted // for invalid (i.e., not convertible to UCS-2) input characters. If // 'errorCharacter' is 0, invalid input characters are ignored (i.e., // produce no corresponding output characters). Return 0 on success // and 'CharConvertStatus::k_INVALILD_CHARS_BIT' otherwise, meaning // that at least one sequence of characters was encountered that could // not be translated to UCS-2. If 'result & 1' is non-zero, one or // more input characters are invalid (in which case the conversion // continues). The behavior is undefined unless 'srcString' is // null-terminated. Note that the null-terminating word counts towards // 'result->size()'. static int ucs2ToUtf8(char *dstBuffer, bsl::size_t dstCapacity, const unsigned short *srcString, bsl::size_t *numCharsWritten = 0, bsl::size_t *numBytesWritten = 0); // Load, into the specified 'dstBuffer' of the specified 'dstCapacity', // the result of converting the specified null-terminated UCS-2 // 'srcString' to its UTF-8 equivalent. Optionally specify // 'numCharsWritten' which (if not 0) indicates the modifiable integer // into which the number of *UTF-8 characters* written (including the // null terminator) is to be loaded. Optionally specify // 'numBytesWritten' which (if not 0) indicates the modifiable integer // into which the number of *bytes* written (including the null // terminator) is to be loaded. Return 0 on success and a bitwise-or // of the masks specified by 'CharConvertStatus::Enum' otherwise, // with 'CharConvertStatus::k_INVALID_INPUT_BIT' set to indicate that // at least one invalid input sequence was encountered, and // 'CharConvertStatus::k_OUT_OF_SPACE_BIT' set to indicate that // 'dstCapacity' was insufficient to accommodate the output. If // 'dstCapacity' was insufficient, the maximal null-terminated prefix // of the properly converted result string is loaded into 'dstBuffer'. // The behavior is undefined unless '0 <= dstCapacity', 'dstBuffer' // refers to an array of at least 'dstCapacity' elements, and // 'srcString' is null-terminated. Note that if 'dstCapacity' is 0, // this function returns exactly 2 and '*numCharsWritten' and // '*numBytesWritten' (if not null) are loaded with 0 (since there is // insufficient space for the null terminator even for an empty input // string). Also note that since UTF-8 is a variable-length encoding, // it is possible for 'numBytesWritten' to be greater than // 'numCharsWritten', and therefore that an input 'srcString' of // 'dstCapacity - 1' *characters* may not fit into 'dstBuffer'. static int ucs2ToUtf8(bsl::string *result, const unsigned short *srcString, bsl::size_t *numCharsWritten = 0); static int ucs2ToUtf8(std::string *result, const unsigned short *srcString, bsl::size_t *numCharsWritten = 0); #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR static int ucs2ToUtf8(std::pmr::string *result, const unsigned short *srcString, bsl::size_t *numCharsWritten = 0); #endif // Load, into the specified 'result', the conversion of the specified // null-terminated UCS-2 'srcString' to its UTF-8 equivalent. // Optionally specify 'numCharsWritten' which (if not 0) indicates the // modifiable integer into which the number of *characters* written // (including the null terminator) is to be loaded. Return 0 on // success and 'CharConvertStatus::k_INVALILD_CHARS_BIT' otherwise, // meaning that at least one sequence of characters was encountered // that could not be translated to UTF-8. The behavior is undefined // unless 'srcString' is null-terminated. Note that the // null-terminating character is not counted in 'result->length()'. // Also note that this function does not currently implement failure // modes; however, this could change if UTF-8 input validation is // added. }; } // close package namespace } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Copyright 2015 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------