BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdljsn_stringutil.h
Go to the documentation of this file.
1/// @file bdljsn_stringutil.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdljsn_stringutil.h -*-C++-*-
8#ifndef INCLUDED_BDLJSN_STRINGUTIL
9#define INCLUDED_BDLJSN_STRINGUTIL
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdljsn_stringutil bdljsn_stringutil
15/// @brief Provide a utility functions for JSON strings.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdljsn
19/// @{
20/// @addtogroup bdljsn_stringutil
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdljsn_stringutil-purpose"> Purpose</a>
25/// * <a href="#bdljsn_stringutil-classes"> Classes </a>
26/// * <a href="#bdljsn_stringutil-description"> Description </a>
27/// * <a href="#bdljsn_stringutil-json-strings"> JSON Strings </a>
28/// * <a href="#bdljsn_stringutil-guarantees-arbitrary-utf-8-to-json-string"> Guarantees: Arbitrary UTF-8 to JSON String </a>
29/// * <a href="#bdljsn_stringutil-strictness"> Strictness </a>
30/// * <a href="#bdljsn_stringutil-example-variance"> Example Variance </a>
31/// * <a href="#bdljsn_stringutil-usage"> Usage </a>
32/// * <a href="#bdljsn_stringutil-example-1-encoding-and-decoding-a-json-string"> Example 1: Encoding and Decoding a JSON String </a>
33///
34/// # Purpose {#bdljsn_stringutil-purpose}
35/// Provide a utility functions for JSON strings.
36///
37/// # Classes {#bdljsn_stringutil-classes}
38///
39/// - bdljsn::StringUtil: namespace for utility functions on JSON strings
40///
41/// # Description {#bdljsn_stringutil-description}
42/// This component defines a utility `struct`,
43/// `bdljsn::StringUtil`, that is a namespace for functions that convert
44/// arbitrary UTF-8 codepoint sequences to JSON strings and vice versa. The
45/// rules for these conversions are outlined below in {JSON Strings} and
46/// detailed in: https://www.rfc-editor.org/rfc/rfc8259#section-7 (RFC8259)
47///
48/// This utility provides two key functions:
49///
50/// * `writeString`: Given an arbitrary UTF-8 codepoint sequence, generate a
51/// JSON string representing the same codepoints.
52/// * `readString`: Given a JSON string (e.g., the output of `writeString`),
53/// generate the equivalent sequence of UTF-8 code points.
54///
55/// When using these functions, a UTF-8 codepoint sequence is always preserved
56/// on the round trip to JSON string and back; however, since there are
57/// equivalent allowed representations of a JSON string, the converse is not
58/// guaranteed.
59///
60/// ## JSON Strings {#bdljsn_stringutil-json-strings}
61///
62///
63/// JSON strings consist of UTF-8 codepoints surround by double quotes (i.e.,
64/// '\"') Within those double quotes certain characters *must* be escaped (i.e.,
65/// replaced with some alternative, multi-byte representation). Those
66/// characters are:
67///
68/// * quotation marks
69/// * backslashes (a.k.a., a "reverse solidus")
70/// * the "control characters" in the range `U+0000` to `U+001F` (inclusive).
71///
72/// Each of the above characters can be escaped by replacing it with the six
73/// byte sequence consisting of:
74///
75/// - a backslash,
76/// - a lower-case `u`, and
77/// - the Unicode value expressed as four hexadecimal digits.
78///
79/// For example, the character that rings the console bell is represented as
80/// '\u0007'. Note that the hexadecimal digits can use upper or lower case
81/// letters but the lead `u` character must be lower case. See {Strictness}.
82///
83/// Eight of the characters that must be escaped can be alternatively
84/// represented by special, 2-byte sequences:
85/// @code
86/// +---------+-----------------+---------------+---------------+
87/// | Unicode | Description | 6-byte escape | 2-byte escape |
88/// +---------+-----------------+---------------+---------------+
89/// | U+0022 | quotation mark | \u0022 | \" |
90/// | U+005C | backslash | \u005c | \\ |
91/// | U+002F | slash | \u002f | \/ |
92/// | U+0008 | backspace | \u0008 | \b |
93/// | U+000C | form feed | \u000C | \f |
94/// | U+000A | line feed | \u000A | \n |
95/// | U+000D | carriage return | \u000D | \r |
96/// | U+0009 | tab | \u0009 | \t |
97/// +---------+-----------------+---------------+---------------+
98/// @endcode
99/// Note that the above set is similar to but not identical to the set of two
100/// byte `char` literals supported by C++. For example, '\0' (null) and '\a'
101/// (bell) are not included above.
102///
103/// ### Guarantees: Arbitrary UTF-8 to JSON String {#bdljsn_stringutil-guarantees-arbitrary-utf-8-to-json-string}
104///
105///
106/// * No UTF-8 characters in the *Basic* *Multilingual* *Plane* are escaped
107/// unless they are in the set that *must* be escaped.
108/// * When a character must be escaped, the 6-byte (hexadecimal) representation
109/// is used only if no 2-byte escape exists.
110/// * When a 6-byte (hexadecimal) representation is used, hexadecimal letters
111/// are in upper case.
112/// * All UTF-8 characters outside of the *Basic* *Multilingual* *Plane*
113/// are represented by two, adjacent 6-byte hexadecimal escape
114/// sequences. For details, see:
115/// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
116///
117/// ## Strictness {#bdljsn_stringutil-strictness}
118///
119///
120/// By default, the `bdljsn::StringUtil` read and write methods strictly follow
121/// the RFC8259 standard. Variances from those rules are expressed using
122/// `bdljsn::StringUtil::FLags`, an `enum` of flag values that can be set in the
123/// optional `flags` parameter of the decoding methods. Multiple flags can be
124/// bitwise set in `flags`; however, currently, just one variance flag is
125/// defined.
126///
127/// ### Example Variance {#bdljsn_stringutil-example-variance}
128///
129///
130/// RFC8259 specifies that the 6-byte Unicode escape sequence start with a
131/// slash, `/`, and lower-case `u`. However, if the
132/// `bdljsn::StringUtil::e_ACCEPT_CAPITAL_UNICODE_ESCAPE` is set, an upper-case
133/// `U` is accepted as well. Thus, both '\u0007' and '\U0007' would be
134/// interpreted as the BELL character.
135///
136/// ## Usage {#bdljsn_stringutil-usage}
137///
138///
139/// This section illustrates intended use of this component.
140///
141/// ### Example 1: Encoding and Decoding a JSON String {#bdljsn_stringutil-example-1-encoding-and-decoding-a-json-string}
142///
143///
144/// First, we initialize a string with a valid sequence of UTF-8 codepoints.
145/// @code
146/// bsl::string initial("Does the name \"Ivan Pavlov\" ring a bell\a?\n");
147/// assert(bdlde::Utf8Util::isValid(initial));
148/// @endcode
149/// Notice that, as required by C++ syntax, several characters are represented
150/// by their two-character escape sequence: double quote (twice), bell, and
151/// newline.
152///
153/// Then, we examine the string as output:
154/// @code
155/// bsl::cout << initial << bsl::endl;
156/// @endcode
157/// and observe:
158/// @code
159/// Does the name "Ivan Pavlov" ring a bell?
160///
161/// @endcode
162/// Notice that the backslash characters (having served their purpose of giving
163/// special meaning to the subsequent character) are not shown. The BELL and
164/// NEWLINE characters are output but are not visible.
165///
166/// Now, we generate JSON string equivalent of the `initial` string.
167/// @code
168/// bsl::ostringstream oss;
169///
170/// int rcEncode = bdljsn::StringUtil::writeString(oss, initial);
171/// assert(0 == rcEncode);
172///
173/// bsl::string jsonCompatibleString = oss.str();
174/// bsl::cout << jsonCompatibleString << bsl::endl;
175/// @endcode
176/// and observed how the `initial` string is represented for JSON:
177/// @code
178/// "Does the name \"Ivan Pavlov\" ring a bell\u0007?\n"
179/// @endcode
180/// Notice that:
181/// * The entire string is delimited by double quotes.
182/// * The interior double quotes and new line are represented by two character
183/// escape sequences (as they were in the C++ string literal.
184/// * Since JSON does not have a two character escape sequence for the BELL
185/// character, '\u0007', the 6-byte Unicode representation is used.
186///
187/// Finally, we convert the `jsonCompatibleString` back to its original content:
188/// @code
189/// bsl::string fromJsonString;
190/// const int rcDecode = bdljsn::StringUtil::readString(
191/// &fromJsonString,
192/// jsonCompatibleString);
193/// assert(0 == rcDecode);
194/// assert(initial == fromJsonString);
195///
196/// bsl::cout << fromJsonString << bsl::endl;
197/// @endcode
198/// and observe (again):
199/// @code
200/// Does the name "Ivan Pavlov" ring a bell?
201///
202/// @endcode
203/// @}
204/** @} */
205/** @} */
206
207/** @addtogroup bdl
208 * @{
209 */
210/** @addtogroup bdljsn
211 * @{
212 */
213/** @addtogroup bdljsn_stringutil
214 * @{
215 */
216
217#include <bdlscm_version.h>
218
219#include <bsls_assert.h>
220
221#include <bsl_ostream.h>
222#include <bsl_string.h>
223#include <bsl_string_view.h>
224
225
226namespace bdljsn {
227
228 // =================
229 // struct StringUtil
230 // =================
231
232/// This class provides utility functions for converting arbitrary UTF-8
233/// sequences into JSON strings and visa versa. See {JSON Strings} in
234/// {DESCRIPTION} for details of these transformations.
236
237 public:
238 // TYPES
243
244 // CLASS METHODS
245
246 /// Load to the specified `value` the UTF-8 codepoint sequence
247 /// equivalent to the specified (JSON) `string` (see {JSON Strings}).
248 /// Return 0 on success and a non-zero value otherwise. Optionally
249 /// specify `flags` to request variances from certain rules of JSON
250 /// decoding (see {Strictness}).
251 static int readString(bsl::string *value,
252 const bsl::string_view& string,
253 int flags = e_NONE);
254
255 /// Load to the specified `value` the UTF-8 codepoint sequence
256 /// equivalent to the specified `string`, that is JSON-compliant absent
257 /// the leading and trailing double quote characters (see {JSON
258 /// Strings}). Return 0 on success and a non-zero value otherwise.
259 /// Optionally specify `flags` to request variances from certain rules
260 /// of JSON decoding (see {Strictness}).
262 const bsl::string_view& string,
263 int flags = e_NONE);
264
265 /// Write to the specified `stream` a JSON-compliant string that is
266 /// equivalent to the specified `string`, an arbitrary UTF-8 codepoint
267 /// sequence. Return 0 on success and a non-zero value otherwise. The
268 /// operation fails if `string` is not a sequence of UTF-8 codepoints or
269 /// if there is an error writing to `stream`. See {Conventions:
270 /// Arbitrary UTF-8 to JSON String} for further details.
271 static int writeString(bsl::ostream& stream,
272 const bsl::string_view& string);
273};
274
275// ============================================================================
276// INLINE DEFINITIONS
277// ============================================================================
278
279 // -----------------
280 // struct StringUtil
281 // -----------------
282
283// CLASS METHODS
284inline
286 const bsl::string_view& string,
287 int flags)
288{
289 BSLS_ASSERT(value);
290
291 if (2 > string.size()) {
292 return -1; // RETURN
293 }
294
295 if (string[0] != '"' && string[string.size() - 1] != '"') {
296 return -1; // RETURN
297 }
298
299 const bsl::string_view contents = string.substr(1, string.size() - 2);
300 return readUnquotedString(value, contents, flags);
301}
302
303} // close package namespace
304
305
306#endif
307
308// ----------------------------------------------------------------------------
309// Copyright 2022 Bloomberg Finance L.P.
310//
311// Licensed under the Apache License, Version 2.0 (the "License");
312// you may not use this file except in compliance with the License.
313// You may obtain a copy of the License at
314//
315// http://www.apache.org/licenses/LICENSE-2.0
316//
317// Unless required by applicable law or agreed to in writing, software
318// distributed under the License is distributed on an "AS IS" BASIS,
319// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
320// See the License for the specific language governing permissions and
321// limitations under the License.
322// ----------------------------- END-OF-FILE ----------------------------------
323
324/** @} */
325/** @} */
326/** @} */
Definition bslstl_stringview.h:441
BSLS_KEYWORD_CONSTEXPR_CPP14 basic_string_view substr(size_type position=0, size_type numChars=npos) const
Definition bslstl_stringview.h:1799
Definition bslstl_string.h:1281
#define BSLS_ASSERT(X)
Definition bsls_assert.h:1804
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdljsn_error.h:143
Definition bdljsn_stringutil.h:235
Flags
Definition bdljsn_stringutil.h:239
@ e_ACCEPT_CAPITAL_UNICODE_ESCAPE
Definition bdljsn_stringutil.h:241
@ e_NONE
Definition bdljsn_stringutil.h:240
static int readString(bsl::string *value, const bsl::string_view &string, int flags=e_NONE)
Definition bdljsn_stringutil.h:285
static int writeString(bsl::ostream &stream, const bsl::string_view &string)
static int readUnquotedString(bsl::string *value, const bsl::string_view &string, int flags=e_NONE)