BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdljsn_tokenizer.h
Go to the documentation of this file.
1/// @file bdljsn_tokenizer.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdljsn_tokenizer.h -*-C++-*-
8#ifndef INCLUDED_BDLJSN_TOKENIZER
9#define INCLUDED_BDLJSN_TOKENIZER
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdljsn_tokenizer bdljsn_tokenizer
15/// @brief Provide a tokenizer for extracting JSON data from a `streambuf`.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdljsn
19/// @{
20/// @addtogroup bdljsn_tokenizer
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdljsn_tokenizer-purpose"> Purpose</a>
25/// * <a href="#bdljsn_tokenizer-classes"> Classes </a>
26/// * <a href="#bdljsn_tokenizer-description"> Description </a>
27/// * <a href="#bdljsn_tokenizer-strict-conformance"> Strict Conformance </a>
28/// * <a href="#bdljsn_tokenizer-usage"> Usage </a>
29/// * <a href="#bdljsn_tokenizer-example-1-extracting-json-data-into-an-object"> Example 1: Extracting JSON Data into an Object </a>
30///
31/// # Purpose {#bdljsn_tokenizer-purpose}
32/// Provide a tokenizer for extracting JSON data from a `streambuf`.
33///
34/// # Classes {#bdljsn_tokenizer-classes}
35///
36/// - bdljsn::Tokenizer: tokenizer for parsing JSON data from a `streambuf`
37///
38/// @see baljsn_decoder
39///
40/// # Description {#bdljsn_tokenizer-description}
41/// This component provides a class, `bdljsn::Tokenizer`, that
42/// traverses data stored in a `bsl::streambuf` one node at a time and provides
43/// clients access to the data associated with that node, including its type and
44/// data value. Client code can use the `reset` function to associate a
45/// `bsl::streambuf` containing JSON data with a tokenizer object and then call
46/// the `advanceToNextToken` function to extract individual data values.
47///
48/// This `class` was created to be used by other components in the `bdljsn` and
49/// `baljsn` packages and in most cases clients should use the
50/// @ref bdljsn_jsonutil , @ref baljsn_decoder , or @ref bdljsn_datumutil components
51/// instead of using this `class`.
52///
53/// On malformed JSON, tokenization may fail before the end of input is reached,
54/// but not all such errors are detected. In particular, callers should check
55/// that closing brackets and braces match opening ones.
56///
57/// ## Strict Conformance {#bdljsn_tokenizer-strict-conformance}
58///
59///
60/// The `bdljsn::Tokenizer` class allows several convenient variances from the
61/// JSON grammar as described in RFC8259 (see
62/// https://www.rfc-editor.org/rfc/rfc8259). If strict conformance is needed,
63/// users can put the tokenizer into strict conformance mode (see
64/// `setConformanceMode`). The behavioral differences are each controlled by
65/// options. The differences between a default constructed tokenizer and one in
66/// strict mode are:
67/// @code
68/// Option Default Strict
69/// -------------------------------- ------- ------
70/// allowConsecutiveSeparators true false
71/// allowFormFeedAsWhitespace true false
72/// allowHeterogenousArrays true true
73/// allowNonUtf8StringLiterals true false
74/// allowStandAloneValues true true
75/// allowTrailingTopLevelComma true false
76/// allowUnescapedControlCharacters true false
77/// @endcode
78/// The default-constructed `bdljsn::Tokenizer` is created having the options
79/// shown above (in the"Default" column) and a `conformancemode` of
80/// `bdljsn::e_RELAXED`. Accordingly, users are free to change any of the
81/// option values to any combination that may be needed; however, once a
82/// tokenizer is set to strict mode the options are set to the values shown
83/// above (in the "Strict" column) and changes are not allowed (doing so leads
84/// to undefined behavior) unless the conformance mode is again set to relaxed.
85///
86/// ## Usage {#bdljsn_tokenizer-usage}
87///
88///
89/// This section illustrates intended use of this component.
90///
91/// ## Example 1: Extracting JSON Data into an Object {#bdljsn_tokenizer-example-1-extracting-json-data-into-an-object}
92///
93///
94/// For this example, we will use `bdljsn::Tokenizer` to read each node in a
95/// JSON document and populate a simple `Employee` object.
96///
97/// First, we will define the JSON data that the tokenizer will traverse over:
98/// @code
99/// const char *INPUT = " {\n"
100/// " \"street\" : \"Lexington Ave\",\n"
101/// " \"state\" : \"New York\",\n"
102/// " \"zipcode\" : \"10022-1331\",\n"
103/// " \"floorCount\" : 55\n"
104/// " }";
105/// @endcode
106/// Next, we will construct populate a `streambuf` with this data:
107/// @code
108/// bdlsb::FixedMemInStreamBuf isb(INPUT, bsl::strlen(INPUT));
109/// @endcode
110/// Then, we will create a `bdljsn::Tokenizer` object and associate the above
111/// streambuf with it:
112/// @code
113/// bdljsn::Tokenizer tokenizer;
114/// tokenizer.reset(&isb);
115/// @endcode
116/// Next, we will create an address record type and object.
117/// @code
118/// struct Address {
119/// bsl::string d_street;
120/// bsl::string d_state;
121/// bsl::string d_zipcode;
122/// int d_floorCount;
123/// } address = { "", "", "", 0 };
124/// @endcode
125/// Then, we will traverse the JSON data one node at a time:
126/// @code
127/// // Read '{'
128///
129/// int rc = tokenizer.advanceToNextToken();
130/// assert(!rc);
131///
132/// bdljsn::Tokenizer::TokenType token = tokenizer.tokenType();
133/// assert(bdljsn::Tokenizer::e_START_OBJECT == token);
134///
135/// rc = tokenizer.advanceToNextToken();
136/// assert(!rc);
137/// token = tokenizer.tokenType();
138///
139/// // Continue reading elements till '}' is encountered
140///
141/// while (bdljsn::Tokenizer::e_END_OBJECT != token) {
142/// assert(bdljsn::Tokenizer::e_ELEMENT_NAME == token);
143///
144/// // Read element name
145///
146/// bslstl::StringRef nodeValue;
147/// rc = tokenizer.value(&nodeValue);
148/// assert(!rc);
149///
150/// bsl::string elementName = nodeValue;
151///
152/// // Read element value
153///
154/// int rc = tokenizer.advanceToNextToken();
155/// assert(!rc);
156///
157/// token = tokenizer.tokenType();
158/// assert(bdljsn::Tokenizer::e_ELEMENT_VALUE == token);
159///
160/// rc = tokenizer.value(&nodeValue);
161/// assert(!rc);
162///
163/// // Extract the simple type with the data
164///
165/// if (elementName == "street") {
166/// rc = bdljsn::StringUtil::readString(&address.d_street, nodeValue);
167/// assert(!rc);
168/// }
169/// else if (elementName == "state") {
170/// rc = bdljsn::StringUtil::readString(&address.d_state, nodeValue);
171/// assert(!rc);
172/// }
173/// else if (elementName == "zipcode") {
174/// rc = bdljsn::StringUtil::readString(&address.d_zipcode, nodeValue);
175/// assert(!rc);
176/// }
177/// else if (elementName == "floorCount") {
178/// rc = bdljsn::NumberUtil::asInt(&address.d_floorCount, nodeValue);
179/// assert(!rc);
180/// }
181///
182/// rc = tokenizer.advanceToNextToken();
183/// assert(!rc);
184/// token = tokenizer.tokenType();
185/// }
186/// @endcode
187/// Finally, we will verify that the `address` aggregate has the correct values:
188/// @code
189/// assert("Lexington Ave" == address.d_street);
190/// assert("New York" == address.d_state);
191/// assert("10022-1331" == address.d_zipcode);
192/// assert(55 == address.d_floorCount);
193/// @endcode
194/// @}
195/** @} */
196/** @} */
197
198/** @addtogroup bdl
199 * @{
200 */
201/** @addtogroup bdljsn
202 * @{
203 */
204/** @addtogroup bdljsn_tokenizer
205 * @{
206 */
207
208#include <bdlscm_version.h>
209
211
212#include <bsls_alignedbuffer.h>
213#include <bsls_assert.h>
214#include <bsls_types.h>
215
216#include <bsl_ios.h>
217#include <bsl_streambuf.h>
218#include <bsl_string.h>
219#include <bsl_string_view.h>
220#include <bsl_vector.h>
221
222
223namespace bdljsn {
224
225 // ===============
226 // class Tokenizer
227 // ===============
228
229/// This `class` provides a mechanism for traversing JSON data stored in a
230/// `bsl::streambuf` one node at a time and allows clients to access the
231/// data associated with that node, including its type and data value.
232///
233/// See @ref bdljsn_tokenizer
235
236 public:
237 // TYPES
240
242 // This 'enum' lists all the possible token types.
243
244 e_BEGIN = 1, // starting token
245 e_ELEMENT_NAME, // element name
246 e_START_OBJECT, // start of an object ('{')
247 e_END_OBJECT, // end of an object ('}')
248 e_START_ARRAY, // start of an array ('[')
249 e_END_ARRAY, // end of an array (']')
250 e_ELEMENT_VALUE, // element value of a simple type
251 e_ERROR // error token
252#ifndef BDE_OMIT_INTERNAL_DEPRECATED
261#endif // BDE_OMIT_INTERNAL_DEPRECATED
262 };
263
264 enum { k_EOF = +1 };
265
270
271 private:
272 // PRIVATE TYPES
273 enum ContextType {
274 // This 'enum' lists the possible contexts that the tokenizer can be
275 // in.
276
277 e_NO_CONTEXT, // context stack is empty
278 e_OBJECT_CONTEXT, // object context
279 e_ARRAY_CONTEXT // array context
280 };
281
282 // One intermediate data buffer used for reading data from the stream, and
283 // another for the context state stack.
284
285 enum {
286 k_BUFSIZE = 1024 * 8,
287 k_MAX_STRING_SIZE = k_BUFSIZE - 1,
288
289 k_CONTEXTSTACKBUFSIZE = 256
290 };
291
292 // DATA
294 d_buffer; // string buffer
295
297 d_stackBuffer; // context stack buffer
298
300 d_allocator; // string allocator (owned)
301
303 d_stackAllocator; // context stack allocator (owned)
304
305 bsl::string d_stringBuffer; // string buffer
306
307 bsl::streambuf *d_streambuf_p; // streambuf (held, not owned)
308
309 bsl::size_t d_cursor; // current cursor
310
311 bsl::size_t d_valueBegin; // cursor for beginning of value
312
313 bsl::size_t d_valueEnd; // cursor for end of value
314
315 bsl::size_t d_valueIter; // cursor for iterating value
316
317 Uint64 d_readOffset; // the offset to the end of the
318 // current 'd_stringBuffer'
319 // relative to the start of the
320 // streambuf
321
322 TokenType d_tokenType; // token type
323
324 bsl::vector<char> d_contextStack; // context type stack
325
326 int d_readStatus; // 0 until EOF or an error is
327 // encountered, then indicates
328 // nature of error. Returned by
329 // 'readStatus'
330
331 int d_bufEndStatus; // status of last read from
332 // '*d_streambuf_p'. If non-zero,
333 // copied to 'd_readStatus' on next
334 // read attempt.
335
336 bool d_allowConsecutiveSeparators;
337 // option for allowing consecutive
338 // separators (i.e., ':', or ',')
339
340 bool d_allowFormFeedAsWhitespace;
341 // option for allowing '\f' as
342 // whitespace in addition to ' ',
343 // '\n', '\t', '\r', and '\v'.
344
345 bool d_allowHeterogenousArrays;
346 // option for allowing arrays of
347 // heterogeneous values
348
349 bool d_allowNonUtf8StringLiterals;
350 // Disables UTF-8 validation
351
352 bool d_allowStandAloneValues;
353 // option for allowing stand alone
354 // values
355
356 bool d_allowTrailingTopLevelComma;
357 // if 'true', allows '{},'
358
359 bool d_allowUnescapedControlCharacters;
360 // option for unescaped control
361 // characters in JSON strings.
362
363 ConformanceMode d_conformanceMode; // "relaxed" (default) or "strict"
364
365 // PRIVATE MANIPULATORS
366
367 /// Increase the size of the string buffer, `d_stringBuffer`, and then
368 /// append additional characters, from the internally-held `streambuf` (
369 /// `d_streambuf_p`) to the end of the current sequence of characters.
370 /// Return 0 on success and a non-zero value otherwise.
371 int expandBufferForLargeValue();
372
373 /// Extract the string value starting at the current data cursor and
374 /// update the value begin and end pointers to refer to the begin and
375 /// end of the extracted string. Return 0 on success and a non-zero
376 /// value otherwise.
377 int extractStringValue();
378
379 /// Move the current sequence of characters being tokenized to the front
380 /// of the internal string buffer, `d_stringBuffer`, and then append
381 /// additional characters, from the internally-held `streambuf`
382 /// (`d_streambuf_p`) to the end of that sequence up to a maximum
383 /// sequence length of `d_buffer.size()` characters. Return the number
384 /// of bytes read from the `streambuf`. Note that if 0 is returned, it
385 /// may mean end of file or, if UTF-8 checking is set, that invalid
386 /// UTF-8 was encountered.
387 int moveValueCharsToStartAndReloadBuffer();
388
389 /// If the `d_contextStack` is empty, return `e_NO_CONTEXT`, otherwise
390 /// pop the top context from the `d_contextStack` stack, and return it.
391 ContextType popContext();
392
393 /// Push the specified `context` onto the `d_contextStack` stack.
394 void pushContext(ContextType context);
395
396 /// Reload the string buffer with new data read from the underlying
397 /// `streambuf` and overwriting the current buffer. After reading
398 /// update the cursor to the new read location. Return the number of
399 /// bytes read from the `streambuf`.
400 int reloadStringBuffer();
401
402 /// Skip all characters until a whitespace or a token character is
403 /// encountered and position the cursor onto the first such character.
404 /// Return 0 on success and a non-zero value otherwise.
405 int skipNonWhitespaceOrTillToken();
406
407 /// Skip all whitespace characters and position the cursor onto the
408 /// first non-whitespace character. Return 0 on success and a non-zero
409 /// value otherwise.
410 int skipWhitespace();
411
412 // PRIVATE ACCESSOR
413
414 /// If the `d_contextStack` is empty, return `e_NO_CONTEXT`, otherwise
415 /// return the top context from the `d_contextStack` stack without
416 /// popping.
417 ContextType context() const;
418
419 private:
420 // NOT IMPLEMENTED
421 Tokenizer(const Tokenizer&);
422 Tokenizer& operator=(const Tokenizer&);
423
424 public:
425 // CREATORS
426
427 /// Create a `Tokenizer` object. Optionally specify a `basicAllocator`
428 /// used to supply memory. If `basicAllocator` is 0, the currently
429 /// installed default allocator is used. By default, the
430 /// `conformanceMode` is `e_RELAXED` and the value of the `Tokenizer`
431 /// options are:
432 /// @code
433 /// allowConsecutiveSeparators() == true;
434 /// allowFormFeedAsWhitespace() == true;
435 /// allowHeterogeneousArrays() == true;
436 /// allowNonUtf8StringLiterals() == true;
437 /// allowStandAloneValues() == true;
438 /// allowTrailingTopLevelComma() == true;
439 /// allowUnescapedControlCharacters() == true;
440 /// @endcode
441 /// The `reset` method must be called before any calls to
442 /// `advanceToNextToken` or `resetStreamBufGetPointer`.
443 explicit Tokenizer(bslma::Allocator *basicAllocator = 0);
444
445 /// Destroy this object.
446 ~Tokenizer();
447
448 // MANIPULATORS
449
450 /// Move to the next token in the data steam. Return 0 on success and a
451 /// non-zero value otherwise. Each call to `advanceToNextToken`
452 /// invalidates the string references returned by the `value` accessor
453 /// for prior nodes. This function *may* fail to move to the next token
454 /// if doing so would advanced past a character sequence that is not
455 /// valid JSON, and is guaranteed to do so (fail to move) if
456 /// `e_RELAXED != conformanceMode()`. The behavior is undefined unless
457 /// `reset` has been called.
459
460 /// Reset this tokenizer to read data from the specified `streambuf`.
461 /// Note that the reader will not be on a valid node until
462 /// `advanceToNextToken` is called. Note that this function does not
463 /// change the the `conformanceMode` nor the values of any of the
464 /// individual token options:
465 /// * `allowConsecutiveSeparators`
466 /// * `allowFormFeedAsWhitespace`
467 /// * `allowHeterogenousArrays`
468 /// * `allowNonUtf8StringLiterals`
469 /// * `allowStandAloneValues`
470 /// * `allowTrailingTopLevelComma`
471 /// * `allowUnescapedControlCharacters`
472 void reset(bsl::streambuf *streambuf);
473
474 /// Reset the get pointer of the `streambuf` held by this object to
475 /// refer to the byte following the last processed byte, if the held
476 /// `streambuf` supports seeking, and return an error otherwise leaving
477 /// this object unchanged. Return 0 on success, and a non-zero value
478 /// otherwise. The behavior is undefined unless `reset` has been
479 /// called. Note that after a successful function return users can read
480 /// data from the `streambuf` that was specified during `reset` from
481 /// where this object stopped. Also note that this call implies the end
482 /// of processing for this object and any subsequent methods invoked on
483 /// this object should only be done after calling `reset` and specifying
484 /// a new `streambuf`.
486
487 /// Set the `allowConsecutiveSeparators` option to the specified
488 /// `value` and return a non-`const` reference to this tokenizer. JSON
489 /// defines two separator tokens: the colon (`:`) and the comma (`,`).
490 /// If the `allowConsecutiveSeparartors` value is `true` this tokenizer
491 /// will accept multiple consecutive sequences of a given separator
492 /// (e.g., `"a"::b, "c":::d` and `"a":b,, "c":d`, ,, "e":f') as if a
493 /// single separator had appeared (i.e., `"a":b, "c":d` and
494 /// `"a":b, "c":d`, "e":f', respectively). Otherwise the tokenizer
495 /// returns an error when multiple consecutive colons are found. By
496 /// default, the value of the `allo ConsecutiveSeparators` option is
497 /// `true`. The behavior is undefined unless
498 /// `e_RELAXED == conformanceMode()`. Note that consecutive sequences
499 /// using both tokens (e.g., `::,,::`) is always an error.
501
502 /// Set the `allowFormFeedAsWhitespace` option to the specifiedd value
503 /// and return a non-`const` reference to this tokenizer. If the
504 /// `allowFormFeedAsWhitespace` value is `true` the formfeed character
505 /// ('\f') is recognized as a whitespace character in addition to '\n',
506 /// '\t', '\r', and '\v'. Otherwise, formfeed is diallowed a
507 /// whitewpace.
509
510 /// Set the `allowHeterogenousArrays` option to the specified `value`
511 /// and return a non-`const` reference to this tokenizer. If the
512 /// `allowHeterogenousArrays` value is `true` this tokenizer will
513 /// successfully tokenize heterogeneous values within an array. If the
514 /// option's value is `false` then the tokenizer will return an error
515 /// for arrays having heterogeneous values. By default, the value of
516 /// the `allowHeterogenousArrays` option is `true`. The behavior is
517 /// undefined unless `e_RELAXED == conformanceMode()`.
519
520 /// Set the `allowNonUtf8StringLiterals` option to the specified `value`
521 /// and return a non-`const` reference to this tokenizer. If the
522 /// `allowNonUtf8StringLiterals` value is `false` this tokenizer will
523 /// check string literal tokens for invalid UTF-8, enter an error mode
524 /// if it encounters a string literal token that has any content that is
525 /// not UTF-8, and fail to advance to subsequent tokens until `reset` is
526 /// called. By default, the value of the `allowNonUtf8StringLiterals`
527 /// option is `true`. The behavior is undefined unless
528 /// `e_RELAXED == conformanceMode()`.
530
531 /// Set the `allowStandAloneValues` option to the specified `value` and
532 /// return a non-`const` reference to this tokenizer. If the
533 /// `allowStandAloneValues` value is `true` this tokenizer will
534 /// successfully tokenize JSON values (strings and numbers). If the
535 /// option's value is `false` then the tokenizer will only tokenize
536 /// complete JSON documents (JSON objects and arrays) and return an
537 /// error for stand alone JSON values. By default, the value of the
538 /// `allowStandAloneValues` option is `true`. The behavior is undefined
539 /// unless `e_RELAXED == conformanceMode()`.
541
542 /// Set the `allowTrailingTopLevelComma` option to the specified `value`
543 /// and return a non-`const` reference to this tokenizer. If the
544 /// `allowTrailingTopLevelComma` value is `true` this tokenizer will
545 /// successfully tokenize JSON values where a comma follows the
546 /// top-level JSON element. If the option's value is `false` then the
547 /// tokenizer will reject documents with such trailing commas, such as
548 /// `{},`. By default, the value of the `allowTrailingTopLevelComma`
549 /// option is `true` for backwards compatibility. Note that a document
550 /// without any JSON elements is invalid whether or not it contains
551 /// commas. The behavior is undefined unless
552 /// `e_RELAXED == conformanceMode()`.
554
555 /// Set the `allowUnescapedControlCharacters` option of this tokenizer
556 /// to the specified `value`. If `true`, characters in the range
557 /// `[ 0x00 .. 0x1F ]` are allowed in JSON strings. If the option is
558 /// `false`, these characters must be represented by their six byte
559 /// escape sequences `[ \u0000 .. \u001F ]`. Several values in that
560 /// range are also (conveniently) represented by two byte sequences:
561 /// @code
562 /// \" quotation mark
563 /// \\ reverse solidus
564 /// \/ solidus
565 /// \b backspace
566 /// \f form feed
567 /// \n line feed
568 /// \r carriage return
569 /// \t tab
570 /// @endcode
571 /// The `DEL` control character (`0x7F`) is accepted even in strict
572 /// mode.
573 ///
574 /// The behavior is undefined unless `e_RELAXED == conformanceMode()`.
575 /// Note that the representation of these byte sequences as C/C++ string
576 /// literals requires that the escape character itself must be escaped:
577 /// @code
578 /// "Hello,\\tworld\\n"; // Can alwas initialize a JSON string with
579 /// // containing tab and a newline
580 /// // escape sequences
581 /// // whether the option is set or not.
582 ///
583 /// "Hello,\tworld\n"; // When this option is 'true'.
584 /// // can also initialize a JSON string
585 /// // with an actual and newline characters.
586 /// @endcode
587 /// Also note that the two resulting strings do *not* compare equal.
589
590 /// Set the `conformanceMode` of this tokenizer to the specified `mode`
591 /// and return a non-`const` reference to this tokenizer. If `mode` is
592 /// `e_STRICT_20240119` the option values of this tokenizer are set to
593 /// be fully compliant with RFC8259 (see
594 /// https://www.rfc-editor.org/rfc/rfc8259)
595 ///
596 /// Specifically, those option values are:
597 /// @code
598 /// allowConsecutiveSeparartor == false;
599 /// allowFormFeedAsWhitespace() == false;
600 /// allowHeterogeneousArrays() == true;
601 /// allowNonUtf8StringLiterals() == false;
602 /// allowStandAloneValues() == true;
603 /// allowTrailingTopLevelComma() == false;
604 /// allowUnescapedControlCharacters() = false;
605 /// @endcode
606 /// Otherwise (i.e., `mode` is `e_RELAXED`), those option values can be
607 /// set in any combination. Note that the behavior is undefined if
608 /// individual options are set when `conformanceMode` is *not*
609 /// `e_RELAXED`.
611
612 // ACCESSORS
613
614 /// Return the value of the `allowConsecutiveSeparators` option of this
615 /// tokenizer.
616 bool allowConsecutiveSeparators() const;
617
618 /// Return the value of the `allowFormFeedAsWhitespace` option of this
619 /// tokenizer.
620 bool allowFormFeedAsWhitespace() const;
621
622 /// Return the value of the `allowHeterogenousArrays` option of this
623 /// tokenizer.
624 bool allowHeterogenousArrays() const;
625
626 /// Return the value of the `allowNonUtf8StringLiterals` option of this
627 /// tokenizer.
628 bool allowNonUtf8StringLiterals() const;
629
630 /// Return the value of the `allowStandAloneValues` option of this
631 /// tokenizer.
632 bool allowStandAloneValues() const;
633
634 /// Return the value of the `allowTrailingTopLevelComma` option of this
635 /// tokenizer.
636 bool allowTrailingTopLevelComma() const;
637
638 /// Return the value of the `allowUnescapedControlCharacters` option of
639 /// this tokenizer.
641
642 /// Return the `conformanceMode` of this tokenizer.
644
645 /// Return the offset of the current octet being tokenized in the stream
646 /// supplied to `reset`, or if an error occurred, the position where the
647 /// failed attempt to tokenize a token occurred. Note that this
648 /// operation is intended to provide additional information in the case
649 /// of an error.
651
652 /// Return the last read position relative to when `reset` was called.
653 /// Note that `readOffset() >= currentPosition()` -- the `readOffset` is
654 /// the offset of the last octet read from the stream supplied to
655 /// `reset`, and is at or beyond the current position being tokenized.
657
658 /// Return the status of the last call to `reloadStringBuffer()`:
659 /// * 0 if `reloadStringBuffer()` has not been called or if a token was
660 /// successfully read.
661 /// * `k_EOF` (which is positive) if no data could be read before
662 /// reaching EOF.
663 /// * a negative value if the `allowNonUtf8StringLiterals` option is
664 /// `false` and a UTF-8 error occurred. The specific value returned
665 /// will be one of the enumerators of the
666 /// `bdlde::Utf8Util::ErrorStatus` `enum` type indicating the nature
667 /// of the UTF-8 error.
668 int readStatus() const;
669
670 /// Return the token type of the current token.
671 TokenType tokenType() const;
672
673 /// Load into the specified `data` the value of the specified token if
674 /// the current token's type is `e_ELEMENT_NAME` or `e_ELEMENT_VALUE` or
675 /// leave `data` unmodified otherwise. Return 0 on success and a
676 /// non-zero value otherwise. Note that the returned `data` is only
677 /// valid until the next manipulator call on this object.
678 int value(bsl::string_view *data) const;
679};
680
681// ============================================================================
682// INLINE DEFINITIONS
683// ============================================================================
684
685// PRIVATE MANIPULATORS
686inline
687Tokenizer::ContextType Tokenizer::popContext()
688{
689 ContextType ret = e_NO_CONTEXT;
690
691 if (!d_contextStack.empty()) {
692 ret = static_cast<ContextType>(d_contextStack.back());
693 d_contextStack.pop_back();
694 }
695
696 return ret;
697}
698
699inline
700void Tokenizer::pushContext(ContextType context)
701{
702 d_contextStack.push_back(static_cast<char>(context));
703}
704
705// PRIVATE ACCESSOR
706inline
707Tokenizer::ContextType Tokenizer::context() const
708{
709 return d_contextStack.empty()
710 ? e_NO_CONTEXT
711 : static_cast<ContextType>(d_contextStack.back());
712}
713
714// CREATORS
715inline
716Tokenizer::Tokenizer(bslma::Allocator *basicAllocator)
717: d_allocator(d_buffer.buffer(), k_BUFSIZE, basicAllocator)
718, d_stackAllocator(d_stackBuffer.buffer(),
719 k_CONTEXTSTACKBUFSIZE,
720 basicAllocator)
721, d_stringBuffer(&d_allocator)
722, d_streambuf_p(0)
723, d_cursor(0)
724, d_valueBegin(0)
725, d_valueEnd(0)
726, d_valueIter(0)
727, d_readOffset(0)
728, d_tokenType(e_BEGIN)
729, d_contextStack(200, &d_stackAllocator)
730, d_readStatus(0)
731, d_bufEndStatus(0)
732, d_allowConsecutiveSeparators(true)
733, d_allowFormFeedAsWhitespace(true)
734, d_allowHeterogenousArrays(true)
735, d_allowNonUtf8StringLiterals(true)
736, d_allowStandAloneValues(true)
737, d_allowTrailingTopLevelComma(true)
738, d_allowUnescapedControlCharacters(true)
739, d_conformanceMode(e_RELAXED)
740{
741 d_stringBuffer.reserve(k_MAX_STRING_SIZE);
742 d_contextStack.clear();
743 pushContext(e_NO_CONTEXT);
744}
745
746inline
750
751// MANIPULATORS
752inline
753void Tokenizer::reset(bsl::streambuf *streambuf)
754{
755 d_streambuf_p = streambuf;
756 d_stringBuffer.clear();
757 d_cursor = 0;
758 d_valueBegin = 0;
759 d_valueEnd = 0;
760 d_valueIter = 0;
761 d_readOffset = 0;
762 d_tokenType = e_BEGIN;
763 d_readStatus = 0;
764 d_bufEndStatus = 0;
765
766 d_contextStack.clear();
767 pushContext(e_NO_CONTEXT);
768}
769
770inline
772{
773 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
774
775 d_allowConsecutiveSeparators = value;
776 return *this;
777}
778
779inline
781{
782 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
783
784 d_allowHeterogenousArrays = value;
785 return *this;
786}
787
788inline
790{
791 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
792
793 d_allowFormFeedAsWhitespace = value;
794 return *this;
795}
796
797inline
799{
800 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
801
802 d_allowNonUtf8StringLiterals = value;
803 return *this;
804}
805
806inline
808{
809 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
810
811 d_allowStandAloneValues = value;
812 return *this;
813}
814
815inline
817{
818 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
819
820 d_allowTrailingTopLevelComma = value;
821 return *this;
822}
823
824inline
826{
827 BSLS_ASSERT(e_RELAXED == d_conformanceMode);
828
829 d_allowUnescapedControlCharacters = value;
830 return *this;
831}
832
833inline
835{
836 d_conformanceMode = mode;
837
838 switch (mode) {
839 case e_RELAXED: {
840 } break;
841 case e_STRICT_20240119: {
842 d_allowConsecutiveSeparators = false;
843 d_allowFormFeedAsWhitespace = false;
844 d_allowHeterogenousArrays = true;
845 d_allowNonUtf8StringLiterals = false;
846 d_allowStandAloneValues = true;
847 d_allowTrailingTopLevelComma = false;
848 d_allowUnescapedControlCharacters = false;
849 } break;
850 default: {
851 BSLS_ASSERT_OPT(0 == "reached");
852 }
853 }
854 return *this;
855}
856
857// ACCESSORS
858inline
860{
861 return d_allowConsecutiveSeparators;
862}
863
864inline
866{
867 return d_allowFormFeedAsWhitespace;
868}
869
870inline
872{
873 return d_allowHeterogenousArrays;
874}
875
876inline
878{
879 return d_allowNonUtf8StringLiterals;
880}
881
882inline
884{
885 return d_allowStandAloneValues;
886}
887
888inline
890{
891 return d_allowTrailingTopLevelComma;
892}
893
894inline
896{
897 return d_allowUnescapedControlCharacters;
898}
899
900inline
902{
903 return d_conformanceMode;
904}
905
906inline
908{
909 return d_readOffset - d_stringBuffer.size() + d_cursor;
910}
911
912inline
914{
915 return d_readOffset;
916}
917
918inline
920{
921 return d_readStatus;
922}
923
924inline
926{
927 return d_tokenType;
928}
929
930} // close package namespace
931
932
933#endif // INCLUDED_BDLJSN_TOKENIZER
934
935// ----------------------------------------------------------------------------
936// Copyright 2022 Bloomberg Finance L.P.
937//
938// Licensed under the Apache License, Version 2.0 (the "License");
939// you may not use this file except in compliance with the License.
940// You may obtain a copy of the License at
941//
942// http://www.apache.org/licenses/LICENSE-2.0
943//
944// Unless required by applicable law or agreed to in writing, software
945// distributed under the License is distributed on an "AS IS" BASIS,
946// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
947// See the License for the specific language governing permissions and
948// limitations under the License.
949// ----------------------------- END-OF-FILE ----------------------------------
950
951/** @} */
952/** @} */
953/** @} */
Definition bdljsn_tokenizer.h:234
bool allowConsecutiveSeparators() const
Definition bdljsn_tokenizer.h:859
int resetStreamBufGetPointer()
bool allowFormFeedAsWhitespace() const
Definition bdljsn_tokenizer.h:865
bsls::Types::Uint64 Uint64
Definition bdljsn_tokenizer.h:239
@ k_EOF
Definition bdljsn_tokenizer.h:264
~Tokenizer()
Destroy this object.
Definition bdljsn_tokenizer.h:747
TokenType tokenType() const
Return the token type of the current token.
Definition bdljsn_tokenizer.h:925
Tokenizer & setAllowConsecutiveSeparators(bool value)
Definition bdljsn_tokenizer.h:771
bool allowNonUtf8StringLiterals() const
Definition bdljsn_tokenizer.h:877
bsls::Types::Uint64 readOffset() const
Definition bdljsn_tokenizer.h:913
Tokenizer & setAllowHeterogenousArrays(bool value)
Definition bdljsn_tokenizer.h:780
Tokenizer & setAllowStandAloneValues(bool value)
Definition bdljsn_tokenizer.h:807
bsls::Types::IntPtr IntPtr
Definition bdljsn_tokenizer.h:238
Tokenizer & setAllowUnescapedControlCharacters(bool value)
Definition bdljsn_tokenizer.h:825
bool allowStandAloneValues() const
Definition bdljsn_tokenizer.h:883
Tokenizer & setAllowTrailingTopLevelComma(bool value)
Definition bdljsn_tokenizer.h:816
int readStatus() const
Definition bdljsn_tokenizer.h:919
ConformanceMode
Definition bdljsn_tokenizer.h:266
@ e_RELAXED
Definition bdljsn_tokenizer.h:267
@ e_STRICT_20240119
Definition bdljsn_tokenizer.h:268
Tokenizer & setAllowFormFeedAsWhitespace(bool value)
Definition bdljsn_tokenizer.h:789
ConformanceMode conformanceMode() const
Return the conformanceMode of this tokenizer.
Definition bdljsn_tokenizer.h:901
void reset(bsl::streambuf *streambuf)
Definition bdljsn_tokenizer.h:753
Tokenizer & setConformanceMode(ConformanceMode mode)
Definition bdljsn_tokenizer.h:834
bool allowUnescapedControlCharacters() const
Definition bdljsn_tokenizer.h:895
bool allowHeterogenousArrays() const
Definition bdljsn_tokenizer.h:871
Tokenizer & setAllowNonUtf8StringLiterals(bool value)
Definition bdljsn_tokenizer.h:798
bsls::Types::Uint64 currentPosition() const
Definition bdljsn_tokenizer.h:907
TokenType
Definition bdljsn_tokenizer.h:241
@ e_ELEMENT_NAME
Definition bdljsn_tokenizer.h:245
@ e_END_OBJECT
Definition bdljsn_tokenizer.h:247
@ BAEJSN_END_ARRAY
Definition bdljsn_tokenizer.h:258
@ e_BEGIN
Definition bdljsn_tokenizer.h:244
@ BAEJSN_ELEMENT_NAME
Definition bdljsn_tokenizer.h:254
@ e_END_ARRAY
Definition bdljsn_tokenizer.h:249
@ e_ELEMENT_VALUE
Definition bdljsn_tokenizer.h:250
@ BAEJSN_START_ARRAY
Definition bdljsn_tokenizer.h:257
@ BAEJSN_END_OBJECT
Definition bdljsn_tokenizer.h:256
@ BAEJSN_START_OBJECT
Definition bdljsn_tokenizer.h:255
@ e_ERROR
Definition bdljsn_tokenizer.h:251
@ e_START_ARRAY
Definition bdljsn_tokenizer.h:248
@ BAEJSN_ERROR
Definition bdljsn_tokenizer.h:260
@ e_START_OBJECT
Definition bdljsn_tokenizer.h:246
@ BAEJSN_ELEMENT_VALUE
Definition bdljsn_tokenizer.h:259
int value(bsl::string_view *data) const
bool allowTrailingTopLevelComma() const
Definition bdljsn_tokenizer.h:889
Definition bdlma_bufferedsequentialallocator.h:265
Definition bslstl_stringview.h:441
Definition bslstl_string.h:1281
void reserve(size_type newCapacity=0)
Definition bslstl_string.h:5407
size_type size() const BSLS_KEYWORD_NOEXCEPT
Definition bslstl_string.h:6592
void clear() BSLS_KEYWORD_NOEXCEPT
Definition bslstl_string.h:5430
reference back()
Definition bslstl_vector.h:2577
bool empty() const BSLS_KEYWORD_NOEXCEPT
Return true if this vector has size 0, and false otherwise.
Definition bslstl_vector.h:2679
Definition bslstl_vector.h:1025
void push_back(const VALUE_TYPE &value)
Definition bslstl_vector.h:3760
void swap(vector &other) BSLS_KEYWORD_NOEXCEPT_SPECIFICATION(AllocatorTraits void clear() BSLS_KEYWORD_NOEXCEPT
Definition bslstl_vector.h:1712
void pop_back()
Definition bslstl_vector.h:3792
Definition bslma_allocator.h:457
Definition bsls_alignedbuffer.h:261
#define BSLS_ASSERT(X)
Definition bsls_assert.h:1804
#define BSLS_ASSERT_OPT(X)
Definition bsls_assert.h:1856
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdljsn_error.h:143
unsigned long long Uint64
Definition bsls_types.h:137
std::ptrdiff_t IntPtr
Definition bsls_types.h:130