BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlb_tokenizer.h
Go to the documentation of this file.
1/// @file bdlb_tokenizer.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlb_tokenizer.h -*-C++-*-
8#ifndef INCLUDED_BDLB_TOKENIZER
9#define INCLUDED_BDLB_TOKENIZER
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdlb_tokenizer bdlb_tokenizer
15/// @brief Provide access to user-described tokens via string references.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdlb
19/// @{
20/// @addtogroup bdlb_tokenizer
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdlb_tokenizer-purpose"> Purpose</a>
25/// * <a href="#bdlb_tokenizer-classes"> Classes </a>
26/// * <a href="#bdlb_tokenizer-description"> Description </a>
27/// * <a href="#bdlb_tokenizer-soft-versus-hard-delimiters"> Soft versus Hard Delimiters </a>
28/// * <a href="#bdlb_tokenizer-the-input-string-to-be-tokenized"> The Input String to be Tokenized </a>
29/// * <a href="#bdlb_tokenizer-iterating-using-a-tokenizeriterator-object"> Iterating using a TokenizerIterator object (ACCESS TO TOKENS ONLY) </a>
30/// * <a href="#bdlb_tokenizer-iterating-using-a-tokenizer-object"> Iterating using a Tokenizer object (ACCESS TO TOKENS AND DELIMITERS) </a>
31/// * <a href="#bdlb_tokenizer-token-and-delimiter-lifetimes"> Token and Delimiter Lifetimes </a>
32/// * <a href="#bdlb_tokenizer-comprehensive-detailed-parsing-specification"> Comprehensive Detailed Parsing Specification </a>
33/// * <a href="#bdlb_tokenizer-usage"> Usage </a>
34/// * <a href="#bdlb_tokenizer-example-1-iterating-over-tokens-using-just-soft-delimiters"> Example 1: Iterating Over Tokens Using Just Soft Delimiters </a>
35/// * <a href="#bdlb_tokenizer-example-2-iterating-over-tokens-using-just-hard-delimiters"> Example 2: Iterating Over Tokens Using Just Hard Delimiters </a>
36/// * <a href="#bdlb_tokenizer-example-3-iterating-over-tokens-using-both-hard-and-soft-delimiters"> Example 3: Iterating Over Tokens Using Both Hard and Soft Delimiters </a>
37///
38/// # Purpose {#bdlb_tokenizer-purpose}
39/// Provide access to user-described tokens via string references.
40///
41/// # Classes {#bdlb_tokenizer-classes}
42///
43/// - bdlb::Tokenizer: lexer for tokens defined via hard and/or soft delimiters
44/// - bdlb::TokenizerIterator: input iterator for delimited tokens in a string
45///
46/// @see bslstl_stringref
47///
48/// # Description {#bdlb_tokenizer-description}
49/// This component defines a mechanism, `bdlb::Tokenizer`, that
50/// provides non-destructive sequential (read-only) access to tokens in a given
51/// input string as characterized by two disjoint sets of user-specified
52/// delimiter characters, each of which is supplied at construction via either a
53/// `const bsl::string_view&` or (for efficiency, when only the leading
54/// characters of the input string may need to be parsed) a `const char *`.
55/// Note that each character (including '\0') that is not explicitly designated
56/// as a delimiter character is assumed to be *token* character.
57///
58/// ## Soft versus Hard Delimiters {#bdlb_tokenizer-soft-versus-hard-delimiters}
59///
60///
61/// The tokenizer recognizes two distinct kinds of delimiter characters, *soft*
62/// and *hard*.
63///
64/// A *soft* *delimiter* is a maximal (non-empty) sequence of soft-delimiter
65/// characters. Soft delimiters, typically whitespace characters, are used to
66/// separate (rather than terminate) tokens, and thus never result in an empty
67/// token.
68///
69/// A *hard* *delimiter* is a maximal (non-empty) sequence of delimiter
70/// characters consisting of exactly one hard-delimiter character. Hard
71/// delimiters, typically printable punctuation characters such (`/`) or colon
72/// (`:` ), are used to terminate (rather than just separate) tokens, and thus a
73/// hard delimiter that is not preceded by a token character results in an empty
74/// token.
75///
76/// Soft delimiters are used in applications where multiple consecutive
77/// delimiter characters are to be treated as just a single delimiter. For
78/// example, if we want the input string ` "Sticks and stones" ` to parse into a
79/// sequence of three non-empty tokens ["Sticks", "and", "stones"], rather than
80/// the four-token sequence ["Sticks", "", "and", "stones"], we would make the
81/// space (` `) a soft-delimiter character.
82///
83/// Hard delimiters are used in applications where consecutive delimiter
84/// characters are to be treated as separate delimiters, giving rise to the
85/// possibility of empty tokens. Making the slash ( `/` ) in the standard date
86/// format a hard delimiter for the input string "15//9" yields the three-token
87/// sequence ["15", "", "9"], rather than the two-token one ["15", "9"] had it
88/// been made soft.
89///
90/// All members within each respective character set are considered equivalent
91/// with respect to tokenization. For example, making `/` and `:` *soft*
92/// *delimiter* characters on the questionably formatted date "2015/:10:/31"
93/// would yield the token sequence ["2015", "10", "31"], whereas making `/` and
94/// `:` *hard* *delimiter* characters would result in the token sequence
95/// ["2015", "", "10", "", "31"]. Making either of these two delimiter
96/// characters hard and the other soft would, in this example, yield the former
97/// (shorter) sequence of tokens. The details of how soft and hard delimiters
98/// interact is illustrated in more detail in the following section (but also
99/// see, later on, the section on "Comprehensive Detailed Parsing
100/// Specification").
101///
102/// ## The Input String to be Tokenized {#bdlb_tokenizer-the-input-string-to-be-tokenized}
103///
104///
105/// Each input string consists of an optional leading sequence of soft-delimiter
106/// characters called the *leader*, followed by an alternating sequence of
107/// tokens and delimiters (the final delimiter being optional):
108/// @code
109/// Input String:
110/// +--------+---------+-------------+---...---+---------+-------------+
111/// | leader | token_1 | delimiter_1 | | token_N | delimiter_N |
112/// +--------+---------+-------------+---...---+---------+-------------+
113/// (optional) (optional)
114/// @endcode
115/// The tokenization of a string can also be expressed as pseudo-Posix regular
116/// expression notation:
117/// @code
118/// delimiter = [[:soft:]]+ | [[:soft:]]* [[:hard:]] [[:soft:]]*
119/// token = [^[:soft:][:hard:]]*
120/// string = [[:soft:]]* (token delimiter)* token?
121/// @endcode
122/// Parsing is from left to right and is *greedy* -- i.e., the longest sequence
123/// satisfying the regular expression is the one that matches. For example, let
124/// `s` represent the start of a soft delimiter, `d` the start of a hard
125/// delimiter, `^` the start of a token, and `~` the continuation of that same
126/// delimiter or token. Using `.` as a soft delimiter and `/` as a hard one,
127/// the string
128/// @code
129/// s~ h~ h~~ h~ s~ hh s h~h h~~~ Delimiters
130///
131/// "..One/.if./.by./land,..two//if.by/./sea!./.."
132///
133/// ^~~ ^~ ^~ ^~~~ ^~~ ^^~ ^~ ^^~~ Tokens
134/// | |
135/// (empty) (empty)
136/// @endcode
137/// yields the tokenization
138/// @code
139/// [One] [if] [by] [land,] [two] [] [if] [by] [] [sea] Tokens
140///
141/// (..) (/.) (./.) (./) (..) (/)(/) (.) (/.)(/) (./..) Delims
142/// @endcode
143/// Notice that in pair of hard delimiters `/./` before the token "sea", the
144/// soft token character between the two hard ones binds to the earlier
145/// delimiter.
146///
147/// ## Iterating using a TokenizerIterator object (ACCESS TO TOKENS ONLY) {#bdlb_tokenizer-iterating-using-a-tokenizeriterator-object}
148///
149///
150/// This component provides two separate mechanisms by which a user may iterate
151/// over a sequence of tokens. The first mechanism is as a *token* *range*,
152/// exposed by the `TokenizerIterator` objects returned by the `begin` and `end`
153/// methods on a `Tokenizer` object. A `TokenizerIterator` supports the concept
154/// of a standard *input* *iterator*, returning each successive token as a
155/// `bslstl::StringRef`, making it suitable for generic use -- e.g., in a
156/// range-based `for` loop:
157/// @code
158/// /// Print, to the specified `output` stream, each whitespace-delimited
159/// /// token in the specified `input`; string on a separate line following
160/// /// a vertical bar ('|') and a hard space (' ').
161/// void parse_1(bsl::ostream& output, const char *input)
162/// {
163/// const char softDelimiters[] = " \t\n"; // whitespace
164///
165/// for (bslstl::StringRef token : bdlb::Tokenizer(input, softDelimiters)) {
166/// bsl::cout << "| " << token << bsl::endl;
167/// }
168/// }
169/// @endcode
170/// The `parse_1` function above produces each (non-whitespace) token in the
171/// supplied input string on a separate line. So, were `parse_1` to be given a
172/// reference to `bsl::cout` and the input string
173/// @code
174/// " Times like \tthese\n try \n \t men's\t \tsouls.\n"
175/// @endcode
176/// we would expect
177/// @code
178/// | Times
179/// | like
180/// | these
181/// | try
182/// | men's
183/// | souls.
184/// @endcode
185/// to be displayed on `bsl::cout`. Note that there is no way to access the
186/// delimiters from a `TokenizerIterator` directly, for that we will need to
187/// use the `tokenizer` as a non-standard "iterator" directly.
188///
189/// ## Iterating using a Tokenizer object (ACCESS TO TOKENS AND DELIMITERS) {#bdlb_tokenizer-iterating-using-a-tokenizer-object}
190///
191///
192/// The second mechanism, not intended for generic use, provides direct access
193/// to the previous and current (trailing) delimiters as well as the current
194/// token:
195/// @code
196/// /// Print, to the specified `output` stream the leader of the specified
197/// /// `input`, on a singly line, followed by subsequent current token and
198/// /// (trailing) delimiter pairs on successive lines, each line beginning
199/// /// with a vertical bar ('|') followed by a tab ('\t') character.
200/// void parse_2(bsl::ostream& output, const char *input)
201/// {
202/// const char softDelimiters[] = " ";
203/// const char hardDelimiters[] = ":/";
204///
205/// bdlb::Tokenizer it(input, softDelimiters, hardDelimiters);
206/// output << "| " << '"' << it.previousDelimiter() << '"' << "\n";
207///
208/// for (; it.isValid(); ++it) {
209/// output << "|\t"
210/// << '"' << it.token() << '"'
211/// << "\t"
212/// << '"' << it.trailingDelimiter() << '"'
213/// << "\n";
214/// }
215/// }
216/// @endcode
217/// The parse_2 function above produces the *leader* on the first line,
218/// followed by each *token* along with its current (trailing) delimiter on
219/// successive lines. So, were `parse_2` to be given a reference to
220/// `bsl::cout` and the input string
221/// @code
222/// " I've :been: a : :bad:/ boy! / "
223/// @endcode
224/// we would expect
225/// @code
226/// | " "
227/// | "I've" " :"
228/// | "been" ": "
229/// | "a :" " : "
230/// | "" ":"
231/// | "bad" ":"
232/// | "" "/ "
233/// | "boy!" " / "
234/// @endcode
235/// to be displayed on `bsl::cout`.
236///
237/// ## Token and Delimiter Lifetimes {#bdlb_tokenizer-token-and-delimiter-lifetimes}
238///
239///
240/// All tokens and delimiters are returned efficiently by value as
241/// `bslstl::StringRef` objects, which naturally remain valid so long as the
242/// underlying input string remains unchanged -- irrespective of the validity
243/// of the `tokenizer` or any of its dispensed token iterators. Note, however,
244/// that all such token iterators are invalidated if the parent tokenizer object
245/// is destroyed or reset. Note also the previous delimiter field remains
246/// accessible from a `tokenizer` object even after it has reached the end of
247/// its input. Also note that the *leader* is accessible, using the
248/// `previousDelimiter` method prior to advancing the iteration state of the
249/// `Tokenizer`.
250///
251/// ## Comprehensive Detailed Parsing Specification {#bdlb_tokenizer-comprehensive-detailed-parsing-specification}
252///
253///
254/// This section provides a comprehensive (length-ordered) enumeration of how
255/// the `bdlb::Tokenizer` performs, according to its three (non-null) character
256/// types:
257/// @code
258/// '.' = any *soft* delimiter character
259/// '#' = any *hard* delimiter character
260/// 'T' = any token character
261/// @endcode
262/// Here's how iteration progresses for various input strings. Note that input
263/// strings having consecutive characters of the same category that naturally
264/// coalesce (i.e., behave as if they were a single character of that category)
265/// -- namely soft-delimiter or token characters -- are labeled with `(%)`.
266/// For example, consider the input ".." at the top of the [length 2] section
267/// below. The table indicates, with a (%) in the first column, that the input
268/// acts the same as if it were a single (soft-delimiter) character (i.e., ".").
269/// There is only one line in this row of the table because, upon construction,
270/// the iterator is immediately invalid (as indicated by the right-most column).
271/// Now consider the "##" entry near the bottom of [length 2]. These
272/// (hard-delimiter) tokens do not coalesce. What's more, the iterator on
273/// construction is valid and produces a empty leader and empty first token.
274/// after advancing the tokenizer, the second line of that row shows the
275/// current state of iteration with the previous delimiter being a `#` as well
276/// as the current one. The current token is again shown as empty. After
277/// advancing the tokenizer again, we now see that the iterator is invalid, yet
278/// the previous delimiter (still accessible) is a `#`).
279/// @code
280/// (%) = repeat Previous Current Current Iterator
281/// Input String Delimiter Token Delimiter Status
282/// ============ ========= ======= ========= ======== [length 0]
283/// "" "" na na invalid
284///
285/// ============ ========= ======= ========= ======== [length 1]
286/// "." "." na na invalid
287/// ------------ --------- ------- --------- --------
288/// "#" "" "" "#" valid
289/// "#" na na invalid
290/// ------------ --------- ------- --------- --------
291/// "T" "" "T" "" valid
292/// "" na na invalid
293///
294/// ============ ========= ======= ========= ======== [length 2]
295/// ".." (%) ".." na na invalid
296/// ------------ --------- ------- --------- --------
297/// ".#" "." "" "#" valid
298/// "#" na na invalid
299/// ------------ --------- ------- --------- --------
300/// ".T" "." "T" "" valid
301/// "" na na invalid
302///
303/// ------------ --------- ------- --------- --------
304/// "#." "" "" "#" valid
305/// "#" na na invalid
306/// ------------ --------- ------- --------- --------
307/// "##" "" "" "#" valid
308/// "#" "" "#" valid
309/// "#" na na invalid
310/// ------------ --------- ------- --------- --------
311/// "#T" "" "" "#" valid
312/// "#" "T" "" valid
313/// "" na na invalid
314///
315/// ------------ --------- ------- --------- --------
316/// "T." "" "T" "." valid
317/// "." na na invalid
318/// ------------ --------- ------- --------- --------
319/// "T#" "" "T" "#" valid
320/// "#" na na invalid
321/// ------------ --------- ------- --------- --------
322/// "TT" (%) "" "TT" "" valid
323/// "" na na invalid
324///
325/// ============ ========= ======= ========= ======== [length 3]
326/// "..." (%) "..." na na invalid
327/// ------------ --------- ------- --------- --------
328/// "..#" (%) ".." "" "#" valid
329/// "#" na na invalid
330/// ------------ --------- ------- --------- --------
331/// "..T" (%) ".." "T" "" valid
332/// ".." na na invalid
333/// ------------ --------- ------- --------- --------
334/// ".#." "." "" "#." valid
335/// "#." na na invalid
336/// ------------ --------- ------- --------- --------
337/// ".##" "." "" "#" valid
338/// "#" "" "#" valid
339/// "#" na na invalid
340/// ------------ --------- ------- --------- --------
341/// ".#T" "." "" "#" valid
342/// "#" "T" "" valid
343/// "" na na invalid
344/// ------------ --------- ------- --------- --------
345/// ".T." "." "T" "." valid
346/// "." na na invalid
347/// ------------ --------- ------- --------- --------
348/// ".T#" "." "T" "#" valid
349/// "#" na na invalid
350/// ------------ --------- ------- --------- --------
351/// ".TT" (%) "." "TT" "" valid
352/// "" na na invalid
353///
354/// ------------ --------- ------- --------- --------
355/// "#.." (%) "" "" "#.." invalid
356/// "#.." na na invalid
357/// ------------ --------- ------- --------- --------
358/// "#.#" "" "" "#." valid
359/// "#." "" "#" valid
360/// "#" na na invalid
361/// ------------ --------- ------- --------- --------
362/// "#.T" "" "" "#." valid
363/// "#." "T" "" valid
364/// "" na na invalid
365/// ------------ --------- ------- --------- --------
366/// "##." "" "" "#" valid
367/// "#" "" "#." valid
368/// "#." na na invalid
369/// ------------ --------- ------- --------- --------
370/// "###" "" "" "#" valid
371/// "#" "" "#" valid
372/// "#" "" "#" valid
373/// "#" na na invalid
374/// ------------ --------- ------- --------- --------
375/// "##T" "" "" "#" valid
376/// "#" "" "#" valid
377/// "#" "T" "" valid
378/// "" na na invalid
379/// ------------ --------- ------- --------- --------
380/// "#T." "" "" "#" valid
381/// "#" "T" "." valid
382/// "." na na invalid
383/// ------------ --------- ------- --------- --------
384/// "#T#" "" "" "#" valid
385/// "#" "T" "#" valid
386/// "#" na na invalid
387/// ------------ --------- ------- --------- --------
388/// "#TT" (%) "" "" "#" valid
389/// "#" "TT" "" valid
390/// "" na na invalid
391///
392/// ------------ --------- ------- --------- --------
393/// "T.." (%) "" "T" ".." valid
394/// ".." na na invalid
395/// ------------ --------- ------- --------- --------
396/// "T.#" "" "T" ".#" valid
397/// ".#" na na invalid
398/// ------------ --------- ------- --------- --------
399/// "T.T" "" "T" "." valid
400/// "." "T" "" valid
401/// "" na na invalid
402/// ------------ --------- ------- --------- --------
403/// "T#." "" "T" "#." valid
404/// "#." na na invalid
405/// ------------ --------- ------- --------- --------
406/// "T##" "" "T" "#" valid
407/// "#" "" "#" valid
408/// "#" "" "#" valid
409/// "#" na na invalid
410/// ------------ --------- ------- --------- --------
411/// "T#T" "" "T" "#" valid
412/// "#" "T" "#" valid
413/// "" na na invalid
414/// ------------ --------- ------- --------- --------
415/// "TT." (%) "" "TT" "." valid
416/// "." na na invalid
417/// ------------ --------- ------- --------- --------
418/// "TT#" (%) "" "TT" "#" valid
419/// "#" na na invalid
420/// ------------ --------- ------- --------- --------
421/// "TTT" (%) "#" "TTT" "" valid
422/// "" na na invalid
423/// ------------ --------- ------- --------- --------
424/// @endcode
425///
426/// ## Usage {#bdlb_tokenizer-usage}
427///
428///
429/// This section illustrates intended use of this component.
430///
431/// ### Example 1: Iterating Over Tokens Using Just Soft Delimiters {#bdlb_tokenizer-example-1-iterating-over-tokens-using-just-soft-delimiters}
432///
433///
434/// This example illustrates the process of splitting the input string into a
435/// sequence of tokens using just soft delimiters.
436///
437/// Suppose, we have a text where words are separated with a variable number of
438/// spaces and we want to remove all duplicated spaces.
439///
440/// First, we create an example character array:
441/// @code
442/// const char text1[] = " This is a test.";
443/// @endcode
444/// Then, we create a `Tokenizer` that uses " "(space) as a soft delimiter:
445/// @code
446/// bdlb::Tokenizer tokenizer1(text1, " ");
447/// @endcode
448/// Note, that the tokenizer skips the leading soft delimiters upon
449/// initialization. Next, we iterate the input character array and build the
450/// string without duplicated spaces:
451/// @code
452/// bsl::string result1;
453/// if (tokenizer1.isValid()) {
454/// result1 += tokenizer1.token();
455/// ++tokenizer1;
456/// }
457/// while (tokenizer1.isValid()) {
458/// result1 += " ";
459/// result1 += tokenizer1.token();
460/// ++tokenizer1;
461/// }
462/// @endcode
463/// Finally, we verify that the resulting string contains the expected result:
464/// @code
465/// const bsl::string EXPECTED1("This is a test.");
466/// assert(EXPECTED1 == result1);
467/// @endcode
468///
469/// ### Example 2: Iterating Over Tokens Using Just Hard Delimiters {#bdlb_tokenizer-example-2-iterating-over-tokens-using-just-hard-delimiters}
470///
471///
472/// This example illustrates the process of splitting the input string into a
473/// sequence of tokens using just hard delimiters.
474///
475/// Suppose, we want to reformat comma-separated-value file and insert the
476/// default value of `0` into missing columns.
477///
478/// First, we create an example CSV line:
479/// @code
480/// const char text2[] = "Col1,Col2,Col3\n111,,133\n,222,\n311,322,\n";
481/// @endcode
482/// Then, we create a `Tokenizer` that uses ","(comma) and "\n"(new-line) as
483/// hard delimiters:
484/// @code
485/// bdlb::Tokenizer tokenizer2(text2, "", ",\n");
486/// @endcode
487/// We use the `trailingDelimiter` accessor to insert correct delimiter into the
488/// output string. Next, we iterate the input line and insert the default
489/// value:
490/// @code
491/// string result2;
492/// while (tokenizer2.isValid()) {
493/// if (tokenizer2.token() != "") {
494/// result2 += tokenizer2.token();
495/// } else {
496/// result2 += "0";
497/// }
498/// result2 += tokenizer2.trailingDelimiter();
499/// ++tokenizer2;
500/// }
501/// @endcode
502/// Finally, we verify that the resulting string contains the expected result:
503/// @code
504/// const string EXPECTED2("Col1,Col2,Col3\n111,0,133\n0,222,0\n311,322,0\n");
505/// assert(EXPECTED2 == result2);
506/// @endcode
507///
508/// ### Example 3: Iterating Over Tokens Using Both Hard and Soft Delimiters {#bdlb_tokenizer-example-3-iterating-over-tokens-using-both-hard-and-soft-delimiters}
509///
510///
511/// This example illustrates the process of splitting the input string into a
512/// sequence of tokens using both soft and hard delimiters.
513///
514/// Suppose, we want to extract the tokens from a file, where the fields are
515/// separated with a "$"(dollar-sign), but can have leading or trailing spaces.
516///
517/// First, we create an example line:
518/// @code
519/// const char text3[] = " This $is $ a$ test. ";
520/// @endcode
521/// Then, we create a `Tokenizer` that uses "$"(dollar-sign) as a hard delimiter
522/// and " "(space) as a soft delimiter:
523/// @code
524/// bdlb::Tokenizer tokenizer3(text3, " ", "$");
525/// @endcode
526/// In this example we only extracting the tokens, and can use the iterator
527/// provided by the tokenizer.
528///
529/// Next, we create an iterator and iterate over the input, extracting the
530/// tokens into the result string:
531/// @code
532/// string result3;
533///
534/// bdlb::Tokenizer::iterator it3 = tokenizer3.begin();
535///
536/// if (it3 != tokenizer3.end()) {
537/// result3 += *it3;
538/// }
539/// ++it3;
540///
541/// while (it3 != tokenizer3.end()) {
542/// result3 += " ";
543/// result3 += *it3;
544/// ++it3;
545/// }
546/// @endcode
547/// Finally, we verify that the resulting string contains the expected result:
548/// @code
549/// const string EXPECTED3("This is a test.");
550/// assert(EXPECTED3 == result3);
551/// @endcode
552/// @}
553/** @} */
554/** @} */
555
556/** @addtogroup bdl
557 * @{
558 */
559/** @addtogroup bdlb
560 * @{
561 */
562/** @addtogroup bdlb_tokenizer
563 * @{
564 */
565
566#include <bdlscm_version.h>
567
568#include <bsl_string.h>
569
570#include <bsls_assert.h>
572#include <bsls_keyword.h>
573#include <bsls_libraryfeatures.h>
574#include <bsls_platform.h>
575#include <bsls_review.h>
576
577#include <bsl_iterator.h>
578
579
580
581namespace bdlb {
582 // ============================
583 // private class Tokenizer_Data
584 // ============================
585
586/// This component-private class is used to hold delimiter information.
587/// Each `Tokenizer` object will have, as a private data member, an object
588/// of this class, and will pass the address of that member to the
589/// (private) constructor of each `TokenizerIterator` object it issues:
590/// @code
591/// +--------------------------------------+
592/// | ,--------------. |
593/// | ( Tokenizer_Data ) |
594/// | `--------------'\ |
595/// | | \ |
596/// | | ,----*------------. |
597/// | | ( TokenizerIterator ) |
598/// | | /`-----------------' |
599/// | | / |
600/// | ,----*--o-. |
601/// | ( Tokenizer ) |
602/// | `---------' |
603/// +--------------------------------------+
604/// bdlb_tokenizer
605/// @endcode
606///
607/// See @ref bdlb_tokenizer
609
610 enum {
611 k_MAX_CHARS = 256 // maximum # of unique values for an 8-bit `char`
612 };
613
614 char d_charTypes[k_MAX_CHARS]; // table of SOFT / HARD / TOKEN characters
615
616 private:
617 // NOT IMPLEMENTED
620
621 public:
622 // CREATORS
623
624 explicit Tokenizer_Data(const bsl::string_view& softDelimiters);
625 /// Create a `Tokenizer_Data` object and load the `d_charTypes` data
626 /// member such that it has the same value *as* *if* this (overly
627 /// prescriptive) algorithm were used: (I) initialize each entry in
628 /// `d_charTypes` array to a value indicating that the character having
629 /// that `index` as its (e.g., ASCII) representation is a *token*
630 /// character; (II) then, for each character in the specified
631 /// `softDelimiters` sequence, overwrite the element at the
632 /// corresponding index in `d_charTypes` with a value that indicates
633 /// that the character is a *soft* delimiter character; (III) finally,
634 /// for each character in the specified `hardDelimiters` sequence,
635 /// overwrite the element at the corresponding index with a distinct
636 /// value that indicates the character is a *hard* delimiter* character.
637 /// Note that duplicate delimiter characters in the respective inputs
638 /// are naturally ignored, and that a character that appears in both
639 /// sets would naturally be considered *hard*. Also note that it is
640 /// entirely reasonable to state, in any public interface, that the
641 /// behavior is undefined unless the characters in the union of the two
642 /// delimiter sequences are unique.
643 Tokenizer_Data(const bsl::string_view& softDelimiters,
644 const bsl::string_view& hardDelimiters);
645
646 // ACCESSORS
647
648 /// Return the input type of the specified `character`: 0 for token,
649 /// 1 for soft delimiter, 2 for hard delimiter.
650 int inputType(char character) const;
651};
652
653 // =====================
654 // class Tokenizer_Proxy
655 // =====================
656
657/// This class provides a proxy holder of a reference to a
658/// `TokernizerIterator` object, allowing correct return of `operator->`.
659///
660/// See @ref bdlb_tokenizer
662
663 // DATA
664 bslstl::StringRef d_obj; // The object
665
666 private:
667 // NOT IMPLEMENTED
669
670 public:
671 // CREATORS
672
673 /// Create a `ProxyHolder` object with a copy the specified `obj`.
675
676#ifdef BSLS_COMPILERFEATURES_SUPPORT_DEFAULTED_FUNCTIONS
677 /// Create a `Tokenizer_Proxy` object having the same value as the
678 /// specified `original` object. Note that this copy constructor is
679 /// generated by the compiler.
680 Tokenizer_Proxy(const Tokenizer_Proxy& original) = default;
681
682 /// Destroy this object.
683 ~Tokenizer_Proxy() = default;
684#endif
685
686 // OPERATORS
687
688 /// Return a pointer to the object contained by the `Tokenizer_Proxy`.
689 const bslstl::StringRef *operator->() const;
690};
691
692 // =======================
693 // class TokenizerIterator
694 // =======================
695
696/// This class provides a C++-standards-conforming input iterator over the
697/// tokens in the input string suppled at construction (along with the
698/// designation of *soft* and *hard* delimiter characters) to a `Tokenizer`
699/// object. Tokens are returned, using a `bslstl::StringRef` -- by value --
700/// that means the iterated references remain valid until the underlying
701/// input string itself is modified or destroyed. Note that all iterators
702/// are invalidated whenever the input string in the parent `Tokenizer`
703/// change.
705#if defined(BSLS_LIBRARYFEATURES_STDCPP_LIBCSTD)
706/// Sun CC workaround: iterators must be derived from `std::iterator` to work
707/// with the native std library algorithms. However, `std::iterator` is
708/// deprecated in C++17, so do not rely on derivation unless required, to avoid
709/// deprecation warnings on modern compilers.
710 : public bsl::iterator<bsl::input_iterator_tag,
711 bslstl::StringRef,
712 int,
713 Tokenizer_Proxy,
714 const bslstl::StringRef>
715#endif // BSLS_LIBRARYFEATURES_STDCPP_LIBCSTD
716{
717
718
719 // DATA
720 const Tokenizer_Data *d_sharedData_p; // (address of) character categories
721 const char *d_cursor_p; // tail of parsed input
722 const char *d_token_p; // current token
723 const char *d_postDelim_p; // current (trailing) delimiter
724 const char *d_end_p; // one past input; 0 for `(char *)`
725 bool d_endFlag; // set `true` when at end of input
726
727 // FRIENDS
728 friend class Tokenizer;
729 friend bool operator==(const TokenizerIterator&, const TokenizerIterator&);
730 friend bool operator!=(const TokenizerIterator&, const TokenizerIterator&);
731
732 // PRIVATE CREATORS
733
734 /// Create a `TokenizerIterator` object bound to the specified sequence
735 /// of `input` characters ending at the specified `end` and the
736 /// specified delimiter and token mapper `sharedData`.
737 TokenizerIterator(const char *input,
738 const char *end,
739 const Tokenizer_Data *sharedData);
740
741 public:
742 // TYPES
744 typedef int difference_type;
747
748 /// Defines a type alias for the tag type that represents the iterator
749 /// concept this class models.
750 typedef bsl::input_iterator_tag iterator_category;
751
752 // CREATORS
753
755 /// Create a `TokenizerIterator` object having the value of the
756 /// specified `origin` iterator.
758
759 // MANIPULATORS
760
761 /// Assign to this object the value of the specified `rhs` iterator, and
762 /// return a reference providing modifiable access to this object.
764
765 /// Advance the iteration state of this object to refer to the next
766 /// token in the underlying input sequence, and return a reference
767 /// providing modifiable access to this object. The behavior is
768 /// undefined unless the iteration state of this object is initially
769 /// valid, or if the underlying input has been modified or destroyed
770 /// since this object was created.
772
773 // ACCESSORS
774
775 /// Return a reference to the non-modifiable current token (i.e.,
776 /// maximal sequence of non-delimiter characters) in the input string.
777 /// The returned reference remains valid so long as the underlying input
778 /// is not modified or destroyed -- irrespective of the state (or
779 /// existence) of this object. The behavior is undefined unless the
780 /// iteration state of this object is initially valid, or if the
781 /// underlying input has been modified or destroyed since this object
782 /// was created.
783 const bslstl::StringRef operator*() const;
784
785 /// Return a proxy object containing the non-modifiable current token
786 /// (i.e., maximal sequence of non-delimiter characters) in the input
787 /// string. The returned proxy remains valid so long as the underlying
788 /// input is not modified or destroyed -- irrespective of the state (or
789 /// existence) of this object. The behavior is undefined unless the
790 /// iteration state of this object is initially valid, or if the
791 /// underlying input has been modified or destroyed since this object
792 /// was created.
794};
795
796// FREE OPERATORS
797
798/// Return `true` if the specified `lhs` and `rhs` objects have the same
799/// value, and `false` otherwise. Two `TokenizerIterator` objects have the
800/// same value if both of them are pointing to the same token within the
801/// same tokenized string or if they both point past the tokenized string.
802/// The behaviour is undefined unless the iterators returned by the same
803/// `Tokenizer` object, or if the underlying input has been modified or
804/// destroyed since any of those objects were created.
805bool operator==(const TokenizerIterator& lhs, const TokenizerIterator& rhs);
806
807/// Return `true` if the specified `lhs` and `rhs` objects do not have the
808/// same value, and `false` otherwise. The behaviour is undefined unless
809/// the iterators returned by the same `Tokenizer` object, or if the
810/// underlying input has been modified or destroyed since any of those
811/// objects were created.
812bool operator!=(const TokenizerIterator& lhs, const TokenizerIterator& rhs);
813
814/// Advance the iteration state of the specified `object` to refer to the
815/// next token in the underlying input sequence, and return a copy of this
816/// object prior advancing the iteration state. The behavior is undefined
817/// unless the iteration state of this object is initially valid, or if the
818/// underlying input has been modified or destroyed since this object was
819/// created.
821
822
823 // ===============
824 // class Tokenizer
825 // ===============
826
827/// This class provides (read-only) sequential access to tokens delimited by
828/// two user-supplied character sets consisting, respectively, of *soft* and
829/// *hard* delimiters characters. Access to the previous and current
830/// (trailing) delimiter, as well as to the current token itself, is
831/// provided efficiently via `bslstl::StringRef`.
832///
833/// See @ref bdlb_tokenizer
835
836 // DATA
837 Tokenizer_Data d_sharedData; // delimiter/token character categories
838 const char *d_input_p; // original input
839 const char *d_cursor_p; // tail of parsed input
840 const char *d_prevDelim_p; // previous delimiter
841 const char *d_token_p; // current token
842 const char *d_postDelim_p; // current (trailing) delimiter
843 const char *d_end_p; // one past end of input; 0 for `(char *)`
844 bool d_endFlag; // set `true` when cursor at end of input
845
846 private:
847 // PRIVATE MANIPULATORS
848
849 /// Rebind this object to refer to the specified sequence of `input`
850 /// characters ending at the specified `endOfInput` pointer. The state
851 /// of the tokenizer following this call is *as* *if* it had been
852 /// constructed with `input` and its current sets of *soft* and *hard*
853 /// delimiter characters. Note that the behavior is undefined if this
854 /// object is used in any way (other than to reset or destroy it) after
855 /// its underlying `input` string is modified.
856 void resetImpl(const char *input, const char *endOfInput);
857
858 private:
859 // NOT IMPLEMENTED
861 Tokenizer& operator=(const Tokenizer&) BSLS_KEYWORD_DELETED;
863
864 public:
865 // TYPES
867
868 // CREATORS
869
870 Tokenizer(const char *input, const bsl::string_view& soft);
871 Tokenizer(const bsl::string_view& input, const bsl::string_view& soft);
872 Tokenizer(const char *input,
873 const bsl::string_view& soft,
874 const bsl::string_view& hard);
875 /// Create a `Tokenizer` object bound to the specified sequence of
876 /// `input` characters having the specified set of (unique) `soft`
877 /// delimiter characters to be used to separate *tokens* (i.e., maximal
878 /// sequence of non-delimiter characters) in `input`. Optionally
879 /// specify a disjoint set of (unique) `hard` delimiter characters to be
880 /// used to explicitly terminate tokens. Delimiters within `input`
881 /// consist of a maximal sequence of one or more delimiter characters,
882 /// at most one of which may be *hard*; when there is a contiguous
883 /// sequence of delimiter characters containing two or more *hard*
884 /// delimiter characters in `input`, any intervening *soft* delimiter
885 /// characters are associated with the previous (*hard*) delimiter. Any
886 /// leading soft delimiter characters -- i.e., those preceding the first
887 /// *token* or *hard* delimiter character (referred to as the *leader*)
888 /// -- are available immediately after construction via the
889 /// `previousDelimiter` method. The behavior is undefined unless all
890 /// supplied delimiter characters are unique. Note that the behavior is
891 /// also undefined if this object is used in any way (other than to
892 /// reset or destroy it) after its underlying `input` string is
893 /// modified. Also note that the current token and (trailing) delimiter
894 /// may be accessed only while this object is in the valid state;
895 /// however, the previous delimiter (or *leader*) is always accessible.
896 /// Also note that all token and delimiter strings are returned as
897 /// references into the underlying `input` string, and hence remain
898 /// valid so long as that string is not modified or destroyed --
899 /// irrespective of the state (or even the existence) of this object.
900 /// Finally note that supplying a default constructed @ref string_view is
901 /// equivalent to supplying an empty c-string (i.e., "").
903 const bsl::string_view& soft,
904 const bsl::string_view& hard);
905
906 /// Destroy this object.
908
909 // MANIPULATORS
910
911 /// Advance the iteration state of this object to refer to the next
912 /// sequence of previous delimiter, current token, and current
913 /// (trailing) delimiter in the underlying input sequence, and return a
914 /// reference providing modifiable access to this object. The current
915 /// delimiter reference becomes the previous one. If there is another
916 /// token remaining in the input, the current token and delimiter are
917 /// updated to refer to the respective new token and (trailing)
918 /// delimiter values -- either of which (but not both) might be empty.
919 /// If there are no tokens remaining in the input, the iteration state
920 /// of this object becomes invalid. The behavior is undefined unless
921 /// the iteration state of this object is initially valid, or if the
922 /// underlying input has been modified or destroyed since this object
923 /// was most recently reset (or created).
925
926 void reset(const char *input);
927 /// Rebind this object to refer to the specified sequence of `input`
928 /// characters. The state of the tokenizer following this call is *as*
929 /// *if* it had been constructed with `input` and its current sets of
930 /// *soft* and *hard* delimiter characters. The behavior is
931 /// undefined if this object is used in any way (other than to reset or
932 /// destroy it) after its underlying `input` string is modified. Note
933 /// that supplying a default constructed @ref string_view is equivalent to
934 /// supplying an empty c-string (i.e., "").
935 void reset(const bsl::string_view& input);
936
937 // ACCESSORS
938
939 /// Return `true` if the previous delimiter (or *leader*) contains a
940 /// *soft* delimiter character, and `false` otherwise. The behavior is
941 /// undefined if the underlying input itself has been modified or
942 /// destroyed since this object was most recently reset (or created).
943 bool hasPreviousSoft() const;
944
945 /// Return `true` if the current (trailing) delimiter contains a *soft*
946 /// delimiter character, and `false` otherwise. The behavior is
947 /// undefined if the iteration state of this object is initially
948 /// invalid, or if the underlying input itself has been modified or
949 /// destroyed since this object was most recently reset (or created).
950 bool hasTrailingSoft() const;
951
952 /// Return `true` if the previous delimiter contains a *hard-delimiter*
953 /// character, and `false` otherwise. The behavior is undefined if the
954 /// underlying input itself has been modified or destroyed since this
955 /// object was most recently reset (or created).
956 bool isPreviousHard() const;
957
958 /// Return `true` if the current (trailing) delimiter contains a *hard*
959 /// delimiter character, and `false` otherwise. The behavior is
960 /// undefined if the iteration state of this object is initially
961 /// invalid, or if the underlying input itself has been modified or
962 /// destroyed since this object was most recently reset (or created).
963 bool isTrailingHard() const;
964
965 /// Return `true` if the iteration state of this object is valid, and
966 /// `false` otherwise. Note that the behavior of advancing the
967 /// iteration state as well as accessing the current token or (trailing)
968 /// delimiter is undefined unless the current iteration state of this
969 /// object is valid.
970 bool isValid() const;
971
972 /// Return a reference to the non-modifiable previous delimiter (or
973 /// *leader*) in the input string. The behavior is undefined if the
974 /// underlying input has been modified or destroyed since this object
975 /// was most recently reset (or created).
977
978 /// Return a reference to the non-modifiable current token (i.e.,
979 /// maximal sequence of non-delimiter characters) in the input string.
980 /// The returned reference remains valid so long as the underlying input
981 /// is not modified or destroyed -- irrespective of the state (or
982 /// existence) of this object. The behavior is undefined unless the
983 /// iteration state of this object is initially valid, or if the
984 /// underlying input has been modified or destroyed since this object
985 /// was most recently reset (or created).
986 bslstl::StringRef token() const;
987
988
989 /// Return a reference to the non-modifiable current (trailing)
990 /// delimiter (maximal sequence of one or more delimiter characters
991 /// containing at most one *hard* delimiter character) in the input
992 /// string. The returned reference remains valid so long as the
993 /// underlying input is not modified or destroyed -- irrespective of the
994 /// state (or existence) of this object. The behavior is undefined
995 /// unless the iteration state of this object is initially valid, or if
996 /// the underlying input has been modified or destroyed since this
997 /// object was most recently reset (or created).
999
1000
1001 // iterators
1002
1003 /// Return an iterator referring to the first token in this object's
1004 /// input string (the past-the-end iterator if this object iteration
1005 /// state is initially invalid). This reference remains valid as long
1006 /// as the underlying input has not been modified or destroyed since
1007 /// this object was most recently reset (or created).
1009
1010 /// Return an iterator referring to position beyond the last token in
1011 /// this object's input string. This reference remains valid as long as
1012 /// the underlying input has not been modified or destroyed since this
1013 /// object was most recently reset (or created).
1014 iterator end() const;
1015};
1016
1017// ============================================================================
1018// INLINE DEFINITIONS
1019// ============================================================================
1020
1021 // --------------------------
1022 // class bdlb::Tokenizer_Data
1023 // --------------------------
1024// ACCESSORS
1025inline
1026int Tokenizer_Data::inputType(char character) const
1027{
1028 return d_charTypes[static_cast<unsigned char>(character)];
1029}
1030
1031 // ---------------------------
1032 // class bdlb::Tokenizer_Proxy
1033 // ---------------------------
1034// CREATORS
1035inline
1037: d_obj(obj)
1038{
1039}
1040
1041// OPERATORS
1042inline
1044{
1045 return &d_obj;
1046}
1047
1048 // -----------------------------
1049 // class bdlb::TokenizerIterator
1050 // -----------------------------
1051// ACCESSORS
1052inline
1054{
1055 // Called on invalid iterator
1056 BSLS_REVIEW(!d_endFlag);
1057 return bslstl::StringRef(d_token_p, d_postDelim_p);
1058}
1059
1060inline
1062{
1063 // Called on invalid iterator
1064 BSLS_REVIEW(!d_endFlag);
1065 return Tokenizer_Proxy(this->operator*());
1066}
1067
1068 // ---------------------
1069 // class bdlb::Tokenizer
1070 // ---------------------
1071// ACCESSORS
1072inline
1074{
1075 return !d_endFlag;
1076}
1077
1078inline
1080{
1081 return bslstl::StringRef(d_prevDelim_p, d_token_p);
1082}
1083
1084inline
1086{
1087 // Called on invalid tokenizer
1088 BSLS_REVIEW(!d_endFlag);
1089 return bslstl::StringRef(d_token_p, d_postDelim_p);
1090}
1091
1092inline
1094{
1095 // Called on invalid tokenizer
1096 BSLS_REVIEW(!d_endFlag);
1097 return bslstl::StringRef(d_postDelim_p, d_cursor_p);
1098}
1099
1100} // close package namespace
1101
1102// FREE OPERATORS
1103inline
1105 const bdlb::TokenizerIterator& rhs)
1106{
1107 // Fast path decision
1108 if (lhs.d_endFlag != rhs.d_endFlag) {
1109 return false; // RETURN
1110 }
1111
1112 // Comparing end iterators
1113 if (lhs.d_endFlag && rhs.d_endFlag) {
1114 return true; // RETURN
1115 }
1116
1117 return lhs.d_token_p == rhs.d_token_p;
1118}
1119
1120inline
1122 const bdlb::TokenizerIterator& rhs)
1123{
1124 return !(lhs == rhs);
1125}
1126
1127inline
1129 int)
1130{
1131 bdlb::TokenizerIterator tmp(object);
1132 ++object;
1133 return tmp;
1134}
1135
1136
1137
1138#endif
1139
1140// ----------------------------------------------------------------------------
1141// Copyright 2015 Bloomberg Finance L.P.
1142//
1143// Licensed under the Apache License, Version 2.0 (the "License");
1144// you may not use this file except in compliance with the License.
1145// You may obtain a copy of the License at
1146//
1147// http://www.apache.org/licenses/LICENSE-2.0
1148//
1149// Unless required by applicable law or agreed to in writing, software
1150// distributed under the License is distributed on an "AS IS" BASIS,
1151// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1152// See the License for the specific language governing permissions and
1153// limitations under the License.
1154// ----------------------------- END-OF-FILE ----------------------------------
1155
1156/** @} */
1157/** @} */
1158/** @} */
Definition bdlb_tokenizer.h:716
friend bool operator!=(const TokenizerIterator &, const TokenizerIterator &)
Tokenizer_Proxy pointer
Definition bdlb_tokenizer.h:745
bslstl::StringRef value_type
Definition bdlb_tokenizer.h:743
TokenizerIterator(const TokenizerIterator &origin)
TokenizerIterator & operator++()
const bslstl::StringRef operator*() const
Definition bdlb_tokenizer.h:1053
friend bool operator==(const TokenizerIterator &, const TokenizerIterator &)
Tokenizer_Proxy operator->() const
Definition bdlb_tokenizer.h:1061
bsl::input_iterator_tag iterator_category
Definition bdlb_tokenizer.h:750
const bslstl::StringRef reference
Definition bdlb_tokenizer.h:746
int difference_type
Definition bdlb_tokenizer.h:744
TokenizerIterator & operator=(const TokenizerIterator &rhs)
Definition bdlb_tokenizer.h:608
int inputType(char character) const
Definition bdlb_tokenizer.h:1026
Tokenizer_Data(const bsl::string_view &softDelimiters, const bsl::string_view &hardDelimiters)
Tokenizer_Data(const bsl::string_view &softDelimiters)
Definition bdlb_tokenizer.h:661
Tokenizer_Proxy(const bsl::string_view &obj)
Create a ProxyHolder object with a copy the specified obj.
Definition bdlb_tokenizer.h:1036
const bslstl::StringRef * operator->() const
Return a pointer to the object contained by the Tokenizer_Proxy.
Definition bdlb_tokenizer.h:1043
Definition bdlb_tokenizer.h:834
void reset(const bsl::string_view &input)
bool isValid() const
Definition bdlb_tokenizer.h:1073
~Tokenizer()
Destroy this object.
iterator end() const
Tokenizer(const bsl::string_view &input, const bsl::string_view &soft, const bsl::string_view &hard)
bool hasTrailingSoft() const
bslstl::StringRef token() const
Definition bdlb_tokenizer.h:1085
bslstl::StringRef trailingDelimiter() const
Definition bdlb_tokenizer.h:1093
TokenizerIterator iterator
Definition bdlb_tokenizer.h:866
bslstl::StringRef previousDelimiter() const
Definition bdlb_tokenizer.h:1079
Tokenizer(const char *input, const bsl::string_view &soft)
iterator begin() const
bool isTrailingHard() const
Tokenizer & operator++()
Tokenizer(const bsl::string_view &input, const bsl::string_view &soft)
bool hasPreviousSoft() const
Tokenizer(const char *input, const bsl::string_view &soft, const bsl::string_view &hard)
bool isPreviousHard() const
void reset(const char *input)
Definition bslstl_stringview.h:441
Definition bslstl_stringref.h:372
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
#define BSLS_KEYWORD_DELETED
Definition bsls_keyword.h:609
#define BSLS_REVIEW(X)
Definition bsls_review.h:949
Definition bdlb_algorithmworkaroundutil.h:74
bool operator!=(const BigEndianInt16 &lhs, const BigEndianInt16 &rhs)
FunctionOutputIterator< FUNCTION > & operator++(FunctionOutputIterator< FUNCTION > &iterator)
Do nothing and return specified iterator.
Definition bdlb_functionoutputiterator.h:405
bool operator==(const BigEndianInt16 &lhs, const BigEndianInt16 &rhs)
StringRefImp< char > StringRef
Definition bslstl_stringref.h:699