// bdlb_tokenizer.h                                                   -*-C++-*-
#ifndef INCLUDED_BDLB_TOKENIZER
#define INCLUDED_BDLB_TOKENIZER

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide access to user-described tokens via string references.
//
//@CLASSES:
//  bdlb::Tokenizer: lexer for tokens defined via hard and/or soft delimiters
//  bdlb::TokenizerIterator: input iterator for delimited tokens in a string
//
//@SEE_ALSO: bslstl_stringref
//
//@DESCRIPTION: This component defines a mechanism, 'bdlb::Tokenizer', that
// provides non-destructive sequential (read-only) access to tokens in a given
// input string as characterized by two disjoint sets of user-specified
// delimiter characters, each of which is supplied at construction via either a
// 'const bsl::string_view&' or (for efficiency, when only the leading
// characters of the input string may need to be parsed) a 'const char *'.
// Note that each character (including '\0') that is not explicitly designated
// as a delimiter character is assumed to be *token* character.
//
///Soft versus Hard Delimiters
///---------------------------
// The tokenizer recognizes two distinct kinds of delimiter characters, *soft*
// and *hard*.
//
// A *soft* *delimiter* is a maximal (non-empty) sequence of soft-delimiter
// characters.  Soft delimiters, typically whitespace characters, are used to
// separate (rather than terminate) tokens, and thus never result in an empty
// token.
//
// A *hard* *delimiter* is a maximal (non-empty) sequence of delimiter
// characters consisting of exactly one hard-delimiter character.  Hard
// delimiters, typically printable punctuation characters such ('/') or colon
// (':' ), are used to terminate (rather than just separate) tokens, and thus a
// hard delimiter that is not preceded by a token character results in an empty
// token.
//
// Soft delimiters are used in applications where multiple consecutive
// delimiter characters are to be treated as just a single delimiter.  For
// example, if we want the input string '"Sticks  and stones"' to parse into a
// sequence of three non-empty tokens ["Sticks", "and", "stones"], rather than
// the four-token sequence ["Sticks", "", "and", "stones"], we would make the
// space (' ') a soft-delimiter character.
//
// Hard delimiters are used in applications where consecutive delimiter
// characters are to be treated as separate delimiters, giving rise to the
// possibility of empty tokens.  Making the slash ('/') in the standard date
// format a hard delimiter for the input string "15//9" yields the three-token
// sequence ["15", "", "9"], rather than the two-token one ["15", "9"] had it
// been made soft.
//
// All members within each respective character set are considered equivalent
// with respect to tokenization.  For example, making '/' and ':' *soft*
// *delimiter* characters on the questionably formatted date "2015/:10:/31"
// would yield the token sequence ["2015", "10", "31"], whereas making '/' and
// ':' *hard* *delimiter* characters would result in the token sequence
// ["2015", "", "10", "", "31"].  Making either of these two delimiter
// characters hard and the other soft would, in this example, yield the former
// (shorter) sequence of tokens.  The details of how soft and hard delimiters
// interact is illustrated in more detail in the following section (but also
// see, later on, the section on "Comprehensive Detailed Parsing
// Specification").
//
///The Input String to be Tokenized
///--------------------------------
// Each input string consists of an optional leading sequence of soft-delimiter
// characters called the *leader*, followed by an alternating sequence of
// tokens and delimiters (the final delimiter being optional):
//..
//  Input String:
//  +--------+---------+-------------+---...---+---------+-------------+
//  | leader | token_1 | delimiter_1 |         | token_N | delimiter_N |
//  +--------+---------+-------------+---...---+---------+-------------+
//  (optional)                                              (optional)
//..
// The tokenization of a string can also be expressed as pseudo-Posix regular
// expression notation:
//..
//   delimiter = [[:soft:]]+ | [[:soft:]]* [[:hard:]] [[:soft:]]*
//   token     = [^[:soft:][:hard:]]*
//   string    = [[:soft:]]* (token delimiter)* token?
//..
// Parsing is from left to right and is *greedy* -- i.e., the longest sequence
// satisfying the regular expression is the one that matches.  For example, let
// 's' represent the start of a soft delimiter, 'd' the start of a hard
// delimiter, '^" the start of a token, and '~' the continuation of that same
// delimiter or token.  Using '.' as a soft delimiter and "/" as a hard one,
// the string
//..
//         s~   h~  h~~  h~     s~   hh  s  h~h    h~~~        Delimiters
//
//        "..One/.if./.by./land,..two//if.by/./sea!./.."
//
//           ^~~  ^~   ^~  ^~~~   ^~~ ^^~ ^~  ^^~~             Tokens
//                                    |       |
//                                 (empty)  (empty)
//..
// yields the tokenization
//..
//     [One]  [if]   [by]  [land,]  [two] [] [if] [by]  [] [sea]       Tokens
//
//  (..)   (/.)  (./.)  (./)     (..)   (/)(/)  (.)  (/.)(/)   (./..)  Delims
//..
// Notice that in pair of hard delimiters "/./" before the token "sea", the
// soft token character between the two hard ones binds to the earlier
// delimiter.
//
///Iterating using a 'TokenizerIterator' object (ACCESS TO TOKENS ONLY)
///--------------------------------------------------------------------
// This component provides two separate mechanisms by which a user may iterate
// over a sequence of tokens.  The first mechanism is as a *token* *range*,
// exposed by the 'TokenizerIterator' objects returned by the 'begin' and 'end'
// methods on a 'Tokenizer' object.  A 'TokenizerIterator' supports the concept
// of a standard *input* *iterator*, returning each successive token as a
// 'bslstl::StringRef', making it suitable for generic use -- e.g., in a
// range-based 'for' loop:
//..
//  void parse_1(bsl::ostream& output, const char *input)
//      // Print, to the specified 'output' stream, each whitespace-delimited
//      // token in the specified 'input; string on a separate line following
//      // a vertical bar ('|') and a hard space (' ').
//  {
//      const char softDelimiters[] = " \t\n";  // whitespace
//
//      for (bslstl::StringRef token : bdlb::Tokenizer(input, softDelimiters)) {
//          bsl::cout << "| " << token << bsl::endl;
//      }
//  }
//..
// The 'parse_1' function above produces each (non-whitespace) token in the
// supplied input string on a separate line.  So, were 'parse_1' to be given a
// reference to 'bsl::cout' and the input string
//..
//  " Times  like \tthese\n  try \n \t men's\t \tsouls.\n"
//..
// we would expect
//..
//  | Times
//  | like
//  | these
//  | try
//  | men's
//  | souls.
//..
// to be displayed on 'bsl::cout'.  Note that there is no way to access the
// delimiters from a 'TokenizerIterator' directly, for that we will need to
// use the 'tokenizer' as a non-standard "iterator" directly.
//
///Iterating using a 'Tokenizer' object (ACCESS TO TOKENS AND DELIMITERS)
///----------------------------------------------------------------------
// The second mechanism, not intended for generic use, provides direct access
// to the previous and current (trailing) delimiters as well as the current
// token:
//..
//  void parse_2(bsl::ostream& output, const char *input)
//      // Print, to the specified 'output' stream the leader of the specified
//      // 'input', on a singly line, followed by subsequent current token and
//      // (trailing) delimiter pairs on successive lines, each line beginning
//      // with a vertical bar ('|') followed by a tab ('\t') character.
//  {
//      const char softDelimiters[] = " ";
//      const char hardDelimiters[] = ":/";
//
//      bdlb::Tokenizer it(input, softDelimiters, hardDelimiters);
//      output << "| " << '"' << it.previousDelimiter() << '"' << "\n";
//
//      for (; it.isValid(); ++it) {
//          output << "|\t"
//                 << '"' << it.token() << '"'
//                 << "\t"
//                 << '"' << it.trailingDelimiter() << '"'
//                 << "\n";
//      }
//  }
//..
// The parse_2 function above produces the *leader* on the first line,
// followed by each *token* along with its current (trailing) delimiter on
// successive lines.  So, were 'parse_2' to be given a reference to
// 'bsl::cout' and the input string
//..
//  " I've :been: a : :bad:/ boy! / "
//..
// we would expect
//..
//  |       " "
//  |       "I've"  " :"
//  |       "been"  ": "
//  |       "a :"   " : "
//  |       ""      ":"
//  |       "bad"   ":"
//  |       ""      "/ "
//  |       "boy!"  " / "
//..
// to be displayed on 'bsl::cout'.
//
///Token and Delimiter Lifetimes
///-----------------------------
// All tokens and delimiters are returned efficiently by value as
// 'bslstl::StringRef' objects, which naturally remain valid so long as the
// underlying input string remains unchanged -- irrespective of the validity
// of the 'tokenizer' or any of its dispensed token iterators.  Note, however,
// that all such token iterators are invalidated if the parent tokenizer object
// is destroyed or reset.  Note also the previous delimiter field remains
// accessible from a 'tokenizer' object even after it has reached the end of
// its input.  Also note that the *leader* is accessible, using the
// 'previousDelimiter' method prior to advancing the iteration state of the
// 'Tokenizer'.
//
///Comprehensive Detailed Parsing Specification
///--------------------------------------------
// This section provides a comprehensive (length-ordered) enumeration of how
// the 'bdlb::Tokenizer' performs, according to its three (non-null) character
// types:
//..
//  '.' = any *soft* delimiter character
//  '#' = any *hard* delimiter character
//  'T' = any token character
//..
// Here's how iteration progresses for various input strings.  Note that input
// strings having consecutive characters of the same category that naturally
// coalesce (i.e., behave as if they were a single character of that category)
// -- namely soft-delimiter or token characters -- are labeled with '(%)'.
// For example, consider the input ".." at the top of the [length 2] section
// below.  The table indicates, with a (%) in the first column, that the input
// acts the same as if it were a single (soft-delimiter) character (i.e., ".").
// There is only one line in this row of the table because, upon construction,
// the iterator is immediately invalid (as indicated by the right-most column).
// Now consider the "##" entry near the bottom of [length 2].  These
// (hard-delimiter) tokens do not coalesce.  What's more, the iterator on
// construction is valid and produces a empty leader and empty first token.
// after advancing the tokenizer, the second line of that row shows the
// current state of iteration with the previous delimiter being a '#' as well
// as the current one.  The current token is again shown as empty.  After
// advancing the tokenizer again, we now see that the iterator is invalid, yet
// the previous delimiter (still accessible) is a '#').
//..
//  (%) = repeat   Previous   Current   Current   Iterator
//  Input String   Delimiter   Token   Delimiter   Status
//  ============   =========  =======  =========  ========  [length 0]
//  ""             ""         na       na         invalid
//
//  ============   =========  =======  =========  ========  [length 1]
//  "."            "."        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#"            ""         ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T"            ""         "T"      ""         valid
//                 ""         na       na         invalid
//
//  ============   =========  =======  =========  ========  [length 2]
//  ".."     (%)   ".."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".#"           "."        ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".T"           "."        "T"      ""         valid
//                 ""         na       na         invalid
//
//  ------------   ---------  -------  ---------  --------
//  "#."           ""         ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "##"           ""         ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#T"           ""         ""       "#"        valid
//                 "#"        "T"      ""         valid
//                 ""         na       na         invalid
//
//  ------------   ---------  -------  ---------  --------
//  "T."           ""         "T"      "."        valid
//                 "."        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T#"           ""         "T"      "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "TT"     (%)   ""         "TT"     ""         valid
//                 ""         na       na         invalid
//
//  ============   =========  =======  =========  ========  [length 3]
//  "..."    (%)   "..."      na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "..#"    (%)   ".."       ""       "#"        valid
//                   "#"      na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "..T"    (%)   ".."       "T"      ""         valid
//                 ".."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".#."          "."        ""       "#."       valid
//                 "#."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".##"          "."        ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".#T"          "."        ""       "#"        valid
//                 "#"        "T"      ""         valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".T."          "."        "T"      "."        valid
//                 "."        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".T#"          "."        "T"      "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  ".TT"    (%)   "."        "TT"     ""         valid
//                 ""         na       na         invalid
//
//  ------------   ---------  -------  ---------  --------
//  "#.."    (%)   ""         ""       "#.."      invalid
//                 "#.."      na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#.#"          ""         ""       "#."       valid
//                 "#."       ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#.T"          ""         ""       "#."       valid
//                 "#."       "T"      ""         valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "##."          ""         ""       "#"        valid
//                 "#"        ""       "#."       valid
//                 "#."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "###"          ""         ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "##T"          ""         ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        "T"      ""         valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#T."          ""         ""       "#"        valid
//                 "#"        "T"      "."        valid
//                 "."        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#T#"          ""         ""       "#"        valid
//                 "#"        "T"      "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "#TT"    (%)   ""         ""       "#"        valid
//                 "#"        "TT"     ""         valid
//                 ""         na       na         invalid
//
//  ------------   ---------  -------  ---------  --------
//  "T.."    (%)   ""         "T"      ".."       valid
//                 ".."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T.#"          ""         "T"      ".#"       valid
//                 ".#"       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T.T"          ""         "T"      "."        valid
//                 "."        "T"      ""         valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T#."          ""         "T"      "#."       valid
//                 "#."       na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T##"          ""         "T"      "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        ""       "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "T#T"          ""         "T"      "#"        valid
//                 "#"        "T"      "#"        valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "TT."    (%)   ""         "TT"     "."        valid
//                 "."        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "TT#"    (%)   ""         "TT"     "#"        valid
//                 "#"        na       na         invalid
//  ------------   ---------  -------  ---------  --------
//  "TTT"    (%)   "#"        "TTT"    ""         valid
//                 ""         na       na         invalid
//  ------------   ---------  -------  ---------  --------
//..
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Iterating Over Tokens Using Just *Soft* Delimiters
///- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// This example illustrates the process of splitting the input string into a
// sequence of tokens using just soft delimiters.
//
// Suppose, we have a text where words are separated with a variable number of
// spaces and we want to remove all duplicated spaces.
//
// First, we create an example character array:
//..
//  const char text1[] = "   This  is    a test.";
//..
// Then, we create a 'Tokenizer' that uses " "(space) as a soft delimiter:
//..
//  bdlb::Tokenizer tokenizer1(text1, " ");
//..
// Note, that the tokenizer skips the leading soft delimiters upon
// initialization.  Next, we iterate the input character array and build the
// string without duplicated spaces:
//..
//  bsl::string result1;
//  if (tokenizer1.isValid()) {
//      result1 += tokenizer1.token();
//      ++tokenizer1;
//  }
//  while (tokenizer1.isValid()) {
//      result1 += " ";
//      result1 += tokenizer1.token();
//      ++tokenizer1;
//  }
//..
// Finally, we verify that the resulting string contains the expected result:
//..
//  const bsl::string EXPECTED1("This is a test.");
//  assert(EXPECTED1 == result1);
//..
//
///Example 2: Iterating Over Tokens Using Just *Hard* Delimiters
///- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// This example illustrates the process of splitting the input string into a
// sequence of tokens using just hard delimiters.
//
// Suppose, we want to reformat comma-separated-value file and insert the
// default value of '0' into missing columns.
//
// First, we create an example CSV line:
//..
//  const char text2[] = "Col1,Col2,Col3\n111,,133\n,222,\n311,322,\n";
//..
// Then, we create a 'Tokenizer' that uses ","(comma) and "\n"(new-line) as
// hard delimiters:
//..
//  bdlb::Tokenizer tokenizer2(text2, "", ",\n");
//..
// We use the 'trailingDelimiter' accessor to insert correct delimiter into the
// output string.  Next, we iterate the input line and insert the default
// value:
//..
//  string result2;
//  while (tokenizer2.isValid()) {
//      if (tokenizer2.token() != "") {
//          result2 += tokenizer2.token();
//      } else {
//          result2 += "0";
//      }
//      result2 += tokenizer2.trailingDelimiter();
//      ++tokenizer2;
//  }
//..
// Finally, we verify that the resulting string contains the expected result:
//..
//  const string EXPECTED2("Col1,Col2,Col3\n111,0,133\n0,222,0\n311,322,0\n");
//  assert(EXPECTED2 == result2);
//..
//
///Example 3: Iterating Over Tokens Using Both *Hard* and *Soft* Delimiters
/// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// This example illustrates the process of splitting the input string into a
// sequence of tokens using both soft and hard delimiters.
//
// Suppose, we want to extract the tokens from a file, where the fields are
// separated with a "$"(dollar-sign), but can have leading or trailing spaces.
//
// First, we create an example line:
//..
//  const char text3[] = " This $is    $   a$ test.      ";
//..
// Then, we create a 'Tokenizer' that uses "$"(dollar-sign) as a hard delimiter
// and " "(space) as a soft delimiter:
//..
//  bdlb::Tokenizer tokenizer3(text3, " ", "$");
//..
// In this example we only extracting the tokens, and can use the iterator
// provided by the tokenizer.
//
// Next, we create an iterator and iterate over the input, extracting the
// tokens into the result string:
//..
//  string result3;
//
//  bdlb::Tokenizer::iterator it3 = tokenizer3.begin();
//
//  if (it3 != tokenizer3.end()) {
//      result3 += *it3;
//  }
//  ++it3;
//
//  while (it3 != tokenizer3.end()) {
//      result3 += " ";
//      result3 += *it3;
//      ++it3;
//  }
//..
// Finally, we verify that the resulting string contains the expected result:
//..
//  const string EXPECTED3("This is a test.");
//  assert(EXPECTED3 == result3);
//..

#include <bdlscm_version.h>

#include <bsl_string.h>

#include <bsls_assert.h>
#include <bsls_compilerfeatures.h>
#include <bsls_keyword.h>
#include <bsls_libraryfeatures.h>
#include <bsls_platform.h>
#include <bsls_review.h>

#include <bsl_iterator.h>

namespace BloombergLP {

namespace bdlb {
                        // ============================
                        // private class Tokenizer_Data
                        // ============================

class Tokenizer_Data {
    // This component-private class is used to hold delimiter information.
    // Each 'Tokenizer' object will have, as a private data member, an object
    // of this class,  and will pass the address of that member to the
    // (private) constructor of each 'TokenizerIterator' object it issues:
    //..
    //  +--------------------------------------+
    //  |   ,--------------.                   |
    //  |  ( Tokenizer_Data )                  |
    //  |   `--------------'\                  |
    //  |          |         \                 |
    //  |          |     ,----*------------.   |
    //  |          |    ( TokenizerIterator )  |
    //  |          |    /`-----------------'   |
    //  |          |   /                       |
    //  |     ,----*--o-.                      |
    //  |    ( Tokenizer )                     |
    //  |     `---------'                      |
    //  +--------------------------------------+
    //  bdlb_tokenizer
    //..

    enum {
        k_MAX_CHARS = 256  // maximum # of unique values for an 8-bit 'char'
    };

    char d_charTypes[k_MAX_CHARS];  // table of SOFT / HARD / TOKEN characters

  private:
    // NOT IMPLEMENTED
    Tokenizer_Data(const Tokenizer_Data&) BSLS_KEYWORD_DELETED;
    Tokenizer_Data& operator=(const Tokenizer_Data&) BSLS_KEYWORD_DELETED;

  public:
    // CREATORS
    explicit Tokenizer_Data(const bsl::string_view& softDelimiters);
    Tokenizer_Data(const bsl::string_view& softDelimiters,
                   const bsl::string_view& hardDelimiters);
        // Create a 'Tokenizer_Data' object and load the 'd_charTypes' data
        // member such that it has the same value *as* *if* this (overly
        // prescriptive) algorithm were used: (I) initialize each entry in
        // 'd_charTypes' array to a value indicating that the character having
        // that 'index' as its (e.g., ASCII) representation is a *token*
        // character; (II) then, for each character in the specified
        // 'softDelimiters' sequence, overwrite the element at the
        // corresponding index in 'd_charTypes' with a value that indicates
        // that the character is a *soft* delimiter character; (III) finally,
        // for each character in the specified 'hardDelimiters' sequence,
        // overwrite the element at the corresponding index with a distinct
        // value that indicates the character is a *hard* delimiter* character.
        // Note that duplicate delimiter characters in the respective inputs
        // are naturally ignored, and that a character that appears in both
        // sets would naturally be considered *hard*.  Also note that it is
        // entirely reasonable to state, in any public interface, that the
        // behavior is undefined unless the characters in the union of the two
        // delimiter sequences are unique.

    // ACCESSORS
    int inputType(char character) const;
        // Return the input type of the specified 'character': 0 for token,
        // 1 for soft delimiter, 2 for hard delimiter.
};

                        // =====================
                        // class Tokenizer_Proxy
                        // =====================

class Tokenizer_Proxy {
    // This class provides a proxy holder of a reference to a
    // 'TokernizerIterator' object, allowing correct return of 'operator->'.

    // DATA
    bslstl::StringRef d_obj; // The object

  private:
    // NOT IMPLEMENTED
    Tokenizer_Proxy& operator=(const Tokenizer_Proxy&) BSLS_KEYWORD_DELETED;

  public:
    // CREATORS
    Tokenizer_Proxy(const bsl::string_view &obj);
        // Create a 'ProxyHolder' object with a copy the specified 'obj'.

#ifdef BSLS_COMPILERFEATURES_SUPPORT_DEFAULTED_FUNCTIONS
    Tokenizer_Proxy(const Tokenizer_Proxy& original) = default;
        // Create a 'Tokenizer_Proxy' object having the same value as the
        // specified 'original' object.  Note that this copy constructor is
        // generated by the compiler.

    ~Tokenizer_Proxy() = default;
        // Destroy this object.
#endif

    // OPERATORS
    const bslstl::StringRef *operator->() const;
        // Return a pointer to the object contained by the 'Tokenizer_Proxy'.
};

                        // =======================
                        // class TokenizerIterator
                        // =======================

class TokenizerIterator
#if defined(BSLS_LIBRARYFEATURES_STDCPP_LIBCSTD)
// Sun CC workaround: iterators must be derived from 'std::iterator' to work
// with the native std library algorithms.  However, 'std::iterator' is
// deprecated in C++17, so do not rely on derivation unless required, to avoid
// deprecation warnings on modern compilers.
    : public bsl::iterator<bsl::input_iterator_tag,
                           bslstl::StringRef,
                           int,
                           Tokenizer_Proxy,
                           const bslstl::StringRef>
#endif  // BSLS_LIBRARYFEATURES_STDCPP_LIBCSTD
{
    // This class provides a C++-standards-conforming input iterator over the
    // tokens in the input string suppled at construction (along with the
    // designation of *soft* and *hard* delimiter characters) to a 'Tokenizer'
    // object.  Tokens are returned, using a 'bslstl::StringRef' -- by value --
    // that means the iterated references remain valid until the underlying
    // input string itself is modified or destroyed.  Note that all iterators
    // are invalidated whenever the input string in the parent 'Tokenizer'
    // change.


    // DATA
    const Tokenizer_Data *d_sharedData_p;  // (address of) character categories
    const char           *d_cursor_p;      // tail of parsed input
    const char           *d_token_p;       // current token
    const char           *d_postDelim_p;   // current (trailing) delimiter
    const char           *d_end_p;         // one past input; 0 for '(char *)'
    bool                  d_endFlag;       // set 'true' when at end of input

    // FRIENDS
    friend class Tokenizer;
    friend bool operator==(const TokenizerIterator&, const TokenizerIterator&);
    friend bool operator!=(const TokenizerIterator&, const TokenizerIterator&);

    // PRIVATE CREATORS
    TokenizerIterator(const char           *input,
                      const char           *end,
                      const Tokenizer_Data *sharedData);
        // Create a 'TokenizerIterator' object bound to the specified sequence
        // of 'input' characters ending at the specified 'end' and the
        // specified delimiter and token mapper 'sharedData'.

  public:
    // TYPES
    typedef bslstl::StringRef        value_type;
    typedef int                      difference_type;
    typedef Tokenizer_Proxy          pointer;
    typedef const bslstl::StringRef  reference;
    typedef bsl::input_iterator_tag  iterator_category;
        // Defines a type alias for the tag type that represents the iterator
        // concept this class models.

    // CREATORS
    TokenizerIterator();
    TokenizerIterator(const TokenizerIterator& origin);
        // Create a 'TokenizerIterator' object having the value of the
        // specified 'origin' iterator.

    // MANIPULATORS
    TokenizerIterator& operator=(const TokenizerIterator& rhs);
        // Assign to this object the value of the specified 'rhs' iterator, and
        // return a reference providing modifiable access to this object.

    TokenizerIterator& operator++();
        // Advance the iteration state of this object to refer to the next
        // token in the underlying input sequence, and return a reference
        // providing modifiable access to this object.  The behavior is
        // undefined unless the iteration state of this object is initially
        // valid, or if the underlying input has been modified or destroyed
        // since this object was created.

    // ACCESSORS
    const bslstl::StringRef operator*() const;
        // Return a reference to the non-modifiable current token (i.e.,
        // maximal sequence of non-delimiter characters) in the input string.
        // The returned reference remains valid so long as the underlying input
        // is not modified or destroyed -- irrespective of the state (or
        // existence) of this object.  The behavior is undefined unless the
        // iteration state of this object is initially valid, or if the
        // underlying input has been modified or destroyed since this object
        // was created.

    Tokenizer_Proxy operator->() const;
        // Return a proxy object containing the non-modifiable current token
        // (i.e., maximal sequence of non-delimiter characters) in the input
        // string.  The returned proxy remains valid so long as the underlying
        // input is not modified or destroyed -- irrespective of the state (or
        // existence) of this object.  The behavior is undefined unless the
        // iteration state of this object is initially valid, or if the
        // underlying input has been modified or destroyed since this object
        // was created.
};

// FREE OPERATORS
bool operator==(const TokenizerIterator& lhs, const TokenizerIterator& rhs);
    // Return 'true' if the specified 'lhs' and 'rhs' objects have the same
    // value, and 'false' otherwise.  Two 'TokenizerIterator' objects have the
    // same value if both of them are pointing to the same token within the
    // same tokenized string or if they both point past the tokenized string.
    // The behaviour is undefined unless the iterators returned by the same
    // 'Tokenizer' object, or if the underlying input has been modified or
    // destroyed since any of those objects were created.

bool operator!=(const TokenizerIterator& lhs, const TokenizerIterator& rhs);
    // Return 'true' if the specified 'lhs' and 'rhs' objects do not have the
    // same value, and 'false' otherwise.  The behaviour is undefined unless
    // the iterators returned by the same 'Tokenizer' object, or if the
    // underlying input has been modified or destroyed since any of those
    // objects were created.

const TokenizerIterator operator++(TokenizerIterator& object, int);
    // Advance the iteration state of the specified 'object' to refer to the
    // next token in the underlying input sequence, and return a copy of this
    // object prior advancing the iteration state.  The behavior is undefined
    // unless the iteration state of this object is initially valid, or if the
    // underlying input has been modified or destroyed since this object was
    // created.


                            // ===============
                            // class Tokenizer
                            // ===============

class Tokenizer {
    // This class provides (read-only) sequential access to tokens delimited by
    // two user-supplied character sets consisting, respectively, of *soft* and
    // *hard* delimiters characters.  Access to the previous and current
    // (trailing) delimiter, as well as to the current token itself, is
    // provided efficiently via 'bslstl::StringRef'.

    // DATA
    Tokenizer_Data  d_sharedData;  // delimiter/token character categories
    const char     *d_input_p;     // original input
    const char     *d_cursor_p;    // tail of parsed input
    const char     *d_prevDelim_p; // previous delimiter
    const char     *d_token_p;     // current token
    const char     *d_postDelim_p; // current (trailing) delimiter
    const char     *d_end_p;       // one past end of input; 0 for '(char *)'
    bool            d_endFlag;     // set 'true' when cursor at end of input

  private:
    // PRIVATE MANIPULATORS
    void resetImpl(const char *input, const char *endOfInput);
        // Rebind this object to refer to the specified sequence of 'input'
        // characters ending at the specified 'endOfInput' pointer.  The state
        // of the tokenizer following this call is *as* *if* it had been
        // constructed with 'input' and its current sets of *soft* and *hard*
        // delimiter characters.  Note that the behavior is undefined if this
        // object is used in any way (other than to reset or destroy it) after
        // its underlying 'input' string is modified.

  private:
    // NOT IMPLEMENTED
    Tokenizer(const Tokenizer&) BSLS_KEYWORD_DELETED;
    Tokenizer& operator=(const Tokenizer&) BSLS_KEYWORD_DELETED;
    Tokenizer& operator++(int) BSLS_KEYWORD_DELETED;

  public:
    // TYPES
    typedef TokenizerIterator iterator;

    // CREATORS
    Tokenizer(const char              *input, const bsl::string_view& soft);
    Tokenizer(const bsl::string_view&  input, const bsl::string_view& soft);
    Tokenizer(const char              *input,
              const bsl::string_view&  soft,
              const bsl::string_view&  hard);
    Tokenizer(const bsl::string_view&  input,
              const bsl::string_view&  soft,
              const bsl::string_view&  hard);
        // Create a 'Tokenizer' object bound to the specified sequence of
        // 'input' characters having the specified set of (unique) 'soft'
        // delimiter characters to be used to separate *tokens* (i.e., maximal
        // sequence of non-delimiter characters) in 'input'.  Optionally
        // specify a disjoint set of (unique) 'hard' delimiter characters to be
        // used to explicitly terminate tokens.  Delimiters within 'input'
        // consist of a maximal sequence of one or more delimiter characters,
        // at most one of which may be *hard*; when there is a contiguous
        // sequence of delimiter characters containing two or more *hard*
        // delimiter characters in 'input', any intervening *soft* delimiter
        // characters are associated with the previous (*hard*) delimiter.  Any
        // leading soft delimiter characters -- i.e., those preceding the first
        // *token* or *hard* delimiter character (referred to as the *leader*)
        // -- are available immediately after construction via the
        // 'previousDelimiter' method.  The behavior is undefined unless all
        // supplied delimiter characters are unique.  Note that the behavior is
        // also undefined if this object is used in any way (other than to
        // reset or destroy it) after its underlying 'input' string is
        // modified.  Also note that the current token and (trailing) delimiter
        // may be accessed only while this object is in the valid state;
        // however, the previous delimiter (or *leader*) is always accessible.
        // Also note that all token and delimiter strings are returned as
        // references into the underlying 'input' string, and hence remain
        // valid so long as that string is not modified or destroyed --
        // irrespective of the state (or even the existence) of this object.
        // Finally note that supplying a default constructed 'string_view' is
        // equivalent to supplying an empty c-string (i.e., "").

    ~Tokenizer();
        // Destroy this object.

    // MANIPULATORS
    Tokenizer& operator++();
        // Advance the iteration state of this object to refer to the next
        // sequence of previous delimiter, current token, and current
        // (trailing) delimiter in the underlying input sequence, and return a
        // reference providing modifiable access to this object.  The current
        // delimiter reference becomes the previous one.  If there is another
        // token remaining in the input, the current token and delimiter are
        // updated to refer to the respective new token and (trailing)
        // delimiter values -- either of which (but not both) might be empty.
        // If there are no tokens remaining in the input, the iteration state
        // of this object becomes invalid.  The behavior is undefined unless
        // the iteration state of this object is initially valid, or if the
        // underlying input has been modified or destroyed since this object
        // was most recently reset (or created).

    void reset(const char *input);
    void reset(const bsl::string_view& input);
        // Rebind this object to refer to the specified sequence of 'input'
        // characters.  The state of the tokenizer following this call is *as*
        // *if* it had been constructed with 'input' and its current sets of
        // *soft* and *hard* delimiter characters.  The behavior is
        // undefined if this object is used in any way (other than to reset or
        // destroy it) after its underlying 'input' string is modified.  Note
        // that supplying a default constructed 'string_view' is equivalent to
        // supplying an empty c-string (i.e., "").

    // ACCESSORS
    bool hasPreviousSoft() const;
        // Return 'true' if the previous delimiter (or *leader*) contains a
        // *soft* delimiter character, and 'false' otherwise.  The behavior is
        // undefined if the underlying input itself has been modified or
        // destroyed since this object was most recently reset (or created).

    bool hasTrailingSoft() const;
        // Return 'true' if the current (trailing) delimiter contains a *soft*
        // delimiter character, and 'false' otherwise.  The behavior is
        // undefined if the iteration state of this object is initially
        // invalid, or if the underlying input itself has been modified or
        // destroyed since this object was most recently reset (or created).

    bool isPreviousHard() const;
        // Return 'true' if the previous delimiter contains a *hard-delimiter*
        // character, and 'false' otherwise.  The behavior is undefined if the
        // underlying input itself has been modified or destroyed since this
        // object was most recently reset (or created).

    bool isTrailingHard() const;
        // Return 'true' if the current (trailing) delimiter contains a *hard*
        // delimiter character, and 'false' otherwise.  The behavior is
        // undefined if the iteration state of this object is initially
        // invalid, or if the underlying input itself has been modified or
        // destroyed since this object was most recently reset (or created).

    bool isValid() const;
        // Return 'true' if the iteration state of this object is valid, and
        // 'false' otherwise.  Note that the behavior of advancing the
        // iteration state as well as accessing the current token or (trailing)
        // delimiter is undefined unless the current iteration state of this
        // object is valid.

    bslstl::StringRef previousDelimiter() const;
        // Return a reference to the non-modifiable previous delimiter (or
        // *leader*) in the input string.  The behavior is undefined if the
        // underlying input has been modified or destroyed since this object
        // was most recently reset (or created).

    bslstl::StringRef token() const;
        // Return a reference to the non-modifiable current token (i.e.,
        // maximal sequence of non-delimiter characters) in the input string.
        // The returned reference remains valid so long as the underlying input
        // is not modified or destroyed -- irrespective of the state (or
        // existence) of this object.  The behavior is undefined unless the
        // iteration state of this object is initially valid, or if the
        // underlying input has been modified or destroyed since this object
        // was most recently reset (or created).


    bslstl::StringRef trailingDelimiter() const;
        // Return a reference to the non-modifiable current (trailing)
        // delimiter (maximal sequence of one or more delimiter characters
        // containing at most one *hard* delimiter character) in the input
        // string.  The returned reference remains valid so long as the
        // underlying input is not modified or destroyed -- irrespective of the
        // state (or existence) of this object.  The behavior is undefined
        // unless the iteration state of this object is initially valid, or if
        // the underlying input has been modified or destroyed since this
        // object was most recently reset (or created).


                         // iterators
    iterator begin() const;
        // Return an iterator referring to the first token in this object's
        // input string (the past-the-end iterator if this object iteration
        // state is initially invalid).  This reference remains valid as long
        // as the underlying input has not been modified or destroyed since
        // this object was most recently reset (or created).

    iterator end() const;
        // Return an iterator referring to position beyond the last token in
        // this object's input string.  This reference remains valid as long as
        // the underlying input has not been modified or destroyed since this
        // object was most recently reset (or created).
};

// FREE OPERATORS
const Tokenizer operator++(Tokenizer& object, int);

// ============================================================================
//                               INLINE DEFINITIONS
// ============================================================================

                        // --------------------------
                        // class bdlb::Tokenizer_Data
                        // --------------------------
// ACCESSORS
inline
int Tokenizer_Data::inputType(char character) const
{
    return d_charTypes[static_cast<unsigned char>(character)];
}

                        // ---------------------------
                        // class bdlb::Tokenizer_Proxy
                        // ---------------------------
// CREATORS
inline
bdlb::Tokenizer_Proxy::Tokenizer_Proxy(const bsl::string_view &obj)
: d_obj(obj)
{
}

// OPERATORS
inline
const bslstl::StringRef *bdlb::Tokenizer_Proxy::operator->() const
{
    return &d_obj;
}

                        // -----------------------------
                        // class bdlb::TokenizerIterator
                        // -----------------------------
// ACCESSORS
inline
const bslstl::StringRef TokenizerIterator::operator*() const
{
    // Called on invalid iterator
    BSLS_REVIEW(!d_endFlag);
    return bslstl::StringRef(d_token_p, d_postDelim_p);
}

inline
Tokenizer_Proxy TokenizerIterator::operator->() const
{
    // Called on invalid iterator
    BSLS_REVIEW(!d_endFlag);
    return Tokenizer_Proxy(this->operator*());
}

                        // ---------------------
                        // class bdlb::Tokenizer
                        // ---------------------
// ACCESSORS
inline
bool Tokenizer::isValid() const
{
    return !d_endFlag;
}

inline
bslstl::StringRef Tokenizer::previousDelimiter() const
{
    return bslstl::StringRef(d_prevDelim_p, d_token_p);
}

inline
bslstl::StringRef Tokenizer::token() const
{
    // Called on invalid tokenizer
    BSLS_REVIEW(!d_endFlag);
    return bslstl::StringRef(d_token_p, d_postDelim_p);
}

inline
bslstl::StringRef Tokenizer::trailingDelimiter() const
{
    // Called on invalid tokenizer
    BSLS_REVIEW(!d_endFlag);
    return bslstl::StringRef(d_postDelim_p, d_cursor_p);
}

}  // close package namespace

// FREE OPERATORS
inline
bool bdlb::operator==(const bdlb::TokenizerIterator& lhs,
                      const bdlb::TokenizerIterator& rhs)
{
    // Fast path decision
    if (lhs.d_endFlag != rhs.d_endFlag) {
        return false;                                                 // RETURN
    }

    // Comparing end iterators
    if (lhs.d_endFlag && rhs.d_endFlag) {
        return true;                                                  // RETURN
    }

    return  lhs.d_token_p == rhs.d_token_p;
}

inline
bool bdlb::operator!=(const bdlb::TokenizerIterator& lhs,
                      const bdlb::TokenizerIterator& rhs)
{
    return !(lhs == rhs);
}

inline
const bdlb::TokenizerIterator bdlb::operator++(bdlb::TokenizerIterator& object,
                                               int)
{
    bdlb::TokenizerIterator tmp(object);
    ++object;
    return tmp;
}

}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2015 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------