// bdljsn_tokenizer.h                                                 -*-C++-*-
#ifndef INCLUDED_BDLJSN_TOKENIZER
#define INCLUDED_BDLJSN_TOKENIZER

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide a tokenizer for extracting JSON data from a 'streambuf'.
//
//@CLASSES:
//  bdljsn::Tokenizer: tokenizer for parsing JSON data from a 'streambuf'
//
//@SEE_ALSO: baljsn_decoder
//
//@DESCRIPTION: This component provides a class, 'bdljsn::Tokenizer', that
// traverses data stored in a 'bsl::streambuf' one node at a time and provides
// clients access to the data associated with that node, including its type and
// data value.  Client code can use the 'reset' function to associate a
// 'bsl::streambuf' containing JSON data with a tokenizer object and then call
// the 'advanceToNextToken' function to extract individual data values.
//
// This 'class' was created to be used by other components in the 'bdljsn' and
// 'baljsn' packages and in most cases clients should use the
// 'bdljsn_jsonutil', 'baljsn_decoder', or 'bdljsn_datumutil' components
// instead of using this 'class'.
//
// On malformed JSON, tokenization may fail before the end of input is reached,
// but not all such errors are detected.  In particular, callers should check
// that closing brackets and braces match opening ones.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Extracting JSON Data into an Object
///----------------------------------------------
// For this example, we will use 'bdljsn::Tokenizer' to read each node in a
// JSON document and populate a simple 'Employee' object.
//
// First, we will define the JSON data that the tokenizer will traverse over:
//..
//  const char *INPUT = "    {\n"
//                      "        \"street\" : \"Lexington Ave\",\n"
//                      "        \"state\" : \"New York\",\n"
//                      "        \"zipcode\" : \"10022-1331\",\n"
//                      "        \"floorCount\" : 55\n"
//                      "    }";
//..
// Next, we will construct populate a 'streambuf' with this data:
//..
//  bdlsb::FixedMemInStreamBuf isb(INPUT, bsl::strlen(INPUT));
//..
// Then, we will create a 'bdljsn::Tokenizer' object and associate the above
// streambuf with it:
//..
//  bdljsn::Tokenizer tokenizer;
//  tokenizer.reset(&isb);
//..
// Next, we will create an address record type and object.
//..
//  struct Address {
//      bsl::string d_street;
//      bsl::string d_state;
//      bsl::string d_zipcode;
//      int         d_floorCount;
//  } address = { "", "", "", 0 };
//..
// Then, we will traverse the JSON data one node at a time:
//..
//  // Read '{'
//
//  int rc = tokenizer.advanceToNextToken();
//  assert(!rc);
//
//  bdljsn::Tokenizer::TokenType token = tokenizer.tokenType();
//  assert(bdljsn::Tokenizer::e_START_OBJECT == token);
//
//  rc = tokenizer.advanceToNextToken();
//  assert(!rc);
//  token = tokenizer.tokenType();
//
//  // Continue reading elements till '}' is encountered
//
//  while (bdljsn::Tokenizer::e_END_OBJECT != token) {
//      assert(bdljsn::Tokenizer::e_ELEMENT_NAME == token);
//
//      // Read element name
//
//      bslstl::StringRef nodeValue;
//      rc = tokenizer.value(&nodeValue);
//      assert(!rc);
//
//      bsl::string elementName = nodeValue;
//
//      // Read element value
//
//      int rc = tokenizer.advanceToNextToken();
//      assert(!rc);
//
//      token = tokenizer.tokenType();
//      assert(bdljsn::Tokenizer::e_ELEMENT_VALUE == token);
//
//      rc = tokenizer.value(&nodeValue);
//      assert(!rc);
//
//      // Extract the simple type with the data
//
//      if (elementName == "street") {
//          rc = bdljsn::StringUtil::readString(&address.d_street, nodeValue);
//          assert(!rc);
//      }
//      else if (elementName == "state") {
//          rc = bdljsn::StringUtil::readString(&address.d_state, nodeValue);
//          assert(!rc);
//      }
//      else if (elementName == "zipcode") {
//          rc = bdljsn::StringUtil::readString(&address.d_zipcode, nodeValue);
//          assert(!rc);
//      }
//      else if (elementName == "floorCount") {
//          rc = bdljsn::NumberUtil::asInt(&address.d_floorCount, nodeValue);
//          assert(!rc);
//      }
//
//      rc = tokenizer.advanceToNextToken();
//      assert(!rc);
//      token = tokenizer.tokenType();
//  }
//..
// Finally, we will verify that the 'address' aggregate has the correct values:
//..
//  assert("Lexington Ave" == address.d_street);
//  assert("New York"      == address.d_state);
//  assert("10022-1331"    == address.d_zipcode);
//  assert(55              == address.d_floorCount);
//..

#include <bdlscm_version.h>

#include <bdlma_bufferedsequentialallocator.h>

#include <bsls_alignedbuffer.h>
#include <bsls_assert.h>
#include <bsls_types.h>

#include <bsl_ios.h>
#include <bsl_streambuf.h>
#include <bsl_string.h>
#include <bsl_string_view.h>
#include <bsl_vector.h>

namespace BloombergLP {
namespace bdljsn {

                              // ===============
                              // class Tokenizer
                              // ===============

class Tokenizer {
    // This 'class' provides a mechanism for traversing JSON data stored in a
    // 'bsl::streambuf' one node at a time and allows clients to access the
    // data associated with that node, including its type and data value.

  public:
    // TYPES
    typedef bsls::Types::IntPtr IntPtr;
    typedef bsls::Types::Uint64 Uint64;

    enum TokenType {
        // This 'enum' lists all the possible token types.

        e_BEGIN = 1,      // starting token
        e_ELEMENT_NAME,   // element name
        e_START_OBJECT,   // start of an object ('{')
        e_END_OBJECT,     // end of an object   ('}')
        e_START_ARRAY,    // start of an array  ('[')
        e_END_ARRAY,      // end of an array    (']')
        e_ELEMENT_VALUE,  // element value of a simple type
        e_ERROR           // error token
#ifndef BDE_OMIT_INTERNAL_DEPRECATED
        ,
        BAEJSN_BEGIN         = e_BEGIN,
        BAEJSN_ELEMENT_NAME  = e_ELEMENT_NAME,
        BAEJSN_START_OBJECT  = e_START_OBJECT,
        BAEJSN_END_OBJECT    = e_END_OBJECT,
        BAEJSN_START_ARRAY   = e_START_ARRAY,
        BAEJSN_END_ARRAY     = e_END_ARRAY,
        BAEJSN_ELEMENT_VALUE = e_ELEMENT_VALUE,
        BAEJSN_ERROR         = e_ERROR
#endif  // BDE_OMIT_INTERNAL_DEPRECATED
    };

    enum { k_EOF = +1 };

  private:
    // PRIVATE TYPES
    enum ContextType {
        // This 'enum' lists the possible contexts that the tokenizer can be
        // in.

        e_NO_CONTEXT,                 // context stack is empty
        e_OBJECT_CONTEXT,             // object context
        e_ARRAY_CONTEXT               // array context
    };

    // One intermediate data buffer used for reading data from the stream, and
    // another for the context state stack.

    enum {
        k_BUFSIZE             = 1024 * 8,
        k_MAX_STRING_SIZE     = k_BUFSIZE - 1,

        k_CONTEXTSTACKBUFSIZE = 256
    };

    // DATA
    bsls::AlignedBuffer<k_BUFSIZE>
                        d_buffer;           // string buffer

    bsls::AlignedBuffer<k_CONTEXTSTACKBUFSIZE>
                        d_stackBuffer;      // context stack buffer

    bdlma::BufferedSequentialAllocator
                        d_allocator;        // string allocator (owned)

    bdlma::BufferedSequentialAllocator
                        d_stackAllocator;   // context stack allocator (owned)

    bsl::string         d_stringBuffer;     // string buffer

    bsl::streambuf     *d_streambuf_p;      // streambuf (held, not owned)

    bsl::size_t         d_cursor;           // current cursor

    bsl::size_t         d_valueBegin;       // cursor for beginning of value

    bsl::size_t         d_valueEnd;         // cursor for end of value

    bsl::size_t         d_valueIter;        // cursor for iterating value

    Uint64              d_readOffset;       // the offset to the end of the
                                            // current 'd_stringBuffer'
                                            // relative to the start of the
                                            // streambuf

    TokenType           d_tokenType;        // token type

    bsl::vector<char>   d_contextStack;     // context type stack

    int                 d_readStatus;       // 0 until EOF or an error is
                                            // encountered, then indicates
                                            // nature of error.  Returned by
                                            // 'readStatus'

    int                 d_bufEndStatus;     // status of last read from
                                            // '*d_streambuf_p'.  If non-zero,
                                            // copied to 'd_readStatus' on next
                                            // read attempt.

    bool                d_allowStandAloneValues;
                                            // option for allowing stand alone
                                            // values

    bool                d_allowHeterogenousArrays;
                                            // option for allowing arrays of
                                            // heterogeneous values

    bool                d_allowNonUtf8StringLiterals;
                                            // Disables UTF-8 validation

    bool                d_allowTrailingTopLevelComma;
                                            // if 'true', allows '{},'

    // PRIVATE MANIPULATORS
    int expandBufferForLargeValue();
        // Increase the size of the string buffer, 'd_stringBuffer', and then
        // append additional characters, from the internally-held 'streambuf' (
        // 'd_streambuf_p') to the end of the current sequence of characters.
        // Return 0 on success and a non-zero value otherwise.

    int extractStringValue();
        // Extract the string value starting at the current data cursor and
        // update the value begin and end pointers to refer to the begin and
        // end of the extracted string.  Return 0 on success and a non-zero
        // value otherwise.

    int moveValueCharsToStartAndReloadBuffer();
        // Move the current sequence of characters being tokenized to the front
        // of the internal string buffer, 'd_stringBuffer', and then append
        // additional characters, from the internally-held 'streambuf'
        // ('d_streambuf_p') to the end of that sequence up to a maximum
        // sequence length of 'd_buffer.size()' characters.  Return the number
        // of bytes read from the 'streambuf'.  Note that if 0 is returned, it
        // may mean end of file or, if UTF-8 checking is set, that invalid
        // UTF-8 was encountered, so it may be necessary to call
        // 'utf8ErrorIsSet()' to tell the difference.

    ContextType popContext();
        // If the 'd_contextStack' is empty, return 'e_NO_CONTEXT', otherwise
        // pop the top context from the 'd_contextStack' stack, and return it.

    void pushContext(ContextType context);
        // Push the specified 'context' onto the 'd_contextStack' stack.

    int reloadStringBuffer();
        // Reload the string buffer with new data read from the underlying
        // 'streambuf' and overwriting the current buffer.  After reading
        // update the cursor to the new read location.  Return the number of
        // bytes read from the 'streambuf'.

    int skipNonWhitespaceOrTillToken();
        // Skip all characters until a whitespace or a token character is
        // encountered and position the cursor onto the first such character.
        // Return 0 on success and a non-zero value otherwise.

    int skipWhitespace();
        // Skip all whitespace characters and position the cursor onto the
        // first non-whitespace character.  Return 0 on success and a non-zero
        // value otherwise.

    // PRIVATE ACCESSOR
    ContextType context() const;
        // If the 'd_contextStack' is empty, return 'e_NO_CONTEXT', otherwise
        // return the top context from the 'd_contextStack' stack without
        // popping.

  private:
    // NOT IMPLEMENTED
    Tokenizer(const Tokenizer&);
    Tokenizer& operator=(const Tokenizer&);

  public:
    // CREATORS
    explicit Tokenizer(bslma::Allocator *basicAllocator = 0);
        // Create a 'Reader' object.  Optionally specify a 'basicAllocator'
        // used to supply memory.  If 'basicAllocator' is 0, the currently
        // installed default allocator is used.

    ~Tokenizer();
        // Destroy this object.

    // MANIPULATORS
    int advanceToNextToken();
        // Move to the next token in the data steam.  Return 0 on success and a
        // non-zero value otherwise.  Each call to 'advanceToNextToken'
        // invalidates the string references returned by the 'value' accessor
        // for prior nodes.  Note that on malformed JSON, this function may,
        // but will not always, return a non-zero value before the end of the
        // token stream is reached.

    void reset(bsl::streambuf *streambuf);
        // Reset this tokenizer to read data from the specified 'streambuf'.
        // Note that the reader will not be on a valid node until
        // 'advanceToNextToken' is called.  Note that this function does not
        // change the value of the 'allowStandAloneValues',
        // 'allowHeterogenousArrays', or 'allowNonUtf8StringLiterals' options.

    int resetStreamBufGetPointer();
        // Reset the get pointer of the 'streambuf' held by this object to
        // refer to the byte following the last processed byte, if the held
        // 'streambuf' supports seeking, and return an error otherwise leaving
        // this object unchanged.  Return 0 on success, and a non-zero value
        // otherwise.  Note that after a successful function return users can
        // read data from the 'streambuf' that was specified during 'reset'
        // from where this object stopped.  Also note that this call implies
        // the end of processing for this object and any subsequent methods
        // invoked on this object should only be done after calling 'reset' and
        // specifying a new 'streambuf'.

    void setAllowHeterogenousArrays(bool value);
        // Set the 'allowHeterogenousArrays' option to the specified 'value'.
        // If the 'allowHeterogenousArrays' value is 'true' this tokenizer will
        // successfully tokenize heterogeneous values within an array.  If the
        // option's value is 'false' then the tokenizer will return an error
        // for arrays having heterogeneous values.  By default, the value of
        // the 'allowHeterogenousArrays' option is 'true'.

    void setAllowNonUtf8StringLiterals(bool value);
        // Set the 'allowNonUtf8StringLiterals' option to the specified
        // 'value'.  If the 'allowNonUtf8StringLiterals' value is 'false' this
        // tokenizer will check string literal tokens for invalid UTF-8, enter
        // an error mode if it encounters a string literal token that has any
        // content that is not UTF-8, and fail to advance to subsequent tokens
        // until 'reset' is called.  By default, the value of the
        // 'allowNonUtf8StringLiterals' option is 'true'.

    void setAllowStandAloneValues(bool value);
        // Set the 'allowStandAloneValues' option to the specified 'value'.  If
        // the 'allowStandAloneValues' value is 'true' this tokenizer will
        // successfully tokenize JSON values (strings and numbers).  If the
        // option's value is 'false' then the tokenizer will only tokenize
        // complete JSON documents (JSON objects and arrays) and return an
        // error for stand alone JSON values.  By default, the value of the
        // 'allowStandAloneValues' option is 'true'.

    void setAllowTrailingTopLevelComma(bool value);
        // Set the 'allowTrailingTopLevelComma' option to the specified
        // 'value'.  If the 'allowTrailingTopLevelComma' value is 'true' this
        // tokenizer will successfully tokenize JSON values where a comma
        // follows the top-level JSON element.  If the option's value is
        // 'false' then the tokenizer will reject documents with such trailing
        // commas, such as '{},'.  By default, the value of the
        // 'allowTrailingTopLevelComma' option is 'true' for backwards
        // compatibility.  Note that a document without any JSON elements is
        // invalid whether or not it contains commas.

    // ACCESSORS
    bool allowHeterogenousArrays() const;
        // Return the value of the 'allowHeterogenousArrays' option of this
        // tokenizer.

    bool allowNonUtf8StringLiterals() const;
        // Return the value of the 'allowNonUtf8StringLiterals' option of this
        // tokenizer.

    bool allowStandAloneValues() const;
        // Return the value of the 'allowStandAloneValues' option of this
        // tokenizer.

    bool allowTrailingTopLevelComma() const;
        // Return the value of the 'allowTrailingTopLevelComma' option of this
        // tokenizer.

    bsls::Types::Uint64 currentPosition() const;
        // Return the offset of the current octet being tokenized in the stream
        // supplied to 'reset', or if an error occurred, the position where the
        // failed attempt to tokenize a token occurred.  Note that this
        // operation is intended to provide additional information in the case
        // of an error.

    bsls::Types::Uint64 readOffset() const;
        // Return the last read position relative to when 'reset' was called.
        // Note that 'readOffset() >= currentPosition()' -- the 'readOffset' is
        // the offset of the last octet read from the stream supplied to
        // 'reset', and is at or beyond the current position being tokenized.

    int readStatus() const;
        // Return the status of the last call to 'reloadStringBuffer()':
        //: o 0 if 'reloadStringBuffer()' has not been called or if a token was
        //:   successfully read.
        //:
        //: o 'k_EOF' (which is positive) if no data could be read before
        //:   reaching EOF.
        //:
        //: o a negative value if the 'allowNonUtf8StringLiterals' option is
        //:   'false' and a UTF-8 error occurred.  The specific value returned
        //:    will be one of the enumerators of the
        //:    'bdlde::Utf8Util::ErrorStatus' 'enum' type indicating the nature
        //:    of the UTF-8 error.

    TokenType tokenType() const;
        // Return the token type of the current token.

    int value(bsl::string_view *data) const;
        // Load into the specified 'data' the value of the specified token if
        // the current token's type is 'e_ELEMENT_NAME' or 'e_ELEMENT_VALUE' or
        // leave 'data' unmodified otherwise.  Return 0 on success and a
        // non-zero value otherwise.  Note that the returned 'data' is only
        // valid until the next manipulator call on this object.
};

// ============================================================================
//                            INLINE DEFINITIONS
// ============================================================================

// PRIVATE MANIPULATORS
inline
Tokenizer::ContextType Tokenizer::popContext()
{
    ContextType ret = e_NO_CONTEXT;

    if (!d_contextStack.empty()) {
        ret = static_cast<ContextType>(d_contextStack.back());
        d_contextStack.pop_back();
    }

    return ret;
}

inline
void Tokenizer::pushContext(ContextType context)
{
    d_contextStack.push_back(static_cast<char>(context));
}

// PRIVATE ACCESSOR
inline
Tokenizer::ContextType Tokenizer::context() const
{
    return d_contextStack.empty()
               ? e_NO_CONTEXT
               : static_cast<ContextType>(d_contextStack.back());
}

// CREATORS
inline
Tokenizer::Tokenizer(bslma::Allocator *basicAllocator)
: d_allocator(d_buffer.buffer(), k_BUFSIZE, basicAllocator)
, d_stackAllocator(d_stackBuffer.buffer(),
                   k_CONTEXTSTACKBUFSIZE,
                   basicAllocator)
, d_stringBuffer(&d_allocator)
, d_streambuf_p(0)
, d_cursor(0)
, d_valueBegin(0)
, d_valueEnd(0)
, d_valueIter(0)
, d_readOffset(0)
, d_tokenType(e_BEGIN)
, d_contextStack(200, &d_stackAllocator)
, d_readStatus(0)
, d_bufEndStatus(0)
, d_allowStandAloneValues(true)
, d_allowHeterogenousArrays(true)
, d_allowNonUtf8StringLiterals(true)
, d_allowTrailingTopLevelComma(true)
{
    d_stringBuffer.reserve(k_MAX_STRING_SIZE);
    d_contextStack.clear();
    pushContext(e_NO_CONTEXT);
}

inline
Tokenizer::~Tokenizer()
{
}

// MANIPULATORS
inline
void Tokenizer::reset(bsl::streambuf *streambuf)
{
    d_streambuf_p  = streambuf;
    d_stringBuffer.clear();
    d_cursor       = 0;
    d_valueBegin   = 0;
    d_valueEnd     = 0;
    d_valueIter    = 0;
    d_readOffset   = 0;
    d_tokenType    = e_BEGIN;
    d_readStatus   = 0;
    d_bufEndStatus = 0;

    d_contextStack.clear();
    pushContext(e_NO_CONTEXT);
}

inline
void Tokenizer::setAllowHeterogenousArrays(bool value)
{
    d_allowHeterogenousArrays = value;
}

inline
void Tokenizer::setAllowNonUtf8StringLiterals(bool value)
{
    d_allowNonUtf8StringLiterals = value;
}

inline
void Tokenizer::setAllowStandAloneValues(bool value)
{
    d_allowStandAloneValues = value;
}

inline
void Tokenizer::setAllowTrailingTopLevelComma(bool value)
{
    d_allowTrailingTopLevelComma = value;
}

// ACCESSORS
inline
bool Tokenizer::allowHeterogenousArrays() const
{
    return d_allowHeterogenousArrays;
}

inline
bool Tokenizer::allowNonUtf8StringLiterals() const
{
    return d_allowNonUtf8StringLiterals;
}

inline
bool Tokenizer::allowStandAloneValues() const
{
    return d_allowStandAloneValues;
}

inline
bool Tokenizer::allowTrailingTopLevelComma() const
{
    return d_allowTrailingTopLevelComma;
}

inline
bsls::Types::Uint64 Tokenizer::currentPosition() const
{
    return d_readOffset - d_stringBuffer.size() + d_cursor;
}

inline
bsls::Types::Uint64 Tokenizer::readOffset() const
{
    return d_readOffset;
}


inline
int Tokenizer::readStatus() const
{
    return d_readStatus;
}

inline
Tokenizer::TokenType Tokenizer::tokenType() const
{
    return d_tokenType;
}


}  // close package namespace
}  // close enterprise namespace

#endif  // INCLUDED_BDLJSN_TOKENIZER

// ----------------------------------------------------------------------------
// Copyright 2022 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------