// bdljsn_stringutil.h                                                -*-C++-*-
#ifndef INCLUDED_BDLJSN_STRINGUTIL
#define INCLUDED_BDLJSN_STRINGUTIL

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide a utility functions for JSON strings.
//
//@CLASSES:
//  bdljsn::StringUtil: namespace for utility functions on JSON strings
//
//@DESCRIPTION: This component defines a utility 'struct',
// 'bdljsn::StringUtil', that is a namespace for functions that convert
// arbitrary UTF-8 codepoint sequences to JSON strings and vice versa.  The
// rules for these conversions are outlined below in {JSON Strings} and
// detailed in: https://www.rfc-editor.org/rfc/rfc8259#section-7 (RFC8259)
//
// This utility provides two key functions:
//
//: o 'writeString': Given an arbitrary UTF-8 codepoint sequence, generate a
//:   JSON string representing the same codepoints.
//:
//: o 'readString': Given a JSON string (e.g., the output of 'writeString'),
//:   generate the equivalent sequence of UTF-8 code points.
//
// When using these functions, a UTF-8 codepoint sequence is always preserved
// on the round trip to JSON string and back; however, since there are
// equivalent allowed representations of a JSON string, the converse is not
// guaranteed.
//
///JSON Strings
///------------
// JSON strings consist of UTF-8 codepoints surround by double quotes (i.e.,
// '\"') Within those double quotes certain characters *must* be escaped (i.e.,
// replaced with some alternative, multi-byte representation).  Those
// characters are:
//
//: o quotation marks
//: o backslashes (a.k.a., a "reverse solidus")
//: o the "control characters" in the range 'U+0000' to 'U+001F' (inclusive).
//
// Each of the above characters can be escaped by replacing it with the six
// byte sequence consisting of:
//
//:   o a backslash,
//:   o a lower-case 'u', and
//:   o the Unicode value expressed as four hexadecimal digits.
//
// For example, the character that rings the console bell is represented as
// '\u0007'.  Note that the hexadecimal digits can use upper or lower case
// letters but the lead 'u' character must be lower case.  See {Strictness}.
//
// Eight of the characters that must be escaped can be alternatively
// represented by special, 2-byte sequences:
//..
//  +---------+-----------------+---------------+---------------+
//  | Unicode | Description     | 6-byte escape | 2-byte escape |
//  +---------+-----------------+---------------+---------------+
//  | U+0022  | quotation mark  | \u0022        |  \"           |
//  | U+005C  | backslash       | \u005c        |  \\           |
//  | U+002F  | slash           | \u002f        |  \/           |
//  | U+0008  | backspace       | \u0008        |  \b           |
//  | U+000C  | form feed       | \u000C        |  \f           |
//  | U+000A  | line feed       | \u000A        |  \n           |
//  | U+000D  | carriage return | \u000D        |  \r           |
//  | U+0009  | tab             | \u0009        |  \t           |
//  +---------+-----------------+---------------+---------------+
//..
// Note that the above set is similar to but not identical to the set of two
// byte 'char' literals supported by C++.  For example, '\0' (null) and '\a'
// (bell) are not included above.
//
///Guarantees: Arbitrary UTF-8 to JSON String
/// - - - - - - - - - - - - - - - - - - - - -
//: o No UTF-8 characters in the *Basic* *Multilingual* *Plane* are escaped
//:   unless they are in the set that *must* be escaped.
//:
//: o When a character must be escaped, the 6-byte (hexadecimal) representation
//:   is used only if no 2-byte escape exists.
//:
//: o When a 6-byte (hexadecimal) representation is used, hexadecimal letters
//:   are in upper case.
//:
//: o All UTF-8 characters outside of the *Basic* *Multilingual* *Plane*
//:   are represented by two, adjacent 6-byte hexadecimal escape
//:   sequences.  For details, see:
//:   https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
//
///Strictness
///----------
// By default, the 'bdljsn::StringUtil' read and write methods strictly follow
// the RFC8259 standard.  Variances from those rules are expressed using
// 'bdljsn::StringUtil::FLags', an 'enum' of flag values that can be set in the
// optional 'flags' parameter of the decoding methods.  Multiple flags can be
// bitwise set in 'flags; however, currently, just one variance flag is
// defined.
//
///Example Variance
/// - - - - - - - -
// RFC8259 specifies that the 6-byte Unicode escape sequence start with a
// slash, '/', and lower-case 'u'.  However, if the
// 'bdljsn::StringUtil::e_ACCEPT_CAPITAL_UNICODE_ESCAPE' is set, an upper-case
// 'U' is accepted as well.  Thus, both '\u0007' and '\U0007' would be
// interpreted as the BELL character.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Encoding and Decoding a JSON String
/// - - - - - - - - - - - - - - - - - - - - - - -
// First, we initialize a string with a valid sequence of UTF-8 codepoints.
//..
//  bsl::string initial("Does the name \"Ivan Pavlov\" ring a bell\a?\n");
//  assert(bdlde::Utf8Util::isValid(initial));
//..
// Notice that, as required by C++ syntax, several characters are represented
// by their two-character escape sequence: double quote (twice), bell, and
// newline.
//
// Then, we examine the string as output:
//..
//  bsl::cout << initial << bsl::endl;
//..
// and observe:
//..
//  Does the name "Ivan Pavlov" ring a bell?
//
//..
// Notice that the backslash characters (having served their purpose of giving
// special meaning to the subsequent character) are not shown.  The BELL and
// NEWLINE characters are output but are not visible.
//
// Now, we generate JSON string equivalent of the 'initial' string.
//..
//  bsl::ostringstream oss;
//
//  int rcEncode = bdljsn::StringUtil::writeString(oss, initial);
//  assert(0 == rcEncode);
//
//  bsl::string  jsonCompatibleString = oss.str();
//  bsl::cout << jsonCompatibleString << bsl::endl;
//..
// and observed how the 'initial' string is represented for JSON:
//..
//  "Does the name \"Ivan Pavlov\" ring a bell\u0007?\n"
//..
// Notice that:
//: o The entire string is delimited by double quotes.
//: o The interior double quotes and new line are represented by two character
//:   escape sequences (as they were in the C++ string literal.
//: o Since JSON does not have a two character escape sequence for the BELL
//:   character, '\u0007', the 6-byte Unicode representation is used.
//
// Finally, we convert the 'jsonCompatibleString' back to its original content:
//..
//  bsl::string fromJsonString;
//  const int   rcDecode = bdljsn::StringUtil::readString(
//                                                       &fromJsonString,
//                                                       jsonCompatibleString);
//  assert(0       == rcDecode);
//  assert(initial == fromJsonString);
//
//  bsl::cout << fromJsonString << bsl::endl;
//..
// and observe (again):
//..
//  Does the name "Ivan Pavlov" ring a bell?
//
//..

#include <bdlscm_version.h>

#include <bsls_assert.h>

#include <bsl_ostream.h>
#include <bsl_string.h>
#include <bsl_string_view.h>

namespace BloombergLP {
namespace bdljsn {

                             // =================
                             // struct StringUtil
                             // =================

struct StringUtil {
    // This class provides utility functions for converting arbitrary UTF-8
    // sequences into JSON strings and visa versa.  See {JSON Strings} in
    // {DESCRIPTION} for details of these transformations.

  public:
    // TYPES
    enum Flags {
        e_NONE                          = 0
      , e_ACCEPT_CAPITAL_UNICODE_ESCAPE = 1 << 0
    };

    // CLASS METHODS
    static int readString(bsl::string             *value,
                          const bsl::string_view&  string,
                          int                      flags = e_NONE);
        // Load to the specified 'value' the UTF-8 codepoint sequence
        // equivalent to the specified (JSON) 'string' (see {JSON Strings}).
        // Return 0 on success and a non-zero value otherwise.  Optionally
        // specify 'flags' to request variances from certain rules of JSON
        // decoding (see {Strictness}).

    static int readUnquotedString(bsl::string             *value,
                                  const bsl::string_view&  string,
                                  int                      flags = e_NONE);
        // Load to the specified 'value' the UTF-8 codepoint sequence
        // equivalent to the specified 'string', that is JSON-compliant absent
        // the leading and trailing double quote characters (see {JSON
        // Strings}).  Return 0 on success and a non-zero value otherwise.
        // Optionally specify 'flags' to request variances from certain rules
        // of JSON decoding (see {Strictness}).

    static int writeString(bsl::ostream&           stream,
                           const bsl::string_view& string);
        // Write to the specified 'stream' a JSON-compliant string that is
        // equivalent to the specified 'string', an arbitrary UTF-8 codepoint
        // sequence.  Return 0 on success and a non-zero value otherwise.  The
        // operation fails if 'string' is not a sequence of UTF-8 codepoints or
        // if there is an error writing to 'stream'.  See {Conventions:
        // Arbitrary UTF-8 to JSON String} for further details.
};

// ============================================================================
//                             INLINE DEFINITIONS
// ============================================================================

                             // -----------------
                             // struct StringUtil
                             // -----------------

// CLASS METHODS
inline
int StringUtil::readString(bsl::string             *value,
                           const bsl::string_view&  string,
                           int                      flags)
{
    BSLS_ASSERT(value);

    if (2 > string.size()) {
        return -1;                                                    // RETURN
    }

    if (string[0] != '"' && string[string.size() - 1] != '"') {
        return -1;                                                    // RETURN
    }

    const bsl::string_view contents = string.substr(1, string.size() - 2);
    return readUnquotedString(value, contents, flags);
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2022 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------