// bdlde_quotedprintabledecoder.h -*-C++-*- #ifndef INCLUDED_BDLDE_QUOTEDPRINTABLEDECODER #define INCLUDED_BDLDE_QUOTEDPRINTABLEDECODER #include <bsls_ident.h> BSLS_IDENT("$Id: $") //@PURPOSE: Provide automata converting to and from Quoted-Printable encodings. // //@CLASSES: // bdlde::QuotedPrintableDecoder: automata for Quoted-Printable decoding // //@SEE_ALSO: bdlde_quotedprintableencoder // //@DESCRIPTION: This component provides a template class (parameterized // separately on both input and output iterators) that can be used to decode // byte sequences of arbitrary length from the Quoted Printable representation // described in Section 6.7 "Quoted-Printable Content Transfer Encoding" of RFC // 2045, "Multipurpose Internet Mail Extensions (MIME) Part One: Format of // Internet Message Bodies." // // Each instance of the decoder retains the state of the conversion from one // supplied input to the next, enabling the processing of segmented input -- // i.e., processing resumes where it left off with the next invocation on new // input. Instance methods are provided for the decoder to (1) assert the end // of input, (2) determine whether the input so far is currently acceptable, // and (3) indicate whether a non-recoverable error has occurred. // ///Quoted-Printable Decoding ///------------------------- // (In the following, all rules mentioned refer to those listed in the encoder // section above.) // // The decoding process for this encoding scheme involves: // //: 1 transforming any encoded character triplets back into their original //: representation (rule #1 and rule #4). //: //: 2 literally writing out characters that have not been changed (rule #2). //: //: 3 deleting any trailing whitespace at the end of an encoded line (rule #3). // //: 4 removing the soft line breaks including the '=' prefix (i.e., //: concatenating broken sentences) (rule #5). // // The standard imposes a maximum of 76 characters exclusive of CRLF; however, // the decoder implemented in this component will handle lines of arbitrary // length. // // The decoder also provides support for 2 error-reporting modes: the strict // mode and the relaxed mode (configurable at construction). A strict-mode // decoder stops decoding at the first offending character encountered, while a // relaxed-mode decoder would continue decoding to the end of the input, // allowing straight pass-through of character sets that cannot be interpreted. // // The following kinds of errors can be encountered during decoding, listed in // order of decreasing order of precedence: //.. // E1. BAD_DATA //.. // An '=' character is not followed by either two uppercase hexadecimal digits, // or a soft line break -- e.g., //.. // '=4=' (only one hexadecimal) // '=K3' (K3 is not a hexadecimal number) // '=1f' (lower case f is a literally encoded character) //.. // // Note that: // //: 1 In the relaxed error-reporting mode of this implementation, lowercase //: hexadecimal digits are treated as valid numerals. //: //: 2 E1 can be caused by a missing or corrupted numeric, a corrupted character //: disguised as an '=', or an accidental insertion of a '=' that does not //: belong. //: //: 3 The case where a seemingly valid character is found in place of a missing //: numeric cannot be detected, e.g., '=4F' where 'F' is actually a literally //: encoded character. //: //: 4 An erroneous occurrence of a '=' character preceding 2 seemingly valid //: hexadecimal numerics is also undetectable, e.g., '=4F' where '=' was //: actually a 't' corrupted during transmission. //.. // E2. BAD_LINEBREAK //.. // A '\r' is not followed by a '\n'. In the relaxed mode, each stand-alone // '\r' or '\n' will be copied straight through to the output. For soft line // breaks, whitespace is ignored between the '=' character and the CRLF as they // are to be treated and removed as transport padding. //.. // E3. BAD_LINELENTH //.. // An encoded line exceeds the specified maximum line length with missing soft // line breaks. (Because input of flexible line lengths is allowed in this // implementation, this error is not detected or reported.) // // In the relaxed-mode, errors of the types E1 and E2 would be copied straight // to output and type E3 ignored. Decoded lines will be broken even when a // bare CRLF is encountered in this mode. Users can still be alerted to the // the unreported errors as offending characters are copied straight through to // the output stream, which can be observed. // // The 'isError' method is used to detect the above anomalies, while for the // 'convert' method, a 'numIn' output parameter (indicating the number of input // characters consumed) or possibly the iterator itself (for iterators with // reference-semantics) identifies the offending character. // ///Usage ///- - - // TBD #include <bdlscm_version.h> #include <bsl_cstring.h> #include <bsl_queue.h> #include <bsl_vector.h> namespace BloombergLP { namespace bdlde { class QuotedPrintableDecoder { // This class implements a mechanism capable of converting data of // arbitrary length from its corresponding Quoted-Printable representation. // PRIVATE TYPES enum { // Symbolic state values. e_ERROR_STATE = -1, // input is irreparably invalid e_INPUT_STATE = 0, // general input state e_SAW_EQUAL_STATE = 1, // need two hexadecimal values or CR LF e_SAW_WS_STATE = 2, // saw a whitespace e_NEED_HEX_STATE = 3, // need one hexadecimal value e_NEED_SOFT_LF_STATE = 4, // need soft new line e_NEED_HARD_LF_STATE = 5, // need soft new line e_DONE_STATE = 6 // any additional input is an error }; public: enum EquivalenceClasses { // This enumeration type enumerates the input equivalence classes. // Separate enums are given to variants resulting from different modes // of operation to eliminate an extra step of mode checking inside the // main decoding loop. // Regular character - copy straight to output e_RC_ = 0, // strict mode e_RC, // relaxed mode // Hexadecimal digit - numeral only when preceded by // '='; otherwise a regular character e_HX_, // strict mode e_HX, // relaxed mode // '=' - wait for more input e_EQ_, // strict mode e_EQ, // relaxed mode // Whitespace - buffer; wait for more input e_WS_, // strict mode e_WS, // relaxed mode // Carriage return e_CR_, // strict mode - wait for further input e_CR, // relaxed mode - wait for further input // Line Feed Strict mode // ------------ e_LC_, // CRLF_MODE - decode to "\r\n" if preceded by // '\r'; report error otherwise e_LL_, // LF_MODE - decode to '\n' if preceded by // '\r' report error otherwise Relaxed mode // ------------ e_LC, // CRLF_MODE - decode to "\r\n" if preceded by // '\r'; ignore otherwise e_LL, // LF_MODE - decode to "\n" if preceded by // '\r'; ignore otherwise // Unrecognized char - halt and report error e_UC_, // strict mode - Ignore and halt decoding e_UC // relaxed mode - Ignore but continue decoding #ifndef BDE_OMIT_INTERNAL_DEPRECATED , BDEDE_RC_ = e_RC_ , BDEDE_RC = e_RC , BDEDE_HX_ = e_HX_ , BDEDE_HX = e_HX , BDEDE_EQ_ = e_EQ_ , BDEDE_EQ = e_EQ , BDEDE_WS_ = e_WS_ , BDEDE_WS = e_WS , BDEDE_CR_ = e_CR_ , BDEDE_CR = e_CR , BDEDE_LC_ = e_LC_ , BDEDE_LL_ = e_LL_ , BDEDE_LC = e_LC , BDEDE_LL = e_LL , BDEDE_UC_ = e_UC_ , BDEDE_UC = e_UC #endif // BDE_OMIT_INTERNAL_DEPRECATED }; enum LineBreakMode { // Configuration governing how line breaks are decoded. e_CRLF_MODE, // "\r\n" are decoded to "\r\n". e_LF_MODE // "\r\n" are decoded to "\n". #ifndef BDE_OMIT_INTERNAL_DEPRECATED , BDEDE_CRLF_MODE = e_CRLF_MODE , BDEDE_LF_MODE = e_LF_MODE #endif // BDE_OMIT_INTERNAL_DEPRECATED }; // CLASS DATA static const char s_componentName[]; // Name of component used when reporting errors. static const bool s_defaultUnrecognizedIsErrorFlag; // Default error reporting mode static const char *s_defaultEquivClassStrict_p; // Default map of 'unsigned char' to equivalence class for strict mode static const char *s_defaultEquivClassCRLF_p; // Default map of 'unsigned char' to equivalence class for CRLF line // break mode static const unsigned char *const s_decodingMap_p; // Character map used for converting an ASCII character to the // hexadecimal value it is representing. static const int s_defaultMaxLineLength; // Default maximum line length static const char* s_lineBreakModeName[]; // Names of line break mode // INSTANCE DATA bool d_unrecognizedIsErrorFlag; // If true, fail on "bad" characters LineBreakMode d_lineBreakMode; // Line break mode int d_state; // TBD doc char d_buffer[90]; // TBD doc int d_bufferLength; // TBD doc char d_hexBuffer; // TBD doc int d_outputLength; // Total number of output characters char *d_equivClass_p; // Map of 'unsigned char' to input equivalence class; // dynamically allocated because there is no default // complete configuration. private: // NOT IMPLEMENTED QuotedPrintableDecoder(const QuotedPrintableDecoder&); QuotedPrintableDecoder& operator=(const QuotedPrintableDecoder&); public: // CLASS METHODS static const char* lineBreakModeToAscii(LineBreakMode mode); // Return the ASCII string describing the specified 'mode' governing // the decoding of hard linebreaks ("\r\n"). The behavior is undefined // unless 'mode' is either e_CRLF_MODE or e_LF_MODE. // CREATORS explicit QuotedPrintableDecoder( bool detectError, QuotedPrintableDecoder::LineBreakMode lineBreakMode = QuotedPrintableDecoder::e_CRLF_MODE); // Create a Quoted-Printable decoder in the initial state, set to the // strict or relaxed error-reporting mode according to whether the // specified 'detectError' flag is 'true' or 'false', respectively, and // also configured to the specified 'lineBreakMode'. The behavior is // undefined unless 'lineBreakMode' is either e_CRLF_MODE or // e_LF_MODE. Note that the decoder reports errors in the strict // mode and output offending characters in the relaxed mode. Hard line // breaks ("\r\n") are decoded to "\r\n" in e_CRLF_MODE (default) // and to '\n' in e_LF_MODE. ~QuotedPrintableDecoder(); // Destroy this object. // MANIPULATORS int convert(char *out, int *numOut, int *numIn, const char *begin, const char *end, int maxNumOut = -1); // Append to the buffer addressed by the specified 'out' all pending // output (if there is any) up to the optionally specified 'maxNumOut' // limit (default is negative, meaning no limit) and, when there is no // pending output and 'maxNumOut' is still not reached, begin to // consume and decode a sequence of input characters starting at the // specified 'begin' position, up to but not including the specified // 'end' position, writing any resulting output in the specified // 'output' buffer up to the (cumulative) 'maxNumOut' limit. If // 'maxNumOut' limit is reached, no further input will be consumed. // Load into the specified 'numOut' and 'numIn' the number of output // bytes produced and input bytes consumed, respectively. Return a // non-negative value on success and a negative value otherwise. A // successful return status indicates the number of characters that // would be output if 'endConvert' were called with no output limit // immediately upon exit from this method. These bytes are also // available for output if this method is called with a sufficiently // large 'maxNumOut'. Note that calling this method after 'endConvert' // has been invoked without an intervening 'reset' call will place this // instance in an error state, and return an error status. Note also // that it is recommended that after all calls to 'convert' are // finished, the 'endConvert' method be called to complete the decoding // of any unprocessed input characters (e.g., whitespace). int endConvert(char *out, int *numOut, int maxNumOut = -1); // Terminate encoding for this decoder; write any retained output // (e.g., from a previous call to 'convert' with a non-zero 'maxNumOut' // argument) to the specified 'out' buffer. Optionally specify the // 'maxNumOut' limit on the number of bytes to output; if 'maxNumOut' // is negative, no limit is imposed. Load into the specified 'numOut' // the number of output bytes produced. Return 0 on success with no // pending output, the positive number of bytes (if any) that would be // output if 'endConvert' were called with no output limit immediately // upon exit from this method, and a negative value otherwise. Any // retained bytes are available on a subsequent call to 'endConvert'. // Once this method is called, no additional input may be supplied // without an intervening call to 'reset'; once this method returns a // zero status, a subsequent call will place this decoder in the error // state, and return an error status. void reset(); // Reset this decoder to its initial state (i.e., as if no input had // been consumed). // ACCESSORS bool isAccepting() const; // Return 'true' if the input read so far by this decoder is considered // syntactically complete and all resulting output has been emitted; // return 'false' otherwise. Note that there must not be any // unprocessed characters accumulated in the input buffer of this // decoder. bool isDone() const; // Return 'true' if this decoder is in the done state (i.e., // 'endConvert' has been called and any additional input will result in // an error), and if there is no pending output; return 'false' // otherwise. bool isError() const; // Return 'true' if this decoder has encountered an irrecoverable error // and 'false' otherwise. An irrecoverable error is one for which // there is no subsequent possibility of achieving an "acceptable" // result (as defined by the 'isAccepting' method). bool isInitialState() const; // Return 'true' if this decoder is in the initial state (i.e., as if // no input had been consumed) and 'false' otherwise. bool isMaximal() const; // Return 'true' if the input to this decoder is maximal (i.e., the // input contains an end-of-input sentinel, signaling that no further // input should be expected). *Always* returns 'false' for // Quoted-Printable decoders since the encoding scheme does not specify // an end-of-input sentinel. bool isUnrecognizedAnError() const; // Return 'true' if this decoder is currently configured to detect an // error when an unrecognizable encoding is encountered, and 'false' // otherwise. LineBreakMode lineBreakMode() const; // Return the line break mode specified for this decoder. int numOutputPending() const; // Return the number of output bytes retained by this decoder and not // emitted because 'maxNumOut' has been reached. int outputLength() const; // Return the total length of the output emitted by this decoder // (possibly after several calls to the 'convert' or the 'input' // methods) since its initial construction or the latest 'reset'. }; // ============================================================================ // INLINE DEFINITIONS // ============================================================================ // CLASS METHODS inline const char* QuotedPrintableDecoder::lineBreakModeToAscii( LineBreakMode mode) { return s_lineBreakModeName[mode]; } // CREATORS inline QuotedPrintableDecoder::QuotedPrintableDecoder( bool unrecognizedIsErrorFlag, QuotedPrintableDecoder::LineBreakMode lineBreakMode) : d_unrecognizedIsErrorFlag(unrecognizedIsErrorFlag) , d_lineBreakMode(lineBreakMode) , d_state(e_INPUT_STATE) , d_bufferLength(0) , d_outputLength(0) { if (unrecognizedIsErrorFlag) { // Strict mode d_equivClass_p = const_cast<char*>(s_defaultEquivClassStrict_p); } else { if (lineBreakMode == e_CRLF_MODE) { d_equivClass_p = const_cast<char*>(s_defaultEquivClassCRLF_p); } else { // First copy the map of equivalence classes for the // e_CRLF_MODE to the strict error-report mode. int len = sizeof(*s_defaultEquivClassCRLF_p) * 256; d_equivClass_p = new char[len]; bsl::memcpy(d_equivClass_p, s_defaultEquivClassCRLF_p, len); d_equivClass_p['\n'] = e_LL; // output '\n' instead if preceded // by '='. } } } // MANIPULATORS inline void QuotedPrintableDecoder::reset() { d_state = e_INPUT_STATE; d_outputLength = 0; d_bufferLength = 0; } // ACCESSORS inline bool QuotedPrintableDecoder::isAccepting() const { return e_INPUT_STATE == d_state || e_DONE_STATE == d_state; } inline bool QuotedPrintableDecoder::isDone() const { return e_DONE_STATE == d_state && 0 == d_bufferLength; } inline bool QuotedPrintableDecoder::isError() const { return e_ERROR_STATE == d_state; } inline bool QuotedPrintableDecoder::isInitialState() const { return e_INPUT_STATE == d_state && 0 == d_outputLength; } inline bool QuotedPrintableDecoder::isMaximal() const { return false; } inline bool QuotedPrintableDecoder::isUnrecognizedAnError() const { return d_unrecognizedIsErrorFlag; } inline QuotedPrintableDecoder::LineBreakMode QuotedPrintableDecoder::lineBreakMode() const { return d_lineBreakMode; } inline int QuotedPrintableDecoder::numOutputPending() const { return d_bufferLength; } inline int QuotedPrintableDecoder::outputLength() const { return d_outputLength; } } // close package namespace } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Copyright 2015 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------