// bdlde_utf8checkinginstreambufwrapper.h -*-C++-*- #ifndef INCLUDED_BDLDE_UTF8CHECKINGINSTREAMBUFWRAPPER #define INCLUDED_BDLDE_UTF8CHECKINGINSTREAMBUFWRAPPER #include <bsls_ident.h> BSLS_IDENT("$Id: $") //@PURPOSE: Provide a stream buffer wrapper for validating UTF-8 input. // //@CLASSES: // bdlde::Utf8CheckingInStreamBufWrapper: wraps input streambuf, checks UTF-8 // //@SEE_ALSO: bsl_streambuf // //@DESCRIPTION: This component provides a mechanism, // 'bdlde::Utf8CheckingInStreamBufWrapper', that inherits from // 'bsl::streambuf', and that holds and wraps another 'streambuf'. It forwards // input through the held streambuf, checking for invalid UTF-8. The wrapping // object does not support output, only input. All normal input functions are // supported. If the held 'streambuf' supports seeking, seeks are supported, // though not forward seeks, and 'pubseekoff(0, bsl::ios_base::cur)' is // supported whether the wrapped 'streambuf' supports seeking or not. // // Input is buffered, the buffer cannot be changed -- 'pubsetbuf' is a no-op. // // The client is normally recommended to use this object by reading from it // until it behaves as though it has reached the end of input, and then call // 'errorStatus' to see if a UTF-8 error happened, and if so, then call // 'pubseekoff(0, bsl::ios_base::cur)' to find the position of the beginning of // the invalid UTF-8 code point. // ///Positioning at the Start ///------------------------ // When starting to read, the wrapped 'streambuf' must be positioned at the // beginning of a UTF-8 code point, or the end of data, otherwise, the wrapper // will interpret the first byte read as incorrect UTF-8. // ///Behavior of Reads ///----------------- // If incorrect UTF-8 exists in the data stream, reads will succeed until // reaching the start of the incorrect code point, after which reads will // behave as though the end of data were reached. All data returned by reads // will be valid UTF-8. Reads of limited length that end before the end of // data may return incomplete, truncated portions of valid UTF-8 code points. // In that case, following reads will return the remainder of the same valid // UTF-8 code point. // ///'errorStatus' ///------------- // The 'errorStatus' accessor is not a virtual function and is not inherited // from 'streambuf'. // // If invalid UTF-8 is encountered while reading, input will succeed right up // to the beginning of the invalid code point, at which point the object will // behave as though it has reached the end of data, with the object positioned // to exactly the start of the invalid code point. 'errorStatus' will reflect // the nature of the UTF-8 error. // // If a seek error occurs, 'errorStatus' will change to 'k_SEEK_FAIL' and // subsequent reads and relative seeks will fail, including // 'pubseekoff(0, bsl::ios_base::cur)'. A 'reset' or an absolute seek to the // start of data will reset 'errorStatus' to 0 and the object will recover to // being able to perform input and relative seeks. // // UTF-8 errors can be recovered from by calling 'reset' or by seeking at least // one byte backward. Note that 'pubseekoff(0, bsl::ios_base::cur)' after a // UTF-8 error will return the object's position without changing the error // state. Note that an absolute seek to the beginning of data will not recover // unless it amounts to a seek at least one byte backward. // // If input has reached invalid UTf-8, 'errorStatus()' will be negative, and // one of the values from 'bdlde::Utf8Util::ErrorStatus'. // // The class method 'toAscii' can be called to translate any value returned by // 'errorStatus()' to a human-readable string. // ///Seeking ///------- // The wrapped 'streambuf' must either support seeking or always return a // negative value when a seek attempt is made. // // Forward seeks and seeks relative to the end of data are not supported. // // If the wrapped 'streambuf' does not support seeking, // 'pubseekoff(0, bsl::ios_base::cur)' will still work on the wrapper and will // return the offset relative to the input position when the wrapper was bound // to the held 'streambuf', without changing the error state. // // Seeks can fail for a number of reasons (see 'seekoff'), and if that happens, // the object will enter a "failed seek state", having no valid position, and // will no longer be able to do input or do relative seeks until recovering by // either doing an absolute seek to 0 or by having 'reset' called. When the // object is in a failed seek state, 'errorStatus()' will equal 'k_SEEK_FAIL'. // ///Valid State ///----------- // If the object has been bound via 'reset' to a held 'streambuf' and is not in // a failed seek state, the object is in a valid state. // ///Usage ///----- // ///Example 1: Detecting invalid UTF-8 read from a 'streambuf': /// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Suppose one has a 'streambuf', 'myStreamBuf' containing UTF-8 that one wants // to read, checking that it is valid UTF-8. // // First, create a 'Utf8CheckingInStreamBufWrapper' that will wrap // 'myStreamBuf': //.. // typedef bdlde::Utf8CheckingInStreamBufWrapper Obj; // Obj wrapper; // wrapper.reset(&myStreamBuf); //.. // Then, read the data from the 'wrapper' 'streambuf' until it stops yielding // data. //.. // std::string s; // bsl::streamsize len = 0, bytesRead; // do { // enum { k_READ_CHUNK = 10 }; // // s.resize(len + k_READ_CHUNK); // // bytesRead = wrapper.sgetn(&s[len], k_READ_CHUNK); // // assert(0 <= bytesRead); // assert(bytesRead <= k_READ_CHUNK); // // s.resize((len += bytesRead)); // } while (0 < bytesRead); // // assert(wrapper.pubseekoff(0, bsl::ios_base::cur) == Obj::pos_type(len)); //.. // Next, use the 'errorStatus' accessor and 'pubseekoff' manipulator to see // what, if anything, went wrong and where. //.. // const int es = wrapper.errorStatus(); // // if (0 == es) { // cout << "No errors occurred.\n"; // } // else if (es < 0) { // cout << "Incorrect UTF-8 encountered " << Obj::toAscii(es) << // " at offset " << wrapper.pubseekoff(0, bsl::ios_base::cur) << endl; // } // else { // cout << "Non-UTF-8 error " << Obj::toAscii(es) << endl; // } //.. // Now, we observe the output: //.. // Incorrect UTF-8 encountered UNEXPECTED_CONTINUATION_OCTET at offset 79 //.. // Finally, we observe that all the data from 'myStreamBuf' up to offset 79 // was read into 's', and that it's all correct UTF-8. //.. // assert(len == s.end() - s.begin()); // assert(bdlde::Utf8Util::isValid(&s[0], len)); //.. #include <bdlscm_version.h> #include <bslma_allocator.h> #include <bslma_usesbslmaallocator.h> #include <bslmf_nestedtraitdeclaration.h> #include <bsls_keyword.h> #include <bsls_types.h> #include <bsl_ios.h> // 'streamsize' #include <bsl_locale.h> #include <bsl_streambuf.h> // 'char_type', 'int_type', 'pos_type', 'off_type', // 'traits_type' are within the 'bsl::streambuf' // class namespace BloombergLP { namespace bdlde { // ==================================== // class Utf8CheckingInStreamBufWrapper // ==================================== class Utf8CheckingInStreamBufWrapper : public bsl::streambuf { // This 'class' inherits from 'bsl::streambuf', and holds and wraps another // 'streambuf'. It forwards input through the held streambuf, and checks // for invalid UTF-8. The wrapping object does not support ouput, only // input. If the held 'streambuf' supports seeking, seeks are supported, // though not forward seeks, and 'pubseekoff(0, bsl::ios_base::cur)' is // supported whether the wrapped 'streambuf' supports seeking or not. // PRIVATE TYPES typedef bsls::Types::IntPtr IntPtr; // A signed integral type the size of // a pointer enum { k_PBACK_BUF_SIZE = 8, // size of putback buffer k_BUF_SIZE = 8 * 1024 // input buffer size }; public: // PUBLIC TYPES enum { k_SEEK_FAIL = +1 // seek failure }; private: // DATA bsl::streambuf *d_heldStreamBuf_p; // the 'streambuf' that this object // wraps around, which is held, not // owned. int d_errorStatus; // The error status of this object. //: o A value from //: 'Utf8Util::ErrorStatus' if a //: UTF-8 error has occurred. Note //: that these are all -ve values. //: //: o 'k_SEEK_FAIL' (positive) //: if a seek error has occurred //: //: o 0 if no error has occured, //: including if end of file has //: been reached int d_bufEndStatus; // status at the end of the buffer, // which may not have been reached // yet char_type *d_buf_p; // input buffer char d_pBackBuf[k_PBACK_BUF_SIZE]; // for putback mode (see above) char_type *d_savedEback_p; // only used in putback-mode, the // saved value of non-putback mode // 'eback' from the base class (note // that when we enter putback-mode, // 'eback() == gptr()', so it's not // necessary to have a // 'd_savedGptr_p') char_type *d_savedEgptr_p; // only used in putback-mode, the // saved value of non-putback mode // 'egptr' from the base class pos_type d_offset; // in non-putback mode, the offset of // 'eback()', in putback mode, the // offset of 'egptr()' bool d_seekable; // 'true' if held 'streambuf' is // seekable and 'false' otherwise bool d_putBackMode; // 'true' if we're in putback mode, // 'false' if normal input bslma::Allocator *d_allocator_p; // used for allocation of 'd_buf_p' public: // TRAITS BSLMF_NESTED_TRAIT_DECLARATION(Utf8CheckingInStreamBufWrapper, bslma::UsesBslmaAllocator); private: // PRIVATE MANIPULATOR pos_type setSeekFailure(bsl::ios_base::openmode mode); // Set the state of this object to the failed seek state and return a // negative position, if the held 'streambuf' is seekable, pass the // specified 'mode' to a seek to the beginning of the file. protected: // PROTECTED MANIPULATORS // implementation functions // The following member functions are virtual and protected. They are part // of the implementation and are called by other functions in this class or // by functions in the base class. These functions have no corresponding // public member functions that call them. int_type overflow(int_type = traits_type::eof()) BSLS_KEYWORD_OVERRIDE; // Unconditionally return 'traits_type::eof()'. The optionally // specified argument is ignored. bsl::streamsize showmanyc() BSLS_KEYWORD_OVERRIDE; // Return the number of bytes that are guaranteed that can be read // before 'underflow' returns 'eof'. If the object is not in a valid // state, -1 will be returned. Note that often, the actual number of // bytes that can be read will be much greater than the value returned // by this function. int_type underflow() BSLS_KEYWORD_OVERRIDE; // Replenish the input buffer with data obtained from the held // 'streambuf', and return the next byte of input (or 'eof' if no input // is available). This function assumes that either the input buffer // is empty or that the end of it has been reached. If this object is // not in a valid state, 'eof' will be returned. // functions forwarded to by // corresponding public functions // The following protected virtual functions all have corresponding public // methods in the base class that forward to them. void imbue(const bsl::locale& locale) BSLS_KEYWORD_OVERRIDE; // If 'sb' is the name of the 'streambuf' held by this object, set 'sb' // to the specified 'locale' as though 'sb.pubimbue(locale)' had been // called. If this object does not hold a 'streambuf', this method has // no effect. Note that this function is forwarded to by the public // method 'pubimbue' in the base class. int_type pbackfail(int_type c = traits_type::eof()) BSLS_KEYWORD_OVERRIDE; // Back up input one byte. Return the byte at the new position, or // 'eof' with the state of this object unchanged on failure. If the // optionally specified 'c' is not 'eof', substitute 'c' for the // previous byte and return that value. If 'c' is 'eof', do not // substitute it for the previous byte and return the byte was there, // or if the previous byte is unknown, fail. If values of 'c' that are // not 'eof' are specified, this function will succeed for at least 8 // successive calls, possibly many more times. The behavior is // undefined unless 'c' is either 'eof' or a value representable as a // 'char_type'. Note that this is forwarded to with a 'char_type' // passed to 'c' by the public method 'sputbackc' in the base class, // and called with 'eof' passed to 'c' by the public method 'sungetc' // in the base class. pos_type seekoff(off_type offset, bsl::ios_base::seekdir whence, bsl::ios_base::openmode mode) BSLS_KEYWORD_OVERRIDE; // Move the position associated with this object according to the // specified 'offset' and 'whence': // //: o If 'whence' is 'bsl::ios_base::beg', set the position to 'offset' //: bytes from the beginning. //: o If 'whence' is 'bsl::ios_base::cur', advance the position by //: 'offset' bytes (note that 'offset' is signed). //: o 'whence == bsl::ios_base::end' is unsupported and a seek fail //: will result. // // A seek can fail if // //: o the object was already in a failed seek state and the seek was //: not an absolute seek to the beginning, //: //: o the object is not bound to a held 'streambuf', //: //: o 'whence' is not 'bsl::ios_base::beg' or 'bsl::ios_base::cur', //: //: o the destination is negative, //: //: o the destination is forward of the current position, or //: //: o a seek on the held 'streambuf' is necessary and that 'streambuf' //: does not support seeking, // // which will put the object into a 'failed seek state'. When the // object is in a failed seek state, 'errorStatus()' will equal // 'k_SEEK_FAIL' and the object will no longer have a valid position, // meaning that input and relative seeks will fail, until the object is // made to recover by either calling 'reset' or an absolute seek to // position 0. // // If a seek is performed on the held 'streambuf', the specified 'mode' // will be propagated to it. The behavior is undefined unless // 'bsl::ios_base::in' is set in 'mode'. Note that this function is // forwarded to by the public method 'pubseekoff' in the base class. // // 'seekoff(0, bsl::ios_base::cur, mode)' is permissible whether the // held 'streambuf' is seekable or not and will never result in a seek // on the held 'streambuf', returning the position in terms of the held // 'streambuf' if that 'streambuf' is seekable and returning the // position relative to when the held 'streambuf' was bound to this // object otherwise. // // Some non-zero seeks will be performed without a seek on the held // 'streambuf', but there is no simple way for the client to predict // when this will be the case. pos_type seekpos(pos_type offset, bsl::ios_base::openmode mode) BSLS_KEYWORD_OVERRIDE; // Set the position of this object to the specified absolute 'offset'. // If a seek on the held 'streambuf' occurs, the specified 'mode' is // passed to it. This function delegates to // 'seekoff(offset, bsl::ios_base::beg, mode)', see that function for // further detail. The behavior is undefined unless // 'bsl::ios_base::in' is set in 'mode'. Note that this function is // forwarded to by the public method 'pubseekpos' in the base class. bsl::streamsize xsgetn(char *buffer, bsl::streamsize numBytes) BSLS_KEYWORD_OVERRIDE; // Read up to the specified 'numBytes' characters from this object to // the specified 'buffer' and return the number of characters // successfully read. A return value of 0 means that either a UTF-8 // error or end of file has been encountered ('errorStatus' must be // called to distinguish between the two), but a non-zero return value // less than 'numBytes' will usually be returned when neither end of // file nor a UTF-8 error has been encountered. The behavior is // undefined unless '4 <= numBytes'. Note that this function is // forwarded to by the public method 'sgetn' in the base class. bsl::streamsize xsputn(const char *, bsl::streamsize ) BSLS_KEYWORD_OVERRIDE; // Output function, not supported in this input-only implementation; // stubbed out, arguments ignored, returns 0. Note that this function // is forwarded to by 'sputn' in the base class. private: // NOT IMPLEMENTED Utf8CheckingInStreamBufWrapper(const Utf8CheckingInStreamBufWrapper&) BSLS_KEYWORD_DELETED; Utf8CheckingInStreamBufWrapper& operator=( const Utf8CheckingInStreamBufWrapper&) BSLS_KEYWORD_DELETED; public: // CLASS METHODS static const char *toAscii(int errorStatus); // Return a description of the specified 'errorStatus'. Note that // 'errorStatus' is either: //: o 'k_SEEK_FAIL' //: //: o A value from 'Utf8Util::ErrorStatus', which are all negative, in //: the case of invalid UTF-8. //: //: o 0 if no errors have occurred, in which case 'NO_ERROR' will be //: returned. Note that this includes the case where end of file has //: been reached without any error occurring. //: //: o If 'errorStatus' is an invalid value, "(* unrecognized value *)" //: will be returned. // CREATORS Utf8CheckingInStreamBufWrapper(); explicit Utf8CheckingInStreamBufWrapper(bslma::Allocator *basicAllocator); // Create a 'Utf8StreamBufInputWrapper' object having no associated // 'streambuf'. Optionally specify a 'basicAllocator' used to supply // memory. If 'basicAllocator' is 0 or not specified, the currently // installed default allocator is used. explicit Utf8CheckingInStreamBufWrapper(bsl::streambuf *streamBuf, bslma::Allocator *basicAllocator = 0); // Create a 'Utf8StreamBufInputWrapper' associated with the specified // 'streamBuf'. Optionally specify a 'basicAllocator' used to supply // memory. If 'basicAllocator' is 0, the currently installed default // allocator is used. ~Utf8CheckingInStreamBufWrapper(); // Destroy this object. // MANIPULATOR void reset(bsl::streambuf *streamBuf); // Associate this object with the specified 'streamBuf', releasing any // previously held 'streambuf'. // ACCESSORS int errorStatus() const; // Return the current error mode of this object. This will be either 0 // (no errors or end of data), 'k_SEEK_FAIL', which is positive, or a // value from 'Utf8Util::ErrorStatus', which are all negative. bool isValid() const; // Return 'true' if this wrapper currently holds a 'streambuf' and is // not in a failed seek state. }; // ============================================================================ // INLINE DEFINITIONS // ============================================================================ // ACCESSORS inline int Utf8CheckingInStreamBufWrapper::errorStatus() const { return d_errorStatus; } inline bool Utf8CheckingInStreamBufWrapper::isValid() const { return d_heldStreamBuf_p && k_SEEK_FAIL != d_errorStatus; } } // close package namespace } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Note that this implementation is derived from 'bdls::FdStreamBuf' which is // based on STLPort's implementation of 'filebuf', with copyright notice as // follows: // // Adapted to bde from STLport, 2009 // 'bdls::FdStreamBuf' from 'bsl::filebuf' // 'bdls::FdStreamBuf_FileHandler' from 'bsl::_Filebuf_base' // // Copyright (c) 1999 // Silicon Graphics Computer Systems, Inc. // // Copyright (c) 1999 // Boris Fomitchev // // This material is provided "as is", with absolutely no warranty expressed // or implied. Any use is at your own risk. // // Permission to use or copy this software for any purpose is hereby granted // without fee, provided the above notices are retained on all copies. // Permission to modify the code and to distribute modified code is granted, // provided the above notices are retained, and a notice that the code was // modified is included with the above copyright notice. // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Copyright 2020 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------