// bdlsta_linefit.h -*-C++-*- #ifndef INCLUDED_BDLSTA_LINEFIT #define INCLUDED_BDLSTA_LINEFIT #include <bsls_ident.h> BSLS_IDENT("$Id: $") // BDE_VERIFY pragma: -LL01 // Link is just too long //@PURPOSE: Online algorithm for computing the least squares regression line. // //@CLASSES: // bdlsta::LineFit: online calculation of least squares regression line // //@DESCRIPTION: This component provides a mechanism, 'bdlsta::LineFit', that // provides online calculation of the least squares line fit. Online // algorithms process the data in one pass, while maintaining accuracy. The // online algorithm used is developed in the implementation notes (it is // similar to the Welford online algorithm for computing variance). The // formulae for line fit are taken from: // https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line // // Note that the behavior is undefined if there are less than 2 data points, or // if all the X's (dependent variable) are the same. // ///Usage ///----- // This section illustrates intended use of this component. // ///Example 1: Calculating line fit, variance, and mean ///- - - - - - - - - - - - - - - - // This example shows how to accumulate a set of values, and calculate the // line fit parameters, variance, and mean. // // First, we create example input and instantiate the appropriate mechanism: //.. // double inputX[] = { 1.0, 2.0, 4.0, 5.0 }; // double inputY[] = { 1.0, 2.0, 4.0, 4.5 }; // bdlsta::LineFit lineFit; //.. // Then, we invoke the 'add' routine to accumulate the data: //.. // for(int i = 0; i < 4; ++i) { // lineFit.add(inputX[i], inputY[i]); // } //.. // Finally, we assert that the alpha, beta, variance, and mean are what we // expect: //.. // double alpha, beta; // ASSERT(4 == lineFit.count()); // ASSERT(3.0 == lineFit.xMean()); // ASSERT(1e-3 > fabs(2.875 - lineFit.yMean())); // ASSERT(1e-3 > fabs(3.33333 - lineFit.variance())); // ASSERT(0 == lineFit.fitIfValid(&alpha, &beta)); // ASSERT(1e-3 > fabs(0.175 - alpha)); // ASSERT(1e-3 > fabs(0.9 - beta )); //.. // BDE_VERIFY pragma: +LL01 #include <bdlscm_version.h> #include <bsl_cmath.h> #include <bsls_assert.h> #include <bsls_review.h> namespace BloombergLP { namespace bdlsta { // ============= // class LineFit // ============= class LineFit { // This class provides an efficient online algorithm for calculating linear // square line fit. The class also calculates the mean for the X's and // Y's, and variance for the X's. These are byproducts of calculating the // line fit. The online algorithm is detailed in the implementation notes. private: // DATA int d_count; // Number of data points. double d_xMean; // Mean of X's. double d_xSum; // Sum of X's. double d_ySum; // Sum of Y's. double d_M2; // 2nd moment double d_xySum; // Sum of Xi*Yi public: // CONSTANTS enum { e_SUCCESS = 0, e_INADEQUATE_DATA = -1 }; // CREATORS LineFit(); // Create an empty 'LineFit' object. // MANIPULATORS void add(double xValue, double yValue); // Add the specified '(xValue, yValue)' point to the data set. // ACCESSORS int count() const; // Returns the number of elements in the data set. void fit(double *alpha, double *beta) const; // Calculate line fit coefficients 'Y = Alpha + Beta * X', and populate // the specified 'alpha' (intercept) and 'beta' (slope). The behavior // is undefined if '2 > count' or all X's are identical. int fitIfValid(double *alpha, double *beta) const; // Calculate line fit coefficients 'Y = Alpha + Beta * X', and populate // the specified 'alpha' (intercept) and 'beta' (slope). Return 0 on // success, and non-zero otherwise. The computations is unsuccessful // if '2 > count' or all X's are identical. double variance() const; // Return the variance of the data set X's. The behavior is undefined // unless '2 <= count'. int varianceIfValid(double *result) const; // Load into the specified 'result', the variance of the data set X's. // Return 0 on success, and a non-zero value otherwise. Specifically, // 'e_INADEQUATE_DATA' is returned if '2 > count'. double xMean() const; // Return the mean of the data set X's. The behavior is undefined // unless '1 <= count'. int xMeanIfValid(double *result) const; // Load into the specified 'result', the mean of the data set X's. // Return 0 on success, and a non-zero value otherwise. Specifically, // 'e_INADEQUATE_DATA' is returned if '1 > count'. double yMean() const; // Return the mean of the data set Y's. The behavior is undefined // unless '1 <= count'. int yMeanIfValid(double *result) const; // Load into the specified 'result', the mean of the data set Y's. // Return 0 on success, and a non-zero value otherwise. Specifically, // 'e_INADEQUATE_DATA' is returned if '1 > count'. }; // ============================================================================ // INLINE DEFINITIONS // ============================================================================ // --------------------- // class bdlsta::LineFit // --------------------- // CREATORS inline LineFit::LineFit() : d_count(0) , d_xMean(0.0) , d_xSum(0.0) , d_ySum(0.0) , d_M2(0.0) , d_xySum(0.0) { } // MANIPULATORS inline void LineFit::add(double xValue, double yValue) { const double delta = xValue - d_xMean; ++d_count; d_xSum += xValue; d_ySum += yValue; d_xMean = d_xSum / static_cast<double>(d_count); const double delta2 = xValue - d_xMean; d_M2 += delta * delta2; d_xySum += xValue * yValue; } // ACCESSORS inline int LineFit::count() const { return d_count; } inline void LineFit::fit(double *alpha, double *beta) const { BSLS_ASSERT(2 <= d_count && 0.0 != d_M2); const double n = static_cast<double>(d_count); double tmpBeta = (d_xySum - d_xSum * d_ySum / n) / d_M2; *beta = tmpBeta; *alpha = (d_ySum - d_xSum * tmpBeta) / n; } inline int LineFit::fitIfValid(double *alpha, double *beta) const { if (2 > d_count || 0.0 == d_M2) { return e_INADEQUATE_DATA; // RETURN } const double n = static_cast<double>(d_count); double tmpBeta = (d_xySum - d_xSum * d_ySum / n) / d_M2; *beta = tmpBeta; *alpha = (d_ySum - d_xSum * tmpBeta) / n; return 0; } inline double LineFit::variance() const { BSLS_ASSERT(2 <= d_count); return d_M2 / (d_count - 1); } inline int LineFit::varianceIfValid(double *result) const { if (2 > d_count) { return e_INADEQUATE_DATA; // RETURN } *result = variance(); return 0; } inline double LineFit::xMean() const { BSLS_ASSERT(1 <= d_count); return d_xSum / static_cast<double>(d_count); } inline int LineFit::xMeanIfValid(double *result) const { if (1 > d_count) { return e_INADEQUATE_DATA; // RETURN } *result = xMean(); return 0; } inline double LineFit::yMean() const { BSLS_ASSERT(1 <= d_count); return d_ySum / static_cast<double>(d_count); } inline int LineFit::yMeanIfValid(double *result) const { if (1 > d_count) { return e_INADEQUATE_DATA; // RETURN } *result = yMean(); return 0; } } // close package namespace } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Copyright 2017 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------