Coverage Report - org.apache.commons.math.stat.regression.SimpleRegression

Classes in this File Line Coverage Branch Coverage Complexity
SimpleRegression
100% 
100% 
1.542

 1  
 /*
 2  
  * Copyright 2003-2004 The Apache Software Foundation.
 3  
  *
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  *
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  *
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.math.stat.regression;
 18  
 import java.io.Serializable;
 19  
 
 20  
 import org.apache.commons.math.MathException;
 21  
 import org.apache.commons.math.distribution.DistributionFactory;
 22  
 import org.apache.commons.math.distribution.TDistribution;
 23  
 
 24  
 /**
 25  
  * Estimates an ordinary least squares regression model
 26  
  * with one independent variable.
 27  
  * <p>
 28  
  * <code> y = intercept + slope * x  </code>
 29  
  * <p>
 30  
  * Standard errors for <code>intercept</code> and <code>slope</code> are 
 31  
  * available as well as ANOVA, r-square and Pearson's r statistics.
 32  
  * <p>
 33  
  * Observations (x,y pairs) can be added to the model one at a time or they 
 34  
  * can be provided in a 2-dimensional array.  The observations are not stored
 35  
  * in memory, so there is no limit to the number of observations that can be
 36  
  * added to the model. 
 37  
  * <p>
 38  
  * <strong>Usage Notes</strong>: <ul>
 39  
  * <li> When there are fewer than two observations in the model, or when
 40  
  * there is no variation in the x values (i.e. all x values are the same) 
 41  
  * all statistics return <code>NaN</code>. At least two observations with
 42  
  * different x coordinates are requred to estimate a bivariate regression 
 43  
  * model.
 44  
  * </li>
 45  
  * <li> getters for the statistics always compute values based on the current
 46  
  * set of observations -- i.e., you can get statistics, then add more data
 47  
  * and get updated statistics without using a new instance.  There is no 
 48  
  * "compute" method that updates all statistics.  Each of the getters performs
 49  
  * the necessary computations to return the requested statistic.</li>
 50  
  * </ul>
 51  
  *
 52  
  * @version $Revision$ $Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) $
 53  
  */
 54  
 public class SimpleRegression implements Serializable {
 55  
 
 56  
     /** Serializable version identifier */
 57  
     static final long serialVersionUID = -3004689053607543335L;
 58  
 
 59  
     /** sum of x values */
 60  20
     private double sumX = 0d;
 61  
 
 62  
     /** total variation in x (sum of squared deviations from xbar) */
 63  20
     private double sumXX = 0d;
 64  
 
 65  
     /** sum of y values */
 66  20
     private double sumY = 0d;
 67  
 
 68  
     /** total variation in y (sum of squared deviations from ybar) */
 69  20
     private double sumYY = 0d;
 70  
 
 71  
     /** sum of products */
 72  20
     private double sumXY = 0d;
 73  
 
 74  
     /** number of observations */
 75  20
     private long n = 0;
 76  
 
 77  
     /** mean of accumulated x values, used in updating formulas */
 78  20
     private double xbar = 0;
 79  
 
 80  
     /** mean of accumulated y values, used in updating formulas */
 81  20
     private double ybar = 0;
 82  
 
 83  
     // ---------------------Public methods--------------------------------------
 84  
 
 85  
     /**
 86  
      * Create an empty SimpleRegression instance
 87  
      */
 88  
     public SimpleRegression() {
 89  20
         super();
 90  20
     }
 91  
     
 92  
     /**
 93  
      * Adds the observation (x,y) to the regression data set.
 94  
      * <p>
 95  
      * Uses updating formulas for means and sums of squares defined in 
 96  
      * "Algorithms for Computing the Sample Variance: Analysis and
 97  
      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 
 98  
      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
 99  
      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
 100  
      *
 101  
      *
 102  
      * @param x independent variable value
 103  
      * @param y dependent variable value
 104  
      */
 105  
     public void addData(double x, double y) {
 106  886
         if (n == 0) {
 107  22
             xbar = x;
 108  22
             ybar = y;
 109  
         } else {
 110  864
             double dx = x - xbar;
 111  864
             double dy = y - ybar;
 112  864
             sumXX += dx * dx * (double) n / (double) (n + 1.0);
 113  864
             sumYY += dy * dy * (double) n / (double) (n + 1.0);
 114  864
             sumXY += dx * dy * (double) n / (double) (n + 1.0);
 115  864
             xbar += dx / (double) (n + 1.0);
 116  864
             ybar += dy / (double) (n + 1.0);
 117  
         }
 118  886
         sumX += x;
 119  886
         sumY += y;
 120  886
         n++;
 121  886
     }
 122  
 
 123  
     /**
 124  
      * Adds the observations represented by the elements in 
 125  
      * <code>data</code>.
 126  
      * <p>
 127  
      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
 128  
      * <code>(data[1][0],data[1][1])</code>, etc. 
 129  
      * <p> 
 130  
      * This method does not replace data that has already been added.  The
 131  
      * observations represented by <code>data</code> are added to the existing
 132  
      * dataset.
 133  
      * <p> 
 134  
      * To replace all data, use <code>clear()</code> before adding the new 
 135  
      * data.
 136  
      * 
 137  
      * @param data array of observations to be added
 138  
      */
 139  
     public void addData(double[][] data) {
 140  216
         for (int i = 0; i < data.length; i++) {
 141  204
             addData(data[i][0], data[i][1]);
 142  
         }
 143  12
     }
 144  
 
 145  
     /**
 146  
      * Clears all data from the model.
 147  
      */
 148  
     public void clear() {
 149  2
         sumX = 0d;
 150  2
         sumXX = 0d;
 151  2
         sumY = 0d;
 152  2
         sumYY = 0d;
 153  2
         sumXY = 0d;
 154  2
         n = 0;
 155  2
     }
 156  
 
 157  
     /**
 158  
      * Returns the number of observations that have been added to the model.
 159  
      *
 160  
      * @return n number of observations that have been added.
 161  
      */
 162  
     public long getN() {
 163  10
         return n;
 164  
     }
 165  
 
 166  
     /**
 167  
      * Returns the "predicted" <code>y</code> value associated with the 
 168  
      * supplied <code>x</code> value,  based on the data that has been
 169  
      * added to the model when this method is activated.
 170  
      * <p>
 171  
      * <code> predict(x) = intercept + slope * x </code>
 172  
      * <p>
 173  
      * <strong>Preconditions</strong>: <ul>
 174  
      * <li>At least two observations (with at least two different x values)
 175  
      * must have been added before invoking this method. If this method is 
 176  
      * invoked before a model can be estimated, <code>Double,NaN</code> is
 177  
      * returned.
 178  
      * </li></ul>
 179  
      *
 180  
      * @param x input <code>x</code> value
 181  
      * @return predicted <code>y</code> value
 182  
      */
 183  
     public double predict(double x) {
 184  10
         double b1 = getSlope();
 185  10
         return getIntercept(b1) + b1 * x;
 186  
     }
 187  
 
 188  
     /**
 189  
      * Returns the intercept of the estimated regression line.
 190  
      * <p>
 191  
      * The least squares estimate of the intercept is computed using the 
 192  
      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
 193  
      * The intercept is sometimes denoted b0. 
 194  
      * <p>
 195  
      * <strong>Preconditions</strong>: <ul>
 196  
      * <li>At least two observations (with at least two different x values)
 197  
      * must have been added before invoking this method. If this method is 
 198  
      * invoked before a model can be estimated, <code>Double,NaN</code> is
 199  
      * returned.
 200  
      * </li></ul>
 201  
      *
 202  
      * @return the intercept of the regression line
 203  
      */
 204  
     public double getIntercept() {
 205  8
         return getIntercept(getSlope());
 206  
     }
 207  
 
 208  
     /**
 209  
     * Returns the slope of the estimated regression line.  
 210  
     * <p>
 211  
     * The least squares estimate of the slope is computed using the 
 212  
     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
 213  
     * The slope is sometimes denoted b1. 
 214  
     * <p>
 215  
     * <strong>Preconditions</strong>: <ul>
 216  
     * <li>At least two observations (with at least two different x values)
 217  
     * must have been added before invoking this method. If this method is 
 218  
     * invoked before a model can be estimated, <code>Double.NaN</code> is
 219  
     * returned.
 220  
     * </li></ul>
 221  
     *
 222  
     * @return the slope of the regression line
 223  
     */
 224  
     public double getSlope() {
 225  118
         if (n < 2) {
 226  14
             return Double.NaN; //not enough data 
 227  
         }
 228  104
         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
 229  14
             return Double.NaN; //not enough variation in x
 230  
         }
 231  90
         return sumXY / sumXX;
 232  
     }
 233  
 
 234  
     /**
 235  
      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
 236  
      * sum of squared errors</a> (SSE) associated with the regression 
 237  
      * model.
 238  
      * <p>
 239  
      * <strong>Preconditions</strong>: <ul>
 240  
      * <li>At least two observations (with at least two different x values)
 241  
      * must have been added before invoking this method. If this method is 
 242  
      * invoked before a model can be estimated, <code>Double,NaN</code> is
 243  
      * returned.
 244  
      * </li></ul>
 245  
      *
 246  
      * @return sum of squared errors associated with the regression model
 247  
      */
 248  
     public double getSumSquaredErrors() {
 249  48
         return getSumSquaredErrors(getSlope());
 250  
     }
 251  
 
 252  
     /**
 253  
      * Returns the sum of squared deviations of the y values about their mean.
 254  
      * <p>
 255  
      * This is defined as SSTO 
 256  
      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
 257  
      * <p>
 258  
      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
 259  
      *
 260  
      * @return sum of squared deviations of y values
 261  
      */
 262  
     public double getTotalSumSquares() {
 263  26
         if (n < 2) {
 264  6
             return Double.NaN;
 265  
         }
 266  20
         return sumYY;
 267  
     }
 268  
 
 269  
     /**
 270  
      * Returns the sum of squared deviations of the predicted y values about 
 271  
      * their mean (which equals the mean of y).
 272  
      * <p>
 273  
      * This is usually abbreviated SSR or SSM.  It is defined as SSM 
 274  
      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
 275  
      * <p>
 276  
      * <strong>Preconditions</strong>: <ul>
 277  
      * <li>At least two observations (with at least two different x values)
 278  
      * must have been added before invoking this method. If this method is 
 279  
      * invoked before a model can be estimated, <code>Double.NaN</code> is
 280  
      * returned.
 281  
      * </li></ul>
 282  
      *
 283  
      * @return sum of squared deviations of predicted y values
 284  
      */
 285  
     public double getRegressionSumSquares() {
 286  8
         return getRegressionSumSquares(getSlope());
 287  
     }
 288  
 
 289  
     /**
 290  
      * Returns the sum of squared errors divided by the degrees of freedom,
 291  
      * usually abbreviated MSE. 
 292  
      * <p>
 293  
      * If there are fewer than <strong>three</strong> data pairs in the model,
 294  
      * or if there is no variation in <code>x</code>, this returns 
 295  
      * <code>Double.NaN</code>.
 296  
      *
 297  
      * @return sum of squared deviations of y values
 298  
      */
 299  
     public double getMeanSquareError() {
 300  58
         if (n < 3) {
 301  18
             return Double.NaN;
 302  
         }
 303  40
         return getSumSquaredErrors() / (double) (n - 2);
 304  
     }
 305  
 
 306  
     /**
 307  
      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
 308  
      * Pearson's product moment correlation coefficient</a>,
 309  
      * usually denoted r. 
 310  
      * <p>
 311  
      * <strong>Preconditions</strong>: <ul>
 312  
      * <li>At least two observations (with at least two different x values)
 313  
      * must have been added before invoking this method. If this method is 
 314  
      * invoked before a model can be estimated, <code>Double,NaN</code> is
 315  
      * returned.
 316  
      * </li></ul>
 317  
      *
 318  
      * @return Pearson's r
 319  
      */
 320  
     public double getR() {
 321  8
         double b1 = getSlope();
 322  8
         double result = Math.sqrt(getRSquare(b1));
 323  8
         if (b1 < 0) {
 324  2
             result = -result;
 325  
         }
 326  8
         return result;
 327  
     }
 328  
 
 329  
     /** 
 330  
      * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
 331  
      * coefficient of determination</a>,
 332  
      * usually denoted r-square. 
 333  
      * <p>
 334  
      * <strong>Preconditions</strong>: <ul>
 335  
      * <li>At least two observations (with at least two different x values)
 336  
      * must have been added before invoking this method. If this method is 
 337  
      * invoked before a model can be estimated, <code>Double,NaN</code> is
 338  
      * returned.
 339  
      * </li></ul>
 340  
      *
 341  
      * @return r-square
 342  
      */
 343  
     public double getRSquare() {
 344  12
         return getRSquare(getSlope());
 345  
     }
 346  
 
 347  
     /**
 348  
      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
 349  
      * standard error of the intercept estimate</a>, 
 350  
      * usually denoted s(b0). 
 351  
      * <p>
 352  
      * If there are fewer that <strong>three</strong> observations in the 
 353  
      * model, or if there is no variation in x, this returns 
 354  
      * <code>Double.NaN</code>.
 355  
      *
 356  
      * @return standard error associated with intercept estimate
 357  
      */
 358  
     public double getInterceptStdErr() {
 359  14
         return Math.sqrt(
 360  
             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
 361  
     }
 362  
 
 363  
     /**
 364  
      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
 365  
      * error of the slope estimate</a>,
 366  
      * usually denoted s(b1). 
 367  
      * <p>
 368  
      * If there are fewer that <strong>three</strong> data pairs in the model,
 369  
      * or if there is no variation in x, this returns <code>Double.NaN</code>.
 370  
      *
 371  
      * @return standard error associated with slope estimate
 372  
      */
 373  
     public double getSlopeStdErr() {
 374  34
         return Math.sqrt(getMeanSquareError() / sumXX);
 375  
     }
 376  
 
 377  
     /**
 378  
      * Returns the half-width of a 95% confidence interval for the slope
 379  
      * estimate.
 380  
      * <p>
 381  
      * The 95% confidence interval is 
 382  
      * <p>
 383  
      * <code>(getSlope() - getSlopeConfidenceInterval(), 
 384  
      * getSlope() + getSlopeConfidenceInterval())</code>
 385  
      * <p>
 386  
      * If there are fewer that <strong>three</strong> observations in the 
 387  
      * model, or if there is no variation in x, this returns 
 388  
      * <code>Double.NaN</code>.
 389  
      * <p>
 390  
      * <strong>Usage Note</strong>:<br>
 391  
      * The validity of this statistic depends on the assumption that the 
 392  
      * observations included in the model are drawn from a
 393  
      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
 394  
      * Bivariate Normal Distribution</a>.
 395  
      *
 396  
      * @return half-width of 95% confidence interval for the slope estimate
 397  
      * 
 398  
      * @throws MathException if the confidence interval can not be computed.
 399  
      */
 400  
     public double getSlopeConfidenceInterval() throws MathException {
 401  6
         return getSlopeConfidenceInterval(0.05d);
 402  
     }
 403  
 
 404  
     /**
 405  
      * Returns the half-width of a (100-100*alpha)% confidence interval for 
 406  
      * the slope estimate.
 407  
      * <p>
 408  
      * The (100-100*alpha)% confidence interval is 
 409  
      * <p>
 410  
      * <code>(getSlope() - getSlopeConfidenceInterval(), 
 411  
      * getSlope() + getSlopeConfidenceInterval())</code>
 412  
      * <p>
 413  
      * To request, for example, a 99% confidence interval, use 
 414  
      * <code>alpha = .01</code>
 415  
      * <p>
 416  
      * <strong>Usage Note</strong>:<br>
 417  
      * The validity of this statistic depends on the assumption that the 
 418  
      * observations included in the model are drawn from a
 419  
      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
 420  
      * Bivariate Normal Distribution</a>.
 421  
      * <p>
 422  
      * <strong> Preconditions:</strong><ul>
 423  
      * <li>If there are fewer that <strong>three</strong> observations in the 
 424  
      * model, or if there is no variation in x, this returns 
 425  
      * <code>Double.NaN</code>. 
 426  
      * </li>
 427  
      * <li><code>(0 < alpha < 1)</code>; otherwise an 
 428  
      * <code>IllegalArgumentException</code> is thrown.
 429  
      * </li></ul>    
 430  
      *
 431  
      * @param alpha the desired significance level 
 432  
      * @return half-width of 95% confidence interval for the slope estimate
 433  
      * @throws MathException if the confidence interval can not be computed.
 434  
      */
 435  
     public double getSlopeConfidenceInterval(double alpha)
 436  
         throws MathException {
 437  10
         if (alpha >= 1 || alpha <= 0) {
 438  2
             throw new IllegalArgumentException();
 439  
         }
 440  8
         return getSlopeStdErr() *
 441  
             getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);
 442  
     }
 443  
 
 444  
     /**
 445  
      * Returns the significance level of the slope (equiv) correlation. 
 446  
      * <p>
 447  
      * Specifically, the returned value is the smallest <code>alpha</code>
 448  
      * such that the slope confidence interval with significance level
 449  
      * equal to <code>alpha</code> does not include <code>0</code>.
 450  
      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
 451  
      * <p>
 452  
      * <strong>Usage Note</strong>:<br>
 453  
      * The validity of this statistic depends on the assumption that the 
 454  
      * observations included in the model are drawn from a
 455  
      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
 456  
      * Bivariate Normal Distribution</a>.
 457  
      * <p>
 458  
      * If there are fewer that <strong>three</strong> observations in the 
 459  
      * model, or if there is no variation in x, this returns 
 460  
      * <code>Double.NaN</code>.
 461  
      *
 462  
      * @return significance level for slope/correlation
 463  
      * @throws MathException if the significance level can not be computed.
 464  
      */
 465  
     public double getSignificance() throws MathException {
 466  12
         return 2d* (1.0 - getTDistribution().cumulativeProbability(
 467  
                     Math.abs(getSlope()) / getSlopeStdErr()));
 468  
     }
 469  
 
 470  
     // ---------------------Private methods-----------------------------------
 471  
 
 472  
     /**
 473  
     * Returns the intercept of the estimated regression line, given the slope.
 474  
     * <p>
 475  
     * Will return <code>NaN</code> if slope is <code>NaN</code>.
 476  
     *
 477  
     * @param slope current slope
 478  
     * @return the intercept of the regression line
 479  
     */
 480  
     private double getIntercept(double slope) {
 481  18
         return (sumY - slope * sumX) / ((double) n);
 482  
     }
 483  
 
 484  
     /**
 485  
      * Returns the sum of squared errors associated with the regression 
 486  
      * model, using the slope of the regression line. 
 487  
      * <p> 
 488  
      * Returns NaN if the slope is NaN.
 489  
      * 
 490  
      * @param b1 current slope
 491  
      * @return sum of squared errors associated with the regression model
 492  
      */
 493  
     private double getSumSquaredErrors(double b1) {
 494  68
         return sumYY - sumXY * sumXY / sumXX;
 495  
     }
 496  
 
 497  
     /** 
 498  
      * Computes r-square from the slope.
 499  
      * <p>
 500  
      * will return NaN if slope is Nan.
 501  
      *
 502  
      * @param b1 current slope
 503  
      * @return r-square
 504  
      */
 505  
     private double getRSquare(double b1) {
 506  20
         double ssto = getTotalSumSquares();
 507  20
         return (ssto - getSumSquaredErrors(b1)) / ssto;
 508  
     }
 509  
 
 510  
     /**
 511  
      * Computes SSR from b1.
 512  
      * 
 513  
      * @param slope regression slope estimate
 514  
      * @return sum of squared deviations of predicted y values
 515  
      */
 516  
     private double getRegressionSumSquares(double slope) {
 517  8
         return slope * slope * sumXX;
 518  
     }
 519  
 
 520  
     /**
 521  
      * Uses distribution framework to get a t distribution instance 
 522  
      * with df = n - 2
 523  
      *
 524  
      * @return t distribution with df = n - 2
 525  
      */
 526  
     private TDistribution getTDistribution() {
 527  20
         return DistributionFactory.newInstance().createTDistribution(n - 2);
 528  
     }
 529  
 }