Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||||||
SimpleRegression |
|
| 1.5416666666666667;1.542 |
1 | /* |
|
2 | * Copyright 2003-2004 The Apache Software Foundation. |
|
3 | * |
|
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 | * you may not use this file except in compliance with the License. |
|
6 | * You may obtain a copy of the License at |
|
7 | * |
|
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 | * |
|
10 | * Unless required by applicable law or agreed to in writing, software |
|
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 | * See the License for the specific language governing permissions and |
|
14 | * limitations under the License. |
|
15 | */ |
|
16 | ||
17 | package org.apache.commons.math.stat.regression; |
|
18 | import java.io.Serializable; |
|
19 | ||
20 | import org.apache.commons.math.MathException; |
|
21 | import org.apache.commons.math.distribution.DistributionFactory; |
|
22 | import org.apache.commons.math.distribution.TDistribution; |
|
23 | ||
24 | /** |
|
25 | * Estimates an ordinary least squares regression model |
|
26 | * with one independent variable. |
|
27 | * <p> |
|
28 | * <code> y = intercept + slope * x </code> |
|
29 | * <p> |
|
30 | * Standard errors for <code>intercept</code> and <code>slope</code> are |
|
31 | * available as well as ANOVA, r-square and Pearson's r statistics. |
|
32 | * <p> |
|
33 | * Observations (x,y pairs) can be added to the model one at a time or they |
|
34 | * can be provided in a 2-dimensional array. The observations are not stored |
|
35 | * in memory, so there is no limit to the number of observations that can be |
|
36 | * added to the model. |
|
37 | * <p> |
|
38 | * <strong>Usage Notes</strong>: <ul> |
|
39 | * <li> When there are fewer than two observations in the model, or when |
|
40 | * there is no variation in the x values (i.e. all x values are the same) |
|
41 | * all statistics return <code>NaN</code>. At least two observations with |
|
42 | * different x coordinates are requred to estimate a bivariate regression |
|
43 | * model. |
|
44 | * </li> |
|
45 | * <li> getters for the statistics always compute values based on the current |
|
46 | * set of observations -- i.e., you can get statistics, then add more data |
|
47 | * and get updated statistics without using a new instance. There is no |
|
48 | * "compute" method that updates all statistics. Each of the getters performs |
|
49 | * the necessary computations to return the requested statistic.</li> |
|
50 | * </ul> |
|
51 | * |
|
52 | * @version $Revision$ $Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) $ |
|
53 | */ |
|
54 | public class SimpleRegression implements Serializable { |
|
55 | ||
56 | /** Serializable version identifier */ |
|
57 | static final long serialVersionUID = -3004689053607543335L; |
|
58 | ||
59 | /** sum of x values */ |
|
60 | 20 | private double sumX = 0d; |
61 | ||
62 | /** total variation in x (sum of squared deviations from xbar) */ |
|
63 | 20 | private double sumXX = 0d; |
64 | ||
65 | /** sum of y values */ |
|
66 | 20 | private double sumY = 0d; |
67 | ||
68 | /** total variation in y (sum of squared deviations from ybar) */ |
|
69 | 20 | private double sumYY = 0d; |
70 | ||
71 | /** sum of products */ |
|
72 | 20 | private double sumXY = 0d; |
73 | ||
74 | /** number of observations */ |
|
75 | 20 | private long n = 0; |
76 | ||
77 | /** mean of accumulated x values, used in updating formulas */ |
|
78 | 20 | private double xbar = 0; |
79 | ||
80 | /** mean of accumulated y values, used in updating formulas */ |
|
81 | 20 | private double ybar = 0; |
82 | ||
83 | // ---------------------Public methods-------------------------------------- |
|
84 | ||
85 | /** |
|
86 | * Create an empty SimpleRegression instance |
|
87 | */ |
|
88 | public SimpleRegression() { |
|
89 | 20 | super(); |
90 | 20 | } |
91 | ||
92 | /** |
|
93 | * Adds the observation (x,y) to the regression data set. |
|
94 | * <p> |
|
95 | * Uses updating formulas for means and sums of squares defined in |
|
96 | * "Algorithms for Computing the Sample Variance: Analysis and |
|
97 | * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. |
|
98 | * 1983, American Statistician, vol. 37, pp. 242-247, referenced in |
|
99 | * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985 |
|
100 | * |
|
101 | * |
|
102 | * @param x independent variable value |
|
103 | * @param y dependent variable value |
|
104 | */ |
|
105 | public void addData(double x, double y) { |
|
106 | 886 | if (n == 0) { |
107 | 22 | xbar = x; |
108 | 22 | ybar = y; |
109 | } else { |
|
110 | 864 | double dx = x - xbar; |
111 | 864 | double dy = y - ybar; |
112 | 864 | sumXX += dx * dx * (double) n / (double) (n + 1.0); |
113 | 864 | sumYY += dy * dy * (double) n / (double) (n + 1.0); |
114 | 864 | sumXY += dx * dy * (double) n / (double) (n + 1.0); |
115 | 864 | xbar += dx / (double) (n + 1.0); |
116 | 864 | ybar += dy / (double) (n + 1.0); |
117 | } |
|
118 | 886 | sumX += x; |
119 | 886 | sumY += y; |
120 | 886 | n++; |
121 | 886 | } |
122 | ||
123 | /** |
|
124 | * Adds the observations represented by the elements in |
|
125 | * <code>data</code>. |
|
126 | * <p> |
|
127 | * <code>(data[0][0],data[0][1])</code> will be the first observation, then |
|
128 | * <code>(data[1][0],data[1][1])</code>, etc. |
|
129 | * <p> |
|
130 | * This method does not replace data that has already been added. The |
|
131 | * observations represented by <code>data</code> are added to the existing |
|
132 | * dataset. |
|
133 | * <p> |
|
134 | * To replace all data, use <code>clear()</code> before adding the new |
|
135 | * data. |
|
136 | * |
|
137 | * @param data array of observations to be added |
|
138 | */ |
|
139 | public void addData(double[][] data) { |
|
140 | 216 | for (int i = 0; i < data.length; i++) { |
141 | 204 | addData(data[i][0], data[i][1]); |
142 | } |
|
143 | 12 | } |
144 | ||
145 | /** |
|
146 | * Clears all data from the model. |
|
147 | */ |
|
148 | public void clear() { |
|
149 | 2 | sumX = 0d; |
150 | 2 | sumXX = 0d; |
151 | 2 | sumY = 0d; |
152 | 2 | sumYY = 0d; |
153 | 2 | sumXY = 0d; |
154 | 2 | n = 0; |
155 | 2 | } |
156 | ||
157 | /** |
|
158 | * Returns the number of observations that have been added to the model. |
|
159 | * |
|
160 | * @return n number of observations that have been added. |
|
161 | */ |
|
162 | public long getN() { |
|
163 | 10 | return n; |
164 | } |
|
165 | ||
166 | /** |
|
167 | * Returns the "predicted" <code>y</code> value associated with the |
|
168 | * supplied <code>x</code> value, based on the data that has been |
|
169 | * added to the model when this method is activated. |
|
170 | * <p> |
|
171 | * <code> predict(x) = intercept + slope * x </code> |
|
172 | * <p> |
|
173 | * <strong>Preconditions</strong>: <ul> |
|
174 | * <li>At least two observations (with at least two different x values) |
|
175 | * must have been added before invoking this method. If this method is |
|
176 | * invoked before a model can be estimated, <code>Double,NaN</code> is |
|
177 | * returned. |
|
178 | * </li></ul> |
|
179 | * |
|
180 | * @param x input <code>x</code> value |
|
181 | * @return predicted <code>y</code> value |
|
182 | */ |
|
183 | public double predict(double x) { |
|
184 | 10 | double b1 = getSlope(); |
185 | 10 | return getIntercept(b1) + b1 * x; |
186 | } |
|
187 | ||
188 | /** |
|
189 | * Returns the intercept of the estimated regression line. |
|
190 | * <p> |
|
191 | * The least squares estimate of the intercept is computed using the |
|
192 | * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. |
|
193 | * The intercept is sometimes denoted b0. |
|
194 | * <p> |
|
195 | * <strong>Preconditions</strong>: <ul> |
|
196 | * <li>At least two observations (with at least two different x values) |
|
197 | * must have been added before invoking this method. If this method is |
|
198 | * invoked before a model can be estimated, <code>Double,NaN</code> is |
|
199 | * returned. |
|
200 | * </li></ul> |
|
201 | * |
|
202 | * @return the intercept of the regression line |
|
203 | */ |
|
204 | public double getIntercept() { |
|
205 | 8 | return getIntercept(getSlope()); |
206 | } |
|
207 | ||
208 | /** |
|
209 | * Returns the slope of the estimated regression line. |
|
210 | * <p> |
|
211 | * The least squares estimate of the slope is computed using the |
|
212 | * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. |
|
213 | * The slope is sometimes denoted b1. |
|
214 | * <p> |
|
215 | * <strong>Preconditions</strong>: <ul> |
|
216 | * <li>At least two observations (with at least two different x values) |
|
217 | * must have been added before invoking this method. If this method is |
|
218 | * invoked before a model can be estimated, <code>Double.NaN</code> is |
|
219 | * returned. |
|
220 | * </li></ul> |
|
221 | * |
|
222 | * @return the slope of the regression line |
|
223 | */ |
|
224 | public double getSlope() { |
|
225 | 118 | if (n < 2) { |
226 | 14 | return Double.NaN; //not enough data |
227 | } |
|
228 | 104 | if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) { |
229 | 14 | return Double.NaN; //not enough variation in x |
230 | } |
|
231 | 90 | return sumXY / sumXX; |
232 | } |
|
233 | ||
234 | /** |
|
235 | * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> |
|
236 | * sum of squared errors</a> (SSE) associated with the regression |
|
237 | * model. |
|
238 | * <p> |
|
239 | * <strong>Preconditions</strong>: <ul> |
|
240 | * <li>At least two observations (with at least two different x values) |
|
241 | * must have been added before invoking this method. If this method is |
|
242 | * invoked before a model can be estimated, <code>Double,NaN</code> is |
|
243 | * returned. |
|
244 | * </li></ul> |
|
245 | * |
|
246 | * @return sum of squared errors associated with the regression model |
|
247 | */ |
|
248 | public double getSumSquaredErrors() { |
|
249 | 48 | return getSumSquaredErrors(getSlope()); |
250 | } |
|
251 | ||
252 | /** |
|
253 | * Returns the sum of squared deviations of the y values about their mean. |
|
254 | * <p> |
|
255 | * This is defined as SSTO |
|
256 | * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>. |
|
257 | * <p> |
|
258 | * If <code>n < 2</code>, this returns <code>Double.NaN</code>. |
|
259 | * |
|
260 | * @return sum of squared deviations of y values |
|
261 | */ |
|
262 | public double getTotalSumSquares() { |
|
263 | 26 | if (n < 2) { |
264 | 6 | return Double.NaN; |
265 | } |
|
266 | 20 | return sumYY; |
267 | } |
|
268 | ||
269 | /** |
|
270 | * Returns the sum of squared deviations of the predicted y values about |
|
271 | * their mean (which equals the mean of y). |
|
272 | * <p> |
|
273 | * This is usually abbreviated SSR or SSM. It is defined as SSM |
|
274 | * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a> |
|
275 | * <p> |
|
276 | * <strong>Preconditions</strong>: <ul> |
|
277 | * <li>At least two observations (with at least two different x values) |
|
278 | * must have been added before invoking this method. If this method is |
|
279 | * invoked before a model can be estimated, <code>Double.NaN</code> is |
|
280 | * returned. |
|
281 | * </li></ul> |
|
282 | * |
|
283 | * @return sum of squared deviations of predicted y values |
|
284 | */ |
|
285 | public double getRegressionSumSquares() { |
|
286 | 8 | return getRegressionSumSquares(getSlope()); |
287 | } |
|
288 | ||
289 | /** |
|
290 | * Returns the sum of squared errors divided by the degrees of freedom, |
|
291 | * usually abbreviated MSE. |
|
292 | * <p> |
|
293 | * If there are fewer than <strong>three</strong> data pairs in the model, |
|
294 | * or if there is no variation in <code>x</code>, this returns |
|
295 | * <code>Double.NaN</code>. |
|
296 | * |
|
297 | * @return sum of squared deviations of y values |
|
298 | */ |
|
299 | public double getMeanSquareError() { |
|
300 | 58 | if (n < 3) { |
301 | 18 | return Double.NaN; |
302 | } |
|
303 | 40 | return getSumSquaredErrors() / (double) (n - 2); |
304 | } |
|
305 | ||
306 | /** |
|
307 | * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> |
|
308 | * Pearson's product moment correlation coefficient</a>, |
|
309 | * usually denoted r. |
|
310 | * <p> |
|
311 | * <strong>Preconditions</strong>: <ul> |
|
312 | * <li>At least two observations (with at least two different x values) |
|
313 | * must have been added before invoking this method. If this method is |
|
314 | * invoked before a model can be estimated, <code>Double,NaN</code> is |
|
315 | * returned. |
|
316 | * </li></ul> |
|
317 | * |
|
318 | * @return Pearson's r |
|
319 | */ |
|
320 | public double getR() { |
|
321 | 8 | double b1 = getSlope(); |
322 | 8 | double result = Math.sqrt(getRSquare(b1)); |
323 | 8 | if (b1 < 0) { |
324 | 2 | result = -result; |
325 | } |
|
326 | 8 | return result; |
327 | } |
|
328 | ||
329 | /** |
|
330 | * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> |
|
331 | * coefficient of determination</a>, |
|
332 | * usually denoted r-square. |
|
333 | * <p> |
|
334 | * <strong>Preconditions</strong>: <ul> |
|
335 | * <li>At least two observations (with at least two different x values) |
|
336 | * must have been added before invoking this method. If this method is |
|
337 | * invoked before a model can be estimated, <code>Double,NaN</code> is |
|
338 | * returned. |
|
339 | * </li></ul> |
|
340 | * |
|
341 | * @return r-square |
|
342 | */ |
|
343 | public double getRSquare() { |
|
344 | 12 | return getRSquare(getSlope()); |
345 | } |
|
346 | ||
347 | /** |
|
348 | * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> |
|
349 | * standard error of the intercept estimate</a>, |
|
350 | * usually denoted s(b0). |
|
351 | * <p> |
|
352 | * If there are fewer that <strong>three</strong> observations in the |
|
353 | * model, or if there is no variation in x, this returns |
|
354 | * <code>Double.NaN</code>. |
|
355 | * |
|
356 | * @return standard error associated with intercept estimate |
|
357 | */ |
|
358 | public double getInterceptStdErr() { |
|
359 | 14 | return Math.sqrt( |
360 | getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); |
|
361 | } |
|
362 | ||
363 | /** |
|
364 | * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard |
|
365 | * error of the slope estimate</a>, |
|
366 | * usually denoted s(b1). |
|
367 | * <p> |
|
368 | * If there are fewer that <strong>three</strong> data pairs in the model, |
|
369 | * or if there is no variation in x, this returns <code>Double.NaN</code>. |
|
370 | * |
|
371 | * @return standard error associated with slope estimate |
|
372 | */ |
|
373 | public double getSlopeStdErr() { |
|
374 | 34 | return Math.sqrt(getMeanSquareError() / sumXX); |
375 | } |
|
376 | ||
377 | /** |
|
378 | * Returns the half-width of a 95% confidence interval for the slope |
|
379 | * estimate. |
|
380 | * <p> |
|
381 | * The 95% confidence interval is |
|
382 | * <p> |
|
383 | * <code>(getSlope() - getSlopeConfidenceInterval(), |
|
384 | * getSlope() + getSlopeConfidenceInterval())</code> |
|
385 | * <p> |
|
386 | * If there are fewer that <strong>three</strong> observations in the |
|
387 | * model, or if there is no variation in x, this returns |
|
388 | * <code>Double.NaN</code>. |
|
389 | * <p> |
|
390 | * <strong>Usage Note</strong>:<br> |
|
391 | * The validity of this statistic depends on the assumption that the |
|
392 | * observations included in the model are drawn from a |
|
393 | * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> |
|
394 | * Bivariate Normal Distribution</a>. |
|
395 | * |
|
396 | * @return half-width of 95% confidence interval for the slope estimate |
|
397 | * |
|
398 | * @throws MathException if the confidence interval can not be computed. |
|
399 | */ |
|
400 | public double getSlopeConfidenceInterval() throws MathException { |
|
401 | 6 | return getSlopeConfidenceInterval(0.05d); |
402 | } |
|
403 | ||
404 | /** |
|
405 | * Returns the half-width of a (100-100*alpha)% confidence interval for |
|
406 | * the slope estimate. |
|
407 | * <p> |
|
408 | * The (100-100*alpha)% confidence interval is |
|
409 | * <p> |
|
410 | * <code>(getSlope() - getSlopeConfidenceInterval(), |
|
411 | * getSlope() + getSlopeConfidenceInterval())</code> |
|
412 | * <p> |
|
413 | * To request, for example, a 99% confidence interval, use |
|
414 | * <code>alpha = .01</code> |
|
415 | * <p> |
|
416 | * <strong>Usage Note</strong>:<br> |
|
417 | * The validity of this statistic depends on the assumption that the |
|
418 | * observations included in the model are drawn from a |
|
419 | * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> |
|
420 | * Bivariate Normal Distribution</a>. |
|
421 | * <p> |
|
422 | * <strong> Preconditions:</strong><ul> |
|
423 | * <li>If there are fewer that <strong>three</strong> observations in the |
|
424 | * model, or if there is no variation in x, this returns |
|
425 | * <code>Double.NaN</code>. |
|
426 | * </li> |
|
427 | * <li><code>(0 < alpha < 1)</code>; otherwise an |
|
428 | * <code>IllegalArgumentException</code> is thrown. |
|
429 | * </li></ul> |
|
430 | * |
|
431 | * @param alpha the desired significance level |
|
432 | * @return half-width of 95% confidence interval for the slope estimate |
|
433 | * @throws MathException if the confidence interval can not be computed. |
|
434 | */ |
|
435 | public double getSlopeConfidenceInterval(double alpha) |
|
436 | throws MathException { |
|
437 | 10 | if (alpha >= 1 || alpha <= 0) { |
438 | 2 | throw new IllegalArgumentException(); |
439 | } |
|
440 | 8 | return getSlopeStdErr() * |
441 | getTDistribution().inverseCumulativeProbability(1d - alpha / 2d); |
|
442 | } |
|
443 | ||
444 | /** |
|
445 | * Returns the significance level of the slope (equiv) correlation. |
|
446 | * <p> |
|
447 | * Specifically, the returned value is the smallest <code>alpha</code> |
|
448 | * such that the slope confidence interval with significance level |
|
449 | * equal to <code>alpha</code> does not include <code>0</code>. |
|
450 | * On regression output, this is often denoted <code>Prob(|t| > 0)</code> |
|
451 | * <p> |
|
452 | * <strong>Usage Note</strong>:<br> |
|
453 | * The validity of this statistic depends on the assumption that the |
|
454 | * observations included in the model are drawn from a |
|
455 | * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> |
|
456 | * Bivariate Normal Distribution</a>. |
|
457 | * <p> |
|
458 | * If there are fewer that <strong>three</strong> observations in the |
|
459 | * model, or if there is no variation in x, this returns |
|
460 | * <code>Double.NaN</code>. |
|
461 | * |
|
462 | * @return significance level for slope/correlation |
|
463 | * @throws MathException if the significance level can not be computed. |
|
464 | */ |
|
465 | public double getSignificance() throws MathException { |
|
466 | 12 | return 2d* (1.0 - getTDistribution().cumulativeProbability( |
467 | Math.abs(getSlope()) / getSlopeStdErr())); |
|
468 | } |
|
469 | ||
470 | // ---------------------Private methods----------------------------------- |
|
471 | ||
472 | /** |
|
473 | * Returns the intercept of the estimated regression line, given the slope. |
|
474 | * <p> |
|
475 | * Will return <code>NaN</code> if slope is <code>NaN</code>. |
|
476 | * |
|
477 | * @param slope current slope |
|
478 | * @return the intercept of the regression line |
|
479 | */ |
|
480 | private double getIntercept(double slope) { |
|
481 | 18 | return (sumY - slope * sumX) / ((double) n); |
482 | } |
|
483 | ||
484 | /** |
|
485 | * Returns the sum of squared errors associated with the regression |
|
486 | * model, using the slope of the regression line. |
|
487 | * <p> |
|
488 | * Returns NaN if the slope is NaN. |
|
489 | * |
|
490 | * @param b1 current slope |
|
491 | * @return sum of squared errors associated with the regression model |
|
492 | */ |
|
493 | private double getSumSquaredErrors(double b1) { |
|
494 | 68 | return sumYY - sumXY * sumXY / sumXX; |
495 | } |
|
496 | ||
497 | /** |
|
498 | * Computes r-square from the slope. |
|
499 | * <p> |
|
500 | * will return NaN if slope is Nan. |
|
501 | * |
|
502 | * @param b1 current slope |
|
503 | * @return r-square |
|
504 | */ |
|
505 | private double getRSquare(double b1) { |
|
506 | 20 | double ssto = getTotalSumSquares(); |
507 | 20 | return (ssto - getSumSquaredErrors(b1)) / ssto; |
508 | } |
|
509 | ||
510 | /** |
|
511 | * Computes SSR from b1. |
|
512 | * |
|
513 | * @param slope regression slope estimate |
|
514 | * @return sum of squared deviations of predicted y values |
|
515 | */ |
|
516 | private double getRegressionSumSquares(double slope) { |
|
517 | 8 | return slope * slope * sumXX; |
518 | } |
|
519 | ||
520 | /** |
|
521 | * Uses distribution framework to get a t distribution instance |
|
522 | * with df = n - 2 |
|
523 | * |
|
524 | * @return t distribution with df = n - 2 |
|
525 | */ |
|
526 | private TDistribution getTDistribution() { |
|
527 | 20 | return DistributionFactory.newInstance().createTDistribution(n - 2); |
528 | } |
|
529 | } |