Coverage Report

Coverage Report - org.apache.commons.math.stat.regression.SimpleRegression

Classes in this File Line Coverage Branch Coverage Complexity

SimpleRegression
100%
100%
1.5416666666666667;1.542

1
/*

2
* Copyright 2003-2004 The Apache Software Foundation.

3
*

4
* Licensed under the Apache License, Version 2.0 (the "License");

5
* you may not use this file except in compliance with the License.

6
* You may obtain a copy of the License at

7
*

8
* http://www.apache.org/licenses/LICENSE-2.0

9
*

10
* Unless required by applicable law or agreed to in writing, software

11
* distributed under the License is distributed on an "AS IS" BASIS,

12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13
* See the License for the specific language governing permissions and

14
* limitations under the License.

15
*/

16

17
package org.apache.commons.math.stat.regression;

18
import java.io.Serializable;

19

20
import org.apache.commons.math.MathException;

21
import org.apache.commons.math.distribution.DistributionFactory;

22
import org.apache.commons.math.distribution.TDistribution;

23

24
/**

25
* Estimates an ordinary least squares regression model

26
* with one independent variable.

27
* 

28
* <code> y = intercept + slope * x </code>

29
* 

30
* Standard errors for <code>intercept</code> and <code>slope</code> are

31
* available as well as ANOVA, r-square and Pearson's r statistics.

32
* 

33
* Observations (x,y pairs) can be added to the model one at a time or they

34
* can be provided in a 2-dimensional array. The observations are not stored

35
* in memory, so there is no limit to the number of observations that can be

36
* added to the model.

37
* 

38
* Usage Notes: <ul>

39
* <li> When there are fewer than two observations in the model, or when

40
* there is no variation in the x values (i.e. all x values are the same)

41
* all statistics return <code>NaN</code>. At least two observations with

42
* different x coordinates are requred to estimate a bivariate regression

43
* model.

44
* </li>

45
* <li> getters for the statistics always compute values based on the current

46
* set of observations -- i.e., you can get statistics, then add more data

47
* and get updated statistics without using a new instance. There is no

48
* "compute" method that updates all statistics. Each of the getters performs

49
* the necessary computations to return the requested statistic.</li>

50
* </ul>

51
*

52
* @version $Revision$ $Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) $

53
*/

54
public class SimpleRegression implements Serializable {

55

56
/** Serializable version identifier */

57
static final long serialVersionUID = -3004689053607543335L;

58

59
/** sum of x values */

60 20
private double sumX = 0d;

61

62
/** total variation in x (sum of squared deviations from xbar) */

63 20
private double sumXX = 0d;

64

65
/** sum of y values */

66 20
private double sumY = 0d;

67

68
/** total variation in y (sum of squared deviations from ybar) */

69 20
private double sumYY = 0d;

70

71
/** sum of products */

72 20
private double sumXY = 0d;

73

74
/** number of observations */

75 20
private long n = 0;

76

77
/** mean of accumulated x values, used in updating formulas */

78 20
private double xbar = 0;

79

80
/** mean of accumulated y values, used in updating formulas */

81 20
private double ybar = 0;

82

83
// ---------------------Public methods--------------------------------------

84

85
/**

86
* Create an empty SimpleRegression instance

87
*/

88
public SimpleRegression() {

89 20
super();

90 20
}

91

92
/**

93
* Adds the observation (x,y) to the regression data set.

94
* 

95
* Uses updating formulas for means and sums of squares defined in

96
* "Algorithms for Computing the Sample Variance: Analysis and

97
* Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.

98
* 1983, American Statistician, vol. 37, pp. 242-247, referenced in

99
* Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985

100
*

101
*

102
* @param x independent variable value

103
* @param y dependent variable value

104
*/

105
public void addData(double x, double y) {

106 886
if (n == 0) {

107 22
xbar = x;

108 22
ybar = y;

109
} else {

110 864
double dx = x - xbar;

111 864
double dy = y - ybar;

112 864
sumXX += dx * dx * (double) n / (double) (n + 1.0);

113 864
sumYY += dy * dy * (double) n / (double) (n + 1.0);

114 864
sumXY += dx * dy * (double) n / (double) (n + 1.0);

115 864
xbar += dx / (double) (n + 1.0);

116 864
ybar += dy / (double) (n + 1.0);

117
}

118 886
sumX += x;

119 886
sumY += y;

120 886
n++;

121 886
}

122

123
/**

124
* Adds the observations represented by the elements in

125
* <code>data</code>.

126
* 

127
* <code>(data[0][0],data[0][1])</code> will be the first observation, then

128
* <code>(data[1][0],data[1][1])</code>, etc.

129
* 

130
* This method does not replace data that has already been added. The

131
* observations represented by <code>data</code> are added to the existing

132
* dataset.

133
* 

134
* To replace all data, use <code>clear()</code> before adding the new

135
* data.

136
*

137
* @param data array of observations to be added

138
*/

139
public void addData(double[][] data) {

140 216
for (int i = 0; i < data.length; i++) {

141 204
addData(data[i][0], data[i][1]);

142
}

143 12
}

144

145
/**

146
* Clears all data from the model.

147
*/

148
public void clear() {

149 2
sumX = 0d;

150 2
sumXX = 0d;

151 2
sumY = 0d;

152 2
sumYY = 0d;

153 2
sumXY = 0d;

154 2
n = 0;

155 2
}

156

157
/**

158
* Returns the number of observations that have been added to the model.

159
*

160
* @return n number of observations that have been added.

161
*/

162
public long getN() {

163 10
return n;

164
}

165

166
/**

167
* Returns the "predicted" <code>y</code> value associated with the

168
* supplied <code>x</code> value, based on the data that has been

169
* added to the model when this method is activated.

170
* 

171
* <code> predict(x) = intercept + slope * x </code>

172
* 

173
* Preconditions: <ul>

174
* <li>At least two observations (with at least two different x values)

175
* must have been added before invoking this method. If this method is

176
* invoked before a model can be estimated, <code>Double,NaN</code> is

177
* returned.

178
* </li></ul>

179
*

180
* @param x input <code>x</code> value

181
* @return predicted <code>y</code> value

182
*/

183
public double predict(double x) {

184 10
double b1 = getSlope();

185 10
return getIntercept(b1) + b1 * x;

186
}

187

188
/**

189
* Returns the intercept of the estimated regression line.

190
* 

191
* The least squares estimate of the intercept is computed using the

192
* <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.

193
* The intercept is sometimes denoted b0.

194
* 

195
* Preconditions: <ul>

196
* <li>At least two observations (with at least two different x values)

197
* must have been added before invoking this method. If this method is

198
* invoked before a model can be estimated, <code>Double,NaN</code> is

199
* returned.

200
* </li></ul>

201
*

202
* @return the intercept of the regression line

203
*/

204
public double getIntercept() {

205 8
return getIntercept(getSlope());

206
}

207

208
/**

209
* Returns the slope of the estimated regression line.

210
* 

211
* The least squares estimate of the slope is computed using the

212
* <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.

213
* The slope is sometimes denoted b1.

214
* 

215
* Preconditions: <ul>

216
* <li>At least two observations (with at least two different x values)

217
* must have been added before invoking this method. If this method is

218
* invoked before a model can be estimated, <code>Double.NaN</code> is

219
* returned.

220
* </li></ul>

221
*

222
* @return the slope of the regression line

223
*/

224
public double getSlope() {

225 118
if (n < 2) {

226 14
return Double.NaN; //not enough data

227
}

228 104
if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {

229 14
return Double.NaN; //not enough variation in x

230
}

231 90
return sumXY / sumXX;

232
}

233

234
/**

235
* Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">

236
* sum of squared errors</a> (SSE) associated with the regression

237
* model.

238
* 

239
* Preconditions: <ul>

240
* <li>At least two observations (with at least two different x values)

241
* must have been added before invoking this method. If this method is

242
* invoked before a model can be estimated, <code>Double,NaN</code> is

243
* returned.

244
* </li></ul>

245
*

246
* @return sum of squared errors associated with the regression model

247
*/

248
public double getSumSquaredErrors() {

249 48
return getSumSquaredErrors(getSlope());

250
}

251

252
/**

253
* Returns the sum of squared deviations of the y values about their mean.

254
* 

255
* This is defined as SSTO

256
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.

257
* 

258
* If <code>n < 2</code>, this returns <code>Double.NaN</code>.

259
*

260
* @return sum of squared deviations of y values

261
*/

262
public double getTotalSumSquares() {

263 26
if (n < 2) {

264 6
return Double.NaN;

265
}

266 20
return sumYY;

267
}

268

269
/**

270
* Returns the sum of squared deviations of the predicted y values about

271
* their mean (which equals the mean of y).

272
* 

273
* This is usually abbreviated SSR or SSM. It is defined as SSM

274
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>

275
* 

276
* Preconditions: <ul>

277
* <li>At least two observations (with at least two different x values)

278
* must have been added before invoking this method. If this method is

279
* invoked before a model can be estimated, <code>Double.NaN</code> is

280
* returned.

281
* </li></ul>

282
*

283
* @return sum of squared deviations of predicted y values

284
*/

285
public double getRegressionSumSquares() {

286 8
return getRegressionSumSquares(getSlope());

287
}

288

289
/**

290
* Returns the sum of squared errors divided by the degrees of freedom,

291
* usually abbreviated MSE.

292
* 

293
* If there are fewer than three data pairs in the model,

294
* or if there is no variation in <code>x</code>, this returns

295
* <code>Double.NaN</code>.

296
*

297
* @return sum of squared deviations of y values

298
*/

299
public double getMeanSquareError() {

300 58
if (n < 3) {

301 18
return Double.NaN;

302
}

303 40
return getSumSquaredErrors() / (double) (n - 2);

304
}

305

306
/**

307
* Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">

308
* Pearson's product moment correlation coefficient</a>,

309
* usually denoted r.

310
* 

311
* Preconditions: <ul>

312
* <li>At least two observations (with at least two different x values)

313
* must have been added before invoking this method. If this method is

314
* invoked before a model can be estimated, <code>Double,NaN</code> is

315
* returned.

316
* </li></ul>

317
*

318
* @return Pearson's r

319
*/

320
public double getR() {

321 8
double b1 = getSlope();

322 8
double result = Math.sqrt(getRSquare(b1));

323 8
if (b1 < 0) {

324 2
result = -result;

325
}

326 8
return result;

327
}

328

329
/**

330
* Returns the <a href="http://www.xycoon.com/coefficient1.htm">

331
* coefficient of determination</a>,

332
* usually denoted r-square.

333
* 

334
* Preconditions: <ul>

335
* <li>At least two observations (with at least two different x values)

336
* must have been added before invoking this method. If this method is

337
* invoked before a model can be estimated, <code>Double,NaN</code> is

338
* returned.

339
* </li></ul>

340
*

341
* @return r-square

342
*/

343
public double getRSquare() {

344 12
return getRSquare(getSlope());

345
}

346

347
/**

348
* Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">

349
* standard error of the intercept estimate</a>,

350
* usually denoted s(b0).

351
* 

352
* If there are fewer that three observations in the

353
* model, or if there is no variation in x, this returns

354
* <code>Double.NaN</code>.

355
*

356
* @return standard error associated with intercept estimate

357
*/

358
public double getInterceptStdErr() {

359 14
return Math.sqrt(

360
getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));

361
}

362

363
/**

364
* Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard

365
* error of the slope estimate</a>,

366
* usually denoted s(b1).

367
* 

368
* If there are fewer that three data pairs in the model,

369
* or if there is no variation in x, this returns <code>Double.NaN</code>.

370
*

371
* @return standard error associated with slope estimate

372
*/

373
public double getSlopeStdErr() {

374 34
return Math.sqrt(getMeanSquareError() / sumXX);

375
}

376

377
/**

378
* Returns the half-width of a 95% confidence interval for the slope

379
* estimate.

380
* 

381
* The 95% confidence interval is

382
* 

383
* <code>(getSlope() - getSlopeConfidenceInterval(),

384
* getSlope() + getSlopeConfidenceInterval())</code>

385
* 

386
* If there are fewer that three observations in the

387
* model, or if there is no variation in x, this returns

388
* <code>Double.NaN</code>.

389
* 

390
* Usage Note: 

391
* The validity of this statistic depends on the assumption that the

392
* observations included in the model are drawn from a

393
* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">

394
* Bivariate Normal Distribution</a>.

395
*

396
* @return half-width of 95% confidence interval for the slope estimate

397
*

398
* @throws MathException if the confidence interval can not be computed.

399
*/

400
public double getSlopeConfidenceInterval() throws MathException {

401 6
return getSlopeConfidenceInterval(0.05d);

402
}

403

404
/**

405
* Returns the half-width of a (100-100*alpha)% confidence interval for

406
* the slope estimate.

407
* 

408
* The (100-100*alpha)% confidence interval is

409
* 

410
* <code>(getSlope() - getSlopeConfidenceInterval(),

411
* getSlope() + getSlopeConfidenceInterval())</code>

412
* 

413
* To request, for example, a 99% confidence interval, use

414
* <code>alpha = .01</code>

415
* 

416
* Usage Note: 

417
* The validity of this statistic depends on the assumption that the

418
* observations included in the model are drawn from a

419
* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">

420
* Bivariate Normal Distribution</a>.

421
* 

422
* Preconditions:<ul>

423
* <li>If there are fewer that three observations in the

424
* model, or if there is no variation in x, this returns

425
* <code>Double.NaN</code>.

426
* </li>

427
* <li><code>(0 < alpha < 1)</code>; otherwise an

428
* <code>IllegalArgumentException</code> is thrown.

429
* </li></ul>

430
*

431
* @param alpha the desired significance level

432
* @return half-width of 95% confidence interval for the slope estimate

433
* @throws MathException if the confidence interval can not be computed.

434
*/

435
public double getSlopeConfidenceInterval(double alpha)

436
throws MathException {

437 10
if (alpha >= 1 || alpha <= 0) {

438 2
throw new IllegalArgumentException();

439
}

440 8
return getSlopeStdErr() *

441
getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);

442
}

443

444
/**

445
* Returns the significance level of the slope (equiv) correlation.

446
* 

447
* Specifically, the returned value is the smallest <code>alpha</code>

448
* such that the slope confidence interval with significance level

449
* equal to <code>alpha</code> does not include <code>0</code>.

450
* On regression output, this is often denoted <code>Prob(|t| > 0)</code>

451
* 

452
* Usage Note: 

453
* The validity of this statistic depends on the assumption that the

454
* observations included in the model are drawn from a

455
* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">

456
* Bivariate Normal Distribution</a>.

457
* 

458
* If there are fewer that three observations in the

459
* model, or if there is no variation in x, this returns

460
* <code>Double.NaN</code>.

461
*

462
* @return significance level for slope/correlation

463
* @throws MathException if the significance level can not be computed.

464
*/

465
public double getSignificance() throws MathException {

466 12
return 2d* (1.0 - getTDistribution().cumulativeProbability(

467
Math.abs(getSlope()) / getSlopeStdErr()));

468
}

469

470
// ---------------------Private methods-----------------------------------

471

472
/**

473
* Returns the intercept of the estimated regression line, given the slope.

474
* 

475
* Will return <code>NaN</code> if slope is <code>NaN</code>.

476
*

477
* @param slope current slope

478
* @return the intercept of the regression line

479
*/

480
private double getIntercept(double slope) {

481 18
return (sumY - slope * sumX) / ((double) n);

482
}

483

484
/**

485
* Returns the sum of squared errors associated with the regression

486
* model, using the slope of the regression line.

487
* 

488
* Returns NaN if the slope is NaN.

489
*

490
* @param b1 current slope

491
* @return sum of squared errors associated with the regression model

492
*/

493
private double getSumSquaredErrors(double b1) {

494 68
return sumYY - sumXY * sumXY / sumXX;

495
}

496

497
/**

498
* Computes r-square from the slope.

499
* 

500
* will return NaN if slope is Nan.

501
*

502
* @param b1 current slope

503
* @return r-square

504
*/

505
private double getRSquare(double b1) {

506 20
double ssto = getTotalSumSquares();

507 20
return (ssto - getSumSquaredErrors(b1)) / ssto;

508
}

509

510
/**

511
* Computes SSR from b1.

512
*

513
* @param slope regression slope estimate

514
* @return sum of squared deviations of predicted y values

515
*/

516
private double getRegressionSumSquares(double slope) {

517 8
return slope * slope * sumXX;

518
}

519

520
/**

521
* Uses distribution framework to get a t distribution instance

522
* with df = n - 2

523
*

524
* @return t distribution with df = n - 2

525
*/

526
private TDistribution getTDistribution() {

527 20
return DistributionFactory.newInstance().createTDistribution(n - 2);

528
}

529
}

1		/*
2		* Copyright 2003-2004 The Apache Software Foundation.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		package org.apache.commons.math.stat.regression;
18		import java.io.Serializable;
19
20		import org.apache.commons.math.MathException;
21		import org.apache.commons.math.distribution.DistributionFactory;
22		import org.apache.commons.math.distribution.TDistribution;
23
24		/**
25		* Estimates an ordinary least squares regression model
26		* with one independent variable.
27		* <p>
28		* <code> y = intercept + slope * x </code>
29		* <p>
30		* Standard errors for <code>intercept</code> and <code>slope</code> are
31		* available as well as ANOVA, r-square and Pearson's r statistics.
32		* <p>
33		* Observations (x,y pairs) can be added to the model one at a time or they
34		* can be provided in a 2-dimensional array. The observations are not stored
35		* in memory, so there is no limit to the number of observations that can be
36		* added to the model.
37		* <p>
38		* <strong>Usage Notes</strong>: <ul>
39		* <li> When there are fewer than two observations in the model, or when
40		* there is no variation in the x values (i.e. all x values are the same)
41		* all statistics return <code>NaN</code>. At least two observations with
42		* different x coordinates are requred to estimate a bivariate regression
43		* model.
44		* </li>
45		* <li> getters for the statistics always compute values based on the current
46		* set of observations -- i.e., you can get statistics, then add more data
47		* and get updated statistics without using a new instance. There is no
48		* "compute" method that updates all statistics. Each of the getters performs
49		* the necessary computations to return the requested statistic.</li>
50		* </ul>
51		*
52		* @version $Revision$ $Date: 2005-02-26 05:11:52 -0800 (Sat, 26 Feb 2005) $
53		*/
54		public class SimpleRegression implements Serializable {
55
56		/** Serializable version identifier */
57		static final long serialVersionUID = -3004689053607543335L;
58
59		/** sum of x values */
60	20	private double sumX = 0d;
61
62		/** total variation in x (sum of squared deviations from xbar) */
63	20	private double sumXX = 0d;
64
65		/** sum of y values */
66	20	private double sumY = 0d;
67
68		/** total variation in y (sum of squared deviations from ybar) */
69	20	private double sumYY = 0d;
70
71		/** sum of products */
72	20	private double sumXY = 0d;
73
74		/** number of observations */
75	20	private long n = 0;
76
77		/** mean of accumulated x values, used in updating formulas */
78	20	private double xbar = 0;
79
80		/** mean of accumulated y values, used in updating formulas */
81	20	private double ybar = 0;
82
83		// ---------------------Public methods--------------------------------------
84
85		/**
86		* Create an empty SimpleRegression instance
87		*/
88		public SimpleRegression() {
89	20	super();
90	20	}
91
92		/**
93		* Adds the observation (x,y) to the regression data set.
94		* <p>
95		* Uses updating formulas for means and sums of squares defined in
96		* "Algorithms for Computing the Sample Variance: Analysis and
97		* Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
98		* 1983, American Statistician, vol. 37, pp. 242-247, referenced in
99		* Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
100		*
101		*
102		* @param x independent variable value
103		* @param y dependent variable value
104		*/
105		public void addData(double x, double y) {
106	886	if (n == 0) {
107	22	xbar = x;
108	22	ybar = y;
109		} else {
110	864	double dx = x - xbar;
111	864	double dy = y - ybar;
112	864	sumXX += dx * dx * (double) n / (double) (n + 1.0);
113	864	sumYY += dy * dy * (double) n / (double) (n + 1.0);
114	864	sumXY += dx * dy * (double) n / (double) (n + 1.0);
115	864	xbar += dx / (double) (n + 1.0);
116	864	ybar += dy / (double) (n + 1.0);
117		}
118	886	sumX += x;
119	886	sumY += y;
120	886	n++;
121	886	}
122
123		/**
124		* Adds the observations represented by the elements in
125		* <code>data</code>.
126		* <p>
127		* <code>(data[0][0],data[0][1])</code> will be the first observation, then
128		* <code>(data[1][0],data[1][1])</code>, etc.
129		* <p>
130		* This method does not replace data that has already been added. The
131		* observations represented by <code>data</code> are added to the existing
132		* dataset.
133		* <p>
134		* To replace all data, use <code>clear()</code> before adding the new
135		* data.
136		*
137		* @param data array of observations to be added
138		*/
139		public void addData(double[][] data) {
140	216	for (int i = 0; i < data.length; i++) {
141	204	addData(data[i][0], data[i][1]);
142		}
143	12	}
144
145		/**
146		* Clears all data from the model.
147		*/
148		public void clear() {
149	2	sumX = 0d;
150	2	sumXX = 0d;
151	2	sumY = 0d;
152	2	sumYY = 0d;
153	2	sumXY = 0d;
154	2	n = 0;
155	2	}
156
157		/**
158		* Returns the number of observations that have been added to the model.
159		*
160		* @return n number of observations that have been added.
161		*/
162		public long getN() {
163	10	return n;
164		}
165
166		/**
167		* Returns the "predicted" <code>y</code> value associated with the
168		* supplied <code>x</code> value, based on the data that has been
169		* added to the model when this method is activated.
170		* <p>
171		* <code> predict(x) = intercept + slope * x </code>
172		* <p>
173		* <strong>Preconditions</strong>: <ul>
174		* <li>At least two observations (with at least two different x values)
175		* must have been added before invoking this method. If this method is
176		* invoked before a model can be estimated, <code>Double,NaN</code> is
177		* returned.
178		* </li></ul>
179		*
180		* @param x input <code>x</code> value
181		* @return predicted <code>y</code> value
182		*/
183		public double predict(double x) {
184	10	double b1 = getSlope();
185	10	return getIntercept(b1) + b1 * x;
186		}
187
188		/**
189		* Returns the intercept of the estimated regression line.
190		* <p>
191		* The least squares estimate of the intercept is computed using the
192		* <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
193		* The intercept is sometimes denoted b0.
194		* <p>
195		* <strong>Preconditions</strong>: <ul>
196		* <li>At least two observations (with at least two different x values)
197		* must have been added before invoking this method. If this method is
198		* invoked before a model can be estimated, <code>Double,NaN</code> is
199		* returned.
200		* </li></ul>
201		*
202		* @return the intercept of the regression line
203		*/
204		public double getIntercept() {
205	8	return getIntercept(getSlope());
206		}
207
208		/**
209		* Returns the slope of the estimated regression line.
210		* <p>
211		* The least squares estimate of the slope is computed using the
212		* <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
213		* The slope is sometimes denoted b1.
214		* <p>
215		* <strong>Preconditions</strong>: <ul>
216		* <li>At least two observations (with at least two different x values)
217		* must have been added before invoking this method. If this method is
218		* invoked before a model can be estimated, <code>Double.NaN</code> is
219		* returned.
220		* </li></ul>
221		*
222		* @return the slope of the regression line
223		*/
224		public double getSlope() {
225	118	if (n < 2) {
226	14	return Double.NaN; //not enough data
227		}
228	104	if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
229	14	return Double.NaN; //not enough variation in x
230		}
231	90	return sumXY / sumXX;
232		}
233
234		/**
235		* Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
236		* sum of squared errors</a> (SSE) associated with the regression
237		* model.
238		* <p>
239		* <strong>Preconditions</strong>: <ul>
240		* <li>At least two observations (with at least two different x values)
241		* must have been added before invoking this method. If this method is
242		* invoked before a model can be estimated, <code>Double,NaN</code> is
243		* returned.
244		* </li></ul>
245		*
246		* @return sum of squared errors associated with the regression model
247		*/
248		public double getSumSquaredErrors() {
249	48	return getSumSquaredErrors(getSlope());
250		}
251
252		/**
253		* Returns the sum of squared deviations of the y values about their mean.
254		* <p>
255		* This is defined as SSTO
256		* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
257		* <p>
258		* If <code>n < 2</code>, this returns <code>Double.NaN</code>.
259		*
260		* @return sum of squared deviations of y values
261		*/
262		public double getTotalSumSquares() {
263	26	if (n < 2) {
264	6	return Double.NaN;
265		}
266	20	return sumYY;
267		}
268
269		/**
270		* Returns the sum of squared deviations of the predicted y values about
271		* their mean (which equals the mean of y).
272		* <p>
273		* This is usually abbreviated SSR or SSM. It is defined as SSM
274		* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
275		* <p>
276		* <strong>Preconditions</strong>: <ul>
277		* <li>At least two observations (with at least two different x values)
278		* must have been added before invoking this method. If this method is
279		* invoked before a model can be estimated, <code>Double.NaN</code> is
280		* returned.
281		* </li></ul>
282		*
283		* @return sum of squared deviations of predicted y values
284		*/
285		public double getRegressionSumSquares() {
286	8	return getRegressionSumSquares(getSlope());
287		}
288
289		/**
290		* Returns the sum of squared errors divided by the degrees of freedom,
291		* usually abbreviated MSE.
292		* <p>
293		* If there are fewer than <strong>three</strong> data pairs in the model,
294		* or if there is no variation in <code>x</code>, this returns
295		* <code>Double.NaN</code>.
296		*
297		* @return sum of squared deviations of y values
298		*/
299		public double getMeanSquareError() {
300	58	if (n < 3) {
301	18	return Double.NaN;
302		}
303	40	return getSumSquaredErrors() / (double) (n - 2);
304		}
305
306		/**
307		* Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
308		* Pearson's product moment correlation coefficient</a>,
309		* usually denoted r.
310		* <p>
311		* <strong>Preconditions</strong>: <ul>
312		* <li>At least two observations (with at least two different x values)
313		* must have been added before invoking this method. If this method is
314		* invoked before a model can be estimated, <code>Double,NaN</code> is
315		* returned.
316		* </li></ul>
317		*
318		* @return Pearson's r
319		*/
320		public double getR() {
321	8	double b1 = getSlope();
322	8	double result = Math.sqrt(getRSquare(b1));
323	8	if (b1 < 0) {
324	2	result = -result;
325		}
326	8	return result;
327		}
328
329		/**
330		* Returns the <a href="http://www.xycoon.com/coefficient1.htm">
331		* coefficient of determination</a>,
332		* usually denoted r-square.
333		* <p>
334		* <strong>Preconditions</strong>: <ul>
335		* <li>At least two observations (with at least two different x values)
336		* must have been added before invoking this method. If this method is
337		* invoked before a model can be estimated, <code>Double,NaN</code> is
338		* returned.
339		* </li></ul>
340		*
341		* @return r-square
342		*/
343		public double getRSquare() {
344	12	return getRSquare(getSlope());
345		}
346
347		/**
348		* Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
349		* standard error of the intercept estimate</a>,
350		* usually denoted s(b0).
351		* <p>
352		* If there are fewer that <strong>three</strong> observations in the
353		* model, or if there is no variation in x, this returns
354		* <code>Double.NaN</code>.
355		*
356		* @return standard error associated with intercept estimate
357		*/
358		public double getInterceptStdErr() {
359	14	return Math.sqrt(
360		getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
361		}
362
363		/**
364		* Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
365		* error of the slope estimate</a>,
366		* usually denoted s(b1).
367		* <p>
368		* If there are fewer that <strong>three</strong> data pairs in the model,
369		* or if there is no variation in x, this returns <code>Double.NaN</code>.
370		*
371		* @return standard error associated with slope estimate
372		*/
373		public double getSlopeStdErr() {
374	34	return Math.sqrt(getMeanSquareError() / sumXX);
375		}
376
377		/**
378		* Returns the half-width of a 95% confidence interval for the slope
379		* estimate.
380		* <p>
381		* The 95% confidence interval is
382		* <p>
383		* <code>(getSlope() - getSlopeConfidenceInterval(),
384		* getSlope() + getSlopeConfidenceInterval())</code>
385		* <p>
386		* If there are fewer that <strong>three</strong> observations in the
387		* model, or if there is no variation in x, this returns
388		* <code>Double.NaN</code>.
389		* <p>
390		* <strong>Usage Note</strong>:<br>
391		* The validity of this statistic depends on the assumption that the
392		* observations included in the model are drawn from a
393		* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
394		* Bivariate Normal Distribution</a>.
395		*
396		* @return half-width of 95% confidence interval for the slope estimate
397		*
398		* @throws MathException if the confidence interval can not be computed.
399		*/
400		public double getSlopeConfidenceInterval() throws MathException {
401	6	return getSlopeConfidenceInterval(0.05d);
402		}
403
404		/**
405		* Returns the half-width of a (100-100*alpha)% confidence interval for
406		* the slope estimate.
407		* <p>
408		* The (100-100*alpha)% confidence interval is
409		* <p>
410		* <code>(getSlope() - getSlopeConfidenceInterval(),
411		* getSlope() + getSlopeConfidenceInterval())</code>
412		* <p>
413		* To request, for example, a 99% confidence interval, use
414		* <code>alpha = .01</code>
415		* <p>
416		* <strong>Usage Note</strong>:<br>
417		* The validity of this statistic depends on the assumption that the
418		* observations included in the model are drawn from a
419		* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
420		* Bivariate Normal Distribution</a>.
421		* <p>
422		* <strong> Preconditions:</strong><ul>
423		* <li>If there are fewer that <strong>three</strong> observations in the
424		* model, or if there is no variation in x, this returns
425		* <code>Double.NaN</code>.
426		* </li>
427		* <li><code>(0 < alpha < 1)</code>; otherwise an
428		* <code>IllegalArgumentException</code> is thrown.
429		* </li></ul>
430		*
431		* @param alpha the desired significance level
432		* @return half-width of 95% confidence interval for the slope estimate
433		* @throws MathException if the confidence interval can not be computed.
434		*/
435		public double getSlopeConfidenceInterval(double alpha)
436		throws MathException {
437	10	if (alpha >= 1 \|\| alpha <= 0) {
438	2	throw new IllegalArgumentException();
439		}
440	8	return getSlopeStdErr() *
441		getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);
442		}
443
444		/**
445		* Returns the significance level of the slope (equiv) correlation.
446		* <p>
447		* Specifically, the returned value is the smallest <code>alpha</code>
448		* such that the slope confidence interval with significance level
449		* equal to <code>alpha</code> does not include <code>0</code>.
450		* On regression output, this is often denoted <code>Prob(\|t\| > 0)</code>
451		* <p>
452		* <strong>Usage Note</strong>:<br>
453		* The validity of this statistic depends on the assumption that the
454		* observations included in the model are drawn from a
455		* <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
456		* Bivariate Normal Distribution</a>.
457		* <p>
458		* If there are fewer that <strong>three</strong> observations in the
459		* model, or if there is no variation in x, this returns
460		* <code>Double.NaN</code>.
461		*
462		* @return significance level for slope/correlation
463		* @throws MathException if the significance level can not be computed.
464		*/
465		public double getSignificance() throws MathException {
466	12	return 2d* (1.0 - getTDistribution().cumulativeProbability(
467		Math.abs(getSlope()) / getSlopeStdErr()));
468		}
469
470		// ---------------------Private methods-----------------------------------
471
472		/**
473		* Returns the intercept of the estimated regression line, given the slope.
474		* <p>
475		* Will return <code>NaN</code> if slope is <code>NaN</code>.
476		*
477		* @param slope current slope
478		* @return the intercept of the regression line
479		*/
480		private double getIntercept(double slope) {
481	18	return (sumY - slope * sumX) / ((double) n);
482		}
483
484		/**
485		* Returns the sum of squared errors associated with the regression
486		* model, using the slope of the regression line.
487		* <p>
488		* Returns NaN if the slope is NaN.
489		*
490		* @param b1 current slope
491		* @return sum of squared errors associated with the regression model
492		*/
493		private double getSumSquaredErrors(double b1) {
494	68	return sumYY - sumXY * sumXY / sumXX;
495		}
496
497		/**
498		* Computes r-square from the slope.
499		* <p>
500		* will return NaN if slope is Nan.
501		*
502		* @param b1 current slope
503		* @return r-square
504		*/
505		private double getRSquare(double b1) {
506	20	double ssto = getTotalSumSquares();
507	20	return (ssto - getSumSquaredErrors(b1)) / ssto;
508		}
509
510		/**
511		* Computes SSR from b1.
512		*
513		* @param slope regression slope estimate
514		* @return sum of squared deviations of predicted y values
515		*/
516		private double getRegressionSumSquares(double slope) {
517	8	return slope * slope * sumXX;
518		}
519
520		/**
521		* Uses distribution framework to get a t distribution instance
522		* with df = n - 2
523		*
524		* @return t distribution with df = n - 2
525		*/
526		private TDistribution getTDistribution() {
527	20	return DistributionFactory.newInstance().createTDistribution(n - 2);
528		}
529		}