Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||||||
EmpiricalDistributionImpl |
|
| 2.6363636363636362;2.636 |
1 | /* |
|
2 | * Copyright 2003-2004 The Apache Software Foundation. |
|
3 | * |
|
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 | * you may not use this file except in compliance with the License. |
|
6 | * You may obtain a copy of the License at |
|
7 | * |
|
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 | * |
|
10 | * Unless required by applicable law or agreed to in writing, software |
|
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 | * See the License for the specific language governing permissions and |
|
14 | * limitations under the License. |
|
15 | */ |
|
16 | ||
17 | package org.apache.commons.math.random; |
|
18 | ||
19 | import java.io.Serializable; |
|
20 | import java.io.BufferedReader; |
|
21 | import java.io.FileReader; |
|
22 | import java.io.File; |
|
23 | import java.io.IOException; |
|
24 | import java.io.InputStreamReader; |
|
25 | import java.net.URL; |
|
26 | import java.util.ArrayList; |
|
27 | import java.util.List; |
|
28 | ||
29 | import org.apache.commons.math.stat.descriptive.SummaryStatistics; |
|
30 | import org.apache.commons.math.stat.descriptive.StatisticalSummary; |
|
31 | ||
32 | /** |
|
33 | * Implements <code>EmpiricalDistribution</code> interface. This implementation |
|
34 | * uses what amounts to the |
|
35 | * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html"> |
|
36 | * Variable Kernel Method</a> with Gaussian smoothing:<p> |
|
37 | * <strong>Digesting the input file</strong> |
|
38 | * <ol><li>Pass the file once to compute min and max.</li> |
|
39 | * <li>Divide the range from min-max into <code>binCount</code> "bins."</li> |
|
40 | * <li>Pass the data file again, computing bin counts and univariate |
|
41 | * statistics (mean, std dev.) for each of the bins </li> |
|
42 | * <li>Divide the interval (0,1) into subintervals associated with the bins, |
|
43 | * with the length of a bin's subinterval proportional to its count.</li></ol> |
|
44 | * <strong>Generating random values from the distribution</strong><ol> |
|
45 | * <li>Generate a uniformly distributed value in (0,1) </li> |
|
46 | * <li>Select the subinterval to which the value belongs. |
|
47 | * <li>Generate a random Gaussian value with mean = mean of the associated |
|
48 | * bin and std dev = std dev of associated bin.</li></ol></p><p> |
|
49 | *<strong>USAGE NOTES:</strong><ul> |
|
50 | *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb |
|
51 | * is to set the bin count to approximately the length of the input file divided |
|
52 | * by 10. </li> |
|
53 | *<li>The input file <i>must</i> be a plain text file containing one valid numeric |
|
54 | * entry per line.</li> |
|
55 | * </ul></p> |
|
56 | * |
|
57 | * @version $Revision$ $Date: 2005-09-03 17:38:15 -0700 (Sat, 03 Sep 2005) $ |
|
58 | */ |
|
59 | 44008 | public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution { |
60 | ||
61 | /** Serializable version identifier */ |
|
62 | static final long serialVersionUID = -6773236347582113490L; |
|
63 | ||
64 | /** List of SummaryStatistics objects characterizing the bins */ |
|
65 | 48 | private ArrayList binStats = null; |
66 | ||
67 | /** Sample statistics */ |
|
68 | 48 | SummaryStatistics sampleStats = null; |
69 | ||
70 | /** number of bins */ |
|
71 | 48 | private int binCount = 1000; |
72 | ||
73 | /** is the distribution loaded? */ |
|
74 | 48 | private boolean loaded = false; |
75 | ||
76 | /** upper bounds of subintervals in (0,1) "belonging" to the bins */ |
|
77 | 48 | private double[] upperBounds = null; |
78 | ||
79 | /** RandomData instance to use in repeated calls to getNext() */ |
|
80 | 48 | private RandomData randomData = new RandomDataImpl(); |
81 | ||
82 | /** |
|
83 | * Creates a new EmpiricalDistribution with the default bin count. |
|
84 | */ |
|
85 | 6 | public EmpiricalDistributionImpl() { |
86 | 6 | binStats = new ArrayList(); |
87 | 6 | } |
88 | ||
89 | /** |
|
90 | * Creates a new EmpiricalDistribution with the specified bin count. |
|
91 | * |
|
92 | * @param binCount number of bins |
|
93 | */ |
|
94 | 42 | public EmpiricalDistributionImpl(int binCount) { |
95 | 42 | this.binCount = binCount; |
96 | 42 | binStats = new ArrayList(); |
97 | 42 | } |
98 | ||
99 | /** |
|
100 | * Computes the empirical distribution from the provided |
|
101 | * array of numbers. |
|
102 | * |
|
103 | * @param in the input data array |
|
104 | */ |
|
105 | public void load(double[] in) { |
|
106 | 12 | DataAdapter da = new ArrayDataAdapter(in); |
107 | try { |
|
108 | 12 | da.computeStats(); |
109 | 12 | fillBinStats(in); |
110 | 0 | } catch (Exception e) { |
111 | 0 | throw new RuntimeException(e.getMessage()); |
112 | 12 | } |
113 | 12 | loaded = true; |
114 | ||
115 | 12 | } |
116 | ||
117 | /** |
|
118 | * Computes the empirical distribution using data read from a URL. |
|
119 | * @param url url of the input file |
|
120 | * |
|
121 | * @throws IOException if an IO error occurs |
|
122 | */ |
|
123 | public void load(URL url) throws IOException { |
|
124 | 12 | BufferedReader in = |
125 | new BufferedReader(new InputStreamReader(url.openStream())); |
|
126 | try { |
|
127 | 12 | DataAdapter da = new StreamDataAdapter(in); |
128 | try { |
|
129 | 12 | da.computeStats(); |
130 | 0 | } catch (Exception e) { |
131 | 0 | throw new IOException(e.getMessage()); |
132 | 12 | } |
133 | 12 | in = new BufferedReader(new InputStreamReader(url.openStream())); |
134 | 12 | fillBinStats(in); |
135 | 12 | loaded = true; |
136 | 12 | } finally { |
137 | 0 | if (in != null) { |
138 | try { |
|
139 | 12 | in.close(); |
140 | 0 | } catch (Exception ex) { |
141 | // ignore |
|
142 | 24 | } |
143 | } |
|
144 | 12 | } |
145 | 12 | } |
146 | ||
147 | /** |
|
148 | * Computes the empirical distribution from the input file. |
|
149 | * |
|
150 | * @param file the input file |
|
151 | * @throws IOException if an IO error occurs |
|
152 | */ |
|
153 | public void load(File file) throws IOException { |
|
154 | 0 | BufferedReader in = new BufferedReader(new FileReader(file)); |
155 | try { |
|
156 | 0 | DataAdapter da = new StreamDataAdapter(in); |
157 | try { |
|
158 | 0 | da.computeStats(); |
159 | 0 | } catch (Exception e) { |
160 | 0 | throw new IOException(e.getMessage()); |
161 | 0 | } |
162 | 0 | in = new BufferedReader(new FileReader(file)); |
163 | 0 | fillBinStats(in); |
164 | 0 | loaded = true; |
165 | 0 | } finally { |
166 | 0 | if (in != null) { |
167 | try { |
|
168 | 0 | in.close(); |
169 | 0 | } catch (Exception ex) { |
170 | // ignore |
|
171 | 0 | } |
172 | } |
|
173 | 0 | } |
174 | 0 | } |
175 | ||
176 | /** |
|
177 | * Provides methods for computing <code>sampleStats</code> and |
|
178 | * <code>beanStats</code> abstracting the source of data. |
|
179 | */ |
|
180 | 96 | private abstract class DataAdapter{ |
181 | /** |
|
182 | * Compute bin stats. |
|
183 | * |
|
184 | * @param min minimum value |
|
185 | * @param delta grid size |
|
186 | * @throws Exception if an error occurs computing bin stats |
|
187 | */ |
|
188 | public abstract void computeBinStats(double min, double delta) |
|
189 | throws Exception; |
|
190 | /** |
|
191 | * Compute sample statistics. |
|
192 | * |
|
193 | * @throws Exception if an error occurs computing sample stats |
|
194 | */ |
|
195 | public abstract void computeStats() throws Exception; |
|
196 | } |
|
197 | /** |
|
198 | * Factory of <code>DataAdapter</code> objects. For every supported source |
|
199 | * of data (array of doubles, file, etc.) an instance of the proper object |
|
200 | * is returned. |
|
201 | */ |
|
202 | 48 | private class DataAdapterFactory{ |
203 | /** |
|
204 | * Creates a DataAdapter from a data object |
|
205 | * |
|
206 | * @param in object providing access to the data |
|
207 | * @return DataAdapter instance |
|
208 | */ |
|
209 | public DataAdapter getAdapter(Object in) { |
|
210 | 24 | if (in instanceof BufferedReader) { |
211 | 12 | BufferedReader inputStream = (BufferedReader) in; |
212 | 12 | return new StreamDataAdapter(inputStream); |
213 | 12 | } else if (in instanceof double[]) { |
214 | 12 | double[] inputArray = (double[]) in; |
215 | 12 | return new ArrayDataAdapter(inputArray); |
216 | } else { |
|
217 | 0 | throw new IllegalArgumentException( |
218 | "Input data comes from the" + " unsupported source"); |
|
219 | } |
|
220 | } |
|
221 | } |
|
222 | /** |
|
223 | * <code>DataAdapter</code> for data provided through some input stream |
|
224 | */ |
|
225 | private class StreamDataAdapter extends DataAdapter{ |
|
226 | ||
227 | /** Input stream providng access to the data */ |
|
228 | BufferedReader inputStream; |
|
229 | ||
230 | /** |
|
231 | * Create a StreamDataAdapter from a BufferedReader |
|
232 | * |
|
233 | * @param in BufferedReader input stream |
|
234 | */ |
|
235 | 24 | public StreamDataAdapter(BufferedReader in){ |
236 | 24 | super(); |
237 | 24 | inputStream = in; |
238 | 24 | } |
239 | /** |
|
240 | * Computes binStats |
|
241 | * |
|
242 | * @param min minimum value |
|
243 | * @param delta grid size |
|
244 | * @throws IOException if an IO error occurs |
|
245 | */ |
|
246 | public void computeBinStats(double min, double delta) |
|
247 | throws IOException { |
|
248 | 12 | String str = null; |
249 | 12 | double val = 0.0d; |
250 | 12012 | while ((str = inputStream.readLine()) != null) { |
251 | 12000 | val = Double.parseDouble(str); |
252 | 12000 | SummaryStatistics stats = |
253 | (SummaryStatistics) binStats.get(findBin(min, val, delta)); |
|
254 | 12000 | stats.addValue(val); |
255 | } |
|
256 | ||
257 | 12 | inputStream.close(); |
258 | 12 | inputStream = null; |
259 | 12 | } |
260 | /** |
|
261 | * Computes sampleStats |
|
262 | * |
|
263 | * @throws IOException if an IOError occurs |
|
264 | */ |
|
265 | public void computeStats() throws IOException { |
|
266 | 12 | String str = null; |
267 | 12 | double val = 0.0; |
268 | 12 | sampleStats = SummaryStatistics.newInstance(); |
269 | 12012 | while ((str = inputStream.readLine()) != null) { |
270 | 12000 | val = new Double(str).doubleValue(); |
271 | 12000 | sampleStats.addValue(val); |
272 | } |
|
273 | 12 | inputStream.close(); |
274 | 12 | inputStream = null; |
275 | 12 | } |
276 | } |
|
277 | ||
278 | /** |
|
279 | * <code>DataAdapter</code> for data provided as array of doubles. |
|
280 | */ |
|
281 | private class ArrayDataAdapter extends DataAdapter{ |
|
282 | ||
283 | /** Array of input data values */ |
|
284 | private double[] inputArray; |
|
285 | ||
286 | /** |
|
287 | * Construct an ArrayDataAdapter from a double[] array |
|
288 | * |
|
289 | * @param in double[] array holding the data |
|
290 | */ |
|
291 | 24 | public ArrayDataAdapter(double[] in){ |
292 | 24 | super(); |
293 | 24 | inputArray = in; |
294 | 24 | } |
295 | /** |
|
296 | * Computes sampleStats |
|
297 | * |
|
298 | * @throws IOException if an IO error occurs |
|
299 | */ |
|
300 | public void computeStats() throws IOException { |
|
301 | 12 | sampleStats = SummaryStatistics.newInstance(); |
302 | 10016 | for (int i = 0; i < inputArray.length; i++) { |
303 | 10004 | sampleStats.addValue(inputArray[i]); |
304 | } |
|
305 | 12 | } |
306 | /** |
|
307 | * Computes binStats |
|
308 | * |
|
309 | * @param min minimum value |
|
310 | * @param delta grid size |
|
311 | * @throws IOException if an IO error occurs |
|
312 | */ |
|
313 | public void computeBinStats(double min, double delta) |
|
314 | throws IOException { |
|
315 | 10016 | for (int i = 0; i < inputArray.length; i++) { |
316 | 10004 | SummaryStatistics stats = |
317 | (SummaryStatistics) binStats.get( |
|
318 | findBin(min, inputArray[i], delta)); |
|
319 | 10004 | stats.addValue(inputArray[i]); |
320 | } |
|
321 | 12 | } |
322 | } |
|
323 | ||
324 | /** |
|
325 | * Fills binStats array (second pass through data file). |
|
326 | * |
|
327 | * @param in object providing access to the data |
|
328 | * @throws IOException if an IO error occurs |
|
329 | */ |
|
330 | private void fillBinStats(Object in) throws IOException { |
|
331 | // Load array of bin upper bounds -- evenly spaced from min - max |
|
332 | 24 | double min = sampleStats.getMin(); |
333 | 24 | double max = sampleStats.getMax(); |
334 | 24 | double delta = (max - min)/(new Double(binCount)).doubleValue(); |
335 | 24 | double[] binUpperBounds = new double[binCount]; |
336 | 24 | binUpperBounds[0] = min + delta; |
337 | 9988 | for (int i = 1; i< binCount - 1; i++) { |
338 | 9964 | binUpperBounds[i] = binUpperBounds[i-1] + delta; |
339 | } |
|
340 | 24 | binUpperBounds[binCount -1] = max; |
341 | ||
342 | // Initialize binStats ArrayList |
|
343 | 24 | if (!binStats.isEmpty()) { |
344 | 0 | binStats.clear(); |
345 | } |
|
346 | 10032 | for (int i = 0; i < binCount; i++) { |
347 | 10008 | SummaryStatistics stats = SummaryStatistics.newInstance(); |
348 | 10008 | binStats.add(i,stats); |
349 | } |
|
350 | ||
351 | // Filling data in binStats Array |
|
352 | 24 | DataAdapterFactory aFactory = new DataAdapterFactory(); |
353 | 24 | DataAdapter da = aFactory.getAdapter(in); |
354 | try { |
|
355 | 24 | da.computeBinStats(min, delta); |
356 | 0 | } catch (Exception e) { |
357 | 0 | if(e instanceof RuntimeException){ |
358 | 0 | throw new RuntimeException(e.getMessage()); |
359 | }else{ |
|
360 | 0 | throw new IOException(e.getMessage()); |
361 | } |
|
362 | 24 | } |
363 | ||
364 | // Assign upperBounds based on bin counts |
|
365 | 24 | upperBounds = new double[binCount]; |
366 | 24 | upperBounds[0] = |
367 | ((double)((SummaryStatistics)binStats.get(0)).getN())/ |
|
368 | (double)sampleStats.getN(); |
|
369 | 9988 | for (int i = 1; i < binCount-1; i++) { |
370 | 9964 | upperBounds[i] = upperBounds[i-1] + |
371 | ((double)((SummaryStatistics)binStats.get(i)).getN())/ |
|
372 | (double)sampleStats.getN(); |
|
373 | } |
|
374 | 24 | upperBounds[binCount-1] = 1.0d; |
375 | 24 | } |
376 | ||
377 | /** |
|
378 | * Returns the index of the bin to which the given value belongs |
|
379 | * |
|
380 | * @param min the minimum value |
|
381 | * @param value the value whose bin we are trying to find |
|
382 | * @param delta the grid size |
|
383 | * @return |
|
384 | */ |
|
385 | private int findBin(double min, double value, double delta) { |
|
386 | 22004 | return Math.min( |
387 | Math.max((int) Math.ceil((value- min) / delta) - 1, 0), |
|
388 | binCount - 1); |
|
389 | } |
|
390 | ||
391 | /** |
|
392 | * Generates a random value from this distribution. |
|
393 | * |
|
394 | * @return the random value. |
|
395 | * @throws IllegalStateException if the distribution has not been loaded |
|
396 | */ |
|
397 | public double getNextValue() throws IllegalStateException { |
|
398 | ||
399 | 15986 | if (!loaded) { |
400 | 2 | throw new IllegalStateException("distribution not loaded"); |
401 | } |
|
402 | ||
403 | // Start with a uniformly distributed random number in (0,1) |
|
404 | 15984 | double x = Math.random(); |
405 | ||
406 | // Use this to select the bin and generate a Gaussian within the bin |
|
407 | 3263396 | for (int i = 0; i < binCount; i++) { |
408 | 3263396 | if (x <= upperBounds[i]) { |
409 | 15984 | SummaryStatistics stats = (SummaryStatistics)binStats.get(i); |
410 | 15984 | if (stats.getN() > 0) { |
411 | 15984 | if (stats.getStandardDeviation() > 0) { // more than one obs |
412 | 14478 | return randomData.nextGaussian |
413 | (stats.getMean(),stats.getStandardDeviation()); |
|
414 | } else { |
|
415 | 1506 | return stats.getMean(); // only one obs in bin |
416 | } |
|
417 | } |
|
418 | } |
|
419 | } |
|
420 | 0 | throw new RuntimeException("No bin selected"); |
421 | } |
|
422 | ||
423 | /** |
|
424 | * Returns a {@link StatisticalSummary} describing this distribution. |
|
425 | * <strong>Preconditions:</strong><ul> |
|
426 | * <li>the distribution must be loaded before invoking this method</li></ul> |
|
427 | * |
|
428 | * @return the sample statistics |
|
429 | * @throws IllegalStateException if the distribution has not been loaded |
|
430 | */ |
|
431 | public StatisticalSummary getSampleStats() { |
|
432 | 24 | return sampleStats; |
433 | } |
|
434 | ||
435 | /** |
|
436 | * Returns the number of bins. |
|
437 | * |
|
438 | * @return the number of bins. |
|
439 | */ |
|
440 | public int getBinCount() { |
|
441 | 8 | return binCount; |
442 | } |
|
443 | ||
444 | /** |
|
445 | * Returns an ArrayList of {@link SummaryStatistics} instances containing |
|
446 | * statistics describing the values in each of the bins. The ArrayList is |
|
447 | * indexed on the bin number. |
|
448 | * |
|
449 | * @return List of bin statistics. |
|
450 | */ |
|
451 | public List getBinStats() { |
|
452 | 4000 | return binStats; |
453 | } |
|
454 | ||
455 | /** |
|
456 | * Returns the array of upper bounds for the bins. Bins are: <br/> |
|
457 | * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],..., |
|
458 | * (upperBounds[binCount-1],max] |
|
459 | * |
|
460 | * @return array of bin upper bounds |
|
461 | */ |
|
462 | public double[] getUpperBounds() { |
|
463 | 602 | return upperBounds; |
464 | } |
|
465 | ||
466 | /** |
|
467 | * Property indicating whether or not the distribution has been loaded. |
|
468 | * |
|
469 | * @return true if the distribution has been loaded |
|
470 | */ |
|
471 | public boolean isLoaded() { |
|
472 | 12 | return loaded; |
473 | } |
|
474 | } |