Coverage Report - org.apache.commons.math.random.EmpiricalDistributionImpl

Classes in this File Line Coverage Branch Coverage Complexity
EmpiricalDistributionImpl
79% 
80% 
2.636

 1  
 /*
 2  
  * Copyright 2003-2004 The Apache Software Foundation.
 3  
  *
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  *
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  *
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.math.random;
 18  
 
 19  
 import java.io.Serializable;
 20  
 import java.io.BufferedReader;
 21  
 import java.io.FileReader;
 22  
 import java.io.File;
 23  
 import java.io.IOException;
 24  
 import java.io.InputStreamReader;
 25  
 import java.net.URL;
 26  
 import java.util.ArrayList;
 27  
 import java.util.List;
 28  
 
 29  
 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
 30  
 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
 31  
 
 32  
 /**
 33  
  * Implements <code>EmpiricalDistribution</code> interface.  This implementation
 34  
  * uses what amounts to the
 35  
  * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
 36  
  * Variable Kernel Method</a> with Gaussian smoothing:<p>
 37  
  * <strong>Digesting the input file</strong>
 38  
  * <ol><li>Pass the file once to compute min and max.</li>
 39  
  * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
 40  
  * <li>Pass the data file again, computing bin counts and univariate
 41  
  *     statistics (mean, std dev.) for each of the bins </li>
 42  
  * <li>Divide the interval (0,1) into subintervals associated with the bins,
 43  
  *     with the length of a bin's subinterval proportional to its count.</li></ol>
 44  
  * <strong>Generating random values from the distribution</strong><ol>
 45  
  * <li>Generate a uniformly distributed value in (0,1) </li>
 46  
  * <li>Select the subinterval to which the value belongs.
 47  
  * <li>Generate a random Gaussian value with mean = mean of the associated
 48  
  *     bin and std dev = std dev of associated bin.</li></ol></p><p>
 49  
  *<strong>USAGE NOTES:</strong><ul>
 50  
  *<li>The <code>binCount</code> is set by default to 1000.  A good rule of thumb
 51  
  *    is to set the bin count to approximately the length of the input file divided
 52  
  *    by 10. </li>
 53  
  *<li>The input file <i>must</i> be a plain text file containing one valid numeric
 54  
  *    entry per line.</li>
 55  
  * </ul></p>
 56  
  *
 57  
  * @version $Revision$ $Date: 2005-09-03 17:38:15 -0700 (Sat, 03 Sep 2005) $
 58  
  */
 59  44008
 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
 60  
 
 61  
     /** Serializable version identifier */
 62  
     static final long serialVersionUID = -6773236347582113490L;
 63  
 
 64  
     /** List of SummaryStatistics objects characterizing the bins */
 65  48
     private ArrayList binStats = null;
 66  
 
 67  
     /** Sample statistics */
 68  48
     SummaryStatistics sampleStats = null;
 69  
 
 70  
     /** number of bins */
 71  48
     private int binCount = 1000;
 72  
 
 73  
     /** is the distribution loaded? */
 74  48
     private boolean loaded = false;
 75  
 
 76  
     /** upper bounds of subintervals in (0,1) "belonging" to the bins */
 77  48
     private double[] upperBounds = null;
 78  
 
 79  
     /** RandomData instance to use in repeated calls to getNext() */
 80  48
     private RandomData randomData = new RandomDataImpl();
 81  
 
 82  
     /**
 83  
      * Creates a new EmpiricalDistribution with the default bin count.
 84  
      */
 85  6
     public EmpiricalDistributionImpl() {
 86  6
         binStats = new ArrayList();
 87  6
     }
 88  
 
 89  
     /**
 90  
      * Creates a new EmpiricalDistribution  with the specified bin count.
 91  
      * 
 92  
      * @param binCount number of bins
 93  
      */
 94  42
     public EmpiricalDistributionImpl(int binCount) {
 95  42
         this.binCount = binCount;
 96  42
         binStats = new ArrayList();
 97  42
     }
 98  
 
 99  
      /**
 100  
      * Computes the empirical distribution from the provided
 101  
      * array of numbers.
 102  
      * 
 103  
      * @param in the input data array
 104  
      */
 105  
     public void load(double[] in) {
 106  12
         DataAdapter da = new ArrayDataAdapter(in);
 107  
         try {
 108  12
             da.computeStats();
 109  12
             fillBinStats(in);
 110  0
         } catch (Exception e) {
 111  0
             throw new RuntimeException(e.getMessage());
 112  12
         }
 113  12
         loaded = true;
 114  
 
 115  12
     }
 116  
 
 117  
     /**
 118  
      * Computes the empirical distribution using data read from a URL.
 119  
      * @param url  url of the input file
 120  
      * 
 121  
      * @throws IOException if an IO error occurs
 122  
      */
 123  
     public void load(URL url) throws IOException {
 124  12
         BufferedReader in =
 125  
             new BufferedReader(new InputStreamReader(url.openStream()));
 126  
         try {
 127  12
             DataAdapter da = new StreamDataAdapter(in);
 128  
             try {
 129  12
                 da.computeStats();
 130  0
             } catch (Exception e) {
 131  0
                 throw new IOException(e.getMessage());
 132  12
             }
 133  12
             in = new BufferedReader(new InputStreamReader(url.openStream()));
 134  12
             fillBinStats(in);
 135  12
             loaded = true;
 136  12
         } finally {
 137  0
            if (in != null) {
 138  
                try {
 139  12
                    in.close();
 140  0
                } catch (Exception ex) {
 141  
                    // ignore
 142  24
                }
 143  
            }
 144  12
         }
 145  12
     }
 146  
 
 147  
     /**
 148  
      * Computes the empirical distribution from the input file.
 149  
      * 
 150  
      * @param file the input file
 151  
      * @throws IOException if an IO error occurs
 152  
      */
 153  
     public void load(File file) throws IOException {
 154  0
         BufferedReader in = new BufferedReader(new FileReader(file));
 155  
         try {
 156  0
             DataAdapter da = new StreamDataAdapter(in);
 157  
             try {
 158  0
                 da.computeStats();
 159  0
             } catch (Exception e) {
 160  0
                 throw new IOException(e.getMessage());
 161  0
             }
 162  0
             in = new BufferedReader(new FileReader(file));
 163  0
             fillBinStats(in);
 164  0
             loaded = true;
 165  0
         } finally {
 166  0
             if (in != null) {
 167  
                 try {
 168  0
                     in.close();
 169  0
                 } catch (Exception ex) {
 170  
                     // ignore
 171  0
                 }
 172  
             }
 173  0
         }
 174  0
     }
 175  
 
 176  
     /**
 177  
      * Provides methods for computing <code>sampleStats</code> and
 178  
      * <code>beanStats</code> abstracting the source of data.
 179  
      */
 180  96
     private abstract class DataAdapter{
 181  
         /** 
 182  
          * Compute bin stats.
 183  
          * 
 184  
          * @param min minimum value
 185  
          * @param delta  grid size
 186  
          * @throws Exception  if an error occurs computing bin stats
 187  
          */
 188  
         public abstract void computeBinStats(double min, double delta)
 189  
                 throws Exception;
 190  
         /**
 191  
          * Compute sample statistics.
 192  
          * 
 193  
          * @throws Exception if an error occurs computing sample stats
 194  
          */
 195  
         public abstract void computeStats() throws Exception;
 196  
     }
 197  
     /**
 198  
      * Factory of <code>DataAdapter</code> objects. For every supported source
 199  
      * of data (array of doubles, file, etc.) an instance of the proper object
 200  
      * is returned.
 201  
      */
 202  48
     private class DataAdapterFactory{
 203  
         /**
 204  
          * Creates a DataAdapter from a data object
 205  
          * 
 206  
          * @param in object providing access to the data
 207  
          * @return DataAdapter instance
 208  
          */
 209  
         public DataAdapter getAdapter(Object in) {
 210  24
             if (in instanceof BufferedReader) {
 211  12
                 BufferedReader inputStream = (BufferedReader) in;
 212  12
                 return new StreamDataAdapter(inputStream);
 213  12
             } else if (in instanceof double[]) {
 214  12
                 double[] inputArray = (double[]) in;
 215  12
                 return new ArrayDataAdapter(inputArray);
 216  
             } else {
 217  0
                 throw new IllegalArgumentException(
 218  
                     "Input data comes from the" + " unsupported source");
 219  
             }
 220  
         }
 221  
     }
 222  
     /**
 223  
      * <code>DataAdapter</code> for data provided through some input stream
 224  
      */
 225  
     private class StreamDataAdapter extends DataAdapter{
 226  
         
 227  
         /** Input stream providng access to the data */
 228  
         BufferedReader inputStream;
 229  
         
 230  
         /**
 231  
          * Create a StreamDataAdapter from a BufferedReader
 232  
          * 
 233  
          * @param in BufferedReader input stream
 234  
          */
 235  24
         public StreamDataAdapter(BufferedReader in){
 236  24
             super();
 237  24
             inputStream = in;
 238  24
         }
 239  
         /**
 240  
          * Computes binStats
 241  
          * 
 242  
          * @param min  minimum value
 243  
          * @param delta  grid size
 244  
          * @throws IOException if an IO error occurs
 245  
          */
 246  
         public void computeBinStats(double min, double delta)
 247  
                 throws IOException {
 248  12
             String str = null;
 249  12
             double val = 0.0d;
 250  12012
             while ((str = inputStream.readLine()) != null) {
 251  12000
                 val = Double.parseDouble(str);
 252  12000
                 SummaryStatistics stats =
 253  
                     (SummaryStatistics) binStats.get(findBin(min, val, delta));
 254  12000
                 stats.addValue(val);
 255  
             }
 256  
 
 257  12
             inputStream.close();
 258  12
             inputStream = null;
 259  12
         }
 260  
         /**
 261  
          * Computes sampleStats
 262  
          * 
 263  
          * @throws IOException if an IOError occurs
 264  
          */
 265  
         public void computeStats() throws IOException {
 266  12
             String str = null;
 267  12
             double val = 0.0;
 268  12
             sampleStats = SummaryStatistics.newInstance();
 269  12012
             while ((str = inputStream.readLine()) != null) {
 270  12000
                 val = new Double(str).doubleValue();
 271  12000
                 sampleStats.addValue(val);
 272  
             }
 273  12
             inputStream.close();
 274  12
             inputStream = null;
 275  12
         }
 276  
     }
 277  
 
 278  
     /**
 279  
      * <code>DataAdapter</code> for data provided as array of doubles.
 280  
      */
 281  
     private class ArrayDataAdapter extends DataAdapter{
 282  
         
 283  
         /** Array of input  data values */
 284  
         private double[] inputArray;
 285  
         
 286  
         /**
 287  
          * Construct an ArrayDataAdapter from a double[] array
 288  
          * 
 289  
          * @param in double[] array holding the data
 290  
          */
 291  24
         public ArrayDataAdapter(double[] in){
 292  24
             super();
 293  24
             inputArray = in;
 294  24
         }
 295  
         /**
 296  
          * Computes sampleStats
 297  
          * 
 298  
          * @throws IOException if an IO error occurs
 299  
          */
 300  
         public void computeStats() throws IOException {
 301  12
             sampleStats = SummaryStatistics.newInstance();
 302  10016
             for (int i = 0; i < inputArray.length; i++) {
 303  10004
                 sampleStats.addValue(inputArray[i]);
 304  
             }
 305  12
         }
 306  
         /**
 307  
          * Computes binStats
 308  
          * 
 309  
          * @param min  minimum value
 310  
          * @param delta  grid size
 311  
          * @throws IOException  if an IO error occurs
 312  
          */
 313  
         public void computeBinStats(double min, double delta)
 314  
             throws IOException {
 315  10016
             for (int i = 0; i < inputArray.length; i++) {
 316  10004
                 SummaryStatistics stats =
 317  
                     (SummaryStatistics) binStats.get(
 318  
                             findBin(min, inputArray[i], delta));
 319  10004
                 stats.addValue(inputArray[i]);
 320  
             }
 321  12
         }
 322  
     }
 323  
 
 324  
     /**
 325  
      * Fills binStats array (second pass through data file).
 326  
      * 
 327  
      * @param in object providing access to the data
 328  
      * @throws IOException  if an IO error occurs
 329  
      */
 330  
     private void fillBinStats(Object in) throws IOException {
 331  
         // Load array of bin upper bounds -- evenly spaced from min - max
 332  24
         double min = sampleStats.getMin();
 333  24
         double max = sampleStats.getMax();
 334  24
         double delta = (max - min)/(new Double(binCount)).doubleValue();
 335  24
         double[] binUpperBounds = new double[binCount];
 336  24
         binUpperBounds[0] = min + delta;
 337  9988
         for (int i = 1; i< binCount - 1; i++) {
 338  9964
             binUpperBounds[i] = binUpperBounds[i-1] + delta;
 339  
         }
 340  24
         binUpperBounds[binCount -1] = max;
 341  
 
 342  
         // Initialize binStats ArrayList
 343  24
         if (!binStats.isEmpty()) {
 344  0
             binStats.clear();
 345  
         }
 346  10032
         for (int i = 0; i < binCount; i++) {
 347  10008
             SummaryStatistics stats = SummaryStatistics.newInstance();
 348  10008
             binStats.add(i,stats);
 349  
         }
 350  
 
 351  
         // Filling data in binStats Array
 352  24
         DataAdapterFactory aFactory = new DataAdapterFactory();
 353  24
         DataAdapter da = aFactory.getAdapter(in);
 354  
         try {
 355  24
             da.computeBinStats(min, delta);
 356  0
         } catch (Exception e) {
 357  0
             if(e instanceof RuntimeException){
 358  0
                 throw new RuntimeException(e.getMessage());
 359  
             }else{
 360  0
                 throw new IOException(e.getMessage());
 361  
             }
 362  24
         }
 363  
 
 364  
         // Assign upperBounds based on bin counts
 365  24
         upperBounds = new double[binCount];
 366  24
         upperBounds[0] =
 367  
         ((double)((SummaryStatistics)binStats.get(0)).getN())/
 368  
         (double)sampleStats.getN();
 369  9988
         for (int i = 1; i < binCount-1; i++) {
 370  9964
             upperBounds[i] = upperBounds[i-1] +
 371  
             ((double)((SummaryStatistics)binStats.get(i)).getN())/
 372  
             (double)sampleStats.getN();
 373  
         }
 374  24
         upperBounds[binCount-1] = 1.0d;
 375  24
     }
 376  
     
 377  
     /**
 378  
      * Returns the index of the bin to which the given value belongs
 379  
      * 
 380  
      * @param min  the minimum value
 381  
      * @param value  the value whose bin we are trying to find
 382  
      * @param delta  the grid size
 383  
      * @return
 384  
      */
 385  
     private int findBin(double min, double value, double delta) {
 386  22004
         return Math.min(
 387  
                 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 
 388  
                 binCount - 1);
 389  
         }
 390  
 
 391  
     /**
 392  
      * Generates a random value from this distribution.
 393  
      * 
 394  
      * @return the random value.
 395  
      * @throws IllegalStateException if the distribution has not been loaded
 396  
      */
 397  
     public double getNextValue() throws IllegalStateException {
 398  
 
 399  15986
         if (!loaded) {
 400  2
             throw new IllegalStateException("distribution not loaded");
 401  
         }
 402  
 
 403  
         // Start with a uniformly distributed random number in (0,1)
 404  15984
         double x = Math.random();
 405  
 
 406  
         // Use this to select the bin and generate a Gaussian within the bin
 407  3263396
         for (int i = 0; i < binCount; i++) {
 408  3263396
            if (x <= upperBounds[i]) {
 409  15984
                SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
 410  15984
                if (stats.getN() > 0) {
 411  15984
                    if (stats.getStandardDeviation() > 0) {  // more than one obs
 412  14478
                         return randomData.nextGaussian
 413  
                             (stats.getMean(),stats.getStandardDeviation());
 414  
                    } else {
 415  1506
                        return stats.getMean(); // only one obs in bin
 416  
                    }
 417  
                }
 418  
            }
 419  
         }
 420  0
         throw new RuntimeException("No bin selected");
 421  
     }
 422  
 
 423  
     /**
 424  
      * Returns a {@link StatisticalSummary} describing this distribution.
 425  
      * <strong>Preconditions:</strong><ul>
 426  
      * <li>the distribution must be loaded before invoking this method</li></ul>
 427  
      * 
 428  
      * @return the sample statistics
 429  
      * @throws IllegalStateException if the distribution has not been loaded
 430  
      */
 431  
     public StatisticalSummary getSampleStats() {
 432  24
         return sampleStats;
 433  
     }
 434  
 
 435  
     /**
 436  
      * Returns the number of bins.
 437  
      * 
 438  
      * @return the number of bins.
 439  
      */
 440  
     public int getBinCount() {
 441  8
         return binCount;
 442  
     }
 443  
 
 444  
     /**
 445  
      * Returns an ArrayList of {@link SummaryStatistics} instances containing
 446  
      * statistics describing the values in each of the bins.  The ArrayList is
 447  
      * indexed on the bin number.
 448  
      * 
 449  
      * @return List of bin statistics.
 450  
      */
 451  
     public List getBinStats() {
 452  4000
         return binStats;
 453  
     }
 454  
 
 455  
     /**
 456  
      * Returns the array of upper bounds for the bins.  Bins are: <br/>
 457  
      * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
 458  
      *  (upperBounds[binCount-1],max]
 459  
      * 
 460  
      * @return array of bin upper bounds
 461  
      */
 462  
     public double[] getUpperBounds() {
 463  602
         return upperBounds;
 464  
     }
 465  
 
 466  
     /**
 467  
      * Property indicating whether or not the distribution has been loaded.
 468  
      * 
 469  
      * @return true if the distribution has been loaded
 470  
      */
 471  
     public boolean isLoaded() {
 472  12
         return loaded;
 473  
     }
 474  
 }