Statistics.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Calculate statistics given a range of values.
5  *
6  *
7  *
8  * \author O.Krause
9  * \date 2015
10  *
11  *
12  * \par Copyright 1995-2017 Shark Development Team
13  *
14  * <BR><HR>
15  * This file is part of Shark.
16  * <http://shark-ml.org/>
17  *
18  * Shark is free software: you can redistribute it and/or modify
19  * it under the terms of the GNU Lesser General Public License as published
20  * by the Free Software Foundation, either version 3 of the License, or
21  * (at your option) any later version.
22  *
23  * Shark is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  * GNU Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License
29  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30  *
31  */
32 #ifndef SHARK_STATISTICS_H
33 #define SHARK_STATISTICS_H
34 
35 
36 //for vector algebra
37 #include <shark/LinAlg/Base.h>
38 
39 //handling of missing values
40 #include <limits>
41 #include <boost/math/special_functions/fpclassify.hpp>
42 
43 //for quantiles
44 #include <boost/range/algorithm/nth_element.hpp>
45 
46 
47 //for the result table
48 #include <string>
49 #include <map>
50 #include <iterator>
51 #include <boost/serialization/string.hpp>
52 #include <boost/serialization/map.hpp>
53 #include <boost/serialization/vector.hpp>
54 
55 namespace shark {
56 namespace statistics{
57 
58 inline double missingValue(){
59  return std::numeric_limits<double>::quiet_NaN();//missing values are a non-signaling NaN
60 }
61 
62 inline bool isMissing(double value){
63  return boost::math::isnan(value);//there is no portable way to distinguish the different types of NaN
64 }
65 
66 ///\brief Base class for all Statistic Objects to be used with Statistics
68 public:
69  virtual std::string name() const=0;
71  virtual RealVector statistics(std::vector<RealVector> const& points)const=0;
72 };
73 
74 ///\brief for a vector of points computes for every dimension the fraction of missing values
76 public:
77  std::string name() const{
78  return "Missing";
79  }
80  RealVector statistics(std::vector<RealVector> const& points)const{
81  std::size_t N = points.size();
82  RealVector missing(points[0].size(),0.0);
83  for(std::size_t i = 0; i != N;++i){
84  for(std::size_t j = 0; j != missing.size(); ++j){
85  if(!isMissing(points[i](j)))continue;
86  missing(j) += 1.0;
87  }
88  }
89  missing /= N;
90  return missing;
91  }
92 };
93 
94 ///\brief For a vector of points computes for every dimension the mean
96 public:
97  std::string name() const{
98  return "Mean";
99  }
100  RealVector statistics(std::vector<RealVector> const& points)const{
101  std::size_t N = points.size();
102  RealVector sum(points[0].size(),0.0);
103  UIntVector numSamples(points[0].size(),0);
104  for(std::size_t i = 0; i != N;++i){
105  for(std::size_t j = 0; j != sum.size(); ++j){
106  if(isMissing(points[i](j)))continue;
107  sum(j) += points[i](j);
108  ++numSamples(j);
109  }
110  }
111  //calculate mean. if the number of non-missing points was 0, return missingValue() for that dimension
112  return safe_div(sum,numSamples,missingValue());
113  }
114 };
115 
116 ///\brief For a vector of points computes for every dimension the variance
118 public:
119  std::string name() const{
120  return "Variance";
121  }
122  RealVector statistics(std::vector<RealVector> const& points)const{
123  std::size_t N = points.size();
124  Mean m;
125  RealVector mean = m.statistics(points);
126  RealVector variance(mean.size(),0.0);
127  UIntVector numSamples(points[0].size(),0);
128  for(std::size_t i = 0; i != N;++i){
129  for(std::size_t j = 0; j != mean.size(); ++j){
130  if(isMissing(points[i](j)))continue;
131  variance(j) += sqr(points[i](j)-mean(j));
132  ++numSamples(j);
133  }
134  }
135  //calculate biased variance. if the number of non-missing points was 0, return missingValue() for that dimension
136  return safe_div(variance,numSamples,missingValue());
137  }
138 };
139 
140 //Quantiles, Median, Lower-Upper
141 ///\brief For a vector of points computes for every dimension the p-quantile
143 public:
144  std::string name() const{
145  return boost::lexical_cast<std::string>(m_quantile)+"-Quantile";
146  }
147  Quantile(double quantile):m_quantile(quantile){}
148  RealVector statistics(std::vector<RealVector> const& points)const{
149  std::size_t N = points.size();
150  RealVector quantiles(points[0].size(),missingValue());
151  for(std::size_t j = 0; j != quantiles.size(); ++j){
152  //get all non-missing values of the j-th dimension
153  std::vector<double> values;
154  for(std::size_t i = 0; i != N;++i){
155  if(isMissing(points[i](j)))continue;
156  values.push_back(points[i](j));
157  }
158  if(values.size() == 0) continue;//no values-> missing value
159 
160  //compute quantile of j-th dimension
161  std::size_t element = std::size_t(values.size()*m_quantile);
162  std::vector<double>::iterator pos= values.begin()+element;
163  boost::nth_element(values,pos);
164  quantiles(j) = *pos;
165  }
166  return quantiles;
167  }
168 private:
169  double m_quantile;
170 };
171 
172 ///\brief For a vector of points computes for every dimension the median
173 class Median:public Quantile{
174 public:
175  std::string name() const{
176  return "Median";
177  }
178  Median():Quantile(0.5){}
179 };
180 
181 ///\brief For a vector of points computes for every dimension the 25%-quantile
182 class LowerQuantile:public Quantile{
183 public:
185 };
186 ///\brief For a vector of points computes for every dimension the 75%-quantile
187 class UpperQuantile:public Quantile{
188 public:
190 };
191 
192 
193 ///\brief Stores results of a running experiment
194 ///
195 /// This is a simple three dimensional table with the dimensions. Experiments
196 /// are thought of having a varied parameter (for example the algorithm names when
197 /// several algorithms are compared) and for each parameter a set of vector valued points
198 /// is stored - one vector for each trial of the experiment for a given parameter.
199 /// It is posible to give every parameter and the whole table a name which adds meta
200 /// information, for example to generate outputs.
201 template<class Parameter>
203 public:
204  typedef typename std::map<Parameter, std::vector<RealVector> >::const_iterator const_iterator;
205 
206  ResultTable(std::size_t numDimensions, std::string const& parameterName="unnamed")
207  :m_dimensionNames(numDimensions,"unnamed"),m_parameterName(parameterName){}
208 
209  std::string const& parameterName()const{
210  return m_parameterName;
211  }
212 
213  void setDimensionName(std::size_t i, std::string const& name){
214  m_dimensionNames[i]=name;
215  }
216 
217  std::string const& dimensionName(std::size_t i)const{
218  return m_dimensionNames[i];
219  }
220 
221  std::size_t numDimensions()const{
222  return m_dimensionNames.size();
223  }
224 
225  void update(Parameter const& parameter, RealVector const& point){
226  SIZE_CHECK(point.size() == numDimensions());
227  m_results[parameter].push_back(point);
228  }
229 
230  void update(Parameter const& parameter, double value){
231  RealVector point(1,value);
232  update(parameter, point);
233  }
234 
235  void update(Parameter const& parameter, double value1, double value2){
236  RealVector point(2);
237  point(0)=value1;
238  point(1)=value2;
239  update(parameter, point);
240  }
241 
242  void update(Parameter const& parameter, double value1, double value2,double value3){
243  RealVector point(3);
244  point(0)=value1;
245  point(1)=value2;
246  point(2)=value3;
247  update(parameter, point);
248  }
249 
250  std::vector<RealVector>const& operator[](Parameter const& param)const{
251  return m_results.find(param)->second;
252  }
253 
254  const_iterator begin()const{
255  return m_results.begin();
256  }
257  const_iterator end()const{
258  return m_results.end();
259  }
260 
261  std::size_t numParams()const{
262  return m_results.size();
263  }
264 
265  Parameter const& parameterValue(std::size_t i)const{
266  const_iterator pos = begin();
267  std::advance(pos,i);
268  return pos->first;
269  }
270 
271 
272  template<class Archive>
273  void serialize(Archive &ar, const unsigned int file_version) {
274  ar & m_dimensionNames;
275  ar & m_parameterName;
276  ar & m_results;
277  (void) file_version;//prevent warning
278  }
279 
280 private:
281  std::vector<std::string> m_dimensionNames;
282  std::string m_parameterName;
283  std::map<Parameter, std::vector<RealVector> > m_results;
284 };
285 
286 ///\brief Generates Statistics over the results of an experiment
287 ///
288 /// Given the results of an experiment stored in a ResultsTable, computes
289 /// several tatistics for each variable.
290 template<class Parameter>
291 struct Statistics {
292 public:
293  typedef typename std::map<Parameter, std::map<std::string,RealVector> >::const_iterator const_iterator;
294  Statistics(ResultTable<Parameter> const* table):m_resultsTable(table){}
295 
296  void addStatistic(std::string const& statisticName, BaseStatisticsObject const& object){
297  typedef typename ResultTable<Parameter>::const_iterator iterator;
298  iterator end = m_resultsTable->end();
299  for(iterator pos=m_resultsTable->begin(); pos != end; ++pos){
300  m_statistics[pos->first][statisticName] = object.statistics(pos->second);
301  }
302  m_statisticNames.push_back(statisticName);
303  }
304 
305  void addStatistic(BaseStatisticsObject const& object){
306  addStatistic(object.name(),object);
307  }
308 
309  std::map<std::string,RealVector> const& operator[](Parameter const& parameter)const{
310  return m_statistics.find(parameter)->second;
311  }
312 
313  const_iterator begin()const{
314  return m_statistics.begin();
315  }
316  const_iterator end()const{
317  return m_statistics.end();
318  }
319 
320  //information about the parameter of the experiments
321  std::string const& parameterName()const{
322  return m_resultsTable->parameterName();
323  }
324 
325  std::size_t numParams()const{
326  return m_resultsTable->numParams();
327  }
328 
329  Parameter const& parameterValue(std::size_t i)const{
330  return m_resultsTable->parameterValue(i);
331  }
332 
333  //information about the names of the dimensions
334  std::size_t numDimensions()const{
335  return m_resultsTable->numDimensions();
336  }
337 
338  std::string const& dimensionName(std::size_t i)const{
339  return m_resultsTable->dimensionName(i);
340  }
341 
342  //information about the statistics
343  std::size_t numStatistics()const{
344  return m_statisticNames.size();
345  }
346  std::string const& statisticName(std::size_t i)const{
347  return m_statisticNames[i];
348  }
349 private:
350  std::vector<std::string> m_statisticNames;
351  ResultTable<Parameter> const* m_resultsTable;
352  std::map<Parameter, std::map<std::string,RealVector> > m_statistics;
353 };
354 
355 template<class Parameter>
357  //first print a legend
358  std::cout<<"# "<<statistics.parameterName();
359  for(std::size_t i = 0; i != statistics.numStatistics(); ++i){
360  for(std::size_t j = 0; j != statistics.numDimensions(); ++j){
361  std::cout<<" "<<statistics.statisticName(i)<<"-"<<statistics.dimensionName(j);
362  }
363  }
364  std::cout<<"\n";
365 
366  //print results parameter by parameter
367  for(std::size_t k = 0; k != statistics.numParams(); ++k){
368  Parameter param=statistics.parameterValue(k);
369  std::map<std::string,RealVector> paramResults=statistics[param];
370  std::cout<<param;
371  for(std::size_t i = 0; i != statistics.numStatistics(); ++i){
372  for(std::size_t j = 0; j != statistics.numDimensions(); ++j){
373  std::cout<<" "<<paramResults[statistics.statisticName(i)](j);
374  }
375  }
376  std::cout<<"\n";
377  }
378 }
379 
380 }}
381 #endif // SHARK_STATISTICS_H